From 46bb394a500e913cf3fe8ede329deea023c78eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 11 Apr 2023 12:05:04 +0200 Subject: [PATCH 01/96] Fix links on GitHub issue templates (#150) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- .github/ISSUE_TEMPLATE/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index af7ecdbe0..53557f844 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,10 +1,10 @@ blank_issues_enabled: false contact_links: - - name: Discord server + - name: Slack workspace about: Come chat with the community! - url: https://discord.gg/akJDeVaxnB + url: https://slack.kedro.org - name: Documentation - url: https://kedro.readthedocs.io/en/stable/ + url: https://docs.kedro.org about: To learn more about how Kedro works - name: Case studies, articles and video tutorials url: https://github.com/kedro-org/kedro-community From c9421aed8f642fa4562b1069ffe00fa08ca9f183 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Wed, 12 Apr 2023 13:41:49 +0100 Subject: [PATCH 02/96] add spark_stream_dataset.py Signed-off-by: Tingting_Wan --- .../spark/spark_stream_dataset.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py new file mode 100644 index 000000000..6844e04cf --- /dev/null +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -0,0 +1,128 @@ +"""SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +from typing import Any, Dict + +import pyspark +import yaml +from kedro.io import AbstractDataSet +from pyspark import SparkConf +from pyspark.sql import SparkSession +from yaml.loader import SafeLoader + + +class SparkStreamingDataSet(AbstractDataSet): + """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. + + Example usage for the + `YAML API `_: + .. code-block:: yaml + + raw.new_inventory: + type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + filepath: data/01_raw/stream/inventory/ + file_format: json + + int.new_inventory: + type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + filepath: data/02_intermediate/inventory/ + file_format: csv + save_args: + output_mode: append + checkpoint: data/04_checkpoint/int_new_inventory + header: True + load_args: + header: True + + """ + + def __init__( + self, + filepath: str = "", + file_format: str = "", + save_args: Dict[str, str] = {}, + load_args: Dict[str, str] = {}, + ): + """Creates a new instance of SparkStreamingDataSet. + + Args: + filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks + specify ``filepath``s starting with ``/dbfs/``. For message brokers such as + Kafka and all filepath is not required. + file_format: File format used during load and save + operations. These are formats supported by the running + SparkContext include parquet, csv, delta. For a list of supported + formats please refer to Apache Spark documentation at + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + load_args: Load args passed to Spark DataFrameReader load method. + It is dependent on the selected file format. You can find + a list of read options for each supported format + in Spark DataFrame read documentation: + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + save_args: Save args passed to Spark DataFrame write options. + Similar to load_args this is dependent on the selected file + format. You can pass ``mode`` and ``partitionBy`` to specify + your overwrite mode and partitioning respectively. You can find + a list of options for each format in Spark DataFrame + write documentation: + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + """ + self._filepath_ = filepath + self.file_format = file_format + self._save_args = save_args + self._load_args = load_args + self.output_format = [ + "kafka" + ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + + # read spark configuration from spark yml file and create a spark context + with open("conf/base/spark.yml") as f: + self.parameters = yaml.load(f, Loader=SafeLoader) + self.spark_conf = SparkConf().setAll(self.parameters.items()) + + # Initialise the spark session + self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf) + self.spark = self.spark_session_conf.getOrCreate() + + def _load(self) -> pyspark.sql.DataFrame: + """Loads data from filepath. + If the connector type is kafka then no file_path is required + + Returns: + Data from filepath as pyspark dataframe. + """ + input_constructor = self.spark.readStream.format(self.file_format).options( + **self._load_args + ) + return ( + input_constructor.load() + if self.file_format + in self.output_format # if the connector type is message broker + else input_constructor.load(self._filepath_) + ) + + def _save(self, data: pyspark.sql.DataFrame) -> None: + """Saves pyspark dataframe. + + Args: + data: PySpark streaming dataframe for saving + + """ + + output_constructor = data.writeStream.format(self.file_format) + + # for message brokers path is not needed + if self.file_format not in self.output_format: + output_constructor = output_constructor.option("path", self._filepath_) + + ( + output_constructor.option( + "checkpointLocation", self._save_args.pop("checkpoint") + ) + .outputMode(self._save_args.pop("output_mode")) + .options(**self._save_args) + .start() + ) + + def _describe(self) -> Dict[str, Any]: + """Returns a dict that describes attributes of the dataset.""" + return None From 63f578aceee5aec43b8dde0e60a5c16c49ed2f32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Wed, 12 Apr 2023 14:22:14 +0200 Subject: [PATCH 03/96] Migrate most of `kedro-datasets` metadata to `pyproject.toml` (#161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Include missing requirements files in sdist Fix gh-86. Signed-off-by: Juan Luis Cano Rodríguez * Migrate most project metadata to `pyproject.toml` See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Move requirements to `pyproject.toml` Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- Makefile | 2 +- kedro-datasets/pyproject.toml | 25 +++++++++++++++++++++++ kedro-datasets/requirements.txt | 1 - kedro-datasets/setup.py | 35 +-------------------------------- 4 files changed, 27 insertions(+), 36 deletions(-) delete mode 100644 kedro-datasets/requirements.txt diff --git a/Makefile b/Makefile index 86daa6313..be653ed59 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ package: cd $(plugin);\ rm -Rf dist;\ - python setup.py sdist bdist_wheel + python -m build pypi: python -m pip install twine -U diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 6df7bd372..0f0ad2fc3 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -1,3 +1,28 @@ +[project] +name = "kedro-datasets" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Datasets is where you can find all of Kedro's data connectors." +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro~=0.18.4", +] +dynamic = ["readme", "version", "optional-dependencies"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets" +Documentation = "https://docs.kedro.org" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[tool.setuptools.packages.find] +include = ["kedro_datasets*"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_datasets.__version__"} + [tool.black] [tool.isort] diff --git a/kedro-datasets/requirements.txt b/kedro-datasets/requirements.txt deleted file mode 100644 index b5edbb617..000000000 --- a/kedro-datasets/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -kedro~=0.18.4 diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 6d88fe50e..4840b8535 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -1,12 +1,6 @@ -import re -from codecs import open from itertools import chain -from os import path -from setuptools import find_packages, setup - -name = "kedro-datasets" -here = path.abspath(path.dirname(__file__)) +from setuptools import setup # at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec PANDAS = "pandas>=1.3, <3.0" @@ -15,21 +9,6 @@ S3FS = "s3fs>=0.3.0, <0.5" POLARS = "polars~=0.15.16" -with open("requirements.txt", "r", encoding="utf-8") as f: - install_requires = [x.strip() for x in f if x.strip()] - -with open("test_requirements.txt", "r", encoding="utf-8") as f: - tests_require = [x.strip() for x in f if x.strip() and not x.startswith("-r")] - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - def _collect_requirements(requires): return sorted(set(chain.from_iterable(requires.values()))) @@ -145,17 +124,5 @@ def _collect_requirements(requires): extras_require["all"] = _collect_requirements(extras_require) setup( - name=name, - version=version, - description="Kedro-Datasets is where you can find all of Kedro's data connectors.", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets", - install_requires=install_requires, - tests_require=tests_require, - author="Kedro", - python_requires=">=3.7, <3.11", - license="Apache Software License (Apache 2.0)", - packages=find_packages(exclude=["tests*"]), extras_require=extras_require, ) From 4b387ff1a5e44ebded6d70df640ab600999c135b Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 11:48:57 +0100 Subject: [PATCH 04/96] restructure the strean dataset to align with the other spark dataset Signed-off-by: Tingting_Wan --- .../spark/spark_stream_dataset.py | 57 ++++++++++++++----- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 6844e04cf..0992ab5ce 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -1,13 +1,13 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" from typing import Any, Dict - -import pyspark +from copy import deepcopy import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf -from pyspark.sql import SparkSession +from pathlib import PurePosixPath +from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader - +from kedro_datasets.spark.spark_dataset import _split_filepath class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -35,13 +35,16 @@ class SparkStreamingDataSet(AbstractDataSet): """ + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + def __init__( self, filepath: str = "", file_format: str = "", - save_args: Dict[str, str] = {}, - load_args: Dict[str, str] = {}, - ): + save_args: Dict[str, Any] = None, + load_args: Dict[str, Any] = None, + ) -> None: """Creates a new instance of SparkStreamingDataSet. Args: @@ -74,23 +77,46 @@ def __init__( "kafka" ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + fs_prefix, filepath = _split_filepath(filepath) + + self._fs_prefix = fs_prefix + self._filepath = PurePosixPath(filepath) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + def _describe(self) -> Dict[str, Any]: + """Returns a dict that describes attributes of the dataset.""" + return { + "filepath": self._fs_prefix + str(self._filepath), + "file_format": self._file_format, + "load_args": self._load_args, + "save_args": self._save_args, + } + + @staticmethod + def _get_spark(self): # read spark configuration from spark yml file and create a spark context with open("conf/base/spark.yml") as f: self.parameters = yaml.load(f, Loader=SafeLoader) self.spark_conf = SparkConf().setAll(self.parameters.items()) # Initialise the spark session - self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf) - self.spark = self.spark_session_conf.getOrCreate() + return SparkSession.builder.config(conf=self.spark_conf).getOrCreate() - def _load(self) -> pyspark.sql.DataFrame: + def _load(self) -> DataFrame: """Loads data from filepath. If the connector type is kafka then no file_path is required Returns: Data from filepath as pyspark dataframe. """ - input_constructor = self.spark.readStream.format(self.file_format).options( + input_constructor = self._get_spark().readStream.format(self.file_format).options( **self._load_args ) return ( @@ -100,7 +126,7 @@ def _load(self) -> pyspark.sql.DataFrame: else input_constructor.load(self._filepath_) ) - def _save(self, data: pyspark.sql.DataFrame) -> None: + def _save(self, data: DataFrame) -> None: """Saves pyspark dataframe. Args: @@ -123,6 +149,7 @@ def _save(self, data: pyspark.sql.DataFrame) -> None: .start() ) - def _describe(self) -> Dict[str, Any]: - """Returns a dict that describes attributes of the dataset.""" - return None + + + + From 39ad9fd56f116d421b85d0a6e4f5a4b0eface6a1 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 17:43:56 +0100 Subject: [PATCH 05/96] adding README.md for specification Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 38 +++++++++++++++++++ .../spark/spark_stream_dataset.py | 17 +++++---- 2 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/spark/README.md diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md new file mode 100644 index 000000000..bded31532 --- /dev/null +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -0,0 +1,38 @@ +# Spark Streaming + +``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames. +See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. + +To work with multiple streaming nodes, 2 hook are required for: + - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details + - Running streaming query without termination unless exception + +#### Example SparkStreamsHook: + +```python +from kedro.framework.hooks import hook_impl +from pyspark.sql import SparkSession + +class SparkStreamsHook: + @hook_impl + def after_pipeline_run(self) -> None: + """Starts a spark streaming await session + once the pipeline reaches the last node + """ + + spark = SparkSession.builder.getOrCreate() + spark.streams.awaitAnyTermination() +``` +To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``. + +#### Example spark.yml: + +```yaml +spark.driver.maxResultSize: 3g +spark.scheduler.mode: FAIR +spark.sql.streaming.schemaInference: True +spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context +spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly +spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used) + +``` diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 0992ab5ce..382c45286 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -1,4 +1,5 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +import os from typing import Any, Dict from copy import deepcopy import yaml @@ -101,13 +102,15 @@ def _describe(self) -> Dict[str, Any]: @staticmethod def _get_spark(self): - # read spark configuration from spark yml file and create a spark context - with open("conf/base/spark.yml") as f: - self.parameters = yaml.load(f, Loader=SafeLoader) - self.spark_conf = SparkConf().setAll(self.parameters.items()) - - # Initialise the spark session - return SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + spark_conf_path = "conf/base/spark.yml" + if os.path.exists(spark_conf_path): + with open(spark_conf_path) as f: + self.parameters = yaml.load(f, Loader=SafeLoader) + self.spark_conf = SparkConf().setAll(self.parameters.items()) + spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + else: + spark = SparkSession.builder.getOrCreate() + return spark def _load(self) -> DataFrame: """Loads data from filepath. From 69eb8bea26b750fb9ffeca48ee1220e372547826 Mon Sep 17 00:00:00 2001 From: Tingting Wan <110382691+Tingting711@users.noreply.github.com> Date: Fri, 14 Apr 2023 01:40:10 +0800 Subject: [PATCH 06/96] Update kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py Co-authored-by: Nok Lam Chan Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 382c45286..77bf62f40 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -19,7 +19,7 @@ class SparkStreamingDataSet(AbstractDataSet): .. code-block:: yaml raw.new_inventory: - type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + type: spark.SparkStreamingDataSet filepath: data/01_raw/stream/inventory/ file_format: json From 3106068ce458d81849df4b513662d0088d3a860b Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 18:41:12 +0100 Subject: [PATCH 07/96] rename the dataset Signed-off-by: Tingting_Wan --- .../spark/{spark_stream_dataset.py => spark_streaming_dataset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kedro-datasets/kedro_datasets/spark/{spark_stream_dataset.py => spark_streaming_dataset.py} (100%) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py similarity index 100% rename from kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py rename to kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py From b8141a7eef96e140e5abd19e7d9a6a16da0f6a47 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 17 Apr 2023 10:19:43 +0100 Subject: [PATCH 08/96] resolve comments Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/__init__.py | 2 ++ kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index 3dede09aa..c93d3f0df 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -12,3 +12,5 @@ from .spark_jdbc_dataset import SparkJDBCDataSet with suppress(ImportError): from .deltatable_dataset import DeltaTableDataSet +with suppress(ImportError): + from .spark_streaming_dataset import SparkStreamingDataSet diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 77bf62f40..1ee271e87 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -24,7 +24,7 @@ class SparkStreamingDataSet(AbstractDataSet): file_format: json int.new_inventory: - type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + type: spark.SparkStreamingDataSet filepath: data/02_intermediate/inventory/ file_format: csv save_args: From 738625e63a48365dc05fe1905cc08db9c40a4aa6 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 17 Apr 2023 11:42:05 +0100 Subject: [PATCH 09/96] fix format and pylint Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 1ee271e87..fa6fc9c7e 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -2,14 +2,15 @@ import os from typing import Any, Dict from copy import deepcopy +from pathlib import PurePosixPath import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf -from pathlib import PurePosixPath from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader from kedro_datasets.spark.spark_dataset import _split_filepath + class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -71,12 +72,10 @@ def __init__( https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html """ self._filepath_ = filepath - self.file_format = file_format + self._file_format = file_format self._save_args = save_args self._load_args = load_args - self.output_format = [ - "kafka" - ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + self.output_format = ["kafka"] fs_prefix, filepath = _split_filepath(filepath) @@ -101,13 +100,15 @@ def _describe(self) -> Dict[str, Any]: } @staticmethod - def _get_spark(self): + def _get_spark(): spark_conf_path = "conf/base/spark.yml" if os.path.exists(spark_conf_path): - with open(spark_conf_path) as f: - self.parameters = yaml.load(f, Loader=SafeLoader) - self.spark_conf = SparkConf().setAll(self.parameters.items()) - spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + with open( + spark_conf_path, encoding="utf-8" + ) as File: # pylint: disable=invalid-name + parameters = yaml.load(File, Loader=SafeLoader) + spark_conf = SparkConf().setAll(parameters.items()) + spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() else: spark = SparkSession.builder.getOrCreate() return spark @@ -119,12 +120,14 @@ def _load(self) -> DataFrame: Returns: Data from filepath as pyspark dataframe. """ - input_constructor = self._get_spark().readStream.format(self.file_format).options( - **self._load_args + input_constructor = ( + self._get_spark() + .readStream.format(self._file_format) + .options(**self._load_args) ) return ( input_constructor.load() - if self.file_format + if self._file_format in self.output_format # if the connector type is message broker else input_constructor.load(self._filepath_) ) @@ -137,10 +140,10 @@ def _save(self, data: DataFrame) -> None: """ - output_constructor = data.writeStream.format(self.file_format) + output_constructor = data.writeStream.format(self._file_format) # for message brokers path is not needed - if self.file_format not in self.output_format: + if self._file_format not in self.output_format: output_constructor = output_constructor.option("path", self._filepath_) ( @@ -151,8 +154,3 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - - - - - From a54cc676df0a957c536994084769bc6e72244417 Mon Sep 17 00:00:00 2001 From: Tingting Wan <110382691+Tingting711@users.noreply.github.com> Date: Mon, 17 Apr 2023 21:21:08 +0800 Subject: [PATCH 10/96] Update kedro-datasets/kedro_datasets/spark/README.md Co-authored-by: Deepyaman Datta Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index bded31532..f222df00a 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -1,6 +1,6 @@ # Spark Streaming -``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames. +``SparkStreamingDataSet`` loads and saves data to streaming DataFrames. See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. To work with multiple streaming nodes, 2 hook are required for: From b924ad6e6fc58c7bf8e4556163787350c9d3da80 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Fri, 21 Apr 2023 23:02:52 +0100 Subject: [PATCH 11/96] add unit tests and SparkStreamingDataset in init.py Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/__init__.py | 2 +- .../spark/spark_streaming_dataset.py | 14 +++++- .../spark/test_spark_streaming_dataset.py | 47 +++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 kedro-datasets/tests/spark/test_spark_streaming_dataset.py diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index c93d3f0df..0c46a7fc3 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -1,6 +1,6 @@ """Provides I/O modules for Apache Spark.""" -__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet"] +__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"] from contextlib import suppress diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index fa6fc9c7e..10680d661 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -6,9 +6,10 @@ import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf +from pyspark.errors.exceptions.captured import AnalysisException from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader -from kedro_datasets.spark.spark_dataset import _split_filepath +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix class SparkStreamingDataSet(AbstractDataSet): @@ -154,3 +155,14 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) + def _exists(self) -> bool: + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + + try: + self._get_spark().read.load(path=load_path, format="delta") + except AnalysisException as exception: + if "is not a Delta table" in exception.desc: + return False + raise + + return True \ No newline at end of file diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py new file mode 100644 index 000000000..47a427742 --- /dev/null +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -0,0 +1,47 @@ +import pytest +import time +from pyspark.sql import SparkSession +from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + +@pytest.fixture +def sample_spark_streaming_df_one(): + schema = StructType( + [ + StructField("sku", StringType(), True), + StructField("new_stock", IntegerType(), True), + ] + ) + data = [("0001", 2), ("0001", 7), ("0002", 4)] + + return SparkSession.builder.getOrCreate() \ + .createDataFrame(data, schema) + + +class TestStreamingDataSet: + def test_load(self,tmp_path, sample_spark_streaming_df_one): + filepath = (tmp_path / "test_streams").as_posix() + spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"]) + spark_json_ds.save(sample_spark_streaming_df_one) + loaded_with_spark = spark_json_ds.load() + + stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load() + assert stream_df.isStreaming + + stream_query = stream_df.writeStream.format("memory").queryName("test").start() + assert stream_query.isActive + time.sleep(3) + stream_query.stop() + loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test") + + assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0 + + + def test_save(self, tmp_path, sample_spark_df): + filepath = (tmp_path / "test_streams").as_posix() + checkpoint_path = (tmp_path / "checkpoint").as_posix() + streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path]) + assert not streaming_ds.exists() + + From 743b823110102c36e0f6a665e3718dc4f9eaa5a7 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:20:02 +0100 Subject: [PATCH 12/96] add unit tests Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 76 +++++++++++++++---- .../spark/test_spark_streaming_dataset.py | 66 ++++++++++------ 2 files changed, 107 insertions(+), 35 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 10680d661..a508a3903 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,13 +1,17 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +import json import os from typing import Any, Dict from copy import deepcopy from pathlib import PurePosixPath import yaml -from kedro.io import AbstractDataSet + +import fsspec +from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path from pyspark import SparkConf -from pyspark.errors.exceptions.captured import AnalysisException +from pyspark.sql.utils import AnalysisException from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.types import StructType from yaml.loader import SafeLoader from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix @@ -91,6 +95,37 @@ def __init__( if save_args is not None: self._save_args.update(save_args) + # Handle schema load argument + self._schema = self._load_args.pop("schema", None) + if self._schema is not None: + if isinstance(self._schema, dict): + self._schema = self._load_schema_from_file(self._schema) + + @staticmethod + def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: + filepath = schema.get("filepath") + if not filepath: + raise DataSetError( + "Schema load argument does not specify a 'filepath' attribute. Please" + "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." + ) + + credentials = deepcopy(schema.get("credentials")) or {} + protocol, schema_path = get_protocol_and_path(filepath) + file_system = fsspec.filesystem(protocol, **credentials) + pure_posix_path = PurePosixPath(schema_path) + load_path = get_filepath_str(pure_posix_path, protocol) + + # Open schema file + with file_system.open(load_path, encoding='utf-8') as fs_file: + try: + return StructType.fromJson(json.loads(fs_file.read())) + except Exception as exc: + raise DataSetError( + f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please" + f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." + ) from exc + def _describe(self) -> Dict[str, Any]: """Returns a dict that describes attributes of the dataset.""" return { @@ -116,16 +151,23 @@ def _get_spark(): def _load(self) -> DataFrame: """Loads data from filepath. - If the connector type is kafka then no file_path is required + If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args Returns: Data from filepath as pyspark dataframe. """ - input_constructor = ( - self._get_spark() - .readStream.format(self._file_format) - .options(**self._load_args) - ) + if self._schema: + input_constructor = ( + self._get_spark() + .readStream.schema(self._schema).format(self._file_format) + .options(**self._load_args) + ) + else: + input_constructor = ( + self._get_spark() + .readStream.format(self._file_format) + .options(**self._load_args) + ) return ( input_constructor.load() if self._file_format @@ -155,14 +197,22 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - def _exists(self) -> bool: - load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + def _exists(self, schema_path:str) -> bool: + """Check the existence of pyspark dataframe. + Args: + schema_path: schema of saved streaming dataframe + """ + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + with open(schema_path, encoding='utf-8') as f: + schema = StructType.fromJson(json.loads(f.read())) try: - self._get_spark().read.load(path=load_path, format="delta") + self._get_spark().readStream.schema(schema).load(load_path, self._file_format) except AnalysisException as exception: - if "is not a Delta table" in exception.desc: + if ( + exception.desc.startswith("Path does not exist:") + or "is not a Streaming data" in exception.desc + ): return False raise - return True \ No newline at end of file diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 47a427742..2d936b1ce 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,12 +1,27 @@ +import json import pytest import time from pyspark.sql import SparkSession -from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet +from kedro_datasets.pandas import ParquetDataSet +from kedro.io.core import DataSetError +from kedro_datasets.spark.spark_dataset import SparkDataSet +from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet from pyspark.sql.types import IntegerType, StringType, StructField, StructType + +def sample_schema(schema_path): + with open(schema_path, encoding='utf-8') as f: + try: + return StructType.fromJson(json.loads(f.read())) + except Exception as exc: + raise DataSetError( + f"Contents of 'schema.filepath' ({schema_path}) are invalid. " + f"Schema is required for streaming data load, Please provide a valid schema_path." + ) from exc + @pytest.fixture -def sample_spark_streaming_df_one(): +def sample_spark_streaming_df(tmp_path): schema = StructType( [ StructField("sku", StringType(), True), @@ -14,34 +29,41 @@ def sample_spark_streaming_df_one(): ] ) data = [("0001", 2), ("0001", 7), ("0002", 4)] - - return SparkSession.builder.getOrCreate() \ - .createDataFrame(data, schema) + schema_path = (tmp_path / "test.json").as_posix() + with open(schema_path, "w") as f: + json.dump(schema.jsonValue(), f) + return SparkSession.builder.getOrCreate().createDataFrame(data, schema) class TestStreamingDataSet: - def test_load(self,tmp_path, sample_spark_streaming_df_one): + def test_load(self, tmp_path, sample_spark_streaming_df): filepath = (tmp_path / "test_streams").as_posix() - spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"]) - spark_json_ds.save(sample_spark_streaming_df_one) - loaded_with_spark = spark_json_ds.load() + schema_path = (tmp_path / "test.json").as_posix() - stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load() - assert stream_df.isStreaming + spark_json_ds = SparkDataSet( + filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) - stream_query = stream_df.writeStream.format("memory").queryName("test").start() - assert stream_query.isActive - time.sleep(3) - stream_query.stop() - loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test") + streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json", + load_args={"schema": {"filepath": schema_path}}).load() + assert streaming_ds.isStreaming + schema = sample_schema(schema_path) + assert streaming_ds.schema == schema - assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0 + def test_save(self, tmp_path, sample_spark_streaming_df): + filepath = (tmp_path / "test_streams_input").as_posix() + schema_path = (tmp_path / "test.json").as_posix() + checkpoint_path = (tmp_path / "checkpoint").as_posix() + spark_json_ds = SparkDataSet( + filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) - def test_save(self, tmp_path, sample_spark_df): - filepath = (tmp_path / "test_streams").as_posix() - checkpoint_path = (tmp_path / "checkpoint").as_posix() - streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path]) - assert not streaming_ds.exists() + streaming_ds = SparkStreamingDataSet( + filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + ) + assert streaming_ds._exists(schema_path) From 3bb371789fd841ac5a3b5ecd98ebb91121b06eae Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:41:59 +0100 Subject: [PATCH 13/96] update test_save Signed-off-by: Tingting_Wan --- .../tests/spark/test_spark_streaming_dataset.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 2d936b1ce..fa3b0fec8 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -52,18 +52,26 @@ def test_load(self, tmp_path, sample_spark_streaming_df): assert streaming_ds.schema == schema def test_save(self, tmp_path, sample_spark_streaming_df): - filepath = (tmp_path / "test_streams_input").as_posix() + filepath_json = (tmp_path / "test_streams").as_posix() + filepath_output = (tmp_path / "test_streams_output").as_posix() schema_path = (tmp_path / "test.json").as_posix() checkpoint_path = (tmp_path / "checkpoint").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}] ) spark_json_ds.save(sample_spark_streaming_df) + loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json", + load_args={"schema": {"filepath": schema_path}}).load() + streaming_ds = SparkStreamingDataSet( - filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} ) + assert not streaming_ds._exists(schema_path) + + streaming_ds.save(loaded_with_streaming) assert streaming_ds._exists(schema_path) + From ae3bc87d1612a0bb2b856ef74e013c1c81de110e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Mon, 17 Apr 2023 10:48:36 +0200 Subject: [PATCH 14/96] Upgrade Polars (#171) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Upgrade Polars Signed-off-by: Juan Luis Cano Rodríguez * Update Polars to 0.17.x --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- kedro-datasets/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 4840b8535..99c30938e 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -7,7 +7,7 @@ SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -POLARS = "polars~=0.15.16" +POLARS = "polars~=0.17.0" def _collect_requirements(requires): From eb634a100d44a4cca5267ff336f98e434dd3ac14 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Mon, 17 Apr 2023 14:47:16 +0100 Subject: [PATCH 15/96] if release is failed, it return exit code and fail the CI (#158) Signed-off-by: Tingting_Wan --- tools/circleci/circleci_release.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index 88c4ed1d0..dd05d4c5a 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -4,6 +4,7 @@ """ import os +import sys import requests from requests.structures import CaseInsensitiveDict @@ -33,12 +34,6 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke headers["Circle-Token"] = circle_release_token resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10) - print(f"Status Code: {resp.status_code}") - if resp.status_code == 201: - print("Creating CircleCI Pipeline successfully") - print(resp.content) - else: - print("Failed to create CircleCI Pipeline") return resp @@ -70,6 +65,14 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke print(package_name, package_version) if check_no_version_pypi(pypi_endpoint, package_name, package_version): - circleci_release( + res = circleci_release( PROJECT_SLUG, payload, circleci_endpoint, CIRCLE_RELEASE_TOKEN ) + print(f"Status Code: {resp.status_code}") + if resp.status_code == 201: + print("Creating CircleCI Pipeline successfully") + else: + print("Failed to create CircleCI Pipeline") + print(resp.content) + if resp.status_code != 201: + sys.exit(1) From 115940bd52af08966aa2c80cbe08bbbf2224c381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 18 Apr 2023 13:25:21 +0200 Subject: [PATCH 16/96] Migrate `kedro-airflow` to static metadata (#172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-airflow to static metadata See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Add explicit PEP 518 build requirements for kedro-datasets Signed-off-by: Juan Luis Cano Rodríguez * Typos Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: Juan Luis Cano Rodríguez * Remove dangling reference to requirements.txt Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- kedro-airflow/MANIFEST.in | 1 - kedro-airflow/RELEASE.md | 1 + kedro-airflow/pyproject.toml | 48 +++++++++++++++++++++++++++++ kedro-airflow/requirements.txt | 3 -- kedro-airflow/setup.cfg | 10 ------ kedro-airflow/setup.py | 41 ------------------------ kedro-airflow/test_requirements.txt | 1 - kedro-datasets/pyproject.toml | 4 +++ 8 files changed, 53 insertions(+), 56 deletions(-) delete mode 100644 kedro-airflow/requirements.txt delete mode 100644 kedro-airflow/setup.cfg delete mode 100644 kedro-airflow/setup.py diff --git a/kedro-airflow/MANIFEST.in b/kedro-airflow/MANIFEST.in index 523166e84..ed984822f 100644 --- a/kedro-airflow/MANIFEST.in +++ b/kedro-airflow/MANIFEST.in @@ -1,4 +1,3 @@ include README.md include LICENSE.md -include requirements.txt include kedro_airflow/airflow_dag_template.j2 diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 75e4654e6..c2e0615b4 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,6 @@ # Upcoming release 0.5.2 * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +* Migrate all project metadata to static `pyproject.toml`. # Release 0.5.1 * Added additional CLI argument `--jinja-file` to provide a path to a custom Jinja2 template. diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml index 4f3292f55..42fe8974b 100644 --- a/kedro-airflow/pyproject.toml +++ b/kedro-airflow/pyproject.toml @@ -1,3 +1,51 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-airflow" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Airflow makes it easy to deploy Kedro projects to Airflow" +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro>=0.17.5", + "python-slugify>=4.0", + "semver~=2.10", # Needs to be at least 2.10.0 to get VersionInfo.match +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.project_commands"] +airflow = "kedro_airflow.plugin:commands" + +[tool.setuptools] +include-package-data = true +packages = ["kedro_airflow"] +zip-safe = false + +[tool.setuptools.package-data] +kedro_airflow = ["kedro_airflow/airflow_dag_template.j2"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_airflow.__version__"} + +[tool.pytest.ini_options] +addopts = """ + --cov-report xml:coverage.xml + --cov-report term-missing + --cov kedro_airflow + --cov tests + --no-cov-on-fail + -ra""" + [tool.black] exclude=".*template.py" diff --git a/kedro-airflow/requirements.txt b/kedro-airflow/requirements.txt deleted file mode 100644 index d1731ba85..000000000 --- a/kedro-airflow/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -kedro>=0.17.5 -python-slugify>=4.0 -semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match diff --git a/kedro-airflow/setup.cfg b/kedro-airflow/setup.cfg deleted file mode 100644 index 7fa30d2d0..000000000 --- a/kedro-airflow/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[metadata] -description-file=README.md - -[tool:pytest] -addopts=--cov-report xml:coverage.xml - --cov-report term-missing - --cov kedro_airflow - --cov tests - --no-cov-on-fail - -ra diff --git a/kedro-airflow/setup.py b/kedro-airflow/setup.py deleted file mode 100644 index 85bb25b8a..000000000 --- a/kedro-airflow/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-airflow" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Airflow makes it easy to deploy Kedro projects to Airflow", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow", - author="Kedro", - python_requires=">=3.7, <3.11", - install_requires=requires, - license="Apache Software License (Apache 2.0)", - packages=["kedro_airflow"], - package_data={"kedro_airflow": ["kedro_airflow/airflow_dag_template.j2"]}, - include_package_data=True, - zip_safe=False, - entry_points={ - "kedro.project_commands": ["airflow = kedro_airflow.plugin:commands"] - }, -) diff --git a/kedro-airflow/test_requirements.txt b/kedro-airflow/test_requirements.txt index 4ced2ca4c..cdea520c7 100644 --- a/kedro-airflow/test_requirements.txt +++ b/kedro-airflow/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt apache-airflow<3.0 bandit>=1.6.2, <2.0 behave diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 0f0ad2fc3..a5f494106 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -1,3 +1,7 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + [project] name = "kedro-datasets" authors = [ From 35231afe22ae19087d347d45a9f7247515b88ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 18 Apr 2023 13:26:53 +0200 Subject: [PATCH 17/96] Migrate `kedro-telemetry` to static metadata (#174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-telemetry to static metadata See kedro-org/kedro#2334. Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- kedro-telemetry/RELEASE.md | 3 ++ kedro-telemetry/pyproject.toml | 38 +++++++++++++++++++++++++ kedro-telemetry/requirements.txt | 2 -- kedro-telemetry/setup.py | 41 --------------------------- kedro-telemetry/test_requirements.txt | 1 - 5 files changed, 41 insertions(+), 44 deletions(-) delete mode 100644 kedro-telemetry/requirements.txt delete mode 100644 kedro-telemetry/setup.py diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index 7cdb93100..bbd32f424 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,3 +1,6 @@ +# Upcoming release +* Migrate all project metadata to static `pyproject.toml`. + # Release 0.2.4 * Added consent checking for collecting project statistics. diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 07449ad97..0cc754854 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -1,3 +1,41 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-telemetry" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Telemetry" +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro~=0.18.0", + "requests~=2.20", +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-telemetry/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.cli_hooks"] +kedro-telemetry = "kedro_telemetry.plugin:cli_hooks" + +[project.entry-points."kedro.hooks"] +kedro-telemetry = "kedro_telemetry.plugin:project_hooks" + +[tool.setuptools] +include-package-data = true +packages = ["kedro_telemetry"] +zip-safe = false + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_telemetry.__version__"} + [tool.isort] multi_line_output = 3 include_trailing_comma = true diff --git a/kedro-telemetry/requirements.txt b/kedro-telemetry/requirements.txt deleted file mode 100644 index c59cb8a9c..000000000 --- a/kedro-telemetry/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -kedro~=0.18.0 -requests~=2.20 diff --git a/kedro-telemetry/setup.py b/kedro-telemetry/setup.py deleted file mode 100644 index db6a976d2..000000000 --- a/kedro-telemetry/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-telemetry" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Telemetry", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry", - author="Kedro", - python_requires=">=3.7, <3.11", - install_requires=requires, - license="Apache Software License (Apache 2.0)", - packages=["kedro_telemetry"], - include_package_data=True, - zip_safe=False, - entry_points={ - "kedro.cli_hooks": ["kedro-telemetry = kedro_telemetry.plugin:cli_hooks"], - "kedro.hooks": ["kedro-telemetry = kedro_telemetry.plugin:project_hooks"] - }, -) diff --git a/kedro-telemetry/test_requirements.txt b/kedro-telemetry/test_requirements.txt index 4f39e717a..fb187d672 100644 --- a/kedro-telemetry/test_requirements.txt +++ b/kedro-telemetry/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt bandit>=1.6.2, <2.0 behave black~=22.0 From 8c2ea1bafed6554432f2dfcf4e20df2029d945b0 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Wed, 19 Apr 2023 15:21:17 +0100 Subject: [PATCH 18/96] ci: port lint, unit test, and e2e tests to Actions (#155) * Add unit test + lint test on GA * trigger GA - will revert Signed-off-by: Ankita Katiyar * Fix lint Signed-off-by: Ankita Katiyar * Add end to end tests * Add cache key Signed-off-by: Ankita Katiyar * Add cache action Signed-off-by: Ankita Katiyar * Rename workflow files Signed-off-by: Ankita Katiyar * Lint + add comment + default bash Signed-off-by: Ankita Katiyar * Add windows test Signed-off-by: Ankita Katiyar * Update workflow name + revert changes to READMEs Signed-off-by: Ankita Katiyar * Add kedro-telemetry/RELEASE.md to trufflehog ignore Signed-off-by: Ankita Katiyar * Add pytables to test_requirements remove from workflow Signed-off-by: Ankita Katiyar * Revert "Add pytables to test_requirements remove from workflow" This reverts commit 8203daa6405d325c74ec2097c9d0c5859bae8257. * Separate pip freeze step Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar Signed-off-by: Tingting_Wan --- .github/workflows/check-plugin.yml | 134 ++++++++++++++++++++++++++ .github/workflows/kedro-airflow.yml | 16 +++ .github/workflows/kedro-datasets.yml | 16 +++ .github/workflows/kedro-docker.yml | 16 +++ .github/workflows/kedro-telemetry.yml | 16 +++ trufflehog-ignore.txt | 2 + 6 files changed, 200 insertions(+) create mode 100644 .github/workflows/check-plugin.yml create mode 100644 .github/workflows/kedro-airflow.yml create mode 100644 .github/workflows/kedro-datasets.yml create mode 100644 .github/workflows/kedro-docker.yml create mode 100644 .github/workflows/kedro-telemetry.yml diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml new file mode 100644 index 000000000..a32c0f651 --- /dev/null +++ b/.github/workflows/check-plugin.yml @@ -0,0 +1,134 @@ +name: Running tests and linter + +on: + workflow_call: + inputs: + plugin: + type: string + +jobs: + unit-tests: + defaults: + run: + shell: bash + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{matrix.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.python-version}} + - name: Cache python packages for Linux + if: matrix.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Cache python packages for Windows + if: matrix.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install Kedro + run: pip install git+https://github.com/kedro-org/kedro@main + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install -r test_requirements.txt + - name: Install pytables (only for kedro-datasets on windows) + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' + run: pip install tables + - name: pip freeze + run: pip freeze + - name: Run unit tests for Linux / all plugins + if: matrix.os != 'windows-latest' + run: make plugin=${{ inputs.plugin }} test + - name: Run unit tests for Windows / kedro-airflow, kedro-docker, kedro-telemetry + if: matrix.os == 'windows-latest' && inputs.plugin != 'kedro-datasets' + run: | + cd ${{ inputs.plugin }} + pytest tests + - name: Run unit tests for Windows / kedro-datasets / no spark sequential + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version == '3.10' + run: | + make test-no-spark-sequential + - name: Run unit tests for Windows / kedro-datasets / no spark parallel + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version != '3.10' + run: | + make test-no-spark + + lint: + defaults: + run: + shell: bash + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v3 + with: + python-version: 3.8 + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install git+https://github.com/kedro-org/kedro@main + pip install -r test_requirements.txt + pip freeze + - name: Install pre-commit hooks + run: | + cd ${{ inputs.plugin }} + pre-commit install --install-hooks + pre-commit install --hook-type pre-push + - name: Run linter + run: make plugin=${{ inputs.plugin }} lint + + e2e-tests: + if: inputs.plugin != 'kedro-datasets' + defaults: + run: + shell: bash + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{matrix.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.python-version}} + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install git+https://github.com/kedro-org/kedro@main + pip install -r test_requirements.txt + - name: pip freeze + run: pip freeze + - name: Run end to end tests + # Custom shell to run kedro-docker e2e-tests because -it flag for `docker run` + # isn't supported on Github Actions. See https://github.com/actions/runner/issues/241 + shell: 'script -q -e -c "bash {0}"' + run: make plugin=${{ inputs.plugin }} e2e-tests diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml new file mode 100644 index 000000000..b68fcce30 --- /dev/null +++ b/.github/workflows/kedro-airflow.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-airflow + +on: + push: + paths: + - "kedro-airflow/**" + pull_request: + paths: + - "kedro-airflow/**" + types: [ synchronize ] + +jobs: + airflow-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-airflow diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml new file mode 100644 index 000000000..9ff4802b6 --- /dev/null +++ b/.github/workflows/kedro-datasets.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-datasets + +on: + push: + paths: + - "kedro-datasets/**" + pull_request: + paths: + - "kedro-datasets/**" + types: [ synchronize ] + +jobs: + datasets-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-datasets diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml new file mode 100644 index 000000000..1812a3a93 --- /dev/null +++ b/.github/workflows/kedro-docker.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-docker + +on: + push: + paths: + - "kedro-docker/**" + pull_request: + paths: + - "kedro-docker/**" + types: [ synchronize ] + +jobs: + docker-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-docker diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml new file mode 100644 index 000000000..fd75e8a71 --- /dev/null +++ b/.github/workflows/kedro-telemetry.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-telemetry + +on: + push: + paths: + - "kedro-telemetry/**" + pull_request: + paths: + - "kedro-telemetry/**" + types: [ synchronize ] + +jobs: + telemetry-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-telemetry diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt index 041fc7ffd..1929a2634 100644 --- a/trufflehog-ignore.txt +++ b/trufflehog-ignore.txt @@ -1 +1,3 @@ kedro-telemetry/README.md +kedro-telemetry/RELEASE.md +kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py From a73b216543f0ee726d85f2ffbb578038e75a8b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Wed, 19 Apr 2023 17:08:42 +0200 Subject: [PATCH 19/96] Migrate `kedro-docker` to static metadata (#173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-docker to static metadata See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Address packaging warning Signed-off-by: Juan Luis Cano Rodríguez * Fix tests Signed-off-by: Juan Luis Cano Rodríguez * Actually install current plugin with dependencies Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Tingting_Wan --- .circleci/continue_config.yml | 1 + kedro-docker/MANIFEST.in | 1 + kedro-docker/RELEASE.md | 1 + kedro-docker/features/environment.py | 2 +- kedro-docker/pyproject.toml | 55 ++++++++++++++++++++++++++++ kedro-docker/requirements.txt | 3 -- kedro-docker/setup.cfg | 10 ----- kedro-docker/setup.py | 44 ---------------------- kedro-docker/test_requirements.txt | 1 - 9 files changed, 59 insertions(+), 59 deletions(-) create mode 100644 kedro-docker/MANIFEST.in delete mode 100644 kedro-docker/requirements.txt delete mode 100644 kedro-docker/setup.cfg delete mode 100644 kedro-docker/setup.py diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 5a1d78015..82653758e 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -69,6 +69,7 @@ commands: command: | cd <> pip install git+https://github.com/kedro-org/kedro@main + pip install . pip install -r test_requirements.txt - run: name: Install pre-commit hooks diff --git a/kedro-docker/MANIFEST.in b/kedro-docker/MANIFEST.in new file mode 100644 index 000000000..451642d6f --- /dev/null +++ b/kedro-docker/MANIFEST.in @@ -0,0 +1 @@ +recursive-include kedro_docker/template * diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index eeb2f0e41..4bd5b8bbd 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming release +* Migrate all project metadata to static `pyproject.toml`. ## Major features and improvements diff --git a/kedro-docker/features/environment.py b/kedro-docker/features/environment.py index 04a5f25cf..930f97a7d 100644 --- a/kedro-docker/features/environment.py +++ b/kedro-docker/features/environment.py @@ -51,7 +51,7 @@ def before_all(context): ) # install the plugin - call([context.python, "setup.py", "install"], env=context.env) + call([context.python, "-m", "pip", "install", "."], env=context.env) def _setup_context_with_venv(context, venv_dir): diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index 0b54e6e31..cdd273509 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -1,3 +1,58 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-docker" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Docker makes it easy to package Kedro projects with Docker." +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "anyconfig~=0.10.0", # not directly required, pinned by Snyk to avoid a vulnerability + "kedro>=0.16.0", + "semver~=2.10", # Needs to be at least 2.10.0 to get VersionInfo.match +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-docker/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.project_commands"] +docker = "kedro_docker.plugin:commands" + +[tool.setuptools] +include-package-data = true +zip-safe = false + +[tool.setuptools.packages.find] +include = ["kedro_docker*"] +namespaces = true # To include the template files + +[tool.setuptools.package-data] +kedro_docker = [ + "template/Dockerfile.*", + "template/.dockerignore", + "template/.dive-ci", +] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_docker.__version__"} + +[tool.pytest.ini_options] +addopts = """ + --cov-report xml:coverage.xml + --cov-report term-missing + --cov kedro_docker + --cov tests + --no-cov-on-fail + -ra""" + [tool.black] [tool.isort] diff --git a/kedro-docker/requirements.txt b/kedro-docker/requirements.txt deleted file mode 100644 index 86c576113..000000000 --- a/kedro-docker/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -anyconfig~=0.10.0 # not directly required, pinned by Snyk to avoid a vulnerability -kedro>=0.16.0 -semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match diff --git a/kedro-docker/setup.cfg b/kedro-docker/setup.cfg deleted file mode 100644 index 9ba92fe11..000000000 --- a/kedro-docker/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[metadata] -description-file=README.md - -[tool:pytest] -addopts=--cov-report xml:coverage.xml - --cov-report term-missing - --cov kedro_docker - --cov tests - --no-cov-on-fail - -ra diff --git a/kedro-docker/setup.py b/kedro-docker/setup.py deleted file mode 100644 index b2ef23ca3..000000000 --- a/kedro-docker/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-docker" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Docker makes it easy to package Kedro projects with Docker.", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker", - license="Apache Software License (Apache 2.0)", - python_requires=">=3.7, <3.11", - install_requires=requires, - author="Kedro", - packages=["kedro_docker"], - package_data={ - "kedro_docker": [ - "template/Dockerfile.*", - "template/.dockerignore", - "template/.dive-ci", - ] - }, - zip_safe=False, - entry_points={"kedro.project_commands": ["docker = kedro_docker.plugin:commands"]}, -) diff --git a/kedro-docker/test_requirements.txt b/kedro-docker/test_requirements.txt index 771ee88a6..01af755ac 100644 --- a/kedro-docker/test_requirements.txt +++ b/kedro-docker/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt bandit>=1.6.2, <2.0 behave>=1.2.6, <2.0 black~=22.0 From 7f4527dc3fbd4cb0922a6757b1f7c1dc80ce98b2 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Fri, 21 Apr 2023 16:32:23 +0100 Subject: [PATCH 20/96] Introdcuing .gitpod.yml to kedro-plugins (#185) Currently opening gitpod will installed a Python 3.11 which breaks everything because we don't support it set. This PR introduce a simple .gitpod.yml to get it started. Signed-off-by: Tingting_Wan --- .gitpod.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .gitpod.yml diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 000000000..70738f4c0 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,33 @@ +# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart +image: gitpod/workspace-python-3.10:2023-04-20-16-32-37 + + +tasks: + # We want packages installed during the pre-build init steps to go to /workspace + # rather than ~ so that they are persisted. Gitpod sets PIP_USER=yes to ensure this, + # but pre-commit requires PIP_USER=no. Hence we set PIP_USER=no and use + # pip install --user to install to /workspace. + - name: kedro-plugins + before: | + echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no + init: | + make sign-off + command: | + pre-commit install --install-hooks + clear + + +github: + prebuilds: + # enable for the master/default branch (defaults to true) + master: true + # enable for all branches in this repo (defaults to false) + branches: true + # enable for pull requests coming from this repo (defaults to true) + pullRequests: true + # enable for pull requests coming from forks (defaults to false) + pullRequestsFromForks: true + # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) + addComment: false + # add a "Review in Gitpod" button to pull requests (defaults to false) + addBadge: true From 57a11d61801ff9ce66f8b9a842fd58031e552b81 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Mon, 24 Apr 2023 13:32:52 +0100 Subject: [PATCH 21/96] sync APIDataSet from kedro's `develop` (#184) * Update APIDataSet Signed-off-by: Nok Chan * Sync ParquetDataSet Signed-off-by: Nok Chan * Sync Test Signed-off-by: Nok Chan * Linting Signed-off-by: Nok Chan * Revert Unnecessary ParquetDataSet Changes Signed-off-by: Nok Chan * Sync release notes Signed-off-by: Nok Chan --------- Signed-off-by: Nok Chan Signed-off-by: Tingting_Wan --- kedro-datasets/RELEASE.md | 2 +- .../kedro_datasets/api/api_dataset.py | 111 +++---- kedro-datasets/tests/api/test_api_dataset.py | 273 ++++++++++++------ .../matplotlib/test_matplotlib_writer.py | 2 - .../tests/polars/test_csv_dataset.py | 1 - 5 files changed, 242 insertions(+), 147 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 01a3b92dc..e1185b54d 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,7 +3,7 @@ ## Major features and improvements: * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). - +* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 4f0ffb4cc..cb8f80d37 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -1,12 +1,17 @@ """``APIDataSet`` loads the data from HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ """ -from typing import Any, Dict, Iterable, List, NoReturn, Union +from typing import Any, Dict, List, NoReturn, Tuple, Union import requests from kedro.io.core import AbstractDataSet, DataSetError +from requests import Session, sessions from requests.auth import AuthBase +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + class APIDataSet(AbstractDataSet[None, requests.Response]): """``APIDataSet`` loads the data from HTTP(S) APIs. @@ -34,88 +39,89 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): data_catalog.html#use-the-data-catalog-with-the-code-api>`_: :: - >>> from kedro_datasets.api import APIDataSet + >>> from kedro.extras.datasets.api import APIDataSet >>> >>> >>> data_set = APIDataSet( >>> url="https://quickstats.nass.usda.gov", - >>> params={ - >>> "key": "SOME_TOKEN", - >>> "format": "JSON", - >>> "commodity_desc": "CORN", - >>> "statisticcat_des": "YIELD", - >>> "agg_level_desc": "STATE", - >>> "year": 2000 - >>> } + >>> load_args={ + >>> "params": { + >>> "key": "SOME_TOKEN", + >>> "format": "JSON", + >>> "commodity_desc": "CORN", + >>> "statisticcat_des": "YIELD", + >>> "agg_level_desc": "STATE", + >>> "year": 2000 + >>> } + >>> }, + >>> credentials=("username", "password") >>> ) >>> data = data_set.load() """ - # pylint: disable=too-many-arguments def __init__( self, url: str, method: str = "GET", - data: Any = None, - params: Dict[str, Any] = None, - headers: Dict[str, Any] = None, - auth: Union[Iterable[str], AuthBase] = None, - json: Union[List, Dict[str, Any]] = None, - timeout: int = 60, - credentials: Union[Iterable[str], AuthBase] = None, + load_args: Dict[str, Any] = None, + credentials: Union[Tuple[str, str], List[str], AuthBase] = None, ) -> None: """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint. Args: url: The API URL endpoint. method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc... - data: The request payload, used for POST, PUT, etc requests - https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests - params: The url parameters of the API. - https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls - headers: The HTTP headers. - https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers - auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``, - or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any - iterable will be cast to a tuple. - json: The request payload, used for POST, PUT, etc requests, passed in - to the json kwarg in the requests object. - https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests - timeout: The wait time in seconds for a response, defaults to 1 minute. - https://requests.readthedocs.io/en/latest/user/quickstart/#timeouts - credentials: same as ``auth``. Allows specifying ``auth`` secrets in - credentials.yml. - + load_args: Additional parameters to be fed to requests.request. + https://requests.readthedocs.io/en/latest/api/#requests.request + credentials: Allows specifying secrets in credentials.yml. + Expected format is ``('login', 'password')`` if given as a tuple or list. + An ``AuthBase`` instance can be provided for more complex cases. Raises: - ValueError: if both ``credentials`` and ``auth`` are specified. + ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified. """ super().__init__() - if credentials is not None and auth is not None: + self._load_args = load_args or {} + self._load_args_auth = self._load_args.pop("auth", None) + + if credentials is not None and self._load_args_auth is not None: raise ValueError("Cannot specify both auth and credentials.") - auth = credentials or auth + self._auth = credentials or self._load_args_auth + + if "cert" in self._load_args: + self._load_args["cert"] = self._convert_type(self._load_args["cert"]) - if isinstance(auth, Iterable): - auth = tuple(auth) + if "timeout" in self._load_args: + self._load_args["timeout"] = self._convert_type(self._load_args["timeout"]) self._request_args: Dict[str, Any] = { "url": url, "method": method, - "data": data, - "params": params, - "headers": headers, - "auth": auth, - "json": json, - "timeout": timeout, + "auth": self._convert_type(self._auth), + **self._load_args, } + @staticmethod + def _convert_type(value: Any): + """ + From the Data Catalog, iterables are provided as Lists. + However, for some parameters in the Python requests library, + only Tuples are allowed. + """ + if isinstance(value, List): + return tuple(value) + return value + def _describe(self) -> Dict[str, Any]: - return {**self._request_args} + # prevent auth from logging + request_args_cp = self._request_args.copy() + request_args_cp.pop("auth", None) + return request_args_cp - def _execute_request(self) -> requests.Response: + def _execute_request(self, session: Session) -> requests.Response: try: - response = requests.request(**self._request_args) + response = session.request(**self._request_args) response.raise_for_status() except requests.exceptions.HTTPError as exc: raise DataSetError("Failed to fetch data", exc) from exc @@ -125,12 +131,13 @@ def _execute_request(self) -> requests.Response: return response def _load(self) -> requests.Response: - return self._execute_request() + with sessions.Session() as session: + return self._execute_request(session) def _save(self, data: None) -> NoReturn: raise DataSetError(f"{self.__class__.__name__} is a read only data set type") def _exists(self) -> bool: - response = self._execute_request() - + with sessions.Session() as session: + response = self._execute_request(session) return response.ok diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index c84290750..848020041 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -1,11 +1,11 @@ # pylint: disable=no-member -import json +import base64 import socket import pytest import requests -import requests_mock from kedro.io.core import DataSetError +from requests.auth import HTTPBasicAuth from kedro_datasets.api import APIDataSet @@ -13,96 +13,190 @@ TEST_URL = "http://example.com/api/test" TEST_TEXT_RESPONSE_DATA = "This is a response." -TEST_JSON_RESPONSE_DATA = [{"key": "value"}] +TEST_JSON_REQUEST_DATA = [{"key": "value"}] TEST_PARAMS = {"param": "value"} TEST_URL_WITH_PARAMS = TEST_URL + "?param=value" - +TEST_METHOD = "GET" TEST_HEADERS = {"key": "value"} -@pytest.mark.parametrize("method", POSSIBLE_METHODS) class TestAPIDataSet: - @pytest.fixture - def requests_mocker(self): - with requests_mock.Mocker() as mock: - yield mock + @pytest.mark.parametrize("method", POSSIBLE_METHODS) + def test_request_method(self, requests_mock, method): + api_data_set = APIDataSet(url=TEST_URL, method=method) + requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA) + + response = api_data_set.load() + assert response.text == TEST_TEXT_RESPONSE_DATA - def test_successfully_load_with_response(self, requests_mocker, method): + @pytest.mark.parametrize( + "parameters_in, url_postfix", + [ + ({"param": "value"}, "?param=value"), + (bytes("a=1", "latin-1"), "?a=1"), + ], + ) + def test_params_in_request(self, requests_mock, parameters_in, url_postfix): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, method=TEST_METHOD, load_args={"params": parameters_in} ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, + requests_mock.register_uri( + TEST_METHOD, TEST_URL + url_postfix, text=TEST_TEXT_RESPONSE_DATA ) response = api_data_set.load() assert isinstance(response, requests.Response) assert response.text == TEST_TEXT_RESPONSE_DATA - def test_successful_json_load_with_response(self, requests_mocker, method): + def test_json_in_request(self, requests_mock): api_data_set = APIDataSet( url=TEST_URL, - method=method, - json=TEST_JSON_RESPONSE_DATA, - headers=TEST_HEADERS, + method=TEST_METHOD, + load_args={"json": TEST_JSON_REQUEST_DATA}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri(TEST_METHOD, TEST_URL) + + response = api_data_set.load() + assert response.request.json() == TEST_JSON_REQUEST_DATA + + def test_headers_in_request(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"headers": TEST_HEADERS} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL, headers={"pan": "cake"}) + + response = api_data_set.load() + + assert response.request.headers["key"] == "value" + assert response.headers["pan"] == "cake" + + def test_api_cookies(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"cookies": {"pan": "cake"}} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL, text="text") + + response = api_data_set.load() + assert response.request.headers["Cookie"] == "pan=cake" + + def test_credentials_auth_error(self): + """ + If ``auth`` in ``load_args`` and ``credentials`` are both provided, + the constructor should raise a ValueError. + """ + with pytest.raises(ValueError, match="both auth and credentials"): + APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"auth": []}, credentials={} + ) + + @staticmethod + def _basic_auth(username, password): + encoded = base64.b64encode(f"{username}:{password}".encode("latin-1")) + return f"Basic {encoded.decode('latin-1')}" + + @pytest.mark.parametrize( + "auth_kwarg", + [ + {"load_args": {"auth": ("john", "doe")}}, + {"load_args": {"auth": ["john", "doe"]}}, + {"load_args": {"auth": HTTPBasicAuth("john", "doe")}}, + {"credentials": ("john", "doe")}, + {"credentials": ["john", "doe"]}, + {"credentials": HTTPBasicAuth("john", "doe")}, + ], + ) + def test_auth_sequence(self, requests_mock, auth_kwarg): + api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD, **auth_kwarg) + requests_mock.register_uri( + TEST_METHOD, TEST_URL, - headers=TEST_HEADERS, - text=json.dumps(TEST_JSON_RESPONSE_DATA), + text=TEST_TEXT_RESPONSE_DATA, ) response = api_data_set.load() assert isinstance(response, requests.Response) - assert response.json() == TEST_JSON_RESPONSE_DATA + assert response.request.headers["Authorization"] == TestAPIDataSet._basic_auth( + "john", "doe" + ) + assert response.text == TEST_TEXT_RESPONSE_DATA - def test_http_error(self, requests_mocker, method): + @pytest.mark.parametrize( + "timeout_in, timeout_out", + [ + (1, 1), + ((1, 2), (1, 2)), + ([1, 2], (1, 2)), + ], + ) + def test_api_timeout(self, requests_mock, timeout_in, timeout_out): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, method=TEST_METHOD, load_args={"timeout": timeout_in} ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text="Nope, not found", - status_code=requests.codes.FORBIDDEN, + requests_mock.register_uri(TEST_METHOD, TEST_URL) + response = api_data_set.load() + assert response.request.timeout == timeout_out + + def test_stream(self, requests_mock): + text = "I am being streamed." + + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"stream": True} ) - with pytest.raises(DataSetError, match="Failed to fetch data"): - api_data_set.load() + requests_mock.register_uri(TEST_METHOD, TEST_URL, text=text) + + response = api_data_set.load() + assert isinstance(response, requests.Response) + assert response.request.stream + + chunks = list(response.iter_content(chunk_size=2, decode_unicode=True)) + assert chunks == ["I ", "am", " b", "ei", "ng", " s", "tr", "ea", "me", "d."] - def test_socket_error(self, requests_mocker, method): + def test_proxy(self, requests_mock): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url="ftp://example.com/api/test", + method=TEST_METHOD, + load_args={"proxies": {"ftp": "ftp://127.0.0.1:3000"}}, + ) + requests_mock.register_uri( + TEST_METHOD, + "ftp://example.com/api/test", ) - requests_mocker.register_uri(method, TEST_URL_WITH_PARAMS, exc=socket.error) - with pytest.raises(DataSetError, match="Failed to connect"): - api_data_set.load() + response = api_data_set.load() + assert response.request.proxies.get("ftp") == "ftp://127.0.0.1:3000" - def test_read_only_mode(self, method): - """ - Saving is disabled on the data set. - """ - api_data_set = APIDataSet(url=TEST_URL, method=method) - with pytest.raises(DataSetError, match="is a read only data set type"): - api_data_set.save({}) + @pytest.mark.parametrize( + "cert_in, cert_out", + [ + (("cert.pem", "privkey.pem"), ("cert.pem", "privkey.pem")), + (["cert.pem", "privkey.pem"], ("cert.pem", "privkey.pem")), + ("some/path/to/file.pem", "some/path/to/file.pem"), + (None, None), + ], + ) + def test_certs(self, requests_mock, cert_in, cert_out): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL) - def test_exists_http_error(self, requests_mocker, method): + response = api_data_set.load() + assert response.request.cert == cert_out + + def test_exists_http_error(self, requests_mock): """ In case of an unexpected HTTP error, ``exists()`` should not silently catch it. """ api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, text="Nope, not found", @@ -111,16 +205,18 @@ def test_exists_http_error(self, requests_mocker, method): with pytest.raises(DataSetError, match="Failed to fetch data"): api_data_set.exists() - def test_exists_ok(self, requests_mocker, method): + def test_exists_ok(self, requests_mock): """ If the file actually exists and server responds 200, ``exists()`` should return True """ api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, text=TEST_TEXT_RESPONSE_DATA, @@ -128,43 +224,38 @@ def test_exists_ok(self, requests_mocker, method): assert api_data_set.exists() - def test_credentials_auth_error(self, method): - """ - If ``auth`` and ``credentials`` are both provided, - the constructor should raise a ValueError. - """ - with pytest.raises(ValueError, match="both auth and credentials"): - APIDataSet(url=TEST_URL, method=method, auth=[], credentials=[]) - - @pytest.mark.parametrize("auth_kwarg", ["auth", "credentials"]) - @pytest.mark.parametrize( - "auth_seq", - [ - ("username", "password"), - ["username", "password"], - (e for e in ["username", "password"]), # Generator. - ], - ) - def test_auth_sequence(self, requests_mocker, method, auth_seq, auth_kwarg): - """ - ``auth`` and ``credentials`` should be able to be any Iterable. - """ - kwargs = { - "url": TEST_URL, - "method": method, - "params": TEST_PARAMS, - "headers": TEST_HEADERS, - auth_kwarg: auth_seq, - } - - api_data_set = APIDataSet(**kwargs) - requests_mocker.register_uri( - method, + def test_http_error(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, + text="Nope, not found", + status_code=requests.codes.FORBIDDEN, ) - response = api_data_set.load() - assert isinstance(response, requests.Response) - assert response.text == TEST_TEXT_RESPONSE_DATA + with pytest.raises(DataSetError, match="Failed to fetch data"): + api_data_set.load() + + def test_socket_error(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL_WITH_PARAMS, exc=socket.error) + + with pytest.raises(DataSetError, match="Failed to connect"): + api_data_set.load() + + def test_read_only_mode(self): + """ + Saving is disabled on the data set. + """ + api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD) + with pytest.raises(DataSetError, match="is a read only data set type"): + api_data_set.save({}) diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index 0745452c6..4086e127e 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -170,7 +170,6 @@ def test_dict_save(self, tmp_path, mock_dict_plot, plot_writer, mocked_s3_bucket plot_writer.save(mock_dict_plot) for colour in COLOUR_LIST: - download_path = tmp_path / "downloaded_image.png" actual_filepath = tmp_path / "locally_saved.png" @@ -361,7 +360,6 @@ def test_list_save(self, tmp_path, mock_list_plot, versioned_plot_writer): versioned_plot_writer.save(mock_list_plot) for index in range(5): - test_path = tmp_path / "test_image.png" versioned_filepath = str(versioned_plot_writer._get_load_path()) diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index 8b05a2025..d79183539 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -77,7 +77,6 @@ def mocked_dataframe(): @pytest.fixture def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): - binarycsv = mocked_dataframe.write_csv()[:-1] mocked_s3_bucket.put_object( From 11c3888a9930b1d10795dc0d82e240975382c7ab Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:45:45 +0100 Subject: [PATCH 22/96] formatting Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 23 ++++++++---- .../spark/test_spark_streaming_dataset.py | 36 ++++++++++--------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index a508a3903..4cb19e6e5 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -7,7 +7,12 @@ import yaml import fsspec -from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path +from kedro.io.core import ( + AbstractDataSet, + DataSetError, + get_filepath_str, + get_protocol_and_path, +) from pyspark import SparkConf from pyspark.sql.utils import AnalysisException from pyspark.sql import SparkSession, DataFrame @@ -117,7 +122,7 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: load_path = get_filepath_str(pure_posix_path, protocol) # Open schema file - with file_system.open(load_path, encoding='utf-8') as fs_file: + with file_system.open(load_path, encoding="utf-8") as fs_file: try: return StructType.fromJson(json.loads(fs_file.read())) except Exception as exc: @@ -159,7 +164,8 @@ def _load(self) -> DataFrame: if self._schema: input_constructor = ( self._get_spark() - .readStream.schema(self._schema).format(self._file_format) + .readStream.schema(self._schema) + .format(self._file_format) .options(**self._load_args) ) else: @@ -197,17 +203,20 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - def _exists(self, schema_path:str) -> bool: + + def _exists(self, schema_path: str) -> bool: """Check the existence of pyspark dataframe. Args: schema_path: schema of saved streaming dataframe """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) - with open(schema_path, encoding='utf-8') as f: + with open(schema_path, encoding="utf-8") as f: schema = StructType.fromJson(json.loads(f.read())) try: - self._get_spark().readStream.schema(schema).load(load_path, self._file_format) + self._get_spark().readStream.schema(schema).load( + load_path, self._file_format + ) except AnalysisException as exception: if ( exception.desc.startswith("Path does not exist:") @@ -215,4 +224,4 @@ def _exists(self, schema_path:str) -> bool: ): return False raise - return True \ No newline at end of file + return True diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index fa3b0fec8..f2fd3bb3d 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,17 +1,14 @@ import json import pytest -import time from pyspark.sql import SparkSession -from kedro_datasets.pandas import ParquetDataSet +from pyspark.sql.types import IntegerType, StringType, StructField, StructType from kedro.io.core import DataSetError from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet -from pyspark.sql.types import IntegerType, StringType, StructField, StructType - def sample_schema(schema_path): - with open(schema_path, encoding='utf-8') as f: + with open(schema_path, encoding="utf-8") as f: try: return StructType.fromJson(json.loads(f.read())) except Exception as exc: @@ -20,6 +17,7 @@ def sample_schema(schema_path): f"Schema is required for streaming data load, Please provide a valid schema_path." ) from exc + @pytest.fixture def sample_spark_streaming_df(tmp_path): schema = StructType( @@ -41,12 +39,15 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema_path = (tmp_path / "test.json").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] ) spark_json_ds.save(sample_spark_streaming_df) - streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json", - load_args={"schema": {"filepath": schema_path}}).load() + streaming_ds = SparkStreamingDataSet( + filepath=filepath, + file_format="json", + load_args={"schema": {"filepath": schema_path}}, + ).load() assert streaming_ds.isStreaming schema = sample_schema(schema_path) assert streaming_ds.schema == schema @@ -58,20 +59,23 @@ def test_save(self, tmp_path, sample_spark_streaming_df): checkpoint_path = (tmp_path / "checkpoint").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath_json, + file_format="json", + save_args=[{"mode", "overwrite"}], ) spark_json_ds.save(sample_spark_streaming_df) - loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json", - load_args={"schema": {"filepath": schema_path}}).load() - + loaded_with_streaming = SparkStreamingDataSet( + filepath=filepath_json, + file_format="json", + load_args={"schema": {"filepath": schema_path}}, + ).load() streaming_ds = SparkStreamingDataSet( - filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + filepath=filepath_output, + file_format="json", + save_args={"checkpoint": checkpoint_path, "output_mode": "append"}, ) assert not streaming_ds._exists(schema_path) streaming_ds.save(loaded_with_streaming) assert streaming_ds._exists(schema_path) - - - From 634d884576cb71609ca7a2d8746871727e3181f0 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 17:29:02 +0100 Subject: [PATCH 23/96] formatting Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/__init__.py | 8 +++++++- .../kedro_datasets/spark/deltatable_dataset.py | 3 +-- .../kedro_datasets/spark/spark_streaming_dataset.py | 10 +++++----- .../kedro_datasets/tracking/json_dataset.py | 1 - .../kedro_datasets/tracking/metrics_dataset.py | 1 - kedro-datasets/setup.py | 13 ++++++++----- kedro-datasets/tests/api/test_api_dataset.py | 3 +-- .../bioinformatics/test_biosequence_dataset.py | 3 +-- kedro-datasets/tests/dask/test_parquet_dataset.py | 3 +-- kedro-datasets/tests/email/test_message_dataset.py | 3 +-- .../tests/geojson/test_geojson_dataset.py | 3 +-- .../tests/holoviews/test_holoviews_writer.py | 3 +-- kedro-datasets/tests/json/test_json_dataset.py | 3 +-- .../tests/libsvm/test_svmlight_dataset.py | 3 +-- .../tests/matplotlib/test_matplotlib_writer.py | 3 +-- kedro-datasets/tests/networkx/test_gml_dataset.py | 3 +-- .../tests/networkx/test_graphml_dataset.py | 3 +-- kedro-datasets/tests/networkx/test_json_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_csv_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_excel_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_feather_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_gbq_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_generic_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_hdf_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_json_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_parquet_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_sql_dataset.py | 1 - kedro-datasets/tests/pandas/test_xml_dataset.py | 3 +-- kedro-datasets/tests/pickle/test_pickle_dataset.py | 3 +-- kedro-datasets/tests/pillow/test_image_dataset.py | 3 +-- kedro-datasets/tests/plotly/test_json_dataset.py | 3 +-- kedro-datasets/tests/plotly/test_plotly_dataset.py | 3 +-- kedro-datasets/tests/polars/test_csv_dataset.py | 3 +-- kedro-datasets/tests/redis/test_redis_dataset.py | 3 +-- .../tests/snowflake/test_snowpark_dataset.py | 1 - .../tests/spark/test_deltatable_dataset.py | 3 +-- kedro-datasets/tests/spark/test_spark_dataset.py | 9 ++++----- .../tests/spark/test_spark_hive_dataset.py | 3 +-- .../tests/spark/test_spark_jdbc_dataset.py | 1 - .../tests/spark/test_spark_streaming_dataset.py | 5 +++-- kedro-datasets/tests/text/test_text_dataset.py | 3 +-- kedro-datasets/tests/tracking/test_json_dataset.py | 3 +-- .../tests/tracking/test_metrics_dataset.py | 3 +-- kedro-datasets/tests/video/conftest.py | 3 +-- kedro-datasets/tests/video/test_video_dataset.py | 5 ++--- kedro-datasets/tests/video/test_video_objects.py | 3 +-- kedro-datasets/tests/yaml/test_yaml_dataset.py | 3 +-- kedro-docker/features/steps/cli_steps.py | 1 - kedro-docker/kedro_docker/plugin.py | 4 ++-- kedro-docker/tests/test_helpers.py | 1 - kedro-telemetry/kedro_telemetry/plugin.py | 1 - kedro-telemetry/tests/test_masking.py | 1 - kedro-telemetry/tests/test_plugin.py | 3 +-- tools/circleci/circleci_release.py | 1 - 54 files changed, 68 insertions(+), 107 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index 0c46a7fc3..bd649f5c7 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -1,6 +1,12 @@ """Provides I/O modules for Apache Spark.""" -__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"] +__all__ = [ + "SparkDataSet", + "SparkHiveDataSet", + "SparkJDBCDataSet", + "DeltaTableDataSet", + "SparkStreamingDataSet", +] from contextlib import suppress diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 34ee6f6a5..9454a47f7 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -6,11 +6,10 @@ from delta.tables import DeltaTable from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark.sql import SparkSession from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix - class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): """``DeltaTableDataSet`` loads data into DeltaTable objects. diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 4cb19e6e5..203539a11 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,24 +1,24 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" import json import os -from typing import Any, Dict from copy import deepcopy from pathlib import PurePosixPath -import yaml +from typing import Any, Dict import fsspec +import yaml from kedro.io.core import ( AbstractDataSet, DataSetError, get_filepath_str, get_protocol_and_path, ) +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark import SparkConf -from pyspark.sql.utils import AnalysisException -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType +from pyspark.sql.utils import AnalysisException from yaml.loader import SafeLoader -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix class SparkStreamingDataSet(AbstractDataSet): diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 4235df999..994236d3d 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -5,7 +5,6 @@ from typing import NoReturn from kedro.io.core import DataSetError - from kedro_datasets.json import JSONDataSet as JDS diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 7c7546a85..2e4e2d970 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -7,7 +7,6 @@ from typing import Dict, NoReturn from kedro.io.core import DataSetError, get_filepath_str - from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 99c30938e..63f41baf7 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -46,10 +46,15 @@ def _collect_requirements(requires): "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"], } -polars_require = {"polars.CSVDataSet": [POLARS],} +polars_require = { + "polars.CSVDataSet": [POLARS], +} redis_require = {"redis.PickleDataSet": ["redis~=4.1"]} snowflake_require = { - "snowflake.SnowparkTableDataSet": ["snowflake-snowpark-python~=1.0.0", "pyarrow~=8.0"] + "snowflake.SnowparkTableDataSet": [ + "snowflake-snowpark-python~=1.0.0", + "pyarrow~=8.0", + ] } spark_require = { "spark.SparkDataSet": [SPARK, HDFS, S3FS], @@ -67,9 +72,7 @@ def _collect_requirements(requires): "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", ] } -video_require = { - "video.VideoDataSet": ["opencv-python~=4.5.5.64"] -} +video_require = {"video.VideoDataSet": ["opencv-python~=4.5.5.64"]} yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]} extras_require = { diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index 848020041..51279c71c 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -5,9 +5,8 @@ import pytest import requests from kedro.io.core import DataSetError -from requests.auth import HTTPBasicAuth - from kedro_datasets.api import APIDataSet +from requests.auth import HTTPBasicAuth POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py index 24666baaf..42b3e252f 100644 --- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py +++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.biosequence import BioSequenceDataSet +from s3fs.core import S3FileSystem LOAD_ARGS = {"format": "fasta"} SAVE_ARGS = {"format": "fasta"} diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py index 8475dbf47..3824d6c0f 100644 --- a/kedro-datasets/tests/dask/test_parquet_dataset.py +++ b/kedro-datasets/tests/dask/test_parquet_dataset.py @@ -5,12 +5,11 @@ import pyarrow.parquet as pq import pytest from kedro.io import DataSetError +from kedro_datasets.dask import ParquetDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs import S3FileSystem -from kedro_datasets.dask import ParquetDataSet - FILE_NAME = "test.parquet" BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py index 100daba52..6f97b6c89 100644 --- a/kedro-datasets/tests/email/test_message_dataset.py +++ b/kedro-datasets/tests/email/test_message_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.email import EmailMessageDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py index b5f3ec4cb..cd6c07c7c 100644 --- a/kedro-datasets/tests/geojson/test_geojson_dataset.py +++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py @@ -7,12 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.geopandas import GeoJSONDataSet from pandas.testing import assert_frame_equal from s3fs import S3FileSystem from shapely.geometry import Point -from kedro_datasets.geopandas import GeoJSONDataSet - @pytest.fixture(params=[None]) def load_version(request): diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py index f4f91383e..53ca795f2 100644 --- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py +++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py @@ -9,9 +9,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.holoviews import HoloviewsWriter +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py index 621e51fcd..dafdd8e3e 100644 --- a/kedro-datasets/tests/json/test_json_dataset.py +++ b/kedro-datasets/tests/json/test_json_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.json import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py index 8fff3edd2..9fcf09c0c 100644 --- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py +++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.svmlight import SVMLightDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index 4086e127e..ed4dec348 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -6,11 +6,10 @@ import matplotlib.pyplot as plt import pytest from kedro.io import DataSetError, Version +from kedro_datasets.matplotlib import MatplotlibWriter from moto import mock_s3 from s3fs import S3FileSystem -from kedro_datasets.matplotlib import MatplotlibWriter - BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "testing", "secret": "testing"} KEY_PATH = "matplotlib" diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py index a3a89eca7..dd589019d 100644 --- a/kedro-datasets/tests/networkx/test_gml_dataset.py +++ b/kedro-datasets/tests/networkx/test_gml_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import GMLDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py index 4e0dcf40d..9ff22883e 100644 --- a/kedro-datasets/tests/networkx/test_graphml_dataset.py +++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import GraphMLDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py index 4d6e582a8..ed437f69a 100644 --- a/kedro-datasets/tests/networkx/test_json_dataset.py +++ b/kedro-datasets/tests/networkx/test_json_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import JSONDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 5cc1ee36b..53a1e7c52 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -12,12 +12,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.pandas import CSVDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import CSVDataSet - BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index 1080cc9b6..bae8c5147 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import ExcelDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import ExcelDataSet - @pytest.fixture def filepath_excel(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index 80c1ce678..ec995d657 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import FeatherDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import FeatherDataSet - @pytest.fixture def filepath_feather(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index e239dbaba..d970db36e 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -4,9 +4,8 @@ import pytest from google.cloud.exceptions import NotFound from kedro.io.core import DataSetError -from pandas.testing import assert_frame_equal - from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet +from pandas.testing import assert_frame_equal DATASET = "dataset" TABLE_NAME = "table_name" diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index 6f40bb0d4..2526c1ed6 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -9,11 +9,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp +from kedro_datasets.pandas import GenericDataSet from pandas._testing import assert_frame_equal from s3fs import S3FileSystem -from kedro_datasets.pandas import GenericDataSet - @pytest.fixture def filepath_sas(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index 563ba63d9..c59e7a104 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import HDFDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import HDFDataSet - HDF_KEY = "data" diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index df2e856d5..7da50165e 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import JSONDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import JSONDataSet - @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py index 2d7ce2996..cc62ed203 100644 --- a/kedro-datasets/tests/pandas/test_parquet_dataset.py +++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py @@ -7,12 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import ParquetDataSet from pandas.testing import assert_frame_equal from pyarrow.fs import FSSpecHandler, PyFileSystem from s3fs.core import S3FileSystem -from kedro_datasets.pandas import ParquetDataSet - FILENAME = "test.parquet" diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index 308582859..b810748c2 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -6,7 +6,6 @@ import pytest import sqlalchemy from kedro.io import DataSetError - from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet TABLE_NAME = "table_a" diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py index bd62ea586..65be88174 100644 --- a/kedro-datasets/tests/pandas/test_xml_dataset.py +++ b/kedro-datasets/tests/pandas/test_xml_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import XMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import XMLDataSet - @pytest.fixture def filepath_xml(tmp_path): diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index fb95681a3..2846201cf 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pickle import PickleDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pickle import PickleDataSet - @pytest.fixture def filepath_pickle(tmp_path): diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py index ea500b20d..ed27e3cb9 100644 --- a/kedro-datasets/tests/pillow/test_image_dataset.py +++ b/kedro-datasets/tests/pillow/test_image_dataset.py @@ -6,11 +6,10 @@ from fsspec.implementations.local import LocalFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.pillow import ImageDataSet from PIL import Image, ImageChops from s3fs.core import S3FileSystem -from kedro_datasets.pillow import ImageDataSet - @pytest.fixture def filepath_png(tmp_path): diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py index ab6e17d9c..0115a72dd 100644 --- a/kedro-datasets/tests/plotly/test_json_dataset.py +++ b/kedro-datasets/tests/plotly/test_json_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.plotly import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py index a422060e8..9b33492bf 100644 --- a/kedro-datasets/tests/plotly/test_plotly_dataset.py +++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py @@ -8,12 +8,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.plotly import PlotlyDataSet from plotly import graph_objects from plotly.graph_objs import Scatter from s3fs.core import S3FileSystem -from kedro_datasets.plotly import PlotlyDataSet - @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index d79183539..4c0807d91 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -12,12 +12,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.polars import CSVDataSet from moto import mock_s3 from polars.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.polars import CSVDataSet - BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index eaa8abbd2..ddda22c17 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -8,9 +8,8 @@ import pytest import redis from kedro.io import DataSetError -from pandas.testing import assert_frame_equal - from kedro_datasets.redis import PickleDataSet +from pandas.testing import assert_frame_equal @pytest.fixture(params=["pickle"]) diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py index 2133953b5..d73731df2 100644 --- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -6,7 +6,6 @@ try: import snowflake.snowpark as sp - from kedro_datasets.snowflake import SnowparkTableDataSet as spds except ImportError: pass # this is only for test discovery to succeed on Python <> 3.8 diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index 5cbbe62b7..430c78ea2 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -4,12 +4,11 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner +from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet - @pytest.fixture def sample_spark_df(): diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 9452b007d..9a3e58035 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -12,6 +12,10 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner +from kedro_datasets.pandas import CSVDataSet, ParquetDataSet +from kedro_datasets.pickle import PickleDataSet +from kedro_datasets.spark import SparkDataSet +from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils from moto import mock_s3 from pyspark.sql import SparkSession from pyspark.sql.functions import col @@ -24,11 +28,6 @@ ) from pyspark.sql.utils import AnalysisException -from kedro_datasets.pandas import CSVDataSet, ParquetDataSet -from kedro_datasets.pickle import PickleDataSet -from kedro_datasets.spark import SparkDataSet -from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils - FOLDER_NAME = "fake_folder" FILENAME = "test.parquet" BUCKET_NAME = "test_bucket" diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index e0b8fc333..88c18aee6 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -5,13 +5,12 @@ import pytest from kedro.io import DataSetError +from kedro_datasets.spark import SparkHiveDataSet from psutil import Popen from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType -from kedro_datasets.spark import SparkHiveDataSet - TESTSPARKDIR = "test_spark_dir" diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py index 0f3d0e66b..73e091ef9 100644 --- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py @@ -2,7 +2,6 @@ import pytest from kedro.io import DataSetError - from kedro_datasets.spark import SparkJDBCDataSet diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index f2fd3bb3d..fe59c5810 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,10 +1,11 @@ import json + import pytest -from pyspark.sql import SparkSession -from pyspark.sql.types import IntegerType, StringType, StructField, StructType from kedro.io.core import DataSetError from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet +from pyspark.sql import SparkSession +from pyspark.sql.types import IntegerType, StringType, StructField, StructType def sample_schema(schema_path): diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py index 733cc6c1f..a4bee6896 100644 --- a/kedro-datasets/tests/text/test_text_dataset.py +++ b/kedro-datasets/tests/text/test_text_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.text import TextDataSet +from s3fs.core import S3FileSystem STRING = "Write to text file." diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py index 62172b1a4..2529868c4 100644 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ b/kedro-datasets/tests/tracking/test_json_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.tracking import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py index 2c1157de9..ad9f4a1cb 100644 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.tracking import MetricsDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py index 7a0a4c87b..0dd5576dc 100644 --- a/kedro-datasets/tests/video/conftest.py +++ b/kedro-datasets/tests/video/conftest.py @@ -1,11 +1,10 @@ from pathlib import Path import pytest +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from PIL import Image from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo - @pytest.fixture(scope="module") def red_frame(): diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py index 1ac3d1ce4..b4428c4df 100644 --- a/kedro-datasets/tests/video/test_video_dataset.py +++ b/kedro-datasets/tests/video/test_video_dataset.py @@ -1,11 +1,10 @@ import boto3 import pytest from kedro.io import DataSetError -from moto import mock_s3 -from utils import TEST_FPS, assert_videos_equal - from kedro_datasets.video import VideoDataSet from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo +from moto import mock_s3 +from utils import TEST_FPS, assert_videos_equal S3_BUCKET_NAME = "test_bucket" S3_KEY_PATH = "video" diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py index 1cb7cca75..3adb701d2 100644 --- a/kedro-datasets/tests/video/test_video_objects.py +++ b/kedro-datasets/tests/video/test_video_objects.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from utils import ( DEFAULT_FOURCC, MJPEG_FOURCC, @@ -21,8 +22,6 @@ assert_images_equal, ) -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo - class TestSequenceVideo: def test_sequence_video_indexing_first(self, color_video, red_frame): diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py index 653606c17..2cadeee7d 100644 --- a/kedro-datasets/tests/yaml/test_yaml_dataset.py +++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.yaml import YAMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.yaml import YAMLDataSet - @pytest.fixture def filepath_yaml(tmp_path): diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 0306c1e2f..2c680fd70 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -8,7 +8,6 @@ import behave import yaml from behave import given, then, when - from features.steps.sh_run import ChildTerminatingPopen, run from features.steps.util import ( TimeoutException, diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index 27af7db96..cc8dda1c4 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -125,9 +125,9 @@ def docker_init(spark): if KEDRO_VERSION.match(">=0.17.0"): verbose = KedroCliError.VERBOSE_ERROR else: - from kedro.framework.cli.cli import ( # noqa # pylint:disable=import-outside-toplevel, no-name-in-module + from kedro.framework.cli.cli import ( _VERBOSE as verbose, - ) + ) # noqa # pylint:disable=import-outside-toplevel, no-name-in-module docker_file_version = "spark" if spark else "simple" docker_file = f"Dockerfile.{docker_file_version}" diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py index 40b5d9306..f205c9efe 100644 --- a/kedro-docker/tests/test_helpers.py +++ b/kedro-docker/tests/test_helpers.py @@ -3,7 +3,6 @@ import pytest from click import ClickException - from kedro_docker.helpers import ( add_jupyter_args, check_docker_image_exists, diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index 5eeb4d489..1027d541d 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -22,7 +22,6 @@ from kedro.framework.startup import ProjectMetadata from kedro.io.data_catalog import DataCatalog from kedro.pipeline import Pipeline - from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 74773e2f4..1e674096b 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -9,7 +9,6 @@ from kedro import __version__ as kedro_version from kedro.framework.cli.cli import KedroCLI, cli from kedro.framework.startup import ProjectMetadata - from kedro_telemetry.masking import ( MASK, _get_cli_structure, diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 222bcc914..9b1a6460b 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -9,8 +9,6 @@ from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline -from pytest import fixture - from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.plugin import ( KedroTelemetryCLIHooks, @@ -18,6 +16,7 @@ _check_for_telemetry_consent, _confirm_consent, ) +from pytest import fixture REPO_NAME = "dummy_project" PACKAGE_NAME = "dummy_package" diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index dd05d4c5a..e8f5d8449 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -8,7 +8,6 @@ import requests from requests.structures import CaseInsensitiveDict - from utils.check_no_version_pypi import check_no_version_pypi from utils.package_version import get_package_version From 9e8f55cea58eeb484a05ec70a724feacddb52ecb Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 19:08:07 +0100 Subject: [PATCH 24/96] formatting Signed-off-by: Tingting_Wan --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index fe59c5810..82b90481c 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -2,11 +2,12 @@ import pytest from kedro.io.core import DataSetError -from kedro_datasets.spark.spark_dataset import SparkDataSet -from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro_datasets.spark.spark_dataset import SparkDataSet +from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet + def sample_schema(schema_path): with open(schema_path, encoding="utf-8") as f: @@ -29,7 +30,7 @@ def sample_spark_streaming_df(tmp_path): ) data = [("0001", 2), ("0001", 7), ("0002", 4)] schema_path = (tmp_path / "test.json").as_posix() - with open(schema_path, "w") as f: + with open(schema_path, "w", encoding="utf-8") as f: json.dump(schema.jsonValue(), f) return SparkSession.builder.getOrCreate().createDataFrame(data, schema) From dbdf19c61acb021506a07e57bd3ae504d2c04a84 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 19:08:44 +0100 Subject: [PATCH 25/96] formatting Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 203539a11..79a044c6d 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -13,13 +13,14 @@ get_filepath_str, get_protocol_and_path, ) -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark import SparkConf from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException from yaml.loader import SafeLoader +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix + class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -47,6 +48,7 @@ class SparkStreamingDataSet(AbstractDataSet): """ + # pylint: disable=too-many-instance-attributes DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] @@ -156,7 +158,8 @@ def _get_spark(): def _load(self) -> DataFrame: """Loads data from filepath. - If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args + If the connector type is kafka then no file_path is required, schema needs to be + seperated from load_args. Returns: Data from filepath as pyspark dataframe. @@ -211,8 +214,8 @@ def _exists(self, schema_path: str) -> bool: schema_path: schema of saved streaming dataframe """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) - with open(schema_path, encoding="utf-8") as f: - schema = StructType.fromJson(json.loads(f.read())) + with open(schema_path, encoding="utf-8") as schema_file: + schema = StructType.fromJson(json.loads(schema_file.read())) try: self._get_spark().readStream.schema(schema).load( load_path, self._file_format From 1a7a4776c86f8835dd2ee3e95562555d3e1ecbe2 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Wed, 12 Apr 2023 13:41:49 +0100 Subject: [PATCH 26/96] add spark_stream_dataset.py Signed-off-by: Tingting_Wan --- .../spark/spark_stream_dataset.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py new file mode 100644 index 000000000..6844e04cf --- /dev/null +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -0,0 +1,128 @@ +"""SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +from typing import Any, Dict + +import pyspark +import yaml +from kedro.io import AbstractDataSet +from pyspark import SparkConf +from pyspark.sql import SparkSession +from yaml.loader import SafeLoader + + +class SparkStreamingDataSet(AbstractDataSet): + """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. + + Example usage for the + `YAML API `_: + .. code-block:: yaml + + raw.new_inventory: + type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + filepath: data/01_raw/stream/inventory/ + file_format: json + + int.new_inventory: + type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + filepath: data/02_intermediate/inventory/ + file_format: csv + save_args: + output_mode: append + checkpoint: data/04_checkpoint/int_new_inventory + header: True + load_args: + header: True + + """ + + def __init__( + self, + filepath: str = "", + file_format: str = "", + save_args: Dict[str, str] = {}, + load_args: Dict[str, str] = {}, + ): + """Creates a new instance of SparkStreamingDataSet. + + Args: + filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks + specify ``filepath``s starting with ``/dbfs/``. For message brokers such as + Kafka and all filepath is not required. + file_format: File format used during load and save + operations. These are formats supported by the running + SparkContext include parquet, csv, delta. For a list of supported + formats please refer to Apache Spark documentation at + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + load_args: Load args passed to Spark DataFrameReader load method. + It is dependent on the selected file format. You can find + a list of read options for each supported format + in Spark DataFrame read documentation: + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + save_args: Save args passed to Spark DataFrame write options. + Similar to load_args this is dependent on the selected file + format. You can pass ``mode`` and ``partitionBy`` to specify + your overwrite mode and partitioning respectively. You can find + a list of options for each format in Spark DataFrame + write documentation: + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + """ + self._filepath_ = filepath + self.file_format = file_format + self._save_args = save_args + self._load_args = load_args + self.output_format = [ + "kafka" + ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + + # read spark configuration from spark yml file and create a spark context + with open("conf/base/spark.yml") as f: + self.parameters = yaml.load(f, Loader=SafeLoader) + self.spark_conf = SparkConf().setAll(self.parameters.items()) + + # Initialise the spark session + self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf) + self.spark = self.spark_session_conf.getOrCreate() + + def _load(self) -> pyspark.sql.DataFrame: + """Loads data from filepath. + If the connector type is kafka then no file_path is required + + Returns: + Data from filepath as pyspark dataframe. + """ + input_constructor = self.spark.readStream.format(self.file_format).options( + **self._load_args + ) + return ( + input_constructor.load() + if self.file_format + in self.output_format # if the connector type is message broker + else input_constructor.load(self._filepath_) + ) + + def _save(self, data: pyspark.sql.DataFrame) -> None: + """Saves pyspark dataframe. + + Args: + data: PySpark streaming dataframe for saving + + """ + + output_constructor = data.writeStream.format(self.file_format) + + # for message brokers path is not needed + if self.file_format not in self.output_format: + output_constructor = output_constructor.option("path", self._filepath_) + + ( + output_constructor.option( + "checkpointLocation", self._save_args.pop("checkpoint") + ) + .outputMode(self._save_args.pop("output_mode")) + .options(**self._save_args) + .start() + ) + + def _describe(self) -> Dict[str, Any]: + """Returns a dict that describes attributes of the dataset.""" + return None From e8779442f48083430453ffdc4606c9c1f3a0a3a3 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 11:48:57 +0100 Subject: [PATCH 27/96] restructure the strean dataset to align with the other spark dataset Signed-off-by: Tingting_Wan --- .../spark/spark_stream_dataset.py | 57 ++++++++++++++----- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 6844e04cf..0992ab5ce 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -1,13 +1,13 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" from typing import Any, Dict - -import pyspark +from copy import deepcopy import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf -from pyspark.sql import SparkSession +from pathlib import PurePosixPath +from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader - +from kedro_datasets.spark.spark_dataset import _split_filepath class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -35,13 +35,16 @@ class SparkStreamingDataSet(AbstractDataSet): """ + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + def __init__( self, filepath: str = "", file_format: str = "", - save_args: Dict[str, str] = {}, - load_args: Dict[str, str] = {}, - ): + save_args: Dict[str, Any] = None, + load_args: Dict[str, Any] = None, + ) -> None: """Creates a new instance of SparkStreamingDataSet. Args: @@ -74,23 +77,46 @@ def __init__( "kafka" ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + fs_prefix, filepath = _split_filepath(filepath) + + self._fs_prefix = fs_prefix + self._filepath = PurePosixPath(filepath) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + def _describe(self) -> Dict[str, Any]: + """Returns a dict that describes attributes of the dataset.""" + return { + "filepath": self._fs_prefix + str(self._filepath), + "file_format": self._file_format, + "load_args": self._load_args, + "save_args": self._save_args, + } + + @staticmethod + def _get_spark(self): # read spark configuration from spark yml file and create a spark context with open("conf/base/spark.yml") as f: self.parameters = yaml.load(f, Loader=SafeLoader) self.spark_conf = SparkConf().setAll(self.parameters.items()) # Initialise the spark session - self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf) - self.spark = self.spark_session_conf.getOrCreate() + return SparkSession.builder.config(conf=self.spark_conf).getOrCreate() - def _load(self) -> pyspark.sql.DataFrame: + def _load(self) -> DataFrame: """Loads data from filepath. If the connector type is kafka then no file_path is required Returns: Data from filepath as pyspark dataframe. """ - input_constructor = self.spark.readStream.format(self.file_format).options( + input_constructor = self._get_spark().readStream.format(self.file_format).options( **self._load_args ) return ( @@ -100,7 +126,7 @@ def _load(self) -> pyspark.sql.DataFrame: else input_constructor.load(self._filepath_) ) - def _save(self, data: pyspark.sql.DataFrame) -> None: + def _save(self, data: DataFrame) -> None: """Saves pyspark dataframe. Args: @@ -123,6 +149,7 @@ def _save(self, data: pyspark.sql.DataFrame) -> None: .start() ) - def _describe(self) -> Dict[str, Any]: - """Returns a dict that describes attributes of the dataset.""" - return None + + + + From 09e9cf2649175495bec87e3ea0fb7383eee00b4a Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 17:43:56 +0100 Subject: [PATCH 28/96] adding README.md for specification Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 38 +++++++++++++++++++ .../spark/spark_stream_dataset.py | 17 +++++---- 2 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/spark/README.md diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md new file mode 100644 index 000000000..bded31532 --- /dev/null +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -0,0 +1,38 @@ +# Spark Streaming + +``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames. +See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. + +To work with multiple streaming nodes, 2 hook are required for: + - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details + - Running streaming query without termination unless exception + +#### Example SparkStreamsHook: + +```python +from kedro.framework.hooks import hook_impl +from pyspark.sql import SparkSession + +class SparkStreamsHook: + @hook_impl + def after_pipeline_run(self) -> None: + """Starts a spark streaming await session + once the pipeline reaches the last node + """ + + spark = SparkSession.builder.getOrCreate() + spark.streams.awaitAnyTermination() +``` +To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``. + +#### Example spark.yml: + +```yaml +spark.driver.maxResultSize: 3g +spark.scheduler.mode: FAIR +spark.sql.streaming.schemaInference: True +spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context +spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly +spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used) + +``` diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 0992ab5ce..382c45286 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -1,4 +1,5 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +import os from typing import Any, Dict from copy import deepcopy import yaml @@ -101,13 +102,15 @@ def _describe(self) -> Dict[str, Any]: @staticmethod def _get_spark(self): - # read spark configuration from spark yml file and create a spark context - with open("conf/base/spark.yml") as f: - self.parameters = yaml.load(f, Loader=SafeLoader) - self.spark_conf = SparkConf().setAll(self.parameters.items()) - - # Initialise the spark session - return SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + spark_conf_path = "conf/base/spark.yml" + if os.path.exists(spark_conf_path): + with open(spark_conf_path) as f: + self.parameters = yaml.load(f, Loader=SafeLoader) + self.spark_conf = SparkConf().setAll(self.parameters.items()) + spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + else: + spark = SparkSession.builder.getOrCreate() + return spark def _load(self) -> DataFrame: """Loads data from filepath. From 2e30ec07941be9f1d5c4e4866d2c0025381ed068 Mon Sep 17 00:00:00 2001 From: Tingting Wan <110382691+Tingting711@users.noreply.github.com> Date: Fri, 14 Apr 2023 01:40:10 +0800 Subject: [PATCH 29/96] Update kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py Co-authored-by: Nok Lam Chan Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py index 382c45286..77bf62f40 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py @@ -19,7 +19,7 @@ class SparkStreamingDataSet(AbstractDataSet): .. code-block:: yaml raw.new_inventory: - type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + type: spark.SparkStreamingDataSet filepath: data/01_raw/stream/inventory/ file_format: json From 6147636c7be7f7131b2534ee81c7a397ec8277ea Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 13 Apr 2023 18:41:12 +0100 Subject: [PATCH 30/96] rename the dataset Signed-off-by: Tingting_Wan --- .../spark/{spark_stream_dataset.py => spark_streaming_dataset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kedro-datasets/kedro_datasets/spark/{spark_stream_dataset.py => spark_streaming_dataset.py} (100%) diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py similarity index 100% rename from kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py rename to kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py From 29376e94d5edb736569e59594edf22c224aa1cf6 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 17 Apr 2023 10:19:43 +0100 Subject: [PATCH 31/96] resolve comments Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/__init__.py | 2 ++ kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index 3dede09aa..c93d3f0df 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -12,3 +12,5 @@ from .spark_jdbc_dataset import SparkJDBCDataSet with suppress(ImportError): from .deltatable_dataset import DeltaTableDataSet +with suppress(ImportError): + from .spark_streaming_dataset import SparkStreamingDataSet diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 77bf62f40..1ee271e87 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -24,7 +24,7 @@ class SparkStreamingDataSet(AbstractDataSet): file_format: json int.new_inventory: - type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + type: spark.SparkStreamingDataSet filepath: data/02_intermediate/inventory/ file_format: csv save_args: From 42ed37a38537d01b35c6e615a2f3d71493984382 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 17 Apr 2023 11:42:05 +0100 Subject: [PATCH 32/96] fix format and pylint Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 1ee271e87..fa6fc9c7e 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -2,14 +2,15 @@ import os from typing import Any, Dict from copy import deepcopy +from pathlib import PurePosixPath import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf -from pathlib import PurePosixPath from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader from kedro_datasets.spark.spark_dataset import _split_filepath + class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -71,12 +72,10 @@ def __init__( https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html """ self._filepath_ = filepath - self.file_format = file_format + self._file_format = file_format self._save_args = save_args self._load_args = load_args - self.output_format = [ - "kafka" - ] # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving. + self.output_format = ["kafka"] fs_prefix, filepath = _split_filepath(filepath) @@ -101,13 +100,15 @@ def _describe(self) -> Dict[str, Any]: } @staticmethod - def _get_spark(self): + def _get_spark(): spark_conf_path = "conf/base/spark.yml" if os.path.exists(spark_conf_path): - with open(spark_conf_path) as f: - self.parameters = yaml.load(f, Loader=SafeLoader) - self.spark_conf = SparkConf().setAll(self.parameters.items()) - spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate() + with open( + spark_conf_path, encoding="utf-8" + ) as File: # pylint: disable=invalid-name + parameters = yaml.load(File, Loader=SafeLoader) + spark_conf = SparkConf().setAll(parameters.items()) + spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() else: spark = SparkSession.builder.getOrCreate() return spark @@ -119,12 +120,14 @@ def _load(self) -> DataFrame: Returns: Data from filepath as pyspark dataframe. """ - input_constructor = self._get_spark().readStream.format(self.file_format).options( - **self._load_args + input_constructor = ( + self._get_spark() + .readStream.format(self._file_format) + .options(**self._load_args) ) return ( input_constructor.load() - if self.file_format + if self._file_format in self.output_format # if the connector type is message broker else input_constructor.load(self._filepath_) ) @@ -137,10 +140,10 @@ def _save(self, data: DataFrame) -> None: """ - output_constructor = data.writeStream.format(self.file_format) + output_constructor = data.writeStream.format(self._file_format) # for message brokers path is not needed - if self.file_format not in self.output_format: + if self._file_format not in self.output_format: output_constructor = output_constructor.option("path", self._filepath_) ( @@ -151,8 +154,3 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - - - - - From d93d9b9d41a0cb9c29243e1950369fea77e3d5ed Mon Sep 17 00:00:00 2001 From: Tingting Wan <110382691+Tingting711@users.noreply.github.com> Date: Mon, 17 Apr 2023 21:21:08 +0800 Subject: [PATCH 33/96] Update kedro-datasets/kedro_datasets/spark/README.md Co-authored-by: Deepyaman Datta Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index bded31532..f222df00a 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -1,6 +1,6 @@ # Spark Streaming -``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames. +``SparkStreamingDataSet`` loads and saves data to streaming DataFrames. See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. To work with multiple streaming nodes, 2 hook are required for: From 5b83444ebd0af2d28f51bd75121ee867b968f76f Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Fri, 21 Apr 2023 23:02:52 +0100 Subject: [PATCH 34/96] add unit tests and SparkStreamingDataset in init.py Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/__init__.py | 2 +- .../spark/spark_streaming_dataset.py | 14 +++++- .../spark/test_spark_streaming_dataset.py | 47 +++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 kedro-datasets/tests/spark/test_spark_streaming_dataset.py diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index c93d3f0df..0c46a7fc3 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -1,6 +1,6 @@ """Provides I/O modules for Apache Spark.""" -__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet"] +__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"] from contextlib import suppress diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index fa6fc9c7e..10680d661 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -6,9 +6,10 @@ import yaml from kedro.io import AbstractDataSet from pyspark import SparkConf +from pyspark.errors.exceptions.captured import AnalysisException from pyspark.sql import SparkSession, DataFrame from yaml.loader import SafeLoader -from kedro_datasets.spark.spark_dataset import _split_filepath +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix class SparkStreamingDataSet(AbstractDataSet): @@ -154,3 +155,14 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) + def _exists(self) -> bool: + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + + try: + self._get_spark().read.load(path=load_path, format="delta") + except AnalysisException as exception: + if "is not a Delta table" in exception.desc: + return False + raise + + return True \ No newline at end of file diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py new file mode 100644 index 000000000..47a427742 --- /dev/null +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -0,0 +1,47 @@ +import pytest +import time +from pyspark.sql import SparkSession +from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + +@pytest.fixture +def sample_spark_streaming_df_one(): + schema = StructType( + [ + StructField("sku", StringType(), True), + StructField("new_stock", IntegerType(), True), + ] + ) + data = [("0001", 2), ("0001", 7), ("0002", 4)] + + return SparkSession.builder.getOrCreate() \ + .createDataFrame(data, schema) + + +class TestStreamingDataSet: + def test_load(self,tmp_path, sample_spark_streaming_df_one): + filepath = (tmp_path / "test_streams").as_posix() + spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"]) + spark_json_ds.save(sample_spark_streaming_df_one) + loaded_with_spark = spark_json_ds.load() + + stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load() + assert stream_df.isStreaming + + stream_query = stream_df.writeStream.format("memory").queryName("test").start() + assert stream_query.isActive + time.sleep(3) + stream_query.stop() + loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test") + + assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0 + + + def test_save(self, tmp_path, sample_spark_df): + filepath = (tmp_path / "test_streams").as_posix() + checkpoint_path = (tmp_path / "checkpoint").as_posix() + streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path]) + assert not streaming_ds.exists() + + From 5b0630e11306643ae0d4c1706fe18a501a6b1179 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:20:02 +0100 Subject: [PATCH 35/96] add unit tests Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 76 +++++++++++++++---- .../spark/test_spark_streaming_dataset.py | 66 ++++++++++------ 2 files changed, 107 insertions(+), 35 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 10680d661..a508a3903 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,13 +1,17 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" +import json import os from typing import Any, Dict from copy import deepcopy from pathlib import PurePosixPath import yaml -from kedro.io import AbstractDataSet + +import fsspec +from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path from pyspark import SparkConf -from pyspark.errors.exceptions.captured import AnalysisException +from pyspark.sql.utils import AnalysisException from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.types import StructType from yaml.loader import SafeLoader from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix @@ -91,6 +95,37 @@ def __init__( if save_args is not None: self._save_args.update(save_args) + # Handle schema load argument + self._schema = self._load_args.pop("schema", None) + if self._schema is not None: + if isinstance(self._schema, dict): + self._schema = self._load_schema_from_file(self._schema) + + @staticmethod + def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: + filepath = schema.get("filepath") + if not filepath: + raise DataSetError( + "Schema load argument does not specify a 'filepath' attribute. Please" + "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." + ) + + credentials = deepcopy(schema.get("credentials")) or {} + protocol, schema_path = get_protocol_and_path(filepath) + file_system = fsspec.filesystem(protocol, **credentials) + pure_posix_path = PurePosixPath(schema_path) + load_path = get_filepath_str(pure_posix_path, protocol) + + # Open schema file + with file_system.open(load_path, encoding='utf-8') as fs_file: + try: + return StructType.fromJson(json.loads(fs_file.read())) + except Exception as exc: + raise DataSetError( + f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please" + f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." + ) from exc + def _describe(self) -> Dict[str, Any]: """Returns a dict that describes attributes of the dataset.""" return { @@ -116,16 +151,23 @@ def _get_spark(): def _load(self) -> DataFrame: """Loads data from filepath. - If the connector type is kafka then no file_path is required + If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args Returns: Data from filepath as pyspark dataframe. """ - input_constructor = ( - self._get_spark() - .readStream.format(self._file_format) - .options(**self._load_args) - ) + if self._schema: + input_constructor = ( + self._get_spark() + .readStream.schema(self._schema).format(self._file_format) + .options(**self._load_args) + ) + else: + input_constructor = ( + self._get_spark() + .readStream.format(self._file_format) + .options(**self._load_args) + ) return ( input_constructor.load() if self._file_format @@ -155,14 +197,22 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - def _exists(self) -> bool: - load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + def _exists(self, schema_path:str) -> bool: + """Check the existence of pyspark dataframe. + Args: + schema_path: schema of saved streaming dataframe + """ + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + with open(schema_path, encoding='utf-8') as f: + schema = StructType.fromJson(json.loads(f.read())) try: - self._get_spark().read.load(path=load_path, format="delta") + self._get_spark().readStream.schema(schema).load(load_path, self._file_format) except AnalysisException as exception: - if "is not a Delta table" in exception.desc: + if ( + exception.desc.startswith("Path does not exist:") + or "is not a Streaming data" in exception.desc + ): return False raise - return True \ No newline at end of file diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 47a427742..2d936b1ce 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,12 +1,27 @@ +import json import pytest import time from pyspark.sql import SparkSession -from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet +from kedro_datasets.pandas import ParquetDataSet +from kedro.io.core import DataSetError +from kedro_datasets.spark.spark_dataset import SparkDataSet +from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet from pyspark.sql.types import IntegerType, StringType, StructField, StructType + +def sample_schema(schema_path): + with open(schema_path, encoding='utf-8') as f: + try: + return StructType.fromJson(json.loads(f.read())) + except Exception as exc: + raise DataSetError( + f"Contents of 'schema.filepath' ({schema_path}) are invalid. " + f"Schema is required for streaming data load, Please provide a valid schema_path." + ) from exc + @pytest.fixture -def sample_spark_streaming_df_one(): +def sample_spark_streaming_df(tmp_path): schema = StructType( [ StructField("sku", StringType(), True), @@ -14,34 +29,41 @@ def sample_spark_streaming_df_one(): ] ) data = [("0001", 2), ("0001", 7), ("0002", 4)] - - return SparkSession.builder.getOrCreate() \ - .createDataFrame(data, schema) + schema_path = (tmp_path / "test.json").as_posix() + with open(schema_path, "w") as f: + json.dump(schema.jsonValue(), f) + return SparkSession.builder.getOrCreate().createDataFrame(data, schema) class TestStreamingDataSet: - def test_load(self,tmp_path, sample_spark_streaming_df_one): + def test_load(self, tmp_path, sample_spark_streaming_df): filepath = (tmp_path / "test_streams").as_posix() - spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"]) - spark_json_ds.save(sample_spark_streaming_df_one) - loaded_with_spark = spark_json_ds.load() + schema_path = (tmp_path / "test.json").as_posix() - stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load() - assert stream_df.isStreaming + spark_json_ds = SparkDataSet( + filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) - stream_query = stream_df.writeStream.format("memory").queryName("test").start() - assert stream_query.isActive - time.sleep(3) - stream_query.stop() - loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test") + streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json", + load_args={"schema": {"filepath": schema_path}}).load() + assert streaming_ds.isStreaming + schema = sample_schema(schema_path) + assert streaming_ds.schema == schema - assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0 + def test_save(self, tmp_path, sample_spark_streaming_df): + filepath = (tmp_path / "test_streams_input").as_posix() + schema_path = (tmp_path / "test.json").as_posix() + checkpoint_path = (tmp_path / "checkpoint").as_posix() + spark_json_ds = SparkDataSet( + filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) - def test_save(self, tmp_path, sample_spark_df): - filepath = (tmp_path / "test_streams").as_posix() - checkpoint_path = (tmp_path / "checkpoint").as_posix() - streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path]) - assert not streaming_ds.exists() + streaming_ds = SparkStreamingDataSet( + filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + ) + assert streaming_ds._exists(schema_path) From 1433808e2d1940f4be6287f67c73abf2a60c76d0 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:41:59 +0100 Subject: [PATCH 36/96] update test_save Signed-off-by: Tingting_Wan --- .../tests/spark/test_spark_streaming_dataset.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 2d936b1ce..fa3b0fec8 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -52,18 +52,26 @@ def test_load(self, tmp_path, sample_spark_streaming_df): assert streaming_ds.schema == schema def test_save(self, tmp_path, sample_spark_streaming_df): - filepath = (tmp_path / "test_streams_input").as_posix() + filepath_json = (tmp_path / "test_streams").as_posix() + filepath_output = (tmp_path / "test_streams_output").as_posix() schema_path = (tmp_path / "test.json").as_posix() checkpoint_path = (tmp_path / "checkpoint").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}] ) spark_json_ds.save(sample_spark_streaming_df) + loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json", + load_args={"schema": {"filepath": schema_path}}).load() + streaming_ds = SparkStreamingDataSet( - filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} ) + assert not streaming_ds._exists(schema_path) + + streaming_ds.save(loaded_with_streaming) assert streaming_ds._exists(schema_path) + From c7778b57932f47db2335acb99c3bd0cbad6655b8 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 25 Apr 2023 10:45:45 +0100 Subject: [PATCH 37/96] formatting Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 23 ++++++++---- .../spark/test_spark_streaming_dataset.py | 36 ++++++++++--------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index a508a3903..4cb19e6e5 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -7,7 +7,12 @@ import yaml import fsspec -from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path +from kedro.io.core import ( + AbstractDataSet, + DataSetError, + get_filepath_str, + get_protocol_and_path, +) from pyspark import SparkConf from pyspark.sql.utils import AnalysisException from pyspark.sql import SparkSession, DataFrame @@ -117,7 +122,7 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: load_path = get_filepath_str(pure_posix_path, protocol) # Open schema file - with file_system.open(load_path, encoding='utf-8') as fs_file: + with file_system.open(load_path, encoding="utf-8") as fs_file: try: return StructType.fromJson(json.loads(fs_file.read())) except Exception as exc: @@ -159,7 +164,8 @@ def _load(self) -> DataFrame: if self._schema: input_constructor = ( self._get_spark() - .readStream.schema(self._schema).format(self._file_format) + .readStream.schema(self._schema) + .format(self._file_format) .options(**self._load_args) ) else: @@ -197,17 +203,20 @@ def _save(self, data: DataFrame) -> None: .options(**self._save_args) .start() ) - def _exists(self, schema_path:str) -> bool: + + def _exists(self, schema_path: str) -> bool: """Check the existence of pyspark dataframe. Args: schema_path: schema of saved streaming dataframe """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) - with open(schema_path, encoding='utf-8') as f: + with open(schema_path, encoding="utf-8") as f: schema = StructType.fromJson(json.loads(f.read())) try: - self._get_spark().readStream.schema(schema).load(load_path, self._file_format) + self._get_spark().readStream.schema(schema).load( + load_path, self._file_format + ) except AnalysisException as exception: if ( exception.desc.startswith("Path does not exist:") @@ -215,4 +224,4 @@ def _exists(self, schema_path:str) -> bool: ): return False raise - return True \ No newline at end of file + return True diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index fa3b0fec8..f2fd3bb3d 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,17 +1,14 @@ import json import pytest -import time from pyspark.sql import SparkSession -from kedro_datasets.pandas import ParquetDataSet +from pyspark.sql.types import IntegerType, StringType, StructField, StructType from kedro.io.core import DataSetError from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet -from pyspark.sql.types import IntegerType, StringType, StructField, StructType - def sample_schema(schema_path): - with open(schema_path, encoding='utf-8') as f: + with open(schema_path, encoding="utf-8") as f: try: return StructType.fromJson(json.loads(f.read())) except Exception as exc: @@ -20,6 +17,7 @@ def sample_schema(schema_path): f"Schema is required for streaming data load, Please provide a valid schema_path." ) from exc + @pytest.fixture def sample_spark_streaming_df(tmp_path): schema = StructType( @@ -41,12 +39,15 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema_path = (tmp_path / "test.json").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] ) spark_json_ds.save(sample_spark_streaming_df) - streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json", - load_args={"schema": {"filepath": schema_path}}).load() + streaming_ds = SparkStreamingDataSet( + filepath=filepath, + file_format="json", + load_args={"schema": {"filepath": schema_path}}, + ).load() assert streaming_ds.isStreaming schema = sample_schema(schema_path) assert streaming_ds.schema == schema @@ -58,20 +59,23 @@ def test_save(self, tmp_path, sample_spark_streaming_df): checkpoint_path = (tmp_path / "checkpoint").as_posix() spark_json_ds = SparkDataSet( - filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}] + filepath=filepath_json, + file_format="json", + save_args=[{"mode", "overwrite"}], ) spark_json_ds.save(sample_spark_streaming_df) - loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json", - load_args={"schema": {"filepath": schema_path}}).load() - + loaded_with_streaming = SparkStreamingDataSet( + filepath=filepath_json, + file_format="json", + load_args={"schema": {"filepath": schema_path}}, + ).load() streaming_ds = SparkStreamingDataSet( - filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"} + filepath=filepath_output, + file_format="json", + save_args={"checkpoint": checkpoint_path, "output_mode": "append"}, ) assert not streaming_ds._exists(schema_path) streaming_ds.save(loaded_with_streaming) assert streaming_ds._exists(schema_path) - - - From 7341429eaf2ad4af8acf707bef3d96a3e06fea3d Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 17:29:02 +0100 Subject: [PATCH 38/96] formatting Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/__init__.py | 8 +++++++- .../kedro_datasets/spark/deltatable_dataset.py | 3 +-- .../kedro_datasets/spark/spark_streaming_dataset.py | 10 +++++----- .../kedro_datasets/tracking/json_dataset.py | 1 - .../kedro_datasets/tracking/metrics_dataset.py | 1 - kedro-datasets/setup.py | 13 ++++++++----- kedro-datasets/tests/api/test_api_dataset.py | 3 +-- .../bioinformatics/test_biosequence_dataset.py | 3 +-- kedro-datasets/tests/dask/test_parquet_dataset.py | 3 +-- kedro-datasets/tests/email/test_message_dataset.py | 3 +-- .../tests/geojson/test_geojson_dataset.py | 3 +-- .../tests/holoviews/test_holoviews_writer.py | 3 +-- kedro-datasets/tests/json/test_json_dataset.py | 3 +-- .../tests/libsvm/test_svmlight_dataset.py | 3 +-- .../tests/matplotlib/test_matplotlib_writer.py | 3 +-- kedro-datasets/tests/networkx/test_gml_dataset.py | 3 +-- .../tests/networkx/test_graphml_dataset.py | 3 +-- kedro-datasets/tests/networkx/test_json_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_csv_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_excel_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_feather_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_gbq_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_generic_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_hdf_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_json_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_parquet_dataset.py | 3 +-- kedro-datasets/tests/pandas/test_sql_dataset.py | 1 - kedro-datasets/tests/pandas/test_xml_dataset.py | 3 +-- kedro-datasets/tests/pickle/test_pickle_dataset.py | 3 +-- kedro-datasets/tests/pillow/test_image_dataset.py | 3 +-- kedro-datasets/tests/plotly/test_json_dataset.py | 3 +-- kedro-datasets/tests/plotly/test_plotly_dataset.py | 3 +-- kedro-datasets/tests/polars/test_csv_dataset.py | 3 +-- kedro-datasets/tests/redis/test_redis_dataset.py | 3 +-- .../tests/snowflake/test_snowpark_dataset.py | 1 - .../tests/spark/test_deltatable_dataset.py | 3 +-- kedro-datasets/tests/spark/test_spark_dataset.py | 9 ++++----- .../tests/spark/test_spark_hive_dataset.py | 3 +-- .../tests/spark/test_spark_jdbc_dataset.py | 1 - .../tests/spark/test_spark_streaming_dataset.py | 5 +++-- kedro-datasets/tests/text/test_text_dataset.py | 3 +-- kedro-datasets/tests/tracking/test_json_dataset.py | 3 +-- .../tests/tracking/test_metrics_dataset.py | 3 +-- kedro-datasets/tests/video/conftest.py | 3 +-- kedro-datasets/tests/video/test_video_dataset.py | 5 ++--- kedro-datasets/tests/video/test_video_objects.py | 3 +-- kedro-datasets/tests/yaml/test_yaml_dataset.py | 3 +-- kedro-docker/features/steps/cli_steps.py | 1 - kedro-docker/kedro_docker/plugin.py | 4 ++-- kedro-docker/tests/test_helpers.py | 1 - kedro-telemetry/kedro_telemetry/plugin.py | 1 - kedro-telemetry/tests/test_masking.py | 1 - kedro-telemetry/tests/test_plugin.py | 3 +-- tools/circleci/circleci_release.py | 1 - 54 files changed, 68 insertions(+), 107 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py index 0c46a7fc3..bd649f5c7 100644 --- a/kedro-datasets/kedro_datasets/spark/__init__.py +++ b/kedro-datasets/kedro_datasets/spark/__init__.py @@ -1,6 +1,12 @@ """Provides I/O modules for Apache Spark.""" -__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"] +__all__ = [ + "SparkDataSet", + "SparkHiveDataSet", + "SparkJDBCDataSet", + "DeltaTableDataSet", + "SparkStreamingDataSet", +] from contextlib import suppress diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 34ee6f6a5..9454a47f7 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -6,11 +6,10 @@ from delta.tables import DeltaTable from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark.sql import SparkSession from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix - class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): """``DeltaTableDataSet`` loads data into DeltaTable objects. diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 4cb19e6e5..203539a11 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,24 +1,24 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" import json import os -from typing import Any, Dict from copy import deepcopy from pathlib import PurePosixPath -import yaml +from typing import Any, Dict import fsspec +import yaml from kedro.io.core import ( AbstractDataSet, DataSetError, get_filepath_str, get_protocol_and_path, ) +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark import SparkConf -from pyspark.sql.utils import AnalysisException -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType +from pyspark.sql.utils import AnalysisException from yaml.loader import SafeLoader -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix class SparkStreamingDataSet(AbstractDataSet): diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 4235df999..994236d3d 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -5,7 +5,6 @@ from typing import NoReturn from kedro.io.core import DataSetError - from kedro_datasets.json import JSONDataSet as JDS diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 7c7546a85..2e4e2d970 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -7,7 +7,6 @@ from typing import Dict, NoReturn from kedro.io.core import DataSetError, get_filepath_str - from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index f2f4921a5..be99f9912 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -46,10 +46,15 @@ def _collect_requirements(requires): "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"], } -polars_require = {"polars.CSVDataSet": [POLARS],} +polars_require = { + "polars.CSVDataSet": [POLARS], +} redis_require = {"redis.PickleDataSet": ["redis~=4.1"]} snowflake_require = { - "snowflake.SnowparkTableDataSet": ["snowflake-snowpark-python~=1.0.0", "pyarrow~=8.0"] + "snowflake.SnowparkTableDataSet": [ + "snowflake-snowpark-python~=1.0.0", + "pyarrow~=8.0", + ] } spark_require = { "spark.SparkDataSet": [SPARK, HDFS, S3FS], @@ -67,9 +72,7 @@ def _collect_requirements(requires): "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", ] } -video_require = { - "video.VideoDataSet": ["opencv-python~=4.5.5.64"] -} +video_require = {"video.VideoDataSet": ["opencv-python~=4.5.5.64"]} yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]} extras_require = { diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index 848020041..51279c71c 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -5,9 +5,8 @@ import pytest import requests from kedro.io.core import DataSetError -from requests.auth import HTTPBasicAuth - from kedro_datasets.api import APIDataSet +from requests.auth import HTTPBasicAuth POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py index 24666baaf..42b3e252f 100644 --- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py +++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.biosequence import BioSequenceDataSet +from s3fs.core import S3FileSystem LOAD_ARGS = {"format": "fasta"} SAVE_ARGS = {"format": "fasta"} diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py index 8475dbf47..3824d6c0f 100644 --- a/kedro-datasets/tests/dask/test_parquet_dataset.py +++ b/kedro-datasets/tests/dask/test_parquet_dataset.py @@ -5,12 +5,11 @@ import pyarrow.parquet as pq import pytest from kedro.io import DataSetError +from kedro_datasets.dask import ParquetDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs import S3FileSystem -from kedro_datasets.dask import ParquetDataSet - FILE_NAME = "test.parquet" BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py index 100daba52..6f97b6c89 100644 --- a/kedro-datasets/tests/email/test_message_dataset.py +++ b/kedro-datasets/tests/email/test_message_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.email import EmailMessageDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py index b5f3ec4cb..cd6c07c7c 100644 --- a/kedro-datasets/tests/geojson/test_geojson_dataset.py +++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py @@ -7,12 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.geopandas import GeoJSONDataSet from pandas.testing import assert_frame_equal from s3fs import S3FileSystem from shapely.geometry import Point -from kedro_datasets.geopandas import GeoJSONDataSet - @pytest.fixture(params=[None]) def load_version(request): diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py index f4f91383e..53ca795f2 100644 --- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py +++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py @@ -9,9 +9,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.holoviews import HoloviewsWriter +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py index 621e51fcd..dafdd8e3e 100644 --- a/kedro-datasets/tests/json/test_json_dataset.py +++ b/kedro-datasets/tests/json/test_json_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.json import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py index 8fff3edd2..9fcf09c0c 100644 --- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py +++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.svmlight import SVMLightDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index 4086e127e..ed4dec348 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -6,11 +6,10 @@ import matplotlib.pyplot as plt import pytest from kedro.io import DataSetError, Version +from kedro_datasets.matplotlib import MatplotlibWriter from moto import mock_s3 from s3fs import S3FileSystem -from kedro_datasets.matplotlib import MatplotlibWriter - BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "testing", "secret": "testing"} KEY_PATH = "matplotlib" diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py index a3a89eca7..dd589019d 100644 --- a/kedro-datasets/tests/networkx/test_gml_dataset.py +++ b/kedro-datasets/tests/networkx/test_gml_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import GMLDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py index 4e0dcf40d..9ff22883e 100644 --- a/kedro-datasets/tests/networkx/test_graphml_dataset.py +++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import GraphMLDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py index 4d6e582a8..ed437f69a 100644 --- a/kedro-datasets/tests/networkx/test_json_dataset.py +++ b/kedro-datasets/tests/networkx/test_json_dataset.py @@ -7,9 +7,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.networkx import JSONDataSet +from s3fs.core import S3FileSystem ATTRS = { "source": "from", diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 5cc1ee36b..53a1e7c52 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -12,12 +12,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.pandas import CSVDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import CSVDataSet - BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index 1080cc9b6..bae8c5147 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import ExcelDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import ExcelDataSet - @pytest.fixture def filepath_excel(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index 80c1ce678..ec995d657 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import FeatherDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import FeatherDataSet - @pytest.fixture def filepath_feather(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index e239dbaba..d970db36e 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -4,9 +4,8 @@ import pytest from google.cloud.exceptions import NotFound from kedro.io.core import DataSetError -from pandas.testing import assert_frame_equal - from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet +from pandas.testing import assert_frame_equal DATASET = "dataset" TABLE_NAME = "table_name" diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index 6f40bb0d4..2526c1ed6 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -9,11 +9,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp +from kedro_datasets.pandas import GenericDataSet from pandas._testing import assert_frame_equal from s3fs import S3FileSystem -from kedro_datasets.pandas import GenericDataSet - @pytest.fixture def filepath_sas(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index 563ba63d9..c59e7a104 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import HDFDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import HDFDataSet - HDF_KEY = "data" diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index df2e856d5..7da50165e 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import JSONDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import JSONDataSet - @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py index 2d7ce2996..cc62ed203 100644 --- a/kedro-datasets/tests/pandas/test_parquet_dataset.py +++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py @@ -7,12 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import ParquetDataSet from pandas.testing import assert_frame_equal from pyarrow.fs import FSSpecHandler, PyFileSystem from s3fs.core import S3FileSystem -from kedro_datasets.pandas import ParquetDataSet - FILENAME = "test.parquet" diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index 308582859..b810748c2 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -6,7 +6,6 @@ import pytest import sqlalchemy from kedro.io import DataSetError - from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet TABLE_NAME = "table_a" diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py index bd62ea586..65be88174 100644 --- a/kedro-datasets/tests/pandas/test_xml_dataset.py +++ b/kedro-datasets/tests/pandas/test_xml_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pandas import XMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pandas import XMLDataSet - @pytest.fixture def filepath_xml(tmp_path): diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index fb95681a3..2846201cf 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -8,11 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.pickle import PickleDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.pickle import PickleDataSet - @pytest.fixture def filepath_pickle(tmp_path): diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py index ea500b20d..ed27e3cb9 100644 --- a/kedro-datasets/tests/pillow/test_image_dataset.py +++ b/kedro-datasets/tests/pillow/test_image_dataset.py @@ -6,11 +6,10 @@ from fsspec.implementations.local import LocalFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.pillow import ImageDataSet from PIL import Image, ImageChops from s3fs.core import S3FileSystem -from kedro_datasets.pillow import ImageDataSet - @pytest.fixture def filepath_png(tmp_path): diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py index ab6e17d9c..0115a72dd 100644 --- a/kedro-datasets/tests/plotly/test_json_dataset.py +++ b/kedro-datasets/tests/plotly/test_json_dataset.py @@ -8,9 +8,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from s3fs.core import S3FileSystem - from kedro_datasets.plotly import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py index a422060e8..9b33492bf 100644 --- a/kedro-datasets/tests/plotly/test_plotly_dataset.py +++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py @@ -8,12 +8,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.plotly import PlotlyDataSet from plotly import graph_objects from plotly.graph_objs import Scatter from s3fs.core import S3FileSystem -from kedro_datasets.plotly import PlotlyDataSet - @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index d79183539..4c0807d91 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -12,12 +12,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.polars import CSVDataSet from moto import mock_s3 from polars.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.polars import CSVDataSet - BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index eaa8abbd2..ddda22c17 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -8,9 +8,8 @@ import pytest import redis from kedro.io import DataSetError -from pandas.testing import assert_frame_equal - from kedro_datasets.redis import PickleDataSet +from pandas.testing import assert_frame_equal @pytest.fixture(params=["pickle"]) diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py index 2133953b5..d73731df2 100644 --- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -6,7 +6,6 @@ try: import snowflake.snowpark as sp - from kedro_datasets.snowflake import SnowparkTableDataSet as spds except ImportError: pass # this is only for test discovery to succeed on Python <> 3.8 diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index 5cbbe62b7..430c78ea2 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -4,12 +4,11 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner +from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet - @pytest.fixture def sample_spark_df(): diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 9452b007d..9a3e58035 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -12,6 +12,10 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner +from kedro_datasets.pandas import CSVDataSet, ParquetDataSet +from kedro_datasets.pickle import PickleDataSet +from kedro_datasets.spark import SparkDataSet +from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils from moto import mock_s3 from pyspark.sql import SparkSession from pyspark.sql.functions import col @@ -24,11 +28,6 @@ ) from pyspark.sql.utils import AnalysisException -from kedro_datasets.pandas import CSVDataSet, ParquetDataSet -from kedro_datasets.pickle import PickleDataSet -from kedro_datasets.spark import SparkDataSet -from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils - FOLDER_NAME = "fake_folder" FILENAME = "test.parquet" BUCKET_NAME = "test_bucket" diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index e0b8fc333..88c18aee6 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -5,13 +5,12 @@ import pytest from kedro.io import DataSetError +from kedro_datasets.spark import SparkHiveDataSet from psutil import Popen from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType -from kedro_datasets.spark import SparkHiveDataSet - TESTSPARKDIR = "test_spark_dir" diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py index 0f3d0e66b..73e091ef9 100644 --- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py @@ -2,7 +2,6 @@ import pytest from kedro.io import DataSetError - from kedro_datasets.spark import SparkJDBCDataSet diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index f2fd3bb3d..fe59c5810 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,10 +1,11 @@ import json + import pytest -from pyspark.sql import SparkSession -from pyspark.sql.types import IntegerType, StringType, StructField, StructType from kedro.io.core import DataSetError from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet +from pyspark.sql import SparkSession +from pyspark.sql.types import IntegerType, StringType, StructField, StructType def sample_schema(schema_path): diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py index 733cc6c1f..a4bee6896 100644 --- a/kedro-datasets/tests/text/test_text_dataset.py +++ b/kedro-datasets/tests/text/test_text_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.text import TextDataSet +from s3fs.core import S3FileSystem STRING = "Write to text file." diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py index 62172b1a4..2529868c4 100644 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ b/kedro-datasets/tests/tracking/test_json_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.tracking import JSONDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py index 2c1157de9..ad9f4a1cb 100644 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py @@ -6,9 +6,8 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from s3fs.core import S3FileSystem - from kedro_datasets.tracking import MetricsDataSet +from s3fs.core import S3FileSystem @pytest.fixture diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py index 7a0a4c87b..0dd5576dc 100644 --- a/kedro-datasets/tests/video/conftest.py +++ b/kedro-datasets/tests/video/conftest.py @@ -1,11 +1,10 @@ from pathlib import Path import pytest +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from PIL import Image from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo - @pytest.fixture(scope="module") def red_frame(): diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py index 1ac3d1ce4..b4428c4df 100644 --- a/kedro-datasets/tests/video/test_video_dataset.py +++ b/kedro-datasets/tests/video/test_video_dataset.py @@ -1,11 +1,10 @@ import boto3 import pytest from kedro.io import DataSetError -from moto import mock_s3 -from utils import TEST_FPS, assert_videos_equal - from kedro_datasets.video import VideoDataSet from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo +from moto import mock_s3 +from utils import TEST_FPS, assert_videos_equal S3_BUCKET_NAME = "test_bucket" S3_KEY_PATH = "video" diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py index 1cb7cca75..3adb701d2 100644 --- a/kedro-datasets/tests/video/test_video_objects.py +++ b/kedro-datasets/tests/video/test_video_objects.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from utils import ( DEFAULT_FOURCC, MJPEG_FOURCC, @@ -21,8 +22,6 @@ assert_images_equal, ) -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo - class TestSequenceVideo: def test_sequence_video_indexing_first(self, color_video, red_frame): diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py index 653606c17..2cadeee7d 100644 --- a/kedro-datasets/tests/yaml/test_yaml_dataset.py +++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py @@ -7,11 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.yaml import YAMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem -from kedro_datasets.yaml import YAMLDataSet - @pytest.fixture def filepath_yaml(tmp_path): diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 0306c1e2f..2c680fd70 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -8,7 +8,6 @@ import behave import yaml from behave import given, then, when - from features.steps.sh_run import ChildTerminatingPopen, run from features.steps.util import ( TimeoutException, diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index 27af7db96..cc8dda1c4 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -125,9 +125,9 @@ def docker_init(spark): if KEDRO_VERSION.match(">=0.17.0"): verbose = KedroCliError.VERBOSE_ERROR else: - from kedro.framework.cli.cli import ( # noqa # pylint:disable=import-outside-toplevel, no-name-in-module + from kedro.framework.cli.cli import ( _VERBOSE as verbose, - ) + ) # noqa # pylint:disable=import-outside-toplevel, no-name-in-module docker_file_version = "spark" if spark else "simple" docker_file = f"Dockerfile.{docker_file_version}" diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py index 40b5d9306..f205c9efe 100644 --- a/kedro-docker/tests/test_helpers.py +++ b/kedro-docker/tests/test_helpers.py @@ -3,7 +3,6 @@ import pytest from click import ClickException - from kedro_docker.helpers import ( add_jupyter_args, check_docker_image_exists, diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index 5eeb4d489..1027d541d 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -22,7 +22,6 @@ from kedro.framework.startup import ProjectMetadata from kedro.io.data_catalog import DataCatalog from kedro.pipeline import Pipeline - from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 74773e2f4..1e674096b 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -9,7 +9,6 @@ from kedro import __version__ as kedro_version from kedro.framework.cli.cli import KedroCLI, cli from kedro.framework.startup import ProjectMetadata - from kedro_telemetry.masking import ( MASK, _get_cli_structure, diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 222bcc914..9b1a6460b 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -9,8 +9,6 @@ from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline -from pytest import fixture - from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.plugin import ( KedroTelemetryCLIHooks, @@ -18,6 +16,7 @@ _check_for_telemetry_consent, _confirm_consent, ) +from pytest import fixture REPO_NAME = "dummy_project" PACKAGE_NAME = "dummy_package" diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index dd05d4c5a..e8f5d8449 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -8,7 +8,6 @@ import requests from requests.structures import CaseInsensitiveDict - from utils.check_no_version_pypi import check_no_version_pypi from utils.package_version import get_package_version From d8d3bc281b54ec3382d8de954c26025491a7f4a2 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 19:08:07 +0100 Subject: [PATCH 39/96] formatting Signed-off-by: Tingting_Wan --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index fe59c5810..82b90481c 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -2,11 +2,12 @@ import pytest from kedro.io.core import DataSetError -from kedro_datasets.spark.spark_dataset import SparkDataSet -from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro_datasets.spark.spark_dataset import SparkDataSet +from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet + def sample_schema(schema_path): with open(schema_path, encoding="utf-8") as f: @@ -29,7 +30,7 @@ def sample_spark_streaming_df(tmp_path): ) data = [("0001", 2), ("0001", 7), ("0002", 4)] schema_path = (tmp_path / "test.json").as_posix() - with open(schema_path, "w") as f: + with open(schema_path, "w", encoding="utf-8") as f: json.dump(schema.jsonValue(), f) return SparkSession.builder.getOrCreate().createDataFrame(data, schema) From be4a3e5c3698a456f6c11d1b8041ea7ba2340298 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Mon, 1 May 2023 19:08:44 +0100 Subject: [PATCH 40/96] formatting Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 203539a11..79a044c6d 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -13,13 +13,14 @@ get_filepath_str, get_protocol_and_path, ) -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark import SparkConf from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException from yaml.loader import SafeLoader +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix + class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. @@ -47,6 +48,7 @@ class SparkStreamingDataSet(AbstractDataSet): """ + # pylint: disable=too-many-instance-attributes DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] @@ -156,7 +158,8 @@ def _get_spark(): def _load(self) -> DataFrame: """Loads data from filepath. - If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args + If the connector type is kafka then no file_path is required, schema needs to be + seperated from load_args. Returns: Data from filepath as pyspark dataframe. @@ -211,8 +214,8 @@ def _exists(self, schema_path: str) -> bool: schema_path: schema of saved streaming dataframe """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) - with open(schema_path, encoding="utf-8") as f: - schema = StructType.fromJson(json.loads(f.read())) + with open(schema_path, encoding="utf-8") as schema_file: + schema = StructType.fromJson(json.loads(schema_file.read())) try: self._get_spark().readStream.schema(schema).load( load_path, self._file_format From e39c6397182d163aa13c6ee46be67679357dfcad Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 11:04:12 +0100 Subject: [PATCH 41/96] lint Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 2 +- .../kedro_datasets/spark/deltatable_dataset.py | 3 ++- .../kedro_datasets/spark/spark_streaming_dataset.py | 9 --------- kedro-datasets/kedro_datasets/tracking/json_dataset.py | 1 + .../kedro_datasets/tracking/metrics_dataset.py | 1 + kedro-datasets/tests/api/test_api_dataset.py | 3 ++- .../tests/bioinformatics/test_biosequence_dataset.py | 3 ++- kedro-datasets/tests/dask/test_parquet_dataset.py | 3 ++- kedro-datasets/tests/email/test_message_dataset.py | 3 ++- kedro-datasets/tests/geojson/test_geojson_dataset.py | 3 ++- kedro-datasets/tests/holoviews/test_holoviews_writer.py | 3 ++- kedro-datasets/tests/json/test_json_dataset.py | 3 ++- kedro-datasets/tests/libsvm/test_svmlight_dataset.py | 3 ++- .../tests/matplotlib/test_matplotlib_writer.py | 3 ++- kedro-datasets/tests/networkx/test_gml_dataset.py | 3 ++- kedro-datasets/tests/networkx/test_graphml_dataset.py | 3 ++- kedro-datasets/tests/networkx/test_json_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_csv_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_excel_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_feather_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_gbq_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_generic_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_hdf_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_json_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_parquet_dataset.py | 3 ++- kedro-datasets/tests/pandas/test_sql_dataset.py | 1 + kedro-datasets/tests/pandas/test_xml_dataset.py | 3 ++- kedro-datasets/tests/pickle/test_pickle_dataset.py | 3 ++- kedro-datasets/tests/pillow/test_image_dataset.py | 3 ++- kedro-datasets/tests/plotly/test_json_dataset.py | 3 ++- kedro-datasets/tests/plotly/test_plotly_dataset.py | 3 ++- kedro-datasets/tests/polars/test_csv_dataset.py | 3 ++- kedro-datasets/tests/redis/test_redis_dataset.py | 3 ++- kedro-datasets/tests/snowflake/test_snowpark_dataset.py | 1 + kedro-datasets/tests/spark/test_deltatable_dataset.py | 3 ++- kedro-datasets/tests/spark/test_spark_dataset.py | 9 +++++---- kedro-datasets/tests/spark/test_spark_hive_dataset.py | 3 ++- kedro-datasets/tests/spark/test_spark_jdbc_dataset.py | 1 + kedro-datasets/tests/text/test_text_dataset.py | 3 ++- kedro-datasets/tests/tracking/test_json_dataset.py | 3 ++- kedro-datasets/tests/tracking/test_metrics_dataset.py | 3 ++- kedro-datasets/tests/video/conftest.py | 3 ++- kedro-datasets/tests/video/test_video_dataset.py | 5 +++-- kedro-datasets/tests/video/test_video_objects.py | 3 ++- kedro-datasets/tests/yaml/test_yaml_dataset.py | 3 ++- 45 files changed, 86 insertions(+), 52 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index f222df00a..910289135 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -3,7 +3,7 @@ ``SparkStreamingDataSet`` loads and saves data to streaming DataFrames. See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. -To work with multiple streaming nodes, 2 hook are required for: +To work with multiple streaming nodes, 2 hook are required for: - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details - Running streaming query without termination unless exception diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 9454a47f7..34ee6f6a5 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -6,10 +6,11 @@ from delta.tables import DeltaTable from kedro.io.core import AbstractDataSet, DataSetError -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix from pyspark.sql import SparkSession from pyspark.sql.utils import AnalysisException +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix + class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): """``DeltaTableDataSet`` loads data into DeltaTable objects. diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 79a044c6d..4d7695e4e 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -24,17 +24,14 @@ class SparkStreamingDataSet(AbstractDataSet): """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects. - Example usage for the `YAML API `_: .. code-block:: yaml - raw.new_inventory: type: spark.SparkStreamingDataSet filepath: data/01_raw/stream/inventory/ file_format: json - int.new_inventory: type: spark.SparkStreamingDataSet filepath: data/02_intermediate/inventory/ @@ -45,7 +42,6 @@ class SparkStreamingDataSet(AbstractDataSet): header: True load_args: header: True - """ # pylint: disable=too-many-instance-attributes @@ -60,7 +56,6 @@ def __init__( load_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of SparkStreamingDataSet. - Args: filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks specify ``filepath``s starting with ``/dbfs/``. For message brokers such as @@ -160,7 +155,6 @@ def _load(self) -> DataFrame: """Loads data from filepath. If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args. - Returns: Data from filepath as pyspark dataframe. """ @@ -186,10 +180,8 @@ def _load(self) -> DataFrame: def _save(self, data: DataFrame) -> None: """Saves pyspark dataframe. - Args: data: PySpark streaming dataframe for saving - """ output_constructor = data.writeStream.format(self._file_format) @@ -209,7 +201,6 @@ def _save(self, data: DataFrame) -> None: def _exists(self, schema_path: str) -> bool: """Check the existence of pyspark dataframe. - Args: schema_path: schema of saved streaming dataframe """ diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 994236d3d..4235df999 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -5,6 +5,7 @@ from typing import NoReturn from kedro.io.core import DataSetError + from kedro_datasets.json import JSONDataSet as JDS diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 2e4e2d970..7c7546a85 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -7,6 +7,7 @@ from typing import Dict, NoReturn from kedro.io.core import DataSetError, get_filepath_str + from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index 51279c71c..848020041 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -5,9 +5,10 @@ import pytest import requests from kedro.io.core import DataSetError -from kedro_datasets.api import APIDataSet from requests.auth import HTTPBasicAuth +from kedro_datasets.api import APIDataSet + POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] TEST_URL = "http://example.com/api/test" diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py index 42b3e252f..24666baaf 100644 --- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py +++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py @@ -8,9 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.biosequence import BioSequenceDataSet from s3fs.core import S3FileSystem +from kedro_datasets.biosequence import BioSequenceDataSet + LOAD_ARGS = {"format": "fasta"} SAVE_ARGS = {"format": "fasta"} diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py index 3824d6c0f..8475dbf47 100644 --- a/kedro-datasets/tests/dask/test_parquet_dataset.py +++ b/kedro-datasets/tests/dask/test_parquet_dataset.py @@ -5,11 +5,12 @@ import pyarrow.parquet as pq import pytest from kedro.io import DataSetError -from kedro_datasets.dask import ParquetDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs import S3FileSystem +from kedro_datasets.dask import ParquetDataSet + FILE_NAME = "test.parquet" BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py index 6f97b6c89..100daba52 100644 --- a/kedro-datasets/tests/email/test_message_dataset.py +++ b/kedro-datasets/tests/email/test_message_dataset.py @@ -8,9 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.email import EmailMessageDataSet from s3fs.core import S3FileSystem +from kedro_datasets.email import EmailMessageDataSet + @pytest.fixture def filepath_message(tmp_path): diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py index cd6c07c7c..b5f3ec4cb 100644 --- a/kedro-datasets/tests/geojson/test_geojson_dataset.py +++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py @@ -7,11 +7,12 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp -from kedro_datasets.geopandas import GeoJSONDataSet from pandas.testing import assert_frame_equal from s3fs import S3FileSystem from shapely.geometry import Point +from kedro_datasets.geopandas import GeoJSONDataSet + @pytest.fixture(params=[None]) def load_version(request): diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py index 53ca795f2..f4f91383e 100644 --- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py +++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py @@ -9,9 +9,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.holoviews import HoloviewsWriter from s3fs.core import S3FileSystem +from kedro_datasets.holoviews import HoloviewsWriter + @pytest.fixture def filepath_png(tmp_path): diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py index dafdd8e3e..621e51fcd 100644 --- a/kedro-datasets/tests/json/test_json_dataset.py +++ b/kedro-datasets/tests/json/test_json_dataset.py @@ -6,9 +6,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.json import JSONDataSet from s3fs.core import S3FileSystem +from kedro_datasets.json import JSONDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py index 9fcf09c0c..8fff3edd2 100644 --- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py +++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py @@ -7,9 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.svmlight import SVMLightDataSet from s3fs.core import S3FileSystem +from kedro_datasets.svmlight import SVMLightDataSet + @pytest.fixture def filepath_svm(tmp_path): diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index ed4dec348..4086e127e 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -6,10 +6,11 @@ import matplotlib.pyplot as plt import pytest from kedro.io import DataSetError, Version -from kedro_datasets.matplotlib import MatplotlibWriter from moto import mock_s3 from s3fs import S3FileSystem +from kedro_datasets.matplotlib import MatplotlibWriter + BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "testing", "secret": "testing"} KEY_PATH = "matplotlib" diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py index dd589019d..a3a89eca7 100644 --- a/kedro-datasets/tests/networkx/test_gml_dataset.py +++ b/kedro-datasets/tests/networkx/test_gml_dataset.py @@ -7,9 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.networkx import GMLDataSet from s3fs.core import S3FileSystem +from kedro_datasets.networkx import GMLDataSet + ATTRS = { "source": "from", "target": "to", diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py index 9ff22883e..4e0dcf40d 100644 --- a/kedro-datasets/tests/networkx/test_graphml_dataset.py +++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py @@ -7,9 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.networkx import GraphMLDataSet from s3fs.core import S3FileSystem +from kedro_datasets.networkx import GraphMLDataSet + ATTRS = { "source": "from", "target": "to", diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py index ed437f69a..4d6e582a8 100644 --- a/kedro-datasets/tests/networkx/test_json_dataset.py +++ b/kedro-datasets/tests/networkx/test_json_dataset.py @@ -7,9 +7,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.networkx import JSONDataSet from s3fs.core import S3FileSystem +from kedro_datasets.networkx import JSONDataSet + ATTRS = { "source": "from", "target": "to", diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 53a1e7c52..5cc1ee36b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -12,11 +12,12 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp -from kedro_datasets.pandas import CSVDataSet from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import CSVDataSet + BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index bae8c5147..1080cc9b6 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -7,10 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import ExcelDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import ExcelDataSet + @pytest.fixture def filepath_excel(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index ec995d657..80c1ce678 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -7,10 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import FeatherDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import FeatherDataSet + @pytest.fixture def filepath_feather(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index d970db36e..e239dbaba 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -4,9 +4,10 @@ import pytest from google.cloud.exceptions import NotFound from kedro.io.core import DataSetError -from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet from pandas.testing import assert_frame_equal +from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet + DATASET = "dataset" TABLE_NAME = "table_name" PROJECT = "project" diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index 2526c1ed6..6f40bb0d4 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -9,10 +9,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp -from kedro_datasets.pandas import GenericDataSet from pandas._testing import assert_frame_equal from s3fs import S3FileSystem +from kedro_datasets.pandas import GenericDataSet + @pytest.fixture def filepath_sas(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index c59e7a104..563ba63d9 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -7,10 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import HDFDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import HDFDataSet + HDF_KEY = "data" diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 7da50165e..df2e856d5 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -8,10 +8,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import JSONDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import JSONDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py index cc62ed203..2d7ce2996 100644 --- a/kedro-datasets/tests/pandas/test_parquet_dataset.py +++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py @@ -7,11 +7,12 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import ParquetDataSet from pandas.testing import assert_frame_equal from pyarrow.fs import FSSpecHandler, PyFileSystem from s3fs.core import S3FileSystem +from kedro_datasets.pandas import ParquetDataSet + FILENAME = "test.parquet" diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index b810748c2..308582859 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -6,6 +6,7 @@ import pytest import sqlalchemy from kedro.io import DataSetError + from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet TABLE_NAME = "table_a" diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py index 65be88174..bd62ea586 100644 --- a/kedro-datasets/tests/pandas/test_xml_dataset.py +++ b/kedro-datasets/tests/pandas/test_xml_dataset.py @@ -8,10 +8,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pandas import XMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pandas import XMLDataSet + @pytest.fixture def filepath_xml(tmp_path): diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index 2846201cf..fb95681a3 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -8,10 +8,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.pickle import PickleDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.pickle import PickleDataSet + @pytest.fixture def filepath_pickle(tmp_path): diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py index ed27e3cb9..ea500b20d 100644 --- a/kedro-datasets/tests/pillow/test_image_dataset.py +++ b/kedro-datasets/tests/pillow/test_image_dataset.py @@ -6,10 +6,11 @@ from fsspec.implementations.local import LocalFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp -from kedro_datasets.pillow import ImageDataSet from PIL import Image, ImageChops from s3fs.core import S3FileSystem +from kedro_datasets.pillow import ImageDataSet + @pytest.fixture def filepath_png(tmp_path): diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py index 0115a72dd..ab6e17d9c 100644 --- a/kedro-datasets/tests/plotly/test_json_dataset.py +++ b/kedro-datasets/tests/plotly/test_json_dataset.py @@ -8,9 +8,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.plotly import JSONDataSet from s3fs.core import S3FileSystem +from kedro_datasets.plotly import JSONDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py index 9b33492bf..a422060e8 100644 --- a/kedro-datasets/tests/plotly/test_plotly_dataset.py +++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py @@ -8,11 +8,12 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER -from kedro_datasets.plotly import PlotlyDataSet from plotly import graph_objects from plotly.graph_objs import Scatter from s3fs.core import S3FileSystem +from kedro_datasets.plotly import PlotlyDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index 4c0807d91..d79183539 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -12,11 +12,12 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp -from kedro_datasets.polars import CSVDataSet from moto import mock_s3 from polars.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.polars import CSVDataSet + BUCKET_NAME = "test_bucket" FILE_NAME = "test.csv" diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index ddda22c17..eaa8abbd2 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -8,9 +8,10 @@ import pytest import redis from kedro.io import DataSetError -from kedro_datasets.redis import PickleDataSet from pandas.testing import assert_frame_equal +from kedro_datasets.redis import PickleDataSet + @pytest.fixture(params=["pickle"]) def backend(request): diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py index d73731df2..2133953b5 100644 --- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -6,6 +6,7 @@ try: import snowflake.snowpark as sp + from kedro_datasets.snowflake import SnowparkTableDataSet as spds except ImportError: pass # this is only for test discovery to succeed on Python <> 3.8 diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index 430c78ea2..5cbbe62b7 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -4,11 +4,12 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner -from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException +from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet + @pytest.fixture def sample_spark_df(): diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 9a3e58035..9452b007d 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -12,10 +12,6 @@ from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner -from kedro_datasets.pandas import CSVDataSet, ParquetDataSet -from kedro_datasets.pickle import PickleDataSet -from kedro_datasets.spark import SparkDataSet -from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils from moto import mock_s3 from pyspark.sql import SparkSession from pyspark.sql.functions import col @@ -28,6 +24,11 @@ ) from pyspark.sql.utils import AnalysisException +from kedro_datasets.pandas import CSVDataSet, ParquetDataSet +from kedro_datasets.pickle import PickleDataSet +from kedro_datasets.spark import SparkDataSet +from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils + FOLDER_NAME = "fake_folder" FILENAME = "test.parquet" BUCKET_NAME = "test_bucket" diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index 88c18aee6..e0b8fc333 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -5,12 +5,13 @@ import pytest from kedro.io import DataSetError -from kedro_datasets.spark import SparkHiveDataSet from psutil import Popen from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro_datasets.spark import SparkHiveDataSet + TESTSPARKDIR = "test_spark_dir" diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py index 73e091ef9..0f3d0e66b 100644 --- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py @@ -2,6 +2,7 @@ import pytest from kedro.io import DataSetError + from kedro_datasets.spark import SparkJDBCDataSet diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py index a4bee6896..733cc6c1f 100644 --- a/kedro-datasets/tests/text/test_text_dataset.py +++ b/kedro-datasets/tests/text/test_text_dataset.py @@ -6,9 +6,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.text import TextDataSet from s3fs.core import S3FileSystem +from kedro_datasets.text import TextDataSet + STRING = "Write to text file." diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py index 2529868c4..62172b1a4 100644 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ b/kedro-datasets/tests/tracking/test_json_dataset.py @@ -6,9 +6,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.tracking import JSONDataSet from s3fs.core import S3FileSystem +from kedro_datasets.tracking import JSONDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py index ad9f4a1cb..2c1157de9 100644 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py @@ -6,9 +6,10 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.tracking import MetricsDataSet from s3fs.core import S3FileSystem +from kedro_datasets.tracking import MetricsDataSet + @pytest.fixture def filepath_json(tmp_path): diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py index 0dd5576dc..7a0a4c87b 100644 --- a/kedro-datasets/tests/video/conftest.py +++ b/kedro-datasets/tests/video/conftest.py @@ -1,10 +1,11 @@ from pathlib import Path import pytest -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from PIL import Image from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo + @pytest.fixture(scope="module") def red_frame(): diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py index b4428c4df..1ac3d1ce4 100644 --- a/kedro-datasets/tests/video/test_video_dataset.py +++ b/kedro-datasets/tests/video/test_video_dataset.py @@ -1,11 +1,12 @@ import boto3 import pytest from kedro.io import DataSetError -from kedro_datasets.video import VideoDataSet -from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo from moto import mock_s3 from utils import TEST_FPS, assert_videos_equal +from kedro_datasets.video import VideoDataSet +from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo + S3_BUCKET_NAME = "test_bucket" S3_KEY_PATH = "video" S3_FULL_PATH = f"s3://{S3_BUCKET_NAME}/{S3_KEY_PATH}/" diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py index 3adb701d2..1cb7cca75 100644 --- a/kedro-datasets/tests/video/test_video_objects.py +++ b/kedro-datasets/tests/video/test_video_objects.py @@ -1,6 +1,5 @@ import numpy as np import pytest -from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo from utils import ( DEFAULT_FOURCC, MJPEG_FOURCC, @@ -22,6 +21,8 @@ assert_images_equal, ) +from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo + class TestSequenceVideo: def test_sequence_video_indexing_first(self, color_video, red_frame): diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py index 2cadeee7d..653606c17 100644 --- a/kedro-datasets/tests/yaml/test_yaml_dataset.py +++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py @@ -7,10 +7,11 @@ from gcsfs import GCSFileSystem from kedro.io import DataSetError from kedro.io.core import PROTOCOL_DELIMITER, Version -from kedro_datasets.yaml import YAMLDataSet from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets.yaml import YAMLDataSet + @pytest.fixture def filepath_yaml(tmp_path): From 66440f4094ea48b0ac6119fb0b284d559e4ad685 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 11:07:58 +0100 Subject: [PATCH 42/96] lint Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 2 +- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +- kedro-docker/features/steps/cli_steps.py | 1 + kedro-docker/kedro_docker/plugin.py | 4 ++-- kedro-docker/tests/test_helpers.py | 1 + 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 4d7695e4e..d68db8745 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -199,7 +199,7 @@ def _save(self, data: DataFrame) -> None: .start() ) - def _exists(self, schema_path: str) -> bool: + def custom_exists(self, schema_path: str) -> bool: """Check the existence of pyspark dataframe. Args: schema_path: schema of saved streaming dataframe diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 82b90481c..d782961a2 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -80,4 +80,4 @@ def test_save(self, tmp_path, sample_spark_streaming_df): assert not streaming_ds._exists(schema_path) streaming_ds.save(loaded_with_streaming) - assert streaming_ds._exists(schema_path) + assert streaming_ds.custom_exists(schema_path) diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 2c680fd70..0306c1e2f 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -8,6 +8,7 @@ import behave import yaml from behave import given, then, when + from features.steps.sh_run import ChildTerminatingPopen, run from features.steps.util import ( TimeoutException, diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index cc8dda1c4..27af7db96 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -125,9 +125,9 @@ def docker_init(spark): if KEDRO_VERSION.match(">=0.17.0"): verbose = KedroCliError.VERBOSE_ERROR else: - from kedro.framework.cli.cli import ( + from kedro.framework.cli.cli import ( # noqa # pylint:disable=import-outside-toplevel, no-name-in-module _VERBOSE as verbose, - ) # noqa # pylint:disable=import-outside-toplevel, no-name-in-module + ) docker_file_version = "spark" if spark else "simple" docker_file = f"Dockerfile.{docker_file_version}" diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py index f205c9efe..40b5d9306 100644 --- a/kedro-docker/tests/test_helpers.py +++ b/kedro-docker/tests/test_helpers.py @@ -3,6 +3,7 @@ import pytest from click import ClickException + from kedro_docker.helpers import ( add_jupyter_args, check_docker_image_exists, From 0ed5b90a8b253860919ac27a4d2eeff73b01d4e3 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 11:20:37 +0100 Subject: [PATCH 43/96] lint Signed-off-by: Tingting_Wan --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index d782961a2..a859b7639 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -77,7 +77,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df): file_format="json", save_args={"checkpoint": checkpoint_path, "output_mode": "append"}, ) - assert not streaming_ds._exists(schema_path) + assert not streaming_ds.custom_exists(schema_path) streaming_ds.save(loaded_with_streaming) assert streaming_ds.custom_exists(schema_path) From 04c623bdfabf9ba4f1ba7b7d022566565d8745a0 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 13:31:49 +0100 Subject: [PATCH 44/96] update test cases Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 7 +++---- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 5 +++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index d68db8745..54b407d84 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -199,16 +199,15 @@ def _save(self, data: DataFrame) -> None: .start() ) - def custom_exists(self, schema_path: str) -> bool: + def _exists(self) -> bool: """Check the existence of pyspark dataframe. Args: schema_path: schema of saved streaming dataframe """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) - with open(schema_path, encoding="utf-8") as schema_file: - schema = StructType.fromJson(json.loads(schema_file.read())) + try: - self._get_spark().readStream.schema(schema).load( + self._get_spark().readStream.schema(self._schema).load( load_path, self._file_format ) except AnalysisException as exception: diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index a859b7639..1794fd54a 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -75,9 +75,10 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds = SparkStreamingDataSet( filepath=filepath_output, file_format="json", + load_args={"schema": {"filepath": schema_path}}, save_args={"checkpoint": checkpoint_path, "output_mode": "append"}, ) - assert not streaming_ds.custom_exists(schema_path) + assert not streaming_ds.exists() streaming_ds.save(loaded_with_streaming) - assert streaming_ds.custom_exists(schema_path) + assert streaming_ds.exists() From a76f944d5ba9ead0fd7dd0e9a74694fe71f56d24 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 14:26:50 +0100 Subject: [PATCH 45/96] add negative test Signed-off-by: Tingting_Wan --- .../spark/test_spark_streaming_dataset.py | 54 +++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 1794fd54a..d3c72968d 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,13 +1,16 @@ +import re import json - +from pathlib import Path import pytest from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet +SCHEMA_FILE_NAME = "schema.json" def sample_schema(schema_path): with open(schema_path, encoding="utf-8") as f: @@ -29,7 +32,7 @@ def sample_spark_streaming_df(tmp_path): ] ) data = [("0001", 2), ("0001", 7), ("0002", 4)] - schema_path = (tmp_path / "test.json").as_posix() + schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() with open(schema_path, "w", encoding="utf-8") as f: json.dump(schema.jsonValue(), f) return SparkSession.builder.getOrCreate().createDataFrame(data, schema) @@ -38,7 +41,7 @@ def sample_spark_streaming_df(tmp_path): class TestStreamingDataSet: def test_load(self, tmp_path, sample_spark_streaming_df): filepath = (tmp_path / "test_streams").as_posix() - schema_path = (tmp_path / "test.json").as_posix() + schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() spark_json_ds = SparkDataSet( filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] @@ -57,7 +60,7 @@ def test_load(self, tmp_path, sample_spark_streaming_df): def test_save(self, tmp_path, sample_spark_streaming_df): filepath_json = (tmp_path / "test_streams").as_posix() filepath_output = (tmp_path / "test_streams_output").as_posix() - schema_path = (tmp_path / "test.json").as_posix() + schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() checkpoint_path = (tmp_path / "checkpoint").as_posix() spark_json_ds = SparkDataSet( @@ -82,3 +85,46 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() + def test_load_options_invalid_schema_file(self, tmp_path): + filepath = (tmp_path / "data").as_posix() + schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix() + Path(schemapath).write_text("dummy", encoding="utf-8") + + pattern = ( + f"Contents of 'schema.filepath' ({schemapath}) are invalid. Please" + f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." + ) + + with pytest.raises(DataSetError, match=re.escape(pattern)): + SparkStreamingDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": {"filepath": schemapath}}, + ) + + def test_load_options_invalid_schema(self, tmp_path): + filepath = (tmp_path / "data").as_posix() + + pattern = ( + "Schema load argument does not specify a 'filepath' attribute. Please" + "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." + ) + + with pytest.raises(DataSetError, match=pattern): + SparkStreamingDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": {}}, + ) + def test_exists_raises_error(self, mocker): + # exists should raise all errors except for + # AnalysisExceptions clearly indicating a missing file + spark_data_set = SparkStreamingDataSet(filepath="") + mocker.patch.object( + spark_data_set, + "_get_spark", + side_effect=AnalysisException("Other Exception", []), + ) + + with pytest.raises(DataSetError, match="Other Exception"): + spark_data_set.exists() \ No newline at end of file From 30b002dd5ffdd6825f834277cd8ca153ac899cb0 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 14:44:50 +0100 Subject: [PATCH 46/96] remove code snippets fpr testing Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 31 +++++-------------- .../spark/test_spark_streaming_dataset.py | 10 ++++-- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 54b407d84..98b9cff71 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -139,16 +139,7 @@ def _describe(self) -> Dict[str, Any]: @staticmethod def _get_spark(): - spark_conf_path = "conf/base/spark.yml" - if os.path.exists(spark_conf_path): - with open( - spark_conf_path, encoding="utf-8" - ) as File: # pylint: disable=invalid-name - parameters = yaml.load(File, Loader=SafeLoader) - spark_conf = SparkConf().setAll(parameters.items()) - spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() - else: - spark = SparkSession.builder.getOrCreate() + spark = SparkSession.builder.getOrCreate() return spark def _load(self) -> DataFrame: @@ -158,19 +149,13 @@ def _load(self) -> DataFrame: Returns: Data from filepath as pyspark dataframe. """ - if self._schema: - input_constructor = ( - self._get_spark() - .readStream.schema(self._schema) - .format(self._file_format) - .options(**self._load_args) - ) - else: - input_constructor = ( - self._get_spark() - .readStream.format(self._file_format) - .options(**self._load_args) - ) + input_constructor = ( + self._get_spark() + .readStream + .schema(self._schema) + .format(self._file_format) + .options(**self._load_args) + ) return ( input_constructor.load() if self._file_format diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index d3c72968d..5f16dd2f3 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,17 +1,19 @@ -import re import json +import re from pathlib import Path + import pytest from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType - from pyspark.sql.utils import AnalysisException + from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet SCHEMA_FILE_NAME = "schema.json" + def sample_schema(schema_path): with open(schema_path, encoding="utf-8") as f: try: @@ -85,6 +87,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() + def test_load_options_invalid_schema_file(self, tmp_path): filepath = (tmp_path / "data").as_posix() schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix() @@ -116,6 +119,7 @@ def test_load_options_invalid_schema(self, tmp_path): file_format="csv", load_args={"header": True, "schema": {}}, ) + def test_exists_raises_error(self, mocker): # exists should raise all errors except for # AnalysisExceptions clearly indicating a missing file @@ -127,4 +131,4 @@ def test_exists_raises_error(self, mocker): ) with pytest.raises(DataSetError, match="Other Exception"): - spark_data_set.exists() \ No newline at end of file + spark_data_set.exists() From 9bef3a2116b17095147c8d8e416e0a20518825c8 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 15:06:29 +0100 Subject: [PATCH 47/96] lint Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 98b9cff71..b80a4d6d4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,23 +1,19 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" import json -import os from copy import deepcopy from pathlib import PurePosixPath from typing import Any, Dict import fsspec -import yaml from kedro.io.core import ( AbstractDataSet, DataSetError, get_filepath_str, get_protocol_and_path, ) -from pyspark import SparkConf from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException -from yaml.loader import SafeLoader from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix @@ -151,8 +147,7 @@ def _load(self) -> DataFrame: """ input_constructor = ( self._get_spark() - .readStream - .schema(self._schema) + .readStream.schema(self._schema) .format(self._file_format) .options(**self._load_args) ) From 0bb5fe1968bd7dce8f707347e1c3777aabbac0ff Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Tue, 2 May 2023 17:13:53 +0100 Subject: [PATCH 48/96] update tests Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 4 ---- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index b80a4d6d4..5c617809d 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -180,10 +180,6 @@ def _save(self, data: DataFrame) -> None: ) def _exists(self) -> bool: - """Check the existence of pyspark dataframe. - Args: - schema_path: schema of saved streaming dataframe - """ load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) try: diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 5f16dd2f3..5c606b676 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -65,18 +65,22 @@ def test_save(self, tmp_path, sample_spark_streaming_df): schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() checkpoint_path = (tmp_path / "checkpoint").as_posix() + # Save the sample json file to temp_path for creating dataframe spark_json_ds = SparkDataSet( filepath=filepath_json, file_format="json", save_args=[{"mode", "overwrite"}], ) spark_json_ds.save(sample_spark_streaming_df) + + # Load the json file as the streaming dataframe loaded_with_streaming = SparkStreamingDataSet( filepath=filepath_json, file_format="json", load_args={"schema": {"filepath": schema_path}}, ).load() + # Append json streams to filepath_output with specified schema path streaming_ds = SparkStreamingDataSet( filepath=filepath_output, file_format="json", From e0ebe2741543c7281bfd354316b5f5558f383df6 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 11:23:39 +0100 Subject: [PATCH 49/96] update test and remove redundacy Signed-off-by: Tingting_Wan --- .../spark/spark_streaming_dataset.py | 34 ++----------- .../spark/test_spark_streaming_dataset.py | 48 ------------------- 2 files changed, 3 insertions(+), 79 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 5c617809d..09f01294c 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -1,21 +1,17 @@ """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame.""" -import json from copy import deepcopy from pathlib import PurePosixPath from typing import Any, Dict -import fsspec from kedro.io.core import ( AbstractDataSet, - DataSetError, - get_filepath_str, - get_protocol_and_path, ) from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix +from kedro_datasets.spark.spark_dataset import SparkDataSet + class SparkStreamingDataSet(AbstractDataSet): @@ -97,32 +93,8 @@ def __init__( self._schema = self._load_args.pop("schema", None) if self._schema is not None: if isinstance(self._schema, dict): - self._schema = self._load_schema_from_file(self._schema) - - @staticmethod - def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: - filepath = schema.get("filepath") - if not filepath: - raise DataSetError( - "Schema load argument does not specify a 'filepath' attribute. Please" - "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." - ) + self._schema = SparkDataSet._load_schema_from_file(self._schema) - credentials = deepcopy(schema.get("credentials")) or {} - protocol, schema_path = get_protocol_and_path(filepath) - file_system = fsspec.filesystem(protocol, **credentials) - pure_posix_path = PurePosixPath(schema_path) - load_path = get_filepath_str(pure_posix_path, protocol) - - # Open schema file - with file_system.open(load_path, encoding="utf-8") as fs_file: - try: - return StructType.fromJson(json.loads(fs_file.read())) - except Exception as exc: - raise DataSetError( - f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please" - f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." - ) from exc def _describe(self) -> Dict[str, Any]: """Returns a dict that describes attributes of the dataset.""" diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 5c606b676..d8b9a1c77 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,12 +1,9 @@ import json -import re -from pathlib import Path import pytest from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType -from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet @@ -91,48 +88,3 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() - - def test_load_options_invalid_schema_file(self, tmp_path): - filepath = (tmp_path / "data").as_posix() - schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix() - Path(schemapath).write_text("dummy", encoding="utf-8") - - pattern = ( - f"Contents of 'schema.filepath' ({schemapath}) are invalid. Please" - f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." - ) - - with pytest.raises(DataSetError, match=re.escape(pattern)): - SparkStreamingDataSet( - filepath=filepath, - file_format="csv", - load_args={"header": True, "schema": {"filepath": schemapath}}, - ) - - def test_load_options_invalid_schema(self, tmp_path): - filepath = (tmp_path / "data").as_posix() - - pattern = ( - "Schema load argument does not specify a 'filepath' attribute. Please" - "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." - ) - - with pytest.raises(DataSetError, match=pattern): - SparkStreamingDataSet( - filepath=filepath, - file_format="csv", - load_args={"header": True, "schema": {}}, - ) - - def test_exists_raises_error(self, mocker): - # exists should raise all errors except for - # AnalysisExceptions clearly indicating a missing file - spark_data_set = SparkStreamingDataSet(filepath="") - mocker.patch.object( - spark_data_set, - "_get_spark", - side_effect=AnalysisException("Other Exception", []), - ) - - with pytest.raises(DataSetError, match="Other Exception"): - spark_data_set.exists() From 5bb5766c9425d2019f2a28ecb00ca775927b4752 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 11:32:28 +0100 Subject: [PATCH 50/96] linting Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 09f01294c..84cc17d23 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -3,15 +3,15 @@ from pathlib import PurePosixPath from typing import Any, Dict -from kedro.io.core import ( - AbstractDataSet, -) +from kedro.io.core import AbstractDataSet from pyspark.sql import DataFrame, SparkSession from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix -from kedro_datasets.spark.spark_dataset import SparkDataSet - +from kedro_datasets.spark.spark_dataset import ( + SparkDataSet, + _split_filepath, + _strip_dbfs_prefix, +) class SparkStreamingDataSet(AbstractDataSet): @@ -95,7 +95,6 @@ def __init__( if isinstance(self._schema, dict): self._schema = SparkDataSet._load_schema_from_file(self._schema) - def _describe(self) -> Dict[str, Any]: """Returns a dict that describes attributes of the dataset.""" return { From 20757812592fcd2af0f063b643ce35e87c779e0f Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 12:08:38 +0100 Subject: [PATCH 51/96] refactor file format Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 13 +++++++++---- .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++----------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 910289135..82ca7a041 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -7,6 +7,15 @@ To work with multiple streaming nodes, 2 hook are required for: - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details - Running streaming query without termination unless exception +#### Supported file formats + +Supported file formats are: +- Text +- CSV +- JSON +- ORC +- Parquet + #### Example SparkStreamsHook: ```python @@ -30,9 +39,5 @@ To make the application work with kafka format, respective spark configuration n ```yaml spark.driver.maxResultSize: 3g spark.scheduler.mode: FAIR -spark.sql.streaming.schemaInference: True -spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context -spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly -spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used) ``` diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 84cc17d23..8c6fa21f4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -74,7 +74,6 @@ def __init__( self._file_format = file_format self._save_args = save_args self._load_args = load_args - self.output_format = ["kafka"] fs_prefix, filepath = _split_filepath(filepath) @@ -122,12 +121,7 @@ def _load(self) -> DataFrame: .format(self._file_format) .options(**self._load_args) ) - return ( - input_constructor.load() - if self._file_format - in self.output_format # if the connector type is message broker - else input_constructor.load(self._filepath_) - ) + return input_constructor.load(self._filepath_) def _save(self, data: DataFrame) -> None: """Saves pyspark dataframe. @@ -137,14 +131,11 @@ def _save(self, data: DataFrame) -> None: output_constructor = data.writeStream.format(self._file_format) - # for message brokers path is not needed - if self._file_format not in self.output_format: - output_constructor = output_constructor.option("path", self._filepath_) - ( output_constructor.option( "checkpointLocation", self._save_args.pop("checkpoint") ) + .option("path", self._filepath_) .outputMode(self._save_args.pop("output_mode")) .options(**self._save_args) .start() From e8ea0d37a8f0a5e7b248c241d1a18b6cbed45631 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 14:58:04 +0100 Subject: [PATCH 52/96] fix read me file Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 82ca7a041..c134ac2ea 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -10,6 +10,7 @@ To work with multiple streaming nodes, 2 hook are required for: #### Supported file formats Supported file formats are: + - Text - CSV - JSON From f08dd095845d67d3ce8167a2be88ff4a5b78e93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Thu, 4 May 2023 10:56:14 +0200 Subject: [PATCH 53/96] docs: Add community contributions (#199) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add community contributions Signed-off-by: Juan Luis Cano Rodríguez * Use newer link to docs Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez --- kedro-datasets/RELEASE.md | 5 +++++ kedro-datasets/kedro_datasets/README.md | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index ddc06407c..bd1d1e73c 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -7,6 +7,11 @@ ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [BrianCechmanek](https://github.com/BrianCechmanek) + # Release 1.2.1: ## Major features and improvements: diff --git a/kedro-datasets/kedro_datasets/README.md b/kedro-datasets/kedro_datasets/README.md index 53360c747..8e2344a30 100644 --- a/kedro-datasets/kedro_datasets/README.md +++ b/kedro-datasets/kedro_datasets/README.md @@ -10,7 +10,7 @@ These data descriptions are supported with the APIs of `pandas`, `spark`, `netwo [The Data Catalog](https://kedro.readthedocs.io/en/stable/data/data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. -Here is a full list of [supported data descriptions and APIs](https://kedro.readthedocs.io/en/stable/kedro.datasets.html). +Here is a full list of [supported data descriptions and APIs](https://docs.kedro.org/en/stable/kedro_datasets.html). ## How can I create my own `AbstractDataSet` implementation? From 24bb52741330df4533e60a03daf76c7f6861bc4e Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 18:05:32 +0100 Subject: [PATCH 54/96] adding test for raise error Signed-off-by: Tingting_Wan --- .../tests/spark/test_spark_streaming_dataset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index d8b9a1c77..67d217d30 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -4,6 +4,7 @@ from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet @@ -88,3 +89,16 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() + + def test_exists_raises_error(self, mocker): + # exists should raise all errors except for + # AnalysisExceptions clearly indicating a missing file + spark_data_set = SparkStreamingDataSet(filepath="") + mocker.patch.object( + spark_data_set, + "_get_spark", + side_effect=AnalysisException("Other Exception", []), + ) + + with pytest.raises(DataSetError, match="Other Exception"): + spark_data_set.exists() From 437e77e7025390338768bc60ce69b9c596a3b2fc Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 11:23:39 +0100 Subject: [PATCH 55/96] update test and remove redundacy Signed-off-by: Tingting_Wan Signed-off-by: Tom Kurian --- .../kedro_datasets/spark/spark_streaming_dataset.py | 12 ++++++------ .../tests/spark/test_spark_streaming_dataset.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 8c6fa21f4..63632929a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -3,15 +3,15 @@ from pathlib import PurePosixPath from typing import Any, Dict -from kedro.io.core import AbstractDataSet +from kedro.io.core import ( + AbstractDataSet, +) from pyspark.sql import DataFrame, SparkSession from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark.spark_dataset import ( - SparkDataSet, - _split_filepath, - _strip_dbfs_prefix, -) +from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix +from kedro_datasets.spark.spark_dataset import SparkDataSet + class SparkStreamingDataSet(AbstractDataSet): diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 67d217d30..cc9a5ab4b 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -4,7 +4,6 @@ from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType -from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet @@ -89,6 +88,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() +<<<<<<< HEAD def test_exists_raises_error(self, mocker): # exists should raise all errors except for @@ -102,3 +102,5 @@ def test_exists_raises_error(self, mocker): with pytest.raises(DataSetError, match="Other Exception"): spark_data_set.exists() +======= +>>>>>>> d1472e2 (update test and remove redundacy) From a3fdbf6fb8880dbaf580493c847cb11d28099322 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 11:32:28 +0100 Subject: [PATCH 56/96] linting Signed-off-by: Tingting_Wan Signed-off-by: Tom Kurian --- .../kedro_datasets/spark/spark_streaming_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 63632929a..8c6fa21f4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -3,15 +3,15 @@ from pathlib import PurePosixPath from typing import Any, Dict -from kedro.io.core import ( - AbstractDataSet, -) +from kedro.io.core import AbstractDataSet from pyspark.sql import DataFrame, SparkSession from pyspark.sql.utils import AnalysisException -from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix -from kedro_datasets.spark.spark_dataset import SparkDataSet - +from kedro_datasets.spark.spark_dataset import ( + SparkDataSet, + _split_filepath, + _strip_dbfs_prefix, +) class SparkStreamingDataSet(AbstractDataSet): From 9d60f25a55726048e1329d701be32b90c5ef3044 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 12:08:38 +0100 Subject: [PATCH 57/96] refactor file format Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index c134ac2ea..82ca7a041 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -10,7 +10,6 @@ To work with multiple streaming nodes, 2 hook are required for: #### Supported file formats Supported file formats are: - - Text - CSV - JSON From ced007dbf24641de42a6a24552e9a2c64e594c03 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 14:58:04 +0100 Subject: [PATCH 58/96] fix read me file Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 82ca7a041..c134ac2ea 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -10,6 +10,7 @@ To work with multiple streaming nodes, 2 hook are required for: #### Supported file formats Supported file formats are: + - Text - CSV - JSON From 0b88324eb0c8bebef9f97505723a76e68d3a68ab Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 4 May 2023 18:05:32 +0100 Subject: [PATCH 59/96] adding test for raise error Signed-off-by: Tingting_Wan Signed-off-by: Tom Kurian --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index cc9a5ab4b..5abffe3f5 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -4,6 +4,7 @@ from kedro.io.core import DataSetError from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql.utils import AnalysisException from kedro_datasets.spark.spark_dataset import SparkDataSet from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet @@ -88,7 +89,6 @@ def test_save(self, tmp_path, sample_spark_streaming_df): streaming_ds.save(loaded_with_streaming) assert streaming_ds.exists() -<<<<<<< HEAD def test_exists_raises_error(self, mocker): # exists should raise all errors except for @@ -101,6 +101,4 @@ def test_exists_raises_error(self, mocker): ) with pytest.raises(DataSetError, match="Other Exception"): - spark_data_set.exists() -======= ->>>>>>> d1472e2 (update test and remove redundacy) + spark_data_set.exists() \ No newline at end of file From ed26aadc46f2209360397b6fd3aa67ba98139f98 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 20:20:41 +0100 Subject: [PATCH 60/96] fix readme file Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index c134ac2ea..3979b6935 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -11,11 +11,11 @@ To work with multiple streaming nodes, 2 hook are required for: Supported file formats are: -- Text -- CSV -- JSON -- ORC -- Parquet +1. Text +1. CSV +1. JSON +1. ORC +1. Parquet #### Example SparkStreamsHook: From 170b09297b5bfd81f259ce81f24ef816ca121121 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 20:23:09 +0100 Subject: [PATCH 61/96] fix readme Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 3979b6935..a09165f14 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -11,11 +11,11 @@ To work with multiple streaming nodes, 2 hook are required for: Supported file formats are: -1. Text -1. CSV -1. JSON -1. ORC -1. Parquet +- Text +- CSV +- JSON +- ORC +- Parquet #### Example SparkStreamsHook: From e63a53acd8dd8282c99fbae3ebe1e6837d7c01e1 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 21:33:29 +0100 Subject: [PATCH 62/96] fix conflicts Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 8c6fa21f4..e63893306 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -155,4 +155,4 @@ def _exists(self) -> bool: ): return False raise - return True + return True \ No newline at end of file From d986c7521e833e166c8531747bef7cb44c0888bf Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Thu, 4 May 2023 21:38:27 +0100 Subject: [PATCH 63/96] fix ci erors Signed-off-by: Tom Kurian --- kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index e63893306..8c6fa21f4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -155,4 +155,4 @@ def _exists(self) -> bool: ): return False raise - return True \ No newline at end of file + return True diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 5abffe3f5..67d217d30 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -101,4 +101,4 @@ def test_exists_raises_error(self, mocker): ) with pytest.raises(DataSetError, match="Other Exception"): - spark_data_set.exists() \ No newline at end of file + spark_data_set.exists() From 64232fa609bdf2b0e63d4bc3006dbbd0953d71e8 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Fri, 5 May 2023 11:07:00 +0100 Subject: [PATCH 64/96] fix lint issue Signed-off-by: Tom Kurian --- kedro-telemetry/kedro_telemetry/plugin.py | 1 + kedro-telemetry/tests/test_masking.py | 1 + kedro-telemetry/tests/test_plugin.py | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index 1027d541d..5eeb4d489 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -22,6 +22,7 @@ from kedro.framework.startup import ProjectMetadata from kedro.io.data_catalog import DataCatalog from kedro.pipeline import Pipeline + from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 1e674096b..74773e2f4 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -9,6 +9,7 @@ from kedro import __version__ as kedro_version from kedro.framework.cli.cli import KedroCLI, cli from kedro.framework.startup import ProjectMetadata + from kedro_telemetry.masking import ( MASK, _get_cli_structure, diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 9b1a6460b..222bcc914 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -9,6 +9,8 @@ from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline +from pytest import fixture + from kedro_telemetry import __version__ as TELEMETRY_VERSION from kedro_telemetry.plugin import ( KedroTelemetryCLIHooks, @@ -16,7 +18,6 @@ _check_for_telemetry_consent, _confirm_consent, ) -from pytest import fixture REPO_NAME = "dummy_project" PACKAGE_NAME = "dummy_package" From 8a61b41798b73ffd4d98b23f82c6b579af9523e1 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Fri, 5 May 2023 14:07:13 +0100 Subject: [PATCH 65/96] update class documentation Signed-off-by: Tom Kurian --- .../spark/spark_streaming_dataset.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 8c6fa21f4..567c4405c 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -21,19 +21,16 @@ class SparkStreamingDataSet(AbstractDataSet): data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml raw.new_inventory: - type: spark.SparkStreamingDataSet - filepath: data/01_raw/stream/inventory/ - file_format: json - int.new_inventory: - type: spark.SparkStreamingDataSet - filepath: data/02_intermediate/inventory/ - file_format: csv - save_args: - output_mode: append - checkpoint: data/04_checkpoint/int_new_inventory - header: True - load_args: - header: True + type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet + filepath: data/01_raw/stream/inventory/ + file_format: json + save_args: + output_mode: append + checkpoint: data/04_checkpoint/raw_new_inventory + header: True + load_args: + schema: + filepath: data/01_raw/schema/inventory_schema.json """ # pylint: disable=too-many-instance-attributes From 37e66e8c603b48272ec1771181e6162505fc2a53 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 16 May 2023 21:14:44 +0100 Subject: [PATCH 66/96] add additional test cases Signed-off-by: Tom Kurian --- .../spark/test_spark_streaming_dataset.py | 73 ++++++++++++++++++- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 67d217d30..4d920980f 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -1,7 +1,9 @@ import json +import boto3 import pytest from kedro.io.core import DataSetError +from moto import mock_s3 from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException @@ -10,9 +12,12 @@ from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet SCHEMA_FILE_NAME = "schema.json" +BUCKET_NAME = "test_bucket" +AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} def sample_schema(schema_path): + """read the schema file from json path""" with open(schema_path, encoding="utf-8") as f: try: return StructType.fromJson(json.loads(f.read())) @@ -24,18 +29,51 @@ def sample_schema(schema_path): @pytest.fixture -def sample_spark_streaming_df(tmp_path): - schema = StructType( +def sample_spark_df_schema() -> StructType: + """Spark Dataframe schema""" + return StructType( [ StructField("sku", StringType(), True), StructField("new_stock", IntegerType(), True), ] ) + + +@pytest.fixture +def sample_spark_streaming_df(tmp_path, sample_spark_df_schema): + """Create s sample dataframe for streaming""" data = [("0001", 2), ("0001", 7), ("0002", 4)] schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() with open(schema_path, "w", encoding="utf-8") as f: - json.dump(schema.jsonValue(), f) - return SparkSession.builder.getOrCreate().createDataFrame(data, schema) + json.dump(sample_spark_df_schema.jsonValue(), f) + return SparkSession.builder.getOrCreate().createDataFrame( + data, sample_spark_df_schema + ) + + +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + conn.create_bucket(Bucket=BUCKET_NAME) + yield conn + + +@pytest.fixture +def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructType): + """Creates schema file and adds it to mocked S3 bucket.""" + temporary_path = tmp_path / SCHEMA_FILE_NAME + temporary_path.write_text(sample_spark_df_schema.json(), encoding="utf-8") + + mocked_s3_bucket.put_object( + Bucket=BUCKET_NAME, Key=SCHEMA_FILE_NAME, Body=temporary_path.read_bytes() + ) + return mocked_s3_bucket class TestStreamingDataSet: @@ -57,6 +95,33 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema = sample_schema(schema_path) assert streaming_ds.schema == schema + @pytest.mark.usefixtures("mocked_s3_schema") + def test_load_options_schema_path_with_credentials( + self, tmp_path, sample_spark_streaming_df + ): + filepath = (tmp_path / "test_streams").as_posix() + schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() + + spark_json_ds = SparkDataSet( + filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) + + streaming_ds = SparkStreamingDataSet( + filepath=filepath, + file_format="json", + load_args={ + "schema": { + "filepath": f"s3://{BUCKET_NAME}/{SCHEMA_FILE_NAME}", + "credentials": AWS_CREDENTIALS, + } + }, + ).load() + + assert streaming_ds.isStreaming + schema = sample_schema(schema_path) + assert streaming_ds.schema == schema + def test_save(self, tmp_path, sample_spark_streaming_df): filepath_json = (tmp_path / "test_streams").as_posix() filepath_output = (tmp_path / "test_streams_output").as_posix() From 07032a8f3c89a8cd51eb59febec403203ed4b3a3 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 16 May 2023 21:36:37 +0100 Subject: [PATCH 67/96] add s3 read test cases Signed-off-by: Tom Kurian --- .../spark/test_spark_streaming_dataset.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 4d920980f..38a33337d 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -64,6 +64,15 @@ def mocked_s3_bucket(): yield conn +@pytest.fixture +def s3_bucket(): + with mock_s3(): + s3 = boto3.resource("s3", region_name="us-east-1") + bucket_name = "test-bucket" + s3.create_bucket(Bucket=bucket_name) + yield bucket_name + + @pytest.fixture def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructType): """Creates schema file and adds it to mocked S3 bucket.""" @@ -95,6 +104,28 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema = sample_schema(schema_path) assert streaming_ds.schema == schema + def test_read_dataframe_from_s3( + self, tmp_path, sample_spark_streaming_df, s3_bucket + ): + + s3_path = f"s3://{s3_bucket}/test-data" + schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() + + spark_json_ds = SparkDataSet( + filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}] + ) + spark_json_ds.save(sample_spark_streaming_df) + + streaming_ds = SparkStreamingDataSet( + filepath=s3_path, + file_format="json", + load_args={"schema": {"filepath": schema_path}}, + ).load() + + assert streaming_ds.isStreaming + schema = sample_schema(schema_path) + assert streaming_ds.schema == schema + @pytest.mark.usefixtures("mocked_s3_schema") def test_load_options_schema_path_with_credentials( self, tmp_path, sample_spark_streaming_df From 2470de19eede465d0de9568850d5dce34c3dbb99 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 16 May 2023 21:49:51 +0100 Subject: [PATCH 68/96] add s3 read test cases Signed-off-by: Tom Kurian --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 38a33337d..cae0742b6 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -108,7 +108,7 @@ def test_read_dataframe_from_s3( self, tmp_path, sample_spark_streaming_df, s3_bucket ): - s3_path = f"s3://{s3_bucket}/test-data" + s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*" schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() spark_json_ds = SparkDataSet( From c4e0f4e2cc8125ab94463ad74b91a5e22471fa55 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 16 May 2023 22:12:44 +0100 Subject: [PATCH 69/96] add s3 read test case Signed-off-by: Tom Kurian --- .../tests/spark/test_spark_streaming_dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index cae0742b6..52966deb0 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -111,10 +111,11 @@ def test_read_dataframe_from_s3( s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*" schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() - spark_json_ds = SparkDataSet( - filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}] - ) - spark_json_ds.save(sample_spark_streaming_df) + # spark_json_ds = SparkDataSet( + # filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}] + # ) + # spark_json_ds.save(sample_spark_streaming_df) + sample_spark_streaming_df.write.json(s3_path) streaming_ds = SparkStreamingDataSet( filepath=s3_path, From 7e3555e80501076074e0d7eb162058ac34b881e4 Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 16 May 2023 22:18:58 +0100 Subject: [PATCH 70/96] test s3 read Signed-off-by: Tom Kurian --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 52966deb0..203afcefa 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -115,6 +115,7 @@ def test_read_dataframe_from_s3( # filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}] # ) # spark_json_ds.save(sample_spark_streaming_df) + sample_spark_streaming_df.write.json(s3_path) streaming_ds = SparkStreamingDataSet( From 6a0029dcd20846f5f88e9adaa7b41061c5fc314e Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Wed, 17 May 2023 15:28:18 +0100 Subject: [PATCH 71/96] remove redundant test cases Signed-off-by: Tom Kurian --- .../spark/test_spark_streaming_dataset.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 203afcefa..9b91ab56f 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -104,30 +104,6 @@ def test_load(self, tmp_path, sample_spark_streaming_df): schema = sample_schema(schema_path) assert streaming_ds.schema == schema - def test_read_dataframe_from_s3( - self, tmp_path, sample_spark_streaming_df, s3_bucket - ): - - s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*" - schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() - - # spark_json_ds = SparkDataSet( - # filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}] - # ) - # spark_json_ds.save(sample_spark_streaming_df) - - sample_spark_streaming_df.write.json(s3_path) - - streaming_ds = SparkStreamingDataSet( - filepath=s3_path, - file_format="json", - load_args={"schema": {"filepath": schema_path}}, - ).load() - - assert streaming_ds.isStreaming - schema = sample_schema(schema_path) - assert streaming_ds.schema == schema - @pytest.mark.usefixtures("mocked_s3_schema") def test_load_options_schema_path_with_credentials( self, tmp_path, sample_spark_streaming_df From e8f6696efa1f3b015e08d61697b3884b3f97a65b Mon Sep 17 00:00:00 2001 From: Tom Kurian Date: Tue, 23 May 2023 21:35:56 +0100 Subject: [PATCH 72/96] fix streaming dataset configurations Signed-off-by: Tom Kurian --- .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++++++------- .../tests/spark/test_spark_streaming_dataset.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 567c4405c..b4c9cb68c 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -67,7 +67,6 @@ def __init__( write documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html """ - self._filepath_ = filepath self._file_format = file_format self._save_args = save_args self._load_args = load_args @@ -102,8 +101,7 @@ def _describe(self) -> Dict[str, Any]: @staticmethod def _get_spark(): - spark = SparkSession.builder.getOrCreate() - return spark + return SparkSession.builder.getOrCreate() def _load(self) -> DataFrame: """Loads data from filepath. @@ -112,27 +110,28 @@ def _load(self) -> DataFrame: Returns: Data from filepath as pyspark dataframe. """ - input_constructor = ( + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + data_stream_reader = ( self._get_spark() .readStream.schema(self._schema) .format(self._file_format) .options(**self._load_args) ) - return input_constructor.load(self._filepath_) + return data_stream_reader.load(load_path) def _save(self, data: DataFrame) -> None: """Saves pyspark dataframe. Args: data: PySpark streaming dataframe for saving """ - + save_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) output_constructor = data.writeStream.format(self._file_format) ( output_constructor.option( "checkpointLocation", self._save_args.pop("checkpoint") ) - .option("path", self._filepath_) + .option("path", save_path) .outputMode(self._save_args.pop("output_mode")) .options(**self._save_args) .start() diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index 9b91ab56f..b4e1f0414 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -85,7 +85,7 @@ def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructT return mocked_s3_bucket -class TestStreamingDataSet: +class TestSparkStreamingDataSet: def test_load(self, tmp_path, sample_spark_streaming_df): filepath = (tmp_path / "test_streams").as_posix() schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() From 9a5ebad1ae4e4d0d64eaf1239f92ac7180b85e08 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 25 May 2023 16:51:58 +0100 Subject: [PATCH 73/96] update streaming datasets doc Signed-off-by: Tingting_Wan --- .../kedro_datasets/spark/spark_streaming_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index b4c9cb68c..0f7e841ed 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -30,10 +30,9 @@ class SparkStreamingDataSet(AbstractDataSet): header: True load_args: schema: - filepath: data/01_raw/schema/inventory_schema.json + filepath: data/01_raw/schema/inventory_schema.json """ - # pylint: disable=too-many-instance-attributes DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] @@ -58,7 +57,9 @@ def __init__( It is dependent on the selected file format. You can find a list of read options for each supported format in Spark DataFrame read documentation: - https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html + https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html, + Please note that a schema is mandatory for a streaming DataFrame if schemaInference + is not True. save_args: Save args passed to Spark DataFrame write options. Similar to load_args this is dependent on the selected file format. You can pass ``mode`` and ``partitionBy`` to specify From eacdd461043beb9bf44342f1d5b237f5fdb86fdb Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 25 May 2023 16:58:06 +0100 Subject: [PATCH 74/96] resolve comments re documentation Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index a09165f14..bdc62c9c4 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -3,7 +3,7 @@ ``SparkStreamingDataSet`` loads and saves data to streaming DataFrames. See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. -To work with multiple streaming nodes, 2 hook are required for: +To work with multiple streaming nodes, 2 hooks are required for: - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details - Running streaming query without termination unless exception @@ -11,8 +11,8 @@ To work with multiple streaming nodes, 2 hook are required for: Supported file formats are: -- Text -- CSV +- Text +- CSV - JSON - ORC - Parquet @@ -33,7 +33,7 @@ class SparkStreamsHook: spark = SparkSession.builder.getOrCreate() spark.streams.awaitAnyTermination() ``` -To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``. +To make the application work with Kafka format, the respective spark configuration needs to be added to``conf/base/spark.yml``. #### Example spark.yml: From 68b6e1bfdc17812c143cce2a0b374cc165497a99 Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 25 May 2023 17:04:10 +0100 Subject: [PATCH 75/96] bugfix lint Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index bdc62c9c4..f8df9e94f 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -11,7 +11,7 @@ To work with multiple streaming nodes, 2 hooks are required for: Supported file formats are: -- Text +- Text - CSV - JSON - ORC From 5b2a479cc5a3fb28ab725cd210bc04a62f7d7dfc Mon Sep 17 00:00:00 2001 From: Tingting_Wan Date: Thu, 25 May 2023 17:15:01 +0100 Subject: [PATCH 76/96] update link Signed-off-by: Tingting_Wan --- kedro-datasets/kedro_datasets/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index f8df9e94f..7400c3c47 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -4,7 +4,7 @@ See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details. To work with multiple streaming nodes, 2 hooks are required for: - - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details + - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/integrations/pyspark_integration.html) for details - Running streaming query without termination unless exception #### Supported file formats From b94f2116e312e71161fb5aa12b34b5bf8ea4a79a Mon Sep 17 00:00:00 2001 From: Nok Chan Date: Fri, 26 May 2023 15:43:22 +0100 Subject: [PATCH 77/96] revert the changes on CI Signed-off-by: Nok Chan --- tools/circleci/circleci_release.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index e8f5d8449..dd05d4c5a 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -8,6 +8,7 @@ import requests from requests.structures import CaseInsensitiveDict + from utils.check_no_version_pypi import check_no_version_pypi from utils.package_version import get_package_version From 9381816f9d7120d7cdc83b60135556199d5a6bef Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Wed, 17 May 2023 16:48:52 +0100 Subject: [PATCH 78/96] test(docker): remove outdated logging-related step (#207) * fixkedro- docker e2e test Signed-off-by: Nok Chan * fix: add timeout to request to satisfy bandit lint --------- Signed-off-by: Nok Chan Co-authored-by: Deepyaman Datta Signed-off-by: Tom Kurian --- kedro-docker/features/docker.feature | 1 - kedro-docker/features/docker_with_spark.feature | 1 - kedro-docker/features/steps/cli_steps.py | 15 --------------- 3 files changed, 17 deletions(-) diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index 7dffa4541..74580213e 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -3,7 +3,6 @@ Feature: Docker commands in new projects Background: Given I have prepared a config file And I run a non-interactive kedro new using pandas-iris starter - And I have fixed logs write permission And I have installed the project dependencies And I have removed old docker image of test project diff --git a/kedro-docker/features/docker_with_spark.feature b/kedro-docker/features/docker_with_spark.feature index a10116476..012ad2595 100644 --- a/kedro-docker/features/docker_with_spark.feature +++ b/kedro-docker/features/docker_with_spark.feature @@ -3,7 +3,6 @@ Feature: Docker commands in new Spark projects Background: Given I have prepared a config file And I run a non-interactive kedro new using pyspark-iris starter - And I have fixed logs write permission And I have installed the project dependencies And I have removed old docker image of test project diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 0306c1e2f..30b80f749 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -126,21 +126,6 @@ def create_configuration_file(context): yaml.dump(config, config_file, default_flow_style=False) -@given("I have fixed logs write permission") -def modify_write_permission(context): - """ - Kedro-docker mounts some subdirectories the current directory (like logs, notebooks etc) - into the Docker container. - If you run kedro commands with different users, - they might create files and directories not writable by each other. - So we are fixing the permissions here. - """ - (context.root_project_dir / "logs").chmod(0o777) - journal_dir = context.root_project_dir / "logs" / "journals" - journal_dir.mkdir(parents=True, exist_ok=True) - journal_dir.chmod(0o777) - - @given("I run a non-interactive kedro new using {starter_name} starter") def create_project_from_config_file(context, starter_name): """Behave step to run kedro new From 373e166ba7b5eb13c73089f51d0a3b29c3f7f23f Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 18 May 2023 09:12:01 -0400 Subject: [PATCH 79/96] ci: ensure plugin requirements get installed in CI (#208) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: install the plugin alongside test requirements * ci: install the plugin alongside test requirements * Update kedro-airflow.yml * Update kedro-datasets.yml * Update kedro-docker.yml * Update kedro-telemetry.yml * Update kedro-airflow.yml * Update kedro-datasets.yml * Update kedro-airflow.yml * Update kedro-docker.yml * Update kedro-telemetry.yml * ci(telemetry): update isort config to correct sort * Don't use profile ¯\_(ツ)_/¯ Signed-off-by: Deepyaman Datta * chore(datasets): remove empty `tool.black` section * chore(docker): remove empty `tool.black` section --------- Signed-off-by: Deepyaman Datta Signed-off-by: Tom Kurian --- .circleci/continue_config.yml | 7 +++---- .github/workflows/check-plugin.yml | 6 +++--- .github/workflows/kedro-airflow.yml | 14 +++++++++----- .github/workflows/kedro-datasets.yml | 14 +++++++++----- .github/workflows/kedro-docker.yml | 14 +++++++++----- .github/workflows/kedro-telemetry.yml | 14 +++++++++----- kedro-datasets/pyproject.toml | 2 -- kedro-docker/pyproject.toml | 2 -- kedro-telemetry/pyproject.toml | 4 +--- 9 files changed, 43 insertions(+), 34 deletions(-) diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 82653758e..d339e82c1 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -69,8 +69,7 @@ commands: command: | cd <> pip install git+https://github.com/kedro-org/kedro@main - pip install . - pip install -r test_requirements.txt + pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - run: name: Install pre-commit hooks command: | @@ -177,7 +176,7 @@ commands: command: conda activate kedro_plugins; pip install git+https://github.com/kedro-org/kedro@main - run: name: Install all requirements - command: conda activate kedro_plugins; cd <>; pip install -r test_requirements.txt -U + command: conda activate kedro_plugins; cd <>; pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - run: name: Pip freeze command: conda activate kedro_plugins; pip freeze @@ -323,7 +322,7 @@ jobs: - run: name: Maybe trigger the release workflow command: | - conda activate kedro_plugins; + conda activate kedro_plugins pip install requests ./tools/circleci/circleci_release.py diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml index 4a3cf8827..29266046d 100644 --- a/.github/workflows/check-plugin.yml +++ b/.github/workflows/check-plugin.yml @@ -42,7 +42,7 @@ jobs: - name: Install dependencies run: | cd ${{ inputs.plugin }} - pip install -r test_requirements.txt + pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - name: pip freeze run: pip freeze - name: Run unit tests for Linux / all plugins @@ -84,7 +84,7 @@ jobs: run: | cd ${{ inputs.plugin }} pip install git+https://github.com/kedro-org/kedro@main - pip install -r test_requirements.txt + pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` pip freeze - name: Install pre-commit hooks run: | @@ -121,7 +121,7 @@ jobs: run: | cd ${{ inputs.plugin }} pip install git+https://github.com/kedro-org/kedro@main - pip install -r test_requirements.txt + pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - name: pip freeze run: pip freeze - name: Run end to end tests diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index b68fcce30..d4e696061 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -1,12 +1,16 @@ -name: Run checks on kedro-airflow +name: Run checks on Kedro-Airflow on: push: - paths: - - "kedro-airflow/**" + paths-ignore: + - "kedro-datasets/**" + - "kedro-docker/**" + - "kedro-telemetry/**" pull_request: - paths: - - "kedro-airflow/**" + paths-ignore: + - "kedro-datasets/**" + - "kedro-docker/**" + - "kedro-telemetry/**" types: [ synchronize ] jobs: diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml index 9ff4802b6..1b25f711b 100644 --- a/.github/workflows/kedro-datasets.yml +++ b/.github/workflows/kedro-datasets.yml @@ -1,12 +1,16 @@ -name: Run checks on kedro-datasets +name: Run checks on Kedro-Datasets on: push: - paths: - - "kedro-datasets/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-docker/**" + - "kedro-telemetry/**" pull_request: - paths: - - "kedro-datasets/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-docker/**" + - "kedro-telemetry/**" types: [ synchronize ] jobs: diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 1812a3a93..4231ca545 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -1,12 +1,16 @@ -name: Run checks on kedro-docker +name: Run checks on Kedro-Docker on: push: - paths: - - "kedro-docker/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-telemetry/**" pull_request: - paths: - - "kedro-docker/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-telemetry/**" types: [ synchronize ] jobs: diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index fd75e8a71..ce5b82743 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -1,12 +1,16 @@ -name: Run checks on kedro-telemetry +name: Run checks on Kedro-Telemetry on: push: - paths: - - "kedro-telemetry/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-docker/**" pull_request: - paths: - - "kedro-telemetry/**" + paths-ignore: + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-docker/**" types: [ synchronize ] jobs: diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index a5f494106..11cf1a157 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -27,8 +27,6 @@ include = ["kedro_datasets*"] readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro_datasets.__version__"} -[tool.black] - [tool.isort] profile = "black" diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index cdd273509..0a9639956 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -53,8 +53,6 @@ addopts = """ --no-cov-on-fail -ra""" -[tool.black] - [tool.isort] multi_line_output = 3 include_trailing_comma = true diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 0cc754854..81ec2c60b 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -42,6 +42,4 @@ include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true line_length = 88 -known_third_party = "kedro" - -[tool.black] +known_first_party = "kedro_telemetry" From f033b951d0b41d9b60bf7dd0d7ca3c38dcc84745 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Thu, 18 May 2023 14:18:00 +0100 Subject: [PATCH 80/96] ci: Migrate the release workflow from CircleCI to GitHub Actions (#203) * Create check-release.yml * change from test pypi to pypi * split into jobs and move version logic into script * update github actions output * lint * changes based on review * changes based on review * fix script to not append continuously * change pypi api token logic Signed-off-by: Tom Kurian --- .github/workflows/check-release.yml | 93 +++++++++++++++++++ .../github_actions/github_actions_release.py | 54 +++++++++++ 2 files changed, 147 insertions(+) create mode 100644 .github/workflows/check-release.yml create mode 100755 tools/github_actions/github_actions_release.py diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml new file mode 100644 index 000000000..386810bbd --- /dev/null +++ b/.github/workflows/check-release.yml @@ -0,0 +1,93 @@ +name: Check versions and build-publish + +on: + push: + branches: + - main + +jobs: + check-version: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests + - name: Check version + run: python tools/github_actions/github_actions_release.py + - name: Set outputs + id: version_check + run: | + echo "new_release=${{ env.NEW_RELEASE }}" >> $GITHUB_OUTPUT + echo "package_name=${{ env.PACKAGE_NAME }}" >> $GITHUB_OUTPUT + echo "package_version=${{ env.PACKAGE_VERSION }}" >> $GITHUB_OUTPUT + outputs: + new_release: ${{ steps.version_check.outputs.new_release }} + package_name: ${{ steps.version_check.outputs.package_name }} + package_version: ${{ steps.version_check.outputs.package_version }} + + test: + needs: check-version + if: ${{ needs.check-version.outputs.new_release == 'true' }} + uses: ./.github/workflows/check-plugin.yml + with: + plugin: ${{ needs.check-version.outputs.package_name }} + + build-publish: + needs: [check-version, test] + if: ${{ needs.check-version.outputs.new_release == 'true' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: | + export plugin=${{ needs.check-version.outputs.package_name }} + make package + - name: Create GitHub Release + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GH_TAGGING_TOKEN }} + script: | + const package_name = "${{ needs.check-version.outputs.package_name }}" + const package_version = "${{ needs.check-version.outputs.package_version }}" + const response = await github.rest.repos.createRelease({ + owner: context.repo.owner, + repo: context.repo.repo, + tag_name: `${package_name}-${package_version}`, + target_commitish: 'main', + name: `${package_name}-${package_version}`, + body: `Release ${package_version}`, + draft: false, + prerelease: false, + }); + return response.data; + - name: Set PyPI token + run: | + if [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-airflow" ]; then + echo 'PYPI_TOKEN=${{ secrets.AIRFLOW_PYPI_TOKEN }}' >> $GITHUB_ENV + elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-datasets" ]; then + echo 'PYPI_TOKEN=${{ secrets.DATASETS_PYPI_TOKEN }}' >> $GITHUB_ENV + elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-docker" ]; then + echo 'PYPI_TOKEN=${{ secrets.DOCKER_PYPI_TOKEN }}' >> $GITHUB_ENV + elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-telemetry" ]; then + echo 'PYPI_TOKEN=${{ secrets.TELEMETRY_PYPI_TOKEN }}' >> $GITHUB_ENV + fi + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: ${{ needs.check-version.outputs.package_name }}/dist + password: ${{ env.PYPI_TOKEN }} + diff --git a/tools/github_actions/github_actions_release.py b/tools/github_actions/github_actions_release.py new file mode 100755 index 000000000..cec1a8b97 --- /dev/null +++ b/tools/github_actions/github_actions_release.py @@ -0,0 +1,54 @@ +import os +import sys +import re +import requests +from pathlib import Path + +VERSION_MATCHSTR = r'\s*__version__\s*=\s*"(\d+\.\d+\.\d+)"' +PACKAGE_PATHS = ( + "kedro-datasets/kedro_datasets", + "kedro-telemetry/kedro_telemetry", + "kedro-airflow/kedro_airflow", + "kedro-docker/kedro_docker", +) + + +def get_package_version(base_path, package_path): + init_file_path = Path(base_path) / package_path / "__init__.py" + match_obj = re.search(VERSION_MATCHSTR, Path(init_file_path).read_text()) + return match_obj.group(1) + + +def check_no_version_pypi(pypi_endpoint, package_name, package_version): + print(f"Check if {package_name} {package_version} is on pypi") + response = requests.get(pypi_endpoint, timeout=10) + if response.status_code == 404: + # Version doesn't exist on Pypi - do release + print(f"Starting the release of {package_name} {package_version}") + return True + else: + print(f"Skipped: {package_name} {package_version} already exists on PyPI") + return False + + +if __name__ == "__main__": + """Check if a package needs to be released""" + base_path = Path() + new_release = "false" + package_name = None + package_version = None + + for package_path in PACKAGE_PATHS: + package_name, _ = package_path.split("/") + package_version = get_package_version(base_path, package_path) + pypi_endpoint = f"https://pypi.org/pypi/{package_name}/{package_version}/json/" + + if check_no_version_pypi(pypi_endpoint, package_name, package_version): + new_release = "true" + break + + env_file = os.getenv('GITHUB_ENV') + with open(env_file, "a") as env_file: + env_file.write(f"NEW_RELEASE={new_release}\n") + if new_release == "true": + env_file.write(f"PACKAGE_NAME={package_name}\nPACKAGE_VERSION={package_version}\n") From 3fdb71c1cfc1d41618fab5c216ac12da785d271e Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Thu, 18 May 2023 14:52:32 +0100 Subject: [PATCH 81/96] build: Relax Kedro bound for `kedro-datasets` (#140) * Less strict pin on Kedro for datasets Signed-off-by: Merel Theisen Signed-off-by: Tom Kurian --- kedro-datasets/RELEASE.md | 2 ++ kedro-datasets/pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index bd1d1e73c..2dbee5adc 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,6 +4,8 @@ * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. +* Relaxed Kedro version pin to `>=0.16` + ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 11cf1a157..457c18bc6 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -11,7 +11,7 @@ description = "Kedro-Datasets is where you can find all of Kedro's data connecto requires-python = ">=3.7, <3.11" license = {text = "Apache Software License (Apache 2.0)"} dependencies = [ - "kedro~=0.18.4", + "kedro>=0.16", ] dynamic = ["readme", "version", "optional-dependencies"] From b08aa6f617f783576b75fc84eee3c960e970c7c6 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 18 May 2023 10:37:30 -0400 Subject: [PATCH 82/96] ci: don't run checks on both `push`/`pull_request` (#192) * ci: don't run checks on both `push`/`pull_request` * ci: don't run checks on both `push`/`pull_request` * ci: don't run checks on both `push`/`pull_request` * ci: don't run checks on both `push`/`pull_request` Signed-off-by: Tom Kurian --- .github/workflows/kedro-airflow.yml | 17 ++++++++++------- .github/workflows/kedro-datasets.yml | 17 ++++++++++------- .github/workflows/kedro-docker.yml | 17 ++++++++++------- .github/workflows/kedro-telemetry.yml | 17 ++++++++++------- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index d4e696061..ef0c87ef9 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -2,16 +2,19 @@ name: Run checks on Kedro-Airflow on: push: + branches: + - main paths-ignore: - - "kedro-datasets/**" - - "kedro-docker/**" - - "kedro-telemetry/**" + - "kedro-datasets/**" + - "kedro-docker/**" + - "kedro-telemetry/**" pull_request: + branches: + - main paths-ignore: - - "kedro-datasets/**" - - "kedro-docker/**" - - "kedro-telemetry/**" - types: [ synchronize ] + - "kedro-datasets/**" + - "kedro-docker/**" + - "kedro-telemetry/**" jobs: airflow-test: diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml index 1b25f711b..943453ee7 100644 --- a/.github/workflows/kedro-datasets.yml +++ b/.github/workflows/kedro-datasets.yml @@ -2,16 +2,19 @@ name: Run checks on Kedro-Datasets on: push: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-docker/**" - - "kedro-telemetry/**" + - "kedro-airflow/**" + - "kedro-docker/**" + - "kedro-telemetry/**" pull_request: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-docker/**" - - "kedro-telemetry/**" - types: [ synchronize ] + - "kedro-airflow/**" + - "kedro-docker/**" + - "kedro-telemetry/**" jobs: datasets-test: diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 4231ca545..71a77cb24 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -2,16 +2,19 @@ name: Run checks on Kedro-Docker on: push: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-datasets/**" - - "kedro-telemetry/**" + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-telemetry/**" pull_request: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-datasets/**" - - "kedro-telemetry/**" - types: [ synchronize ] + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-telemetry/**" jobs: docker-test: diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index ce5b82743..f53841bde 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -2,16 +2,19 @@ name: Run checks on Kedro-Telemetry on: push: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-datasets/**" - - "kedro-docker/**" + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-docker/**" pull_request: + branches: + - main paths-ignore: - - "kedro-airflow/**" - - "kedro-datasets/**" - - "kedro-docker/**" - types: [ synchronize ] + - "kedro-airflow/**" + - "kedro-datasets/**" + - "kedro-docker/**" jobs: telemetry-test: From 148b464ef5df6bb83b5fa69ed14121c49ceb69de Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Fri, 19 May 2023 11:54:31 -0400 Subject: [PATCH 83/96] chore: delete extra space ending check-release.yml (#210) Signed-off-by: Tom Kurian --- .github/workflows/check-release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml index 386810bbd..916cf70f7 100644 --- a/.github/workflows/check-release.yml +++ b/.github/workflows/check-release.yml @@ -90,4 +90,3 @@ jobs: with: packages-dir: ${{ needs.check-version.outputs.package_name }}/dist password: ${{ env.PYPI_TOKEN }} - From be2431c0cec4c316815aec1c9a162a5b38090801 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Fri, 19 May 2023 17:17:41 +0100 Subject: [PATCH 84/96] ci: Create merge-gatekeeper.yml to make sure PR only merged when all tests checked. (#215) * Create merge-gatekeeper.yml * Update .github/workflows/merge-gatekeeper.yml --------- Co-authored-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Signed-off-by: Tom Kurian --- .github/workflows/merge-gatekeeper.yml | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/merge-gatekeeper.yml diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml new file mode 100644 index 000000000..be615ecbd --- /dev/null +++ b/.github/workflows/merge-gatekeeper.yml @@ -0,0 +1,27 @@ +name: Merge Gatekeeper + +on: + pull_request: + branches: + - main + - develop + +jobs: + merge-gatekeeper: + runs-on: ubuntu-latest + # Restrict permissions of the GITHUB_TOKEN. + # Docs: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs + permissions: + checks: read + statuses: read + steps: + - name: Run Merge Gatekeeper + # NOTE: v1 is updated to reflect the latest v1.x.y. Please use any tag/branch that suits your needs: + # https://github.com/upsidr/merge-gatekeeper/tags + # https://github.com/upsidr/merge-gatekeeper/branches + uses: upsidr/merge-gatekeeper@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + timeout: 1800 + interval: 30 + From 74a211f8b774fb6813579552e2e4e2b280121147 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Fri, 19 May 2023 19:00:51 +0100 Subject: [PATCH 85/96] ci: Remove the CircleCI setup (#209) * remove circleci setup files and utils * remove circleci configs in kedro-telemetry * remove redundant .github in kedro-telemetry * Delete continue_config.yml * Update check-release.yml * lint * increase timeout to 40 mins for docker e2e tests Signed-off-by: Tom Kurian --- .circleci/config.yml | 38 -- .circleci/continue_config.yml | 516 ------------------ .github/workflows/merge-gatekeeper.yml | 3 +- kedro-telemetry/.circleci/config.yml | 131 ----- .../.github/ISSUE_TEMPLATE/bug-report.md | 41 -- .../.github/ISSUE_TEMPLATE/feature-request.md | 20 - .../.github/PULL_REQUEST_TEMPLATE.md | 13 - tools/circleci/circleci_release.py | 78 --- tools/circleci/github_release.py | 52 -- tools/circleci/utils/check_no_version_pypi.py | 13 - tools/circleci/utils/package_version.py | 21 - 11 files changed, 1 insertion(+), 925 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 .circleci/continue_config.yml delete mode 100644 kedro-telemetry/.circleci/config.yml delete mode 100644 kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md delete mode 100644 kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md delete mode 100644 kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md delete mode 100755 tools/circleci/circleci_release.py delete mode 100755 tools/circleci/github_release.py delete mode 100644 tools/circleci/utils/check_no_version_pypi.py delete mode 100644 tools/circleci/utils/package_version.py diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index b8a27e1c3..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,38 +0,0 @@ -version: 2.1 - -parameters: - release_package: - type: string - default: "" - release_version: - type: string - default: "" - -setup: true - -# the path-filtering orb is required to continue a pipeline based on -# the path of an updated fileset -orbs: - path-filtering: circleci/path-filtering@0.1.1 - -workflows: - always-run: - jobs: - # the path-filtering/filter job determines which pipeline - # parameters to update. - - path-filtering/filter: - name: check-updated-files - # 3-column, whitespace-delimited mapping. One mapping per - # line: - # - mapping: | - kedro-telemetry/.* run-build-kedro-telemetry true - kedro-docker/.* run-build-kedro-docker true - kedro-airflow/.* run-build-kedro-airflow true - kedro-datasets/.* run-build-kedro-datasets true - base-revision: main - # this is the path of the configuration we should trigger once - # path filtering and pipeline parameter value updates are - # complete. In this case, we are using the parent dynamic - # configuration itself. - config-path: .circleci/continue_config.yml diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml deleted file mode 100644 index d339e82c1..000000000 --- a/.circleci/continue_config.yml +++ /dev/null @@ -1,516 +0,0 @@ -version: 2.1 - -orbs: - win: circleci/windows@2.4.1 - -# the default pipeline parameters, which will be updated according to -# the results of the path-filtering orb -parameters: - run-build-kedro-telemetry: - type: boolean - default: false - run-build-kedro-docker: - type: boolean - default: false - run-build-kedro-airflow: - type: boolean - default: false - run-build-kedro-datasets: - type: boolean - default: false - release_package: - type: string - default: "" - release_version: - type: string - default: "" - -commands: - setup_conda: - parameters: - python_version: - type: string - steps: - - run: - name: Cleanup pyenv - command: sudo rm -rf .pyenv/ /opt/circleci/.pyenv/ - - run: - name: Download and install miniconda - command: | - curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - - run: - name: Create conda environment with correct python version - command: | - . /home/circleci/miniconda/etc/profile.d/conda.sh - conda create --name kedro_plugins python=<> -y - - run: - name: Setup bash env to run conda activation at each step - command: | - echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV - echo "conda deactivate; conda activate kedro_plugins" >> $BASH_ENV - source $BASH_ENV - - setup_requirements: - parameters: - plugin: - type: string - steps: - - run: - name: Install pip setuptools - command: make install-pip-setuptools - - run: - # pytables does not work properly with python 3.9 to handle our HDFDataSet - # if pip-installed, so we install this dependency via conda - name: Install pytables - command: conda install -c conda-forge pytables -y - - run: - name: Install kedro and test requirements - command: | - cd <> - pip install git+https://github.com/kedro-org/kedro@main - pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - - run: - name: Install pre-commit hooks - command: | - cd <> - pre-commit install --install-hooks - pre-commit install --hook-type pre-push - - run: - # this is needed to fix java cacerts so - # spark can automatically download packages from mvn - # https://stackoverflow.com/a/50103533/1684058 - name: Fix cacerts - command: | - sudo rm /etc/ssl/certs/java/cacerts - sudo update-ca-certificates -f - - run: - # Since recently Spark installation for some reason does not have enough permissions to execute - # /home/circleci/miniconda/envs/kedro_plugins/lib/python3.X/site-packages/pyspark/bin/spark-class. - # So fixing it manually here. - name: Fix Spark permissions - command: sudo chmod -R u+x /home/circleci/miniconda/envs/kedro_plugins/lib/ - - run: - name: Pip freeze - command: pip freeze - - setup: - parameters: - python_version: - type: string - plugin: - type: string - steps: - - checkout - - setup_conda: - python_version: <> - - setup_requirements: - plugin: <> - - # Windows specific commands - win_setup_conda: - # Miniconda3 is pre-installed on the machine: - # https://circleci.com/docs/2.0/hello-world-windows - parameters: - python_version: - type: string - steps: - - run: - name: Initialize conda - command: conda init powershell - - run: - name: Create 'kedro_plugins' conda environment - command: conda create --name kedro_plugins python=<> -y - - - win_setup_env: - steps: - - run: - # Required for Tensorflow tests - name: Install Microsoft Visual C++ Redistributable - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://aka.ms/vs/16/release/vc_redist.x64.exe -OutFile vc_redist.x64.exe - .\vc_redist.x64.exe /S /v/qn - - run: - name: Install Java 8 - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u252-b09/OpenJDK8U-jdk_x64_windows_8u252b09.zip -OutFile OpenJDK8U.zip - Expand-Archive .\OpenJDK8U.zip -DestinationPath C:\OpenJDK8U - - run: - name: Create Inbound rules for Java - command: | - New-NetFirewallRule -DisplayName "Allow JDK UDP" -Profile "Public" -Protocol "UDP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow - New-NetFirewallRule -DisplayName "Allow JDK TCP" -Profile "Public" -Protocol "TCP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow - - run: - name: Set Java environment variables - command: | - [Environment]::SetEnvironmentVariable("Path", [Environment]::GetEnvironmentVariable('Path', 'Machine') + ";C:\OpenJDK8U\openjdk-8u252-b09\bin", "Machine") - setx /m JAVA_HOME "C:\OpenJDK8U\openjdk-8u252-b09" - - run: - name: Setup Hadoop binary - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://github.com/steveloughran/winutils/raw/master/hadoop-2.6.3/bin/winutils.exe -OutFile winutils.exe - New-Item -ItemType directory -Path C:\hadoop\bin - mv .\winutils.exe C:\hadoop\bin - setx /m HADOOP_HOME "C:\hadoop\" - - run: - name: Install 'make' command - command: choco install make - - - win_setup_requirements: - parameters: - plugin: - type: string - python_version: - type: string - steps: - - run: - name: Install GDAL, Fiona and pytables - command: conda activate kedro_plugins; conda install gdal fiona pytables -c conda-forge -y - - run: - name: Install Kedro - command: conda activate kedro_plugins; pip install git+https://github.com/kedro-org/kedro@main - - run: - name: Install all requirements - command: conda activate kedro_plugins; cd <>; pip install . -r test_requirements.txt # TODO(deepyaman): Define `test` extra and `pip install .[test]` - - run: - name: Pip freeze - command: conda activate kedro_plugins; pip freeze - -jobs: - unit_tests: - parameters: - python_version: - type: string - plugin: - type: string - machine: - image: ubuntu-2004:202201-02 - docker_layer_caching: true - steps: - - setup: - python_version: <> - plugin: <> - - run: - name: Run unit tests - command: make plugin=<> test - - e2e_tests: - parameters: - python_version: - type: string - plugin: - type: string - machine: - image: ubuntu-2004:202201-02 - docker_layer_caching: true - steps: - - setup: - python_version: <> - plugin: <> - - run: - name: Run e2e tests - command: make plugin=<> e2e-tests - - lint: - parameters: - plugin: - type: string - machine: - image: ubuntu-2004:202201-02 - docker_layer_caching: true - steps: - - setup: - python_version: "3.8" - plugin: <> - - run: - name: Run pylint and flake8 - command: make plugin=<> lint - - win_unit_tests: - parameters: - python_version: - type: string - plugin: - type: string - executor: - name: win/default - steps: - - checkout - - win_setup_conda: - python_version: <> - - win_setup_env - - win_setup_requirements: - plugin: <> - python_version: <> - # For anything not `kedro-datasets` - - unless: - condition: - equal: ["kedro-datasets", <>] - - # e2e tests are not currently runnable on CircleCI on Windows as - # those require the ability to run Linux containers: - # "The Windows executor currently only supports Windows containers. - # Running Linux containers on Windows is not possible for now" - # (from https://circleci.com/docs/2.0/hello-world-windows/) - steps: - - run: - name: Run unit tests - command: | - conda activate kedro_plugins - cd <> - pytest tests - - - run: - # geopandas and tensorflow conflicts when imported simultaneously. - # The HDF5 header files used to compile this application do not match - # the version used by the HDF5 library to which this application is linked. - # Data corruption or segmentation faults may occur if the application continues. - # This can happen when an application was compiled by one version of HDF5 but - # linked with a different version of static or shared HDF5 library. - # You should recompile the application or check your shared library related - # settings such as 'LD_LIBRARY_PATH'. - # You can, at your own risk, disable this warning by setting the environment - # variable 'HDF5_DISABLE_VERSION_CHECK' to a value of '1'. - # Setting it to 2 or higher will suppress the warning messages totally. - name: Set HDF5_DISABLE_VERSION_CHECK environment variable - command: setx /m HDF5_DISABLE_VERSION_CHECK 1 - - when: - condition: - and: - - not: - equal: [ "3.10", <> ] - - equal: [ "kedro-datasets", <> ] - steps: - - run: - name: Run unit tests without spark in parallel - command: conda activate kedro_plugins; make test-no-spark - - when: - condition: - and: - - equal: [ "3.10", <> ] - - equal: [ "kedro-datasets", <> ] - steps: - - run: - name: Run unit tests without spark sequentially - command: conda activate kedro_plugins; make test-no-spark-sequential - - sync: - parameters: - python_version: - type: string - docker: - # https://circleci.com/docs/2.0/circleci-images/#circleci-base-image - - image: cimg/base:2020.01 - steps: - - checkout - - add_ssh_keys - - run: - name: Set git email and name - command: | - git config --global user.email "kedro@kedro.com" - git config --global user.name "Kedro" - # - run: - # name: Trigger Read The Docs build - # command: ./tools/circleci/rtd-build.sh ${RTD_TOKEN} latest - - setup_conda: - python_version: <> - - run: - name: Maybe trigger the release workflow - command: | - conda activate kedro_plugins - pip install requests - ./tools/circleci/circleci_release.py - - - # This is effectively just a combination of the lint, unit_tests and e2e_tests jobs. - # It's used to check that the nightly docker image is working ok and before publishing a release. - build_package: - parameters: - python_version: - type: string - machine: - image: ubuntu-2004:202201-02 - docker_layer_caching: true - steps: - - setup: - python_version: <> # Just need one Python version here - plugin: <> - - run: - name: Run linters - command: export plugin=<>; make lint - - unless: - condition: - equal: ["3.10", <>] - steps: - - run: - name: Run unit tests in parallel - command: export plugin=<>; make test - - when: - condition: - equal: [ "3.10", <> ] - steps: - - run: - name: Run unit tests sequentially - command: export plugin=<>; make test-sequential - - run: - name: Run e2e tests - command: make plugin=<> e2e-tests - - publish_package: - machine: - image: ubuntu-2004:202201-02 - docker_layer_caching: true - steps: - - run: - name: Print the release package and version - command: | - echo "Release package: <> <>" - - setup: - python_version: "3.8" # Just need one Python version here - plugin: <> # From circle_release.py - - add_ssh_keys - - run: - name: Tag and publish release on Github - command: ./tools/circleci/github_release.py <> <> - - run: - name: Publish to PyPI - command: | - export plugin=<> - make package - make pypi - - -workflows: - # when pipeline parameter, run-build-kedro-telemetry is true, the - # kedro-telemetry job is triggered. - kedro-telemetry: - when: - and: - - <> - - not: <> - - not: <> - jobs: - - unit_tests: - plugin: "kedro-telemetry" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - win_unit_tests: - plugin: "kedro-telemetry" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - lint: - plugin: "kedro-telemetry" - # when pipeline parameter, run-build-kedro-docker is true, the - # kedro-docker job is triggered. - kedro-docker: - when: - and: - - <> - - not: <> - - not: <> - jobs: - - unit_tests: - plugin: "kedro-docker" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - e2e_tests: - plugin: "kedro-docker" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - win_unit_tests: - plugin: "kedro-docker" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - lint: - plugin: "kedro-docker" - # when pipeline parameter, run-build-kedro-airflow is true, the - # kedro-airflow job is triggered. - kedro-airflow: - when: - and: - - <> - - not: <> - - not: <> - jobs: - - unit_tests: - plugin: "kedro-airflow" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - pre-steps: - - run: - name: Avoid GPL dependency (unidecode) - command: echo 'export SLUGIFY_USES_TEXT_UNIDECODE=yes' >> $BASH_ENV - - e2e_tests: - plugin: "kedro-airflow" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - win_unit_tests: - plugin: "kedro-airflow" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - lint: - plugin: "kedro-airflow" - # when pipeline parameter, run-build-kedro-datasets is true, the - # kedro-datasets job is triggered. - kedro-datasets: - when: - and: - - <> - - not: <> - - not: <> - jobs: - - unit_tests: - plugin: "kedro-datasets" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - win_unit_tests: - plugin: "kedro-datasets" - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - lint: - plugin: "kedro-datasets" - - # For release - main_updated: - when: - and: - - not: <> - - not: <> - jobs: - - sync: - filters: - branches: - only: main - matrix: - # We just need one Python enviornment to trigger the job - parameters: - python_version: ["3.8"] - - package_release: - when: - and: - - <> - - <> - jobs: - - build_package: - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - publish_package: - requires: - - build_package diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml index be615ecbd..77ad752de 100644 --- a/.github/workflows/merge-gatekeeper.yml +++ b/.github/workflows/merge-gatekeeper.yml @@ -22,6 +22,5 @@ jobs: uses: upsidr/merge-gatekeeper@v1 with: token: ${{ secrets.GITHUB_TOKEN }} - timeout: 1800 + timeout: 2400 interval: 30 - diff --git a/kedro-telemetry/.circleci/config.yml b/kedro-telemetry/.circleci/config.yml deleted file mode 100644 index d0915db48..000000000 --- a/kedro-telemetry/.circleci/config.yml +++ /dev/null @@ -1,131 +0,0 @@ -version: 2.1 - -orbs: - win: circleci/windows@2.4.0 - -commands: - # Windows-related commands - win_setup_conda: - # Miniconda3 is pre-installed on the machine: - # https://circleci.com/docs/2.0/hello-world-windows/ - description: Setup conda - steps: - - run: - name: Initialize conda - command: conda init powershell - - run: - name: Create 'kedro-telemetry' conda environment - command: | - conda create --name kedro-telemetry python=$env:CONDA_ENV_PY_VERSION -y - - win_setup_requirements: - description: Install kedro-telemetry dependencies - steps: - - run: - name: Install kedro-telemetry dependencies - command: | - conda activate kedro-telemetry - python -m pip install -U pip setuptools wheel - pip install git+https://github.com/kedro-org/kedro@main - pip install -r test_requirements.txt -U - - win_build: - description: Run build on Windows - steps: - - checkout - - win_setup_conda - - win_setup_requirements - - run: - name: Run unit tests - command: | - conda activate kedro-telemetry - pytest .\tests - -jobs: - build_36: &DEFAULT - machine: - # Don't use 2018 image: https://discuss.circleci.com/t/24639/18 - image: circleci/classic:201711-01 - docker_layer_caching: true - environment: - CONDA_ENV_PY_VERSION: "3.6" - steps: - - checkout - - run: - name: Create virtual env - command: | - # Get rid of pyenv stuff - sudo rm -rf .pyenv/ /opt/circleci/.pyenv/ - # Download and install miniconda - curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - # Create an anaconda virtualenv for ${CONDA_ENV_PY_VERSION} and make that the default python interpreter - echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV - echo "conda deactivate; conda activate kedro-telemetry" >> $BASH_ENV - . /home/circleci/miniconda/etc/profile.d/conda.sh - conda create --name kedro-telemetry python=${CONDA_ENV_PY_VERSION} -y - source $BASH_ENV - - run: - name: Pip install dependencies - command: | - make install-pip-setuptools - pip install git+https://github.com/kedro-org/kedro - pip install -r test_requirements.txt - pre-commit install --install-hooks - pre-commit install --hook-type pre-push - - run: - name: Run pylint and flake8 - command: | - make lint - - run: - name: Run tests - command: | - make test - - build_37: - <<: *DEFAULT - environment: - CONDA_ENV_PY_VERSION: 3.7 - - build_38: - <<: *DEFAULT - environment: - CONDA_ENV_PY_VERSION: 3.8 - - # Windows-related jobs - win_build_36: - executor: - name: win/default - working_directory: ~/repo - environment: - CONDA_ENV_PY_VERSION: "3.6" - steps: [win_build] - - win_build_37: - executor: - name: win/default - working_directory: ~/repo - environment: - CONDA_ENV_PY_VERSION: "3.7" - steps: - - win_build - - win_build_38: - executor: - name: win/default - working_directory: ~/repo - environment: - CONDA_ENV_PY_VERSION: "3.8" - steps: - - win_build - -workflows: - version: 2 - regular: - jobs: - - build_36 - - build_37 - - build_38 - - win_build_36 - - win_build_37 - - win_build_38 diff --git a/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md b/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md deleted file mode 100644 index f89bf88bf..000000000 --- a/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -name: Bug report -about: If something isn't working -title: '' -labels: 'Issue: Bug Report' -assignees: '' - ---- - -## Description -Short description of the problem here. - -## Context -How has this bug affected you? What were you trying to accomplish? - -## Steps to Reproduce -1. [First Step] -2. [Second Step] -3. [And so on...] - -## Expected Result -Tell us what should happen. - -## Actual Result -Tell us what happens instead. - -``` --- If you received an error, place it here. -``` - -``` --- Separate them if you have more than one. -``` - -## Your Environment -Include as many relevant details about the environment in which you experienced the bug: - -* Kedro-telemetry version used (`pip show kedro-telemetry`): -* Kedro version used (`pip show kedro` or `kedro -V`): -* Python version used (`python -V`): -* Operating system and version: diff --git a/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md b/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md deleted file mode 100644 index a7911c2f1..000000000 --- a/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Feature request -about: Let us know if you have a feature request or enhancement -title: '<Title>' -labels: 'Issue: Feature Request' -assignees: '' - ---- - -## Description -Is your feature request related to a problem? A clear and concise description of what the problem is: "I'm always frustrated when ..." - -## Context -Why is this change important to you? How would you use it? How can it benefit other users? - -## Possible Implementation -(Optional) Suggest an idea for implementing the addition or change. - -## Possible Alternatives -(Optional) Describe any alternative solutions or features you've considered. diff --git a/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md b/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 9efb5746d..000000000 --- a/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,13 +0,0 @@ -## Description -<!-- Why was this PR created? --> - -## Development notes -<!-- What have you changed, and how has this been tested? --> - -## Checklist - -- [ ] Read the [contributing](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md) guidelines -- [ ] Opened this PR as a 'Draft Pull Request' if it is work-in-progress -- [ ] Updated the documentation to reflect the code changes -- [ ] Added a description of this change in the [`RELEASE.md`](https://github.com/kedro-org/kedro/blob/main/RELEASE.md) file -- [ ] Added tests to cover my changes diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py deleted file mode 100755 index dd05d4c5a..000000000 --- a/tools/circleci/circleci_release.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -""" -CircleCI pipeline to check if it needs to trigger a release -""" - -import os -import sys - -import requests -from requests.structures import CaseInsensitiveDict - -from utils.check_no_version_pypi import check_no_version_pypi -from utils.package_version import get_package_version - -PACKAGE_PATHS = ( - "kedro-datasets/kedro_datasets", - "kedro-telemetry/kedro_telemetry", - "kedro-airflow/kedro_airflow", - "kedro-docker/kedro_docker", -) -PROJECT_SLUG = "github/kedro-org/kedro-plugins" -# CIRCLE_BRANCH = "feat/cicd-auto-release" -CIRCLE_BRANCH = os.environ.get("CIRCLE_BRANCH") - - -def circleci_release(project_slug, payload, circle_endpoint, circle_release_token): - """Trigging the CircleCI Release Pipeline""" - # See https://circleci.com/docs/2.0/api-developers-guide - print("Starting the CircleCI Release Pipeline") - CIRCLE_ENDPOINT = f"https://circleci.com/api/v2/project/{project_slug}/pipeline" - - headers = CaseInsensitiveDict() - headers["Content-Type"] = "application/json" - headers["Circle-Token"] = circle_release_token - - resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10) - return resp - - -if __name__ == "__main__": - """Trigger the CircleCI Release Process""" - from pathlib import Path - - # Personal API Tokens - https://circleci.com/docs/managing-api-tokens - CIRCLE_RELEASE_TOKEN = os.environ.get("CIRCLE_RELEASE_TOKEN") - if not CIRCLE_RELEASE_TOKEN: - raise ValueError("CIRCLE_RELEASE_TOKEN is not defined as envionrmnet variable.") - - base_path = Path() - # Loop for all 4 repositories - for package_path in PACKAGE_PATHS: - package_name, _ = package_path.split("/") - package_version = get_package_version(base_path, package_path) - pypi_endpoint = f"https://pypi.org/pypi/{package_name}/{package_version}/json/" - circleci_endpoint = ( - f"https://circleci.com/api/v2/project/{PROJECT_SLUG}/pipeline" - ) - payload = { - "branch": CIRCLE_BRANCH, - "parameters": { - "release_package": package_name, - "release_version": package_version, - }, - } - - print(package_name, package_version) - if check_no_version_pypi(pypi_endpoint, package_name, package_version): - res = circleci_release( - PROJECT_SLUG, payload, circleci_endpoint, CIRCLE_RELEASE_TOKEN - ) - print(f"Status Code: {resp.status_code}") - if resp.status_code == 201: - print("Creating CircleCI Pipeline successfully") - else: - print("Failed to create CircleCI Pipeline") - print(resp.content) - if resp.status_code != 201: - sys.exit(1) diff --git a/tools/circleci/github_release.py b/tools/circleci/github_release.py deleted file mode 100755 index d5bc3115c..000000000 --- a/tools/circleci/github_release.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys - -import requests -from requests.structures import CaseInsensitiveDict - -GITHUB_USER = "kedro-org" -GITHUB_REPO = "kedro-plugins" -# On GitHub select "Settings" > "Developer Setting" -> "Personal access Token"" -GITHUB_TAGGING_TOKEN = os.environ.get("GITHUB_TAGGING_TOKEN") - - -def github_release( - package_name, - version, - github_user=GITHUB_USER, - github_repo=GITHUB_REPO, - github_tagging_token=GITHUB_TAGGING_TOKEN, -): - """Trigger the GitHub Release to create artifacts and tags""" - print("Starting GitHub Release") - - github_endpoint = ( - f"https://api.github.com/repos/{github_user}/{github_repo}/releases" - ) - payload = { - "tag_name": f"{package_name}-{version}", # kedro-datasets 0.0.1 - "target_commitish": "main", - "name": f"{version}", - "body": f"Release {version}", - "draft": False, - "prerelease": False, - } - - headers = CaseInsensitiveDict() - headers["Content-Type"] = "application/json" - headers["Authorization"] = f"token {github_tagging_token}" - resp = requests.post(github_endpoint, headers=headers, json=payload, timeout=10) - if resp.status_code == 200: - print("Create GitHub release successfully") - print(resp.content) - else: - print("Failed to create Github release") - print(resp.content) - return resp - - -if __name__ == "__main__": - package_name = sys.argv[1] - package_version = sys.argv[2] - res = github_release(package_name, package_version) diff --git a/tools/circleci/utils/check_no_version_pypi.py b/tools/circleci/utils/check_no_version_pypi.py deleted file mode 100644 index 777f09c9a..000000000 --- a/tools/circleci/utils/check_no_version_pypi.py +++ /dev/null @@ -1,13 +0,0 @@ -import requests - - -def check_no_version_pypi(pypi_endpoint, package_name, package_version): - print("Check if {package_name} {package_version} is on pypi") - response = requests.get(pypi_endpoint, timeout=10) - if response.status_code == 404: - # Not exist on Pypi - do release - print(f"Starting the release of {package_name} {package_version}") - return True - else: - print(f"Skipped: {package_name} {package_version} already exists on PyPI") - return False diff --git a/tools/circleci/utils/package_version.py b/tools/circleci/utils/package_version.py deleted file mode 100644 index 48de594ff..000000000 --- a/tools/circleci/utils/package_version.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 -""" -Get version of Kedro -""" - -import re -from pathlib import Path - -VERSION_MATCHSTR = r'\s*__version__\s*=\s*"(\d+\.\d+\.\d+)"' - - -def get_package_version(base_path, package_path): - init_file_path = Path(base_path) / package_path / "__init__.py" - match_obj = re.search(VERSION_MATCHSTR, Path(init_file_path).read_text()) - return match_obj.group(1) - - -if __name__ == "__main__": - base_path = Path() - package_path = "kedro-datasets/kedro_datasets" - print(get_package_version(base_path, package_path)) From 9d7820a79c0f0d3bee7d746570fad6cf86bfe466 Mon Sep 17 00:00:00 2001 From: McDonnellJoseph <90898184+McDonnellJoseph@users.noreply.github.com> Date: Mon, 22 May 2023 12:01:29 +0200 Subject: [PATCH 86/96] feat: Dataset API add `save` method (#180) * [FEAT] add save method to APIDataset Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] create save_args parameter for api_dataset Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] add tests for socket + http errors Signed-off-by: <jmcdonnell@fieldbox.ai> Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] check save data is json Signed-off-by: <jmcdonnell@fieldbox.ai> Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] clean code Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] handle different data types Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] test coverage for exceptions Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] add examples in APIDataSet docstring Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * sync APIDataSet from kedro's `develop` (#184) * Update APIDataSet Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Sync ParquetDataSet Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Sync Test Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Linting Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Revert Unnecessary ParquetDataSet Changes Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Sync release notes Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> --------- Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] remove support for delete method Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] lint files Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] fix conflicts Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] remove fail save test Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] review suggestions Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [ENH] fix tests Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> * [FIX] reorder arguments Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> --------- Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai> Signed-off-by: <jmcdonnell@fieldbox.ai> Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> Co-authored-by: jmcdonnell <jmcdonnell@fieldbox.ai> Co-authored-by: Nok Lam Chan <mediumnok@gmail.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/RELEASE.md | 10 +- .../kedro_datasets/api/api_dataset.py | 138 ++++++++++++++---- .../kedro_datasets/pandas/generic_dataset.py | 2 - .../spark/spark_jdbc_dataset.py | 1 - kedro-datasets/tests/api/test_api_dataset.py | 138 ++++++++++++++++-- 5 files changed, 246 insertions(+), 43 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 2dbee5adc..fd2a755ca 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,6 +3,8 @@ ## Major features and improvements: * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). +* Added a save method to the APIDataSet + * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. * Relaxed Kedro version pin to `>=0.16` @@ -42,10 +44,10 @@ Many thanks to the following Kedroids for contributing PRs to this release: * Added the following new datasets: -| Type | Description | Location | -| ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | -| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | -| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | +| Type | Description | Location | +| -------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -------------------------- | +| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | +| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index cb8f80d37..b1b93a7eb 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -1,7 +1,9 @@ """``APIDataSet`` loads the data from HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ """ -from typing import Any, Dict, List, NoReturn, Tuple, Union +import json as json_ # make pylint happy +from copy import deepcopy +from typing import Any, Dict, List, Tuple, Union import requests from kedro.io.core import AbstractDataSet, DataSetError @@ -14,11 +16,10 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): - """``APIDataSet`` loads the data from HTTP(S) APIs. + """``APIDataSet`` loads/saves data from/to HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ - Example usage for the - `YAML API <https://kedro.readthedocs.io/en/stable/data/\ + Example usage for the `YAML API <https://kedro.readthedocs.io/en/stable/data/\ data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml @@ -34,10 +35,8 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): agg_level_desc: STATE, year: 2000 - Example usage for the - `Python API <https://kedro.readthedocs.io/en/stable/data/\ - data_catalog.html#use-the-data-catalog-with-the-code-api>`_: - :: + Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\ + data_catalog.html#use-the-data-catalog-with-the-code-api>`_: :: >>> from kedro.extras.datasets.api import APIDataSet >>> @@ -57,49 +56,101 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): >>> credentials=("username", "password") >>> ) >>> data = data_set.load() + + ``APIDataSet`` can also be used to save output on a remote server using HTTP(S) + methods. + + >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}' + + >>> data_set = APIDataSet( + method = "POST" + url = "url_of_remote_server", + save_args = {"chunk_size":1} + ) + >>> data_set.save(example_table) + + On initialisation, we can specify all the necessary parameters in the save args + dictionary. The default HTTP(S) method is POST but PUT is also supported. Two + important parameters to keep in mind are timeout and chunk_size. `timeout` defines + how long our program waits for a response after a request. `chunk_size`, is only + used if the input of save method is a list. It will divide the request into chunks + of size `chunk_size`. For example, here we will send two requests each containing + one row of our example DataFrame. + If the data passed to the save method is not a list, ``APIDataSet`` will check if it + can be loaded as JSON. If true, it will send the data unchanged in a single request. + Otherwise, the ``_save`` method will try to dump the data in JSON format and execute + the request. """ + DEFAULT_SAVE_ARGS = { + "params": None, + "headers": None, + "auth": None, + "json": None, + "timeout": 60, + "chunk_size": 100, + } + # pylint: disable=too-many-arguments + def __init__( self, url: str, method: str = "GET", load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, credentials: Union[Tuple[str, str], List[str], AuthBase] = None, ) -> None: """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint. Args: url: The API URL endpoint. - method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc... + method: The method of the request. GET, POST, PUT are the only supported + methods load_args: Additional parameters to be fed to requests.request. https://requests.readthedocs.io/en/latest/api/#requests.request credentials: Allows specifying secrets in credentials.yml. - Expected format is ``('login', 'password')`` if given as a tuple or list. - An ``AuthBase`` instance can be provided for more complex cases. + Expected format is ``('login', 'password')`` if given as a tuple or + list. An ``AuthBase`` instance can be provided for more complex cases. + save_args: Options for saving data on server. Includes all parameters used + during load method. Adds an optional parameter, ``chunk_size`` which + determines the size of the package sent at each request. Raises: - ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified. + ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are + specified. """ super().__init__() - self._load_args = load_args or {} - self._load_args_auth = self._load_args.pop("auth", None) + # GET method means load + if method == "GET": + self._params = load_args or {} + + # PUT, POST, DELETE means save + elif method in ["PUT", "POST"]: + self._params = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._params.update(save_args) + self._chunk_size = self._params.pop("chunk_size", 1) + else: + raise ValueError("Only GET, POST and PUT methods are supported") + + self._param_auth = self._params.pop("auth", None) - if credentials is not None and self._load_args_auth is not None: + if credentials is not None and self._param_auth is not None: raise ValueError("Cannot specify both auth and credentials.") - self._auth = credentials or self._load_args_auth + self._auth = credentials or self._param_auth - if "cert" in self._load_args: - self._load_args["cert"] = self._convert_type(self._load_args["cert"]) + if "cert" in self._params: + self._params["cert"] = self._convert_type(self._params["cert"]) - if "timeout" in self._load_args: - self._load_args["timeout"] = self._convert_type(self._load_args["timeout"]) + if "timeout" in self._params: + self._params["timeout"] = self._convert_type(self._params["timeout"]) self._request_args: Dict[str, Any] = { "url": url, "method": method, "auth": self._convert_type(self._auth), - **self._load_args, + **self._params, } @staticmethod @@ -131,11 +182,48 @@ def _execute_request(self, session: Session) -> requests.Response: return response def _load(self) -> requests.Response: - with sessions.Session() as session: - return self._execute_request(session) + if self._request_args["method"] == "GET": + with sessions.Session() as session: + return self._execute_request(session) + + raise DataSetError("Only GET method is supported for load") + + def _execute_save_with_chunks( + self, + json_data: List[Dict[str, Any]], + ) -> requests.Response: + chunk_size = self._chunk_size + n_chunks = len(json_data) // chunk_size + 1 + + for i in range(n_chunks): + send_data = json_data[i * chunk_size : (i + 1) * chunk_size] + response = self._execute_save_request(json_data=send_data) + + return response + + def _execute_save_request(self, json_data: Any) -> requests.Response: + try: + json_.loads(json_data) + except TypeError: + self._request_args["json"] = json_.dumps(json_data) + try: + response = requests.request(**self._request_args) + response.raise_for_status() + except requests.exceptions.HTTPError as exc: + raise DataSetError("Failed to send data", exc) from exc + + except OSError as exc: + raise DataSetError("Failed to connect to the remote server") from exc + return response + + def _save(self, data: Any) -> requests.Response: + if self._request_args["method"] in ["PUT", "POST"]: + if isinstance(data, list): + return self._execute_save_with_chunks(json_data=data) + + return self._execute_save_request(json_data=data) - def _save(self, data: None) -> NoReturn: - raise DataSetError(f"{self.__class__.__name__} is a read only data set type") + raise DataSetError("Use PUT or POST methods for save") def _exists(self) -> bool: with sessions.Session() as session: diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index a2bb6b1be..91229edcf 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -181,7 +181,6 @@ def _ensure_file_system_target(self) -> None: ) def _load(self) -> pd.DataFrame: - self._ensure_file_system_target() load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -196,7 +195,6 @@ def _load(self) -> pd.DataFrame: ) def _save(self, data: pd.DataFrame) -> None: - self._ensure_file_system_target() save_path = get_filepath_str(self._get_save_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index ca3c7643c..c90c5f958 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -126,7 +126,6 @@ def __init__( # Update properties in load_args and save_args with credentials. if credentials is not None: - # Check credentials for bad inputs. for cred_key, cred_value in credentials.items(): if cred_value is None: diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index 848020041..c736f90b5 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -1,5 +1,6 @@ # pylint: disable=no-member import base64 +import json import socket import pytest @@ -10,25 +11,44 @@ from kedro_datasets.api import APIDataSet POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] +SAVE_METHODS = ["POST", "PUT"] TEST_URL = "http://example.com/api/test" TEST_TEXT_RESPONSE_DATA = "This is a response." TEST_JSON_REQUEST_DATA = [{"key": "value"}] +TEST_JSON_RESPONSE_DATA = [{"key": "value"}] TEST_PARAMS = {"param": "value"} TEST_URL_WITH_PARAMS = TEST_URL + "?param=value" TEST_METHOD = "GET" TEST_HEADERS = {"key": "value"} +TEST_SAVE_DATA = [json.dumps({"key1": "info1", "key2": "info2"})] + class TestAPIDataSet: @pytest.mark.parametrize("method", POSSIBLE_METHODS) def test_request_method(self, requests_mock, method): - api_data_set = APIDataSet(url=TEST_URL, method=method) - requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA) - - response = api_data_set.load() - assert response.text == TEST_TEXT_RESPONSE_DATA + if method in ["OPTIONS", "HEAD", "PATCH", "DELETE"]: + with pytest.raises( + ValueError, + match="Only GET, POST and PUT methods are supported", + ): + APIDataSet(url=TEST_URL, method=method) + + else: + api_data_set = APIDataSet(url=TEST_URL, method=method) + + requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA) + + if method == "GET": + response = api_data_set.load() + assert response.text == TEST_TEXT_RESPONSE_DATA + else: + with pytest.raises( + DataSetError, match="Only GET method is supported for load" + ): + api_data_set.load() @pytest.mark.parametrize( "parameters_in, url_postfix", @@ -181,7 +201,6 @@ def test_certs(self, requests_mock, cert_in, cert_out): url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in} ) requests_mock.register_uri(TEST_METHOD, TEST_URL) - response = api_data_set.load() assert response.request.cert == cert_out @@ -252,10 +271,107 @@ def test_socket_error(self, requests_mock): with pytest.raises(DataSetError, match="Failed to connect"): api_data_set.load() - def test_read_only_mode(self): + @pytest.mark.parametrize("method", POSSIBLE_METHODS) + def test_successful_save(self, requests_mock, method): + """ + When we want to save some data on a server + Given an APIDataSet class + Then check we get a response + """ + if method in ["PUT", "POST"]: + api_data_set = APIDataSet( + url=TEST_URL, + method=method, + save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri( + method, + TEST_URL_WITH_PARAMS, + headers=TEST_HEADERS, + status_code=requests.codes.ok, + ) + response = api_data_set._save(TEST_SAVE_DATA) + + assert isinstance(response, requests.Response) + elif method == "GET": + api_data_set = APIDataSet( + url=TEST_URL, + method=method, + save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + with pytest.raises(DataSetError, match="Use PUT or POST methods for save"): + api_data_set._save(TEST_SAVE_DATA) + else: + with pytest.raises( + ValueError, + match="Only GET, POST and PUT methods are supported", + ): + APIDataSet(url=TEST_URL, method=method) + + @pytest.mark.parametrize("save_methods", SAVE_METHODS) + def test_successful_save_with_json(self, requests_mock, save_methods): """ - Saving is disabled on the data set. + When we want to save with json parameters + Given an APIDataSet class + Then check we get a response """ - api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD) - with pytest.raises(DataSetError, match="is a read only data set type"): - api_data_set.save({}) + api_data_set = APIDataSet( + url=TEST_URL, + method=save_methods, + save_args={"json": TEST_JSON_RESPONSE_DATA, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri( + save_methods, + TEST_URL, + headers=TEST_HEADERS, + text=json.dumps(TEST_JSON_RESPONSE_DATA), + ) + response_list = api_data_set._save(TEST_SAVE_DATA) + + assert isinstance(response_list, requests.Response) + + response_dict = api_data_set._save({"item1": "key1"}) + assert isinstance(response_dict, requests.Response) + + response_json = api_data_set._save(TEST_SAVE_DATA[0]) + assert isinstance(response_json, requests.Response) + + @pytest.mark.parametrize("save_methods", SAVE_METHODS) + def test_save_http_error(self, requests_mock, save_methods): + api_data_set = APIDataSet( + url=TEST_URL, + method=save_methods, + save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS, "chunk_size": 2}, + ) + requests_mock.register_uri( + save_methods, + TEST_URL_WITH_PARAMS, + headers=TEST_HEADERS, + text="Nope, not found", + status_code=requests.codes.FORBIDDEN, + ) + + with pytest.raises(DataSetError, match="Failed to send data"): + api_data_set.save(TEST_SAVE_DATA) + + with pytest.raises(DataSetError, match="Failed to send data"): + api_data_set.save(TEST_SAVE_DATA[0]) + + @pytest.mark.parametrize("save_methods", SAVE_METHODS) + def test_save_socket_error(self, requests_mock, save_methods): + api_data_set = APIDataSet( + url=TEST_URL, + method=save_methods, + save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri(save_methods, TEST_URL_WITH_PARAMS, exc=socket.error) + + with pytest.raises( + DataSetError, match="Failed to connect to the remote server" + ): + api_data_set.save(TEST_SAVE_DATA) + + with pytest.raises( + DataSetError, match="Failed to connect to the remote server" + ): + api_data_set.save(TEST_SAVE_DATA[0]) From 36de4b9953f5ddd83153ff6f86a7decd901d6f1f Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 22 May 2023 12:31:24 +0100 Subject: [PATCH 87/96] ci: Automatically extract release notes for GitHub Releases (#212) * ci: Automatically extract release notes Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * fix lint Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Raise exceptions Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Lint Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Lint Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> --------- Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- .github/workflows/check-release.yml | 29 +++++++------- tools/github_actions/extract_release_notes.py | 39 +++++++++++++++++++ 2 files changed, 52 insertions(+), 16 deletions(-) create mode 100644 tools/github_actions/extract_release_notes.py diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml index 916cf70f7..51036d260 100644 --- a/.github/workflows/check-release.yml +++ b/.github/workflows/check-release.yml @@ -56,24 +56,21 @@ jobs: run: | export plugin=${{ needs.check-version.outputs.package_name }} make package + - name: Extract release notes from ${{needs.check-version.outputs.package_name}}/RELEASE.md + id: extract + run: | + python tools/github_actions/extract_release_notes.py \ + "${{needs.check-version.outputs.package_name}}/RELEASE.md" \ + "Release ${{needs.check-version.outputs.package_version}}" - name: Create GitHub Release - uses: actions/github-script@v6 + uses: softprops/action-gh-release@v1 with: - github-token: ${{ secrets.GH_TAGGING_TOKEN }} - script: | - const package_name = "${{ needs.check-version.outputs.package_name }}" - const package_version = "${{ needs.check-version.outputs.package_version }}" - const response = await github.rest.repos.createRelease({ - owner: context.repo.owner, - repo: context.repo.repo, - tag_name: `${package_name}-${package_version}`, - target_commitish: 'main', - name: `${package_name}-${package_version}`, - body: `Release ${package_version}`, - draft: false, - prerelease: false, - }); - return response.data; + tag_name: ${{needs.check-version.outputs.package_name}}-${{needs.check-version.outputs.package_version}} + name: ${{needs.check-version.outputs.package_name}}-${{needs.check-version.outputs.package_version}} + body_path: release_body.txt + draft: false + prerelease: false + token: ${{ secrets.GH_TAGGING_TOKEN }} - name: Set PyPI token run: | if [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-airflow" ]; then diff --git a/tools/github_actions/extract_release_notes.py b/tools/github_actions/extract_release_notes.py new file mode 100644 index 000000000..52a8516cb --- /dev/null +++ b/tools/github_actions/extract_release_notes.py @@ -0,0 +1,39 @@ +import sys + + +def extract_section(filename, heading): + with open(filename, 'r') as file: + lines = file.readlines() + + start_line, end_line = None, None + + for i, line in enumerate(lines): + if line.startswith('# '): + current_heading = line.strip('#').replace(':', '').strip() + if current_heading == heading: + start_line = i + elif start_line is not None: + end_line = i + break + + if start_line is not None: + if end_line is None: + end_line = len(lines) + section_lines = lines[start_line + 1:end_line] + section = ''.join(section_lines).strip() + return section + else: + return None + + +if __name__ == '__main__': + if len(sys.argv) != 3: + raise Exception("Usage: python extract_release_notes.py <filename> <heading>") + + filename = sys.argv[1] + heading = sys.argv[2] + section = extract_section(filename, heading) + if not section: + raise Exception(f"Section not found under the {heading} heading") + with open("release_body.txt", "w") as text_file: + text_file.write(section) From 870e623f4530b6c9428efdefd781fbc258a965d3 Mon Sep 17 00:00:00 2001 From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Date: Mon, 22 May 2023 15:43:07 +0100 Subject: [PATCH 88/96] feat: Add metadata attribute to datasets (#189) * Add metadata attribute to all datasets Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/RELEASE.md | 2 +- .../kedro_datasets/api/api_dataset.py | 18 +++++++------ .../biosequence/biosequence_dataset.py | 9 ++++++- .../kedro_datasets/dask/parquet_dataset.py | 5 ++++ .../kedro_datasets/email/message_dataset.py | 5 ++++ .../geopandas/geojson_dataset.py | 5 ++++ .../holoviews/holoviews_writer.py | 5 ++++ .../kedro_datasets/json/json_dataset.py | 5 ++++ .../matplotlib/matplotlib_writer.py | 5 ++++ .../kedro_datasets/networkx/gml_dataset.py | 5 ++++ .../networkx/graphml_dataset.py | 5 ++++ .../kedro_datasets/networkx/json_dataset.py | 5 ++++ .../kedro_datasets/pandas/csv_dataset.py | 5 ++++ .../kedro_datasets/pandas/excel_dataset.py | 5 ++++ .../kedro_datasets/pandas/feather_dataset.py | 5 ++++ .../kedro_datasets/pandas/gbq_dataset.py | 18 +++++++++++-- .../kedro_datasets/pandas/generic_dataset.py | 9 ++++++- .../kedro_datasets/pandas/hdf_dataset.py | 9 ++++++- .../kedro_datasets/pandas/json_dataset.py | 5 ++++ .../kedro_datasets/pandas/parquet_dataset.py | 5 ++++ .../kedro_datasets/pandas/sql_dataset.py | 11 ++++++++ .../kedro_datasets/pandas/xml_dataset.py | 5 ++++ .../kedro_datasets/pickle/pickle_dataset.py | 9 ++++++- .../kedro_datasets/pillow/image_dataset.py | 5 ++++ .../kedro_datasets/plotly/json_dataset.py | 5 ++++ .../kedro_datasets/plotly/plotly_dataset.py | 5 ++++ .../kedro_datasets/polars/csv_dataset.py | 5 ++++ .../kedro_datasets/redis/redis_dataset.py | 9 ++++++- .../snowflake/snowpark_dataset.py | 9 ++++++- .../spark/deltatable_dataset.py | 7 +++-- .../kedro_datasets/spark/spark_dataset.py | 6 ++++- .../spark/spark_hive_dataset.py | 5 ++++ .../spark/spark_jdbc_dataset.py | 5 ++++ .../svmlight/svmlight_dataset.py | 26 +++++++++++++++++++ .../tensorflow/tensorflow_model_dataset.py | 6 +++++ .../kedro_datasets/text/text_dataset.py | 6 +++++ .../kedro_datasets/video/video_dataset.py | 11 +++++++- .../kedro_datasets/yaml/yaml_dataset.py | 5 ++++ 38 files changed, 254 insertions(+), 21 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index fd2a755ca..76d730159 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,9 +4,9 @@ * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). * Added a save method to the APIDataSet - * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. * Relaxed Kedro version pin to `>=0.16` +* Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins. ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index b1b93a7eb..ad2a6c367 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -10,10 +10,6 @@ from requests import Session, sessions from requests.auth import AuthBase -# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. -# Any contribution to datasets should be made in kedro-datasets -# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) - class APIDataSet(AbstractDataSet[None, requests.Response]): """``APIDataSet`` loads/saves data from/to HTTP(S) APIs. @@ -38,7 +34,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\ data_catalog.html#use-the-data-catalog-with-the-code-api>`_: :: - >>> from kedro.extras.datasets.api import APIDataSet + >>> from kedro_datasets.api import APIDataSet >>> >>> >>> data_set = APIDataSet( @@ -99,6 +95,7 @@ def __init__( load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, credentials: Union[Tuple[str, str], List[str], AuthBase] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint. @@ -108,12 +105,15 @@ def __init__( methods load_args: Additional parameters to be fed to requests.request. https://requests.readthedocs.io/en/latest/api/#requests.request - credentials: Allows specifying secrets in credentials.yml. - Expected format is ``('login', 'password')`` if given as a tuple or - list. An ``AuthBase`` instance can be provided for more complex cases. save_args: Options for saving data on server. Includes all parameters used during load method. Adds an optional parameter, ``chunk_size`` which determines the size of the package sent at each request. + credentials: Allows specifying secrets in credentials.yml. + Expected format is ``('login', 'password')`` if given as a tuple or list. + An ``AuthBase`` instance can be provided for more complex cases. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + Raises: ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified. @@ -153,6 +153,8 @@ def __init__( **self._params, } + self.metadata = metadata + @staticmethod def _convert_type(value: Any): """ diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index e9dd924a6..ed683da48 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -10,7 +10,9 @@ from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path -class BioSequenceDataSet(AbstractDataSet[List, List]): +class BioSequenceDataSet( + AbstractDataSet[List, List] +): # pylint:disable=too-many-instance-attributes r"""``BioSequenceDataSet`` loads and saves data to a sequence file. Example: @@ -47,6 +49,7 @@ def __init__( save_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """ Creates a new instance of ``BioSequenceDataSet`` pointing @@ -69,6 +72,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO """ @@ -100,6 +105,8 @@ def __init__( self._fs_open_args_load = _fs_open_args_load self._fs_open_args_save = _fs_open_args_save + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: return { "filepath": self._filepath, diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index f3c00e265..76344b7f5 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -94,6 +94,7 @@ def __init__( save_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ParquetDataSet`` pointing to concrete parquet files. @@ -109,11 +110,15 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ self._filepath = filepath self._fs_args = deepcopy(fs_args) or {} self._credentials = deepcopy(credentials) or {} + self.metadata = metadata + # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 2faf3bb5d..e94735aac 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -64,6 +64,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``EmailMessageDataSet`` pointing to a concrete text file on a specific filesystem. @@ -103,6 +104,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -116,6 +119,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index 75a9f8357..4596b2b82 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -56,6 +56,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``GeoJSONDataSet`` pointing to a concrete GeoJSON file on a specific filesystem fsspec. @@ -85,6 +86,8 @@ def __init__( Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `wb` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = copy.deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -97,6 +100,8 @@ def __init__( self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 9a17dbe7b..df38739e9 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -47,6 +47,7 @@ def __init__( credentials: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``HoloviewsWriter``. @@ -70,6 +71,8 @@ def __init__( ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _credentials = deepcopy(credentials) or {} _fs_args = deepcopy(fs_args) or {} @@ -83,6 +86,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 53239ece3..c2df700b3 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -59,6 +59,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file on a specific filesystem. @@ -86,6 +87,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -99,6 +102,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index d7aaf6a02..a0c9a049b 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -115,6 +115,7 @@ def __init__( save_args: Dict[str, Any] = None, version: Version = None, overwrite: bool = False, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``MatplotlibWriter``. @@ -140,6 +141,8 @@ def __init__( overwrite: If True, any existing image files will be removed. Only relevant when saving multiple Matplotlib objects at once. + metadata: Any arbitrary Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _credentials = deepcopy(credentials) or {} _fs_args = deepcopy(fs_args) or {} @@ -153,6 +156,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index 4dd88cb22..25111e639 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -48,6 +48,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``GMLDataSet``. @@ -73,6 +74,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -86,6 +89,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index ca12b6bae..c538498a6 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -47,6 +47,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``GraphMLDataSet``. @@ -72,6 +73,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -85,6 +88,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 3fdf9f253..8ac0e35a3 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -48,6 +48,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONDataSet``. @@ -73,6 +74,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -86,6 +89,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 336aff406..52ba0c7e6 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -77,6 +77,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file on a specific filesystem. @@ -102,6 +103,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -114,6 +117,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 45aee3192..0eceb759d 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -118,6 +118,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ExcelDataSet`` pointing to a concrete Excel file on a specific filesystem. @@ -150,6 +151,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: If versioning is enabled while in append mode. @@ -165,6 +168,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index be261d42a..0ca8e1cd8 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -77,6 +77,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``FeatherDataSet`` pointing to a concrete filepath. @@ -102,6 +103,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -114,6 +117,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index ebfadf249..a8001d2ae 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -20,7 +20,9 @@ ) -class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]): +class GBQTableDataSet( + AbstractDataSet[None, pd.DataFrame] +): # pylint:disable=too-many-instance-attributes """``GBQTableDataSet`` loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table. @@ -74,6 +76,7 @@ def __init__( credentials: Union[Dict[str, Any], Credentials] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``GBQTableDataSet``. @@ -96,6 +99,8 @@ def __init__( Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html All defaults are preserved, but "progress_bar", which is set to False. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: When ``load_args['location']`` and ``save_args['location']`` @@ -125,6 +130,8 @@ def __init__( location=self._save_args.get("location"), ) + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: return { "dataset": self._dataset, @@ -171,7 +178,9 @@ def _validate_location(self): ) -class GBQQueryDataSet(AbstractDataSet[None, pd.DataFrame]): +class GBQQueryDataSet( + AbstractDataSet[None, pd.DataFrame] +): # pylint:disable=too-many-instance-attributes """``GBQQueryDataSet`` loads data from a provided SQL query from Google BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq`` internally to read from BigQuery table. Therefore it supports all allowed @@ -214,6 +223,7 @@ def __init__( load_args: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, filepath: str = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``GBQQueryDataSet``. @@ -235,6 +245,8 @@ def __init__( (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) used for reading the SQL query from filepath. filepath: A path to a file with a sql query statement. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: When ``sql`` and ``filepath`` parameters are either both empty @@ -283,6 +295,8 @@ def __init__( self._fs = fsspec.filesystem(self._protocol, **_fs_credentials, **_fs_args) self._filepath = path + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: load_args = copy.deepcopy(self._load_args) desc = {} diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 91229edcf..9388876d7 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -28,7 +28,9 @@ ] -class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): +class GenericDataSet( + AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame] +): # pylint:disable=too-many-instance-attributes """`pandas.GenericDataSet` loads/saves data from/to a data file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to dynamically select the appropriate type of read/write target on a best effort basis. @@ -94,6 +96,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ): """Creates a new instance of ``GenericDataSet`` pointing to a concrete data file on a specific filesystem. The appropriate pandas load/save methods are @@ -134,6 +137,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: Will be raised if at least less than one appropriate @@ -154,6 +159,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index b821f17da..0632ad612 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -19,7 +19,9 @@ HDFSTORE_DRIVER = "H5FD_CORE" -class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): +class HDFDataSet( + AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame] +): # pylint:disable=too-many-instance-attributes """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. @@ -69,6 +71,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``HDFDataSet`` pointing to a concrete hdf file on a specific filesystem. @@ -100,6 +103,8 @@ def __init__( Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set `wb` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -113,6 +118,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 76d1cca0a..f0777ec21 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -72,6 +72,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file on a specific filesystem. @@ -97,6 +98,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -108,6 +111,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index b41d468c3..537abe9b0 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -83,6 +83,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ParquetDataSet`` pointing to a concrete Parquet file on a specific filesystem. @@ -111,6 +112,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -123,6 +126,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 029dc6939..a94a36743 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -156,12 +156,14 @@ class SQLTableDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine engines: Dict[str, Any] = {} + # pylint: disable=too-many-arguments def __init__( self, table_name: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new ``SQLTableDataSet``. @@ -188,6 +190,8 @@ def __init__( To find all supported connection string formats, see here: https://docs.sqlalchemy.org/core/engines.html#database-urls It has ``index=False`` in the default parameters. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: When either ``table_name`` or ``con`` is empty. @@ -216,6 +220,8 @@ def __init__( self._connection_str = credentials["con"] self.create_connection(self._connection_str) + self.metadata = metadata + @classmethod def create_connection(cls, connection_str: str) -> None: """Given a connection string, create singleton connection @@ -380,6 +386,7 @@ def __init__( # pylint: disable=too-many-arguments fs_args: Dict[str, Any] = None, filepath: str = None, execution_options: Optional[Dict[str, Any]] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new ``SQLQueryDataSet``. @@ -411,6 +418,8 @@ def __init__( # pylint: disable=too-many-arguments https://docs.sqlalchemy.org/core/connections.html#sqlalchemy.engine.Connection.execution_options Note that this is not a standard argument supported by pandas API, but could be useful for handling large datasets. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: When either ``sql`` or ``con`` parameters is empty. @@ -441,6 +450,8 @@ def __init__( # pylint: disable=too-many-arguments else default_load_args ) + self.metadata = metadata + # load sql query from file if sql: self._load_args["sql"] = sql diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 59f96e441..5a73a1536 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -55,6 +55,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``XMLDataSet`` pointing to a concrete XML file on a specific filesystem. @@ -80,6 +81,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -92,6 +95,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 11ee512c1..f381e39d4 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -18,7 +18,9 @@ ) -class PickleDataSet(AbstractVersionedDataSet[Any, Any]): +class PickleDataSet( + AbstractVersionedDataSet[Any, Any] +): # pylint:disable=too-many-instance-attributes """``PickleDataSet`` loads/saves data from/to a Pickle file using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by the specified backend library passed in (defaults to the ``pickle`` library), so it @@ -81,6 +83,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``PickleDataSet`` pointing to a concrete Pickle file on a specific filesystem. ``PickleDataSet`` supports custom backends to @@ -132,6 +135,8 @@ def __init__( Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `wb` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: ValueError: If ``backend`` does not satisfy the `pickle` interface. @@ -170,6 +175,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 6dd94635e..aaf74fb1b 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -43,6 +43,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ImageDataSet`` pointing to a concrete image file on a specific filesystem. @@ -70,6 +71,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -83,6 +86,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index f99fe8ac4..5a29a06e0 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -61,6 +61,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file on a specific filesystem. @@ -92,6 +93,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -105,6 +108,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 1bb0acef6..38638a3d8 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -75,6 +75,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``PlotlyDataSet`` pointing to a concrete JSON file on a specific filesystem. @@ -109,6 +110,8 @@ def __init__( Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ super().__init__(filepath, load_args, save_args, version, credentials, fs_args) self._plotly_args = plotly_args @@ -121,6 +124,8 @@ def __init__( self._fs_open_args_load = _fs_open_args_load self._fs_open_args_save = _fs_open_args_save + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: return {**super()._describe(), "plotly_args": self._plotly_args} diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 6bbc721c4..fa2332bfa 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -75,6 +75,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file on a specific filesystem. @@ -103,6 +104,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -115,6 +118,8 @@ def __init__( self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index ce5aa741f..f292ca986 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -11,7 +11,9 @@ from kedro.io.core import AbstractDataSet, DataSetError -class PickleDataSet(AbstractDataSet[Any, Any]): +class PickleDataSet( + AbstractDataSet[Any, Any] +): # pylint:disable=too-many-instance-attributes """``PickleDataSet`` loads/saves data from/to a Redis database. The underlying functionality is supported by the redis library, so it supports all allowed options for instantiating the redis app ``from_url`` and setting @@ -68,6 +70,7 @@ def __init__( save_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, redis_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``PickleDataSet``. This loads/saves data from/to a Redis database while deserialising/serialising. Supports custom backends to @@ -109,6 +112,8 @@ def __init__( https://redis-py.readthedocs.io/en/stable/connections.html?highlight=from_url#redis.Redis.from_url All defaults are preserved, except `url`, which is set to `redis://127.0.0.1:6379`. You could also specify the url through the env variable ``REDIS_URL``. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: ValueError: If ``backend`` does not satisfy the `pickle` interface. @@ -134,6 +139,8 @@ def __init__( self._key = key + self.metadata = metadata + _redis_args = deepcopy(redis_args) or {} self._redis_from_url_args = _redis_args.pop("from_url_args", {}) self._redis_from_url_args.setdefault("url", self.DEFAULT_REDIS_URL) diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index e1adc50c0..9cebbf12f 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -10,7 +10,9 @@ logger = logging.getLogger(__name__) -class SnowparkTableDataSet(AbstractDataSet): +class SnowparkTableDataSet( + AbstractDataSet +): # pylint:disable=too-many-instance-attributes """``SnowparkTableDataSet`` loads and saves Snowpark dataframes. As of Mar-2023, the snowpark connector only works with Python 3.8. @@ -108,6 +110,7 @@ def __init__( # pylint: disable=too-many-arguments load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``SnowparkTableDataSet``. @@ -128,6 +131,8 @@ def __init__( # pylint: disable=too-many-arguments credentials: A dictionary with a snowpark connection string. To find all supported arguments, see here: https://docs.snowflake.com/en/user-guide/python-connector-api.html#connect + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ if not table_name: @@ -168,6 +173,8 @@ def __init__( # pylint: disable=too-many-arguments self._connection_parameters = connection_parameters self._session = self._get_session(self._connection_parameters) + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: return { "table_name": self._table_name, diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 34ee6f6a5..4290a2cfb 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -2,7 +2,7 @@ ``delta-spark`` """ from pathlib import PurePosixPath -from typing import NoReturn +from typing import Any, Dict, NoReturn from delta.tables import DeltaTable from kedro.io.core import AbstractDataSet, DataSetError @@ -62,7 +62,7 @@ class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): # using ``ThreadRunner`` instead _SINGLE_PROCESS = True - def __init__(self, filepath: str) -> None: + def __init__(self, filepath: str, metadata: Dict[str, Any] = None) -> None: """Creates a new instance of ``DeltaTableDataSet``. Args: @@ -70,11 +70,14 @@ def __init__(self, filepath: str) -> None: and working with data written to mount path points, specify ``filepath``s for (versioned) ``SparkDataSet``s starting with ``/dbfs/mnt``. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ fs_prefix, filepath = _split_filepath(filepath) self._fs_prefix = fs_prefix self._filepath = PurePosixPath(filepath) + self.metadata = metadata @staticmethod def _get_spark(): diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index a0d099350..f2da7573e 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -236,7 +236,7 @@ class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - def __init__( # pylint: disable=too-many-arguments + def __init__( # pylint: disable=too-many-arguments disable=too-many-locals self, filepath: str, file_format: str = "parquet", @@ -244,6 +244,7 @@ def __init__( # pylint: disable=too-many-arguments save_args: Dict[str, Any] = None, version: Version = None, credentials: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``SparkDataSet``. @@ -275,12 +276,15 @@ def __init__( # pylint: disable=too-many-arguments ``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``. Optional keyword arguments passed to ``hdfs.client.InsecureClient`` if ``filepath`` prefix is ``hdfs://``. Ignored otherwise. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ credentials = deepcopy(credentials) or {} fs_prefix, filepath = _split_filepath(filepath) path = PurePosixPath(filepath) exists_function = None glob_function = None + self.metadata = metadata if not filepath.startswith("/dbfs/") and _deployed_on_databricks(): logger.warning( diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 3ea2fb0a1..75ae4cdf8 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -73,6 +73,7 @@ def __init__( write_mode: str = "errorifexists", table_pk: List[str] = None, save_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``SparkHiveDataSet``. @@ -88,6 +89,8 @@ def __init__( on a list of column names. Other `HiveOptions` can be found here: https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#specifying-storage-format-for-hive-tables + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Note: For users leveraging the `upsert` functionality, @@ -119,6 +122,8 @@ def __init__( self._format = self._save_args.pop("format", None) or "hive" self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True + self.metadata = metadata + def _describe(self) -> Dict[str, Any]: return { "database": self._database, diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index c90c5f958..2ac96e544 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -74,6 +74,7 @@ def __init__( credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new ``SparkJDBCDataSet``. @@ -93,6 +94,8 @@ def __init__( with the JDBC URL and the name of the table. To find all supported arguments, see here: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.jdbc.html + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: When either ``url`` or ``table`` is empty or @@ -116,6 +119,8 @@ def __init__( self._url = url self._table = table + self.metadata = metadata + # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index c08555aa1..cc26dd141 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -98,7 +98,31 @@ def __init__( version: Optional[Version] = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: + """Creates a new instance of SVMLightDataSet to load/save data from a svmlight/libsvm file. + + Args: + filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + load_args: Arguments passed on to ``load_svmlight_file``. + See the details in + https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html + save_args: Arguments passed on to ``dump_svmlight_file``. + See the details in + https://scikit-learn.org/stable/modules/generated/sklearn.datasets.dump_svmlight_file.html + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) _fs_open_args_save = _fs_args.pop("open_args_save", {}) @@ -111,6 +135,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 42b550737..a4ce65887 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -72,6 +72,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``TensorFlowModelDataSet``. @@ -96,6 +97,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = copy.deepcopy(fs_args) or {} _credentials = copy.deepcopy(credentials) or {} @@ -105,6 +108,9 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 0bb559e29..40697bc13 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -45,12 +45,14 @@ class TextDataSet(AbstractVersionedDataSet[str, str]): """ + # pylint: disable=too-many-arguments def __init__( self, filepath: str, version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``TextDataSet`` pointing to a concrete text file on a specific filesystem. @@ -74,6 +76,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -87,6 +91,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index 03311146d..37239037f 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -126,7 +126,10 @@ class SequenceVideo(AbstractVideo): """A video object read from an indexable sequence of frames""" def __init__( - self, frames: Sequence[PIL.Image.Image], fps: float, fourcc: str = "mp4v" + self, + frames: Sequence[PIL.Image.Image], + fps: float, + fourcc: str = "mp4v", ) -> None: self._n_frames = len(frames) self._frames = frames @@ -155,6 +158,7 @@ def __getitem__(self, index: Union[int, slice]): class GeneratorVideo(AbstractVideo): """A video object with frames yielded by a generator""" + # pylint: disable=too-many-arguments def __init__( self, frames: Generator[PIL.Image.Image, None, None], @@ -258,12 +262,14 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): """ + # pylint: disable=too-many-arguments def __init__( self, filepath: str, fourcc: Optional[str] = "mp4v", credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of VideoDataSet to load / save video data for given filepath. @@ -276,6 +282,8 @@ def __init__( E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ # parse the path and protocol (e.g. file, http, s3, etc.) protocol, path = get_protocol_and_path(filepath) @@ -286,6 +294,7 @@ def __init__( _credentials = deepcopy(credentials) or {} self._storage_options = {**_credentials, **_fs_args} self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + self.metadata = metadata def _load(self) -> AbstractVideo: """Loads data from the video file. diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index a576f439a..5ab0fd3dc 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -56,6 +56,7 @@ def __init__( version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, + metadata: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``YAMLDataSet`` pointing to a concrete YAML file on a specific filesystem. @@ -83,6 +84,8 @@ def __init__( https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open All defaults are preserved, except `mode`, which is set to `r` when loading and to `w` when saving. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ _fs_args = deepcopy(fs_args) or {} _fs_open_args_load = _fs_args.pop("open_args_load", {}) @@ -96,6 +99,8 @@ def __init__( self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self.metadata = metadata + super().__init__( filepath=PurePosixPath(path), version=version, From 9d66cc8b5bc6b3943707763c7477a3fbafe40976 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Mon, 22 May 2023 16:47:35 +0100 Subject: [PATCH 89/96] feat: Add ManagedTableDataset for managed Delta Lake tables in Databricks (#206) * committing first version of UnityTableCatalog with unit tests. This datasets allows users to interface with Unity catalog tables in Databricks to both read and write. Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * renaming dataset Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * adding mlflow connectors Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * fixing mlflow imports Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * cleaned up mlflow for initial release Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * cleaned up mlflow references from setup.py for initial release Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * fixed deps in setup.py Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * adding comments before intiial PR Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * moved validation to dataclass Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * bug fix in type of partition column and cleanup Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * updated docstring for ManagedTableDataSet Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * added backticks to catalog Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * fixing regex to allow hyphens Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/test_requirements.txt Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * adding backticks to catalog Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Require pandas < 2.0 for compatibility with spark < 3.4 Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Replace use of walrus operator Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Add test coverage for validation methods Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Remove unused versioning functions Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Fix exception catching for invalid schema, add test for invalid schema Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Add pylint ignore Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Add tests/databricks to ignore for no-spark tests Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Nok Lam Chan <mediumnok@gmail.com> * Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Nok Lam Chan <mediumnok@gmail.com> * Remove spurious mlflow test dependency Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Add explicit check for database existence Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Remove character limit for table names Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Refactor validation steps in ManagedTable Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Remove spurious checks for table and schema name existence Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> --------- Signed-off-by: Danny Farah <danny_farah@mckinsey.com> Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> Co-authored-by: Danny Farah <danny.farah@quantumblack.com> Co-authored-by: Danny Farah <danny_farah@mckinsey.com> Co-authored-by: Nok Lam Chan <mediumnok@gmail.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- Makefile | 4 +- kedro-datasets/.gitignore | 3 + .../kedro_datasets/databricks/__init__.py | 8 + .../databricks/managed_table_dataset.py | 432 ++++++++++++++++ kedro-datasets/setup.py | 8 +- kedro-datasets/test_requirements.txt | 2 +- kedro-datasets/tests/databricks/__init__.py | 0 kedro-datasets/tests/databricks/conftest.py | 25 + .../databricks/test_managed_table_dataset.py | 484 ++++++++++++++++++ 9 files changed, 962 insertions(+), 4 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/databricks/__init__.py create mode 100644 kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py create mode 100644 kedro-datasets/tests/databricks/__init__.py create mode 100644 kedro-datasets/tests/databricks/conftest.py create mode 100644 kedro-datasets/tests/databricks/test_managed_table_dataset.py diff --git a/Makefile b/Makefile index be653ed59..4e0b4e640 100644 --- a/Makefile +++ b/Makefile @@ -52,10 +52,10 @@ sign-off: # kedro-datasets related only test-no-spark: - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --numprocesses 4 --dist loadfile + cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile test-no-spark-sequential: - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark + cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks # kedro-datasets/snowflake tests skipped from default scope test-snowflake-only: diff --git a/kedro-datasets/.gitignore b/kedro-datasets/.gitignore index d20ee9733..721e13f70 100644 --- a/kedro-datasets/.gitignore +++ b/kedro-datasets/.gitignore @@ -145,3 +145,6 @@ kedro.db kedro/html docs/tmp-build-artifacts docs/build +spark-warehouse +metastore_db/ +derby.log diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py new file mode 100644 index 000000000..d416ac291 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -0,0 +1,8 @@ +"""Provides interface to Unity Catalog Tables.""" + +__all__ = ["ManagedTableDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .managed_table_dataset import ManagedTableDataSet diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py new file mode 100644 index 000000000..01ec15a6f --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -0,0 +1,432 @@ +"""``ManagedTableDataSet`` implementation to access managed delta tables +in Databricks. +""" +import logging +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import pandas as pd +from kedro.io.core import ( + AbstractVersionedDataSet, + DataSetError, + Version, + VersionNotFoundError, +) +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType +from pyspark.sql.utils import AnalysisException, ParseException + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ManagedTable: # pylint: disable=too-many-instance-attributes + """Stores the definition of a managed table""" + + # regex for tables, catalogs and schemas + _NAMING_REGEX = r"\b[0-9a-zA-Z_-]{1,}\b" + _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] + _VALID_DATAFRAME_TYPES = ["spark", "pandas"] + database: str + catalog: Optional[str] + table: str + write_mode: str + dataframe_type: str + primary_key: Optional[str] + owner_group: str + partition_columns: Union[str, List[str]] + json_schema: StructType + + def __post_init__(self): + """Run validation methods if declared. + The validation method can be a simple check + that raises DataSetError. + The validation is performed by calling a function named: + `validate_<field_name>(self, value) -> raises DataSetError` + """ + for name in self.__dataclass_fields__.keys(): # pylint: disable=no-member + method = getattr(self, f"_validate_{name}", None) + if method: + method() + + def _validate_table(self): + """Validates table name + + Raises: + DataSetError: If the table name does not conform to naming constraints. + """ + if not re.fullmatch(self._NAMING_REGEX, self.table): + raise DataSetError("table does not conform to naming") + + def _validate_database(self): + """Validates database name + + Raises: + DataSetError: If the dataset name does not conform to naming constraints. + """ + if not re.fullmatch(self._NAMING_REGEX, self.database): + raise DataSetError("database does not conform to naming") + + def _validate_catalog(self): + """Validates catalog name + + Raises: + DataSetError: If the catalog name does not conform to naming constraints. + """ + if self.catalog: + if not re.fullmatch(self._NAMING_REGEX, self.catalog): + raise DataSetError("catalog does not conform to naming") + + def _validate_write_mode(self): + """Validates the write mode + + Raises: + DataSetError: If an invalid `write_mode` is passed. + """ + if self.write_mode not in self._VALID_WRITE_MODES: + valid_modes = ", ".join(self._VALID_WRITE_MODES) + raise DataSetError( + f"Invalid `write_mode` provided: {self.write_mode}. " + f"`write_mode` must be one of: {valid_modes}" + ) + + def _validate_dataframe_type(self): + """Validates the dataframe type + + Raises: + DataSetError: If an invalid `dataframe_type` is passed + """ + if self.dataframe_type not in self._VALID_DATAFRAME_TYPES: + valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) + raise DataSetError(f"`dataframe_type` must be one of {valid_types}") + + def _validate_primary_key(self): + """Validates the primary key of the table + + Raises: + DataSetError: If no `primary_key` is specified. + """ + if self.primary_key is None or len(self.primary_key) == 0: + if self.write_mode == "upsert": + raise DataSetError( + f"`primary_key` must be provided for" + f"`write_mode` {self.write_mode}" + ) + + def full_table_location(self) -> str: + """Returns the full table location + + Returns: + str: table location in the format catalog.database.table + """ + full_table_location = None + if self.catalog and self.database and self.table: + full_table_location = f"`{self.catalog}`.`{self.database}`.`{self.table}`" + elif self.database and self.table: + full_table_location = f"`{self.database}`.`{self.table}`" + return full_table_location + + def schema(self) -> StructType: + """Returns the Spark schema of the table if it exists + + Returns: + StructType: + """ + schema = None + try: + if self.json_schema is not None: + schema = StructType.fromJson(self.json_schema) + except (KeyError, ValueError) as exc: + raise DataSetError(exc) from exc + return schema + + +class ManagedTableDataSet(AbstractVersionedDataSet): + """``ManagedTableDataSet`` loads and saves data into managed delta tables on Databricks. + Load and save can be in Spark or Pandas dataframes, specified in dataframe_type. + When saving data, you can specify one of three modes: overwrite(default), append, + or upsert. Upsert requires you to specify the primary_column parameter which + will be used as part of the join condition. This dataset works best with + the databricks kedro starter. That starter comes with hooks that allow this + dataset to function properly. Follow the instructions in that starter to + setup your project for this dataset. + + Example usage for the + `YAML API <https://kedro.readthedocs.io/en/stable/data/\ + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: + + .. code-block:: yaml + + names_and_ages@spark: + type: databricks.ManagedTableDataSet + table: names_and_ages + + names_and_ages@pandas: + type: databricks.ManagedTableDataSet + table: names_and_ages + dataframe_type: pandas + + Example usage for the + `Python API <https://kedro.readthedocs.io/en/stable/data/\ + data_catalog.html#use-the-data-catalog-with-the-code-api>`_: + .. code-block:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import (StructField, StringType, + IntegerType, StructType) + from kedro_datasets.databricks import ManagedTableDataSet + schema = StructType([StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + data_set = ManagedTableDataSet(table="names_and_ages") + data_set.save(spark_df) + reloaded = data_set.load() + reloaded.take(4) + """ + + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism within a Spark pipeline please consider + # using ``ThreadRunner`` instead + _SINGLE_PROCESS = True + + def __init__( # pylint: disable=R0913 + self, + table: str, + catalog: str = None, + database: str = "default", + write_mode: str = "overwrite", + dataframe_type: str = "spark", + primary_key: Optional[Union[str, List[str]]] = None, + version: Version = None, + *, + # the following parameters are used by project hooks + # to create or update table properties + schema: Dict[str, Any] = None, + partition_columns: List[str] = None, + owner_group: str = None, + ) -> None: + """Creates a new instance of ``ManagedTableDataSet`` + + Args: + table (str): the name of the table + catalog (str, optional): the name of the catalog in Unity. + Defaults to None. + database (str, optional): the name of the database. + (also referred to as schema). Defaults to "default". + write_mode (str, optional): the mode to write the data into the table. + Options are:["overwrite", "append", "upsert"]. + "upsert" mode requires primary_key field to be populated. + Defaults to "overwrite". + dataframe_type (str, optional): "pandas" or "spark" dataframe. + Defaults to "spark". + primary_key (Union[str, List[str]], optional): the primary key of the table. + Can be in the form of a list. Defaults to None. + version (Version, optional): kedro.io.core.Version instance to load the data. + Defaults to None. + schema (Dict[str, Any], optional): the schema of the table in JSON form. + Dataframes will be truncated to match the schema if provided. + Used by the hooks to create the table if the schema is provided + Defaults to None. + partition_columns (List[str], optional): the columns to use for partitioning the table. + Used by the hooks. Defaults to None. + owner_group (str, optional): if table access control is enabled in your workspace, + specifying owner_group will transfer ownership of the table and database to + this owner. All databases should have the same owner_group. Defaults to None. + Raises: + DataSetError: Invalid configuration supplied (through ManagedTable validation) + """ + + self._table = ManagedTable( + database=database, + catalog=catalog, + table=table, + write_mode=write_mode, + dataframe_type=dataframe_type, + primary_key=primary_key, + owner_group=owner_group, + partition_columns=partition_columns, + json_schema=schema, + ) + + self._version = version + + super().__init__( + filepath=None, + version=version, + exists_function=self._exists, + ) + + @staticmethod + def _get_spark() -> SparkSession: + return SparkSession.builder.getOrCreate() + + def _load(self) -> Union[DataFrame, pd.DataFrame]: + """Loads the version of data in the format defined in the init + (spark|pandas dataframe) + + Raises: + VersionNotFoundError: if the version defined in + the init doesn't exist + + Returns: + Union[DataFrame, pd.DataFrame]: Returns a dataframe + in the format defined in the init + """ + if self._version and self._version.load >= 0: + try: + data = ( + self._get_spark() + .read.format("delta") + .option("versionAsOf", self._version.load) + .table(self._table.full_table_location()) + ) + except Exception as exc: + raise VersionNotFoundError(self._version.load) from exc + else: + data = self._get_spark().table(self._table.full_table_location()) + if self._table.dataframe_type == "pandas": + data = data.toPandas() + return data + + def _save_append(self, data: DataFrame) -> None: + """Saves the data to the table by appending it + to the location defined in the init + + Args: + data (DataFrame): the Spark dataframe to append to the table + """ + data.write.format("delta").mode("append").saveAsTable( + self._table.full_table_location() + ) + + def _save_overwrite(self, data: DataFrame) -> None: + """Overwrites the data in the table with the data provided. + (this is the default save mode) + + Args: + data (DataFrame): the Spark dataframe to overwrite the table with. + """ + delta_table = data.write.format("delta") + if self._table.write_mode == "overwrite": + delta_table = delta_table.mode("overwrite").option( + "overwriteSchema", "true" + ) + delta_table.saveAsTable(self._table.full_table_location()) + + def _save_upsert(self, update_data: DataFrame) -> None: + """Upserts the data by joining on primary_key columns or column. + If table doesn't exist at save, the data is inserted to a new table. + + Args: + update_data (DataFrame): the Spark dataframe to upsert + """ + if self._exists(): + base_data = self._get_spark().table(self._table.full_table_location()) + base_columns = base_data.columns + update_columns = update_data.columns + + if set(update_columns) != set(base_columns): + raise DataSetError( + f"Upsert requires tables to have identical columns. " + f"Delta table {self._table.full_table_location()} " + f"has columns: {base_columns}, whereas " + f"dataframe has columns {update_columns}" + ) + + where_expr = "" + if isinstance(self._table.primary_key, str): + where_expr = ( + f"base.{self._table.primary_key}=update.{self._table.primary_key}" + ) + elif isinstance(self._table.primary_key, list): + where_expr = " AND ".join( + f"base.{col}=update.{col}" for col in self._table.primary_key + ) + + update_data.createOrReplaceTempView("update") + self._get_spark().conf.set( + "fullTableAddress", self._table.full_table_location() + ) + self._get_spark().conf.set("whereExpr", where_expr) + upsert_sql = """MERGE INTO ${fullTableAddress} base USING update ON ${whereExpr} + WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT *""" + self._get_spark().sql(upsert_sql) + else: + self._save_append(update_data) + + def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None: + """Saves the data based on the write_mode and dataframe_type in the init. + If write_mode is pandas, Spark dataframe is created first. + If schema is provided, data is matched to schema before saving + (columns will be sorted and truncated). + + Args: + data (Any): Spark or pandas dataframe to save to the table location + """ + # filter columns specified in schema and match their ordering + if self._table.schema(): + cols = self._table.schema().fieldNames() + if self._table.dataframe_type == "pandas": + data = self._get_spark().createDataFrame( + data.loc[:, cols], schema=self._table.schema() + ) + else: + data = data.select(*cols) + else: + if self._table.dataframe_type == "pandas": + data = self._get_spark().createDataFrame(data) + if self._table.write_mode == "overwrite": + self._save_overwrite(data) + elif self._table.write_mode == "upsert": + self._save_upsert(data) + elif self._table.write_mode == "append": + self._save_append(data) + + def _describe(self) -> Dict[str, str]: + """Returns a description of the instance of ManagedTableDataSet + + Returns: + Dict[str, str]: Dict with the details of the dataset + """ + return { + "catalog": self._table.catalog, + "database": self._table.database, + "table": self._table.table, + "write_mode": self._table.write_mode, + "dataframe_type": self._table.dataframe_type, + "primary_key": self._table.primary_key, + "version": str(self._version), + "owner_group": self._table.owner_group, + "partition_columns": self._table.partition_columns, + } + + def _exists(self) -> bool: + """Checks to see if the table exists + + Returns: + bool: boolean of whether the table defined + in the dataset instance exists in the Spark session + """ + if self._table.catalog: + try: + self._get_spark().sql(f"USE CATALOG `{self._table.catalog}`") + except (ParseException, AnalysisException) as exc: + logger.warning( + "catalog %s not found or unity not enabled. Error message: %s", + self._table.catalog, + exc, + ) + try: + return ( + self._get_spark() + .sql(f"SHOW TABLES IN `{self._table.database}`") + .filter(f"tableName = '{self._table.table}'") + .count() + > 0 + ) + except (ParseException, AnalysisException) as exc: + logger.warning("error occured while trying to find table: %s", exc) + return False diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index be99f9912..a154d8132 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -8,6 +8,7 @@ HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" POLARS = "polars~=0.17.0" +DELTA = "delta-spark~=1.2.1" def _collect_requirements(requires): @@ -16,7 +17,10 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} -dask_require = {"dask.ParquetDataSet": ["dask[complete]", "triad>=0.6.7, <1.0"]} +dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} +databricks_require = { + "databricks.ManagedTableDataSet": [SPARK, PANDAS, DELTA] +} geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } @@ -79,6 +83,7 @@ def _collect_requirements(requires): "api": _collect_requirements(api_require), "biosequence": _collect_requirements(biosequence_require), "dask": _collect_requirements(dask_require), + "databricks": _collect_requirements(databricks_require), "docs": [ "docutils==0.16", "sphinx~=3.4.3", @@ -108,6 +113,7 @@ def _collect_requirements(requires): **api_require, **biosequence_require, **dask_require, + **databricks_require, **geopandas_require, **holoviews_require, **matplotlib_require, diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index 4d4954739..fe20fee5f 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -30,7 +30,7 @@ networkx~=2.4 opencv-python~=4.5.5.64 openpyxl>=3.0.3, <4.0 pandas-gbq>=0.12.0, <0.18.0 -pandas>=1.3 # 1.3 for read_xml/to_xml +pandas>=1.3, <2 # 1.3 for read_xml/to_xml, <2 for compatibility with Spark < 3.4 Pillow~=9.0 plotly>=4.8.0, <6.0 polars~=0.15.13 diff --git a/kedro-datasets/tests/databricks/__init__.py b/kedro-datasets/tests/databricks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/databricks/conftest.py b/kedro-datasets/tests/databricks/conftest.py new file mode 100644 index 000000000..26d63b056 --- /dev/null +++ b/kedro-datasets/tests/databricks/conftest.py @@ -0,0 +1,25 @@ +""" +This file contains the fixtures that are reusable by any tests within +this directory. You don't need to import the fixtures as pytest will +discover them automatically. More info here: +https://docs.pytest.org/en/latest/fixture.html +""" +import pytest +from pyspark.sql import SparkSession + + +@pytest.fixture(scope="class", autouse=True) +def spark_session(): + spark = ( + SparkSession.builder.appName("test") + .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + .getOrCreate() + ) + spark.sql("create database if not exists test") + yield spark + spark.sql("drop database test cascade;") diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py new file mode 100644 index 000000000..9aae08707 --- /dev/null +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -0,0 +1,484 @@ +import pandas as pd +import pytest +from kedro.io.core import DataSetError, Version, VersionNotFoundError +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + +from kedro_datasets.databricks import ManagedTableDataSet + + +@pytest.fixture +def sample_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 32), ("Evan", 23)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def mismatched_upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("height", IntegerType(), True), + ] + ) + + data = [("Alex", 32, 174), ("Evan", 23, 166)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def subset_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("height", IntegerType(), True), + ] + ) + + data = [("Alex", 32, 174), ("Evan", 23, 166)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def subset_pandas_df(): + return pd.DataFrame( + {"name": ["Alex", "Evan"], "age": [32, 23], "height": [174, 166]} + ) + + +@pytest.fixture +def subset_expected_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 32), ("Evan", 23)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def sample_pandas_df(): + return pd.DataFrame( + {"name": ["Alex", "Bob", "Clarke", "Dave"], "age": [31, 12, 65, 29]} + ) + + +@pytest.fixture +def append_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Evan", 23), ("Frank", 13)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_append_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 31), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ("Frank", 13), + ] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 32), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 31), + ("Alex", 32), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ] + + return spark_session.createDataFrame(data, schema) + + +# pylint: disable=too-many-public-methods +class TestManagedTableDataSet: + def test_full_table(self): + unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") + assert unity_ds._table.full_table_location() == "`test`.`test`.`test`" + + unity_ds = ManagedTableDataSet( + catalog="test-test", database="test", table="test" + ) + assert unity_ds._table.full_table_location() == "`test-test`.`test`.`test`" + + unity_ds = ManagedTableDataSet(database="test", table="test") + assert unity_ds._table.full_table_location() == "`test`.`test`" + + unity_ds = ManagedTableDataSet(table="test") + assert unity_ds._table.full_table_location() == "`default`.`test`" + + with pytest.raises(TypeError): + ManagedTableDataSet() # pylint: disable=no-value-for-parameter + + def test_describe(self): + unity_ds = ManagedTableDataSet(table="test") + assert unity_ds._describe() == { + "catalog": None, + "database": "default", + "table": "test", + "write_mode": "overwrite", + "dataframe_type": "spark", + "primary_key": None, + "version": "None", + "owner_group": None, + "partition_columns": None, + } + + def test_invalid_write_mode(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", write_mode="invalid") + + def test_dataframe_type(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", dataframe_type="invalid") + + def test_missing_primary_key_upsert(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", write_mode="upsert") + + def test_invalid_table_name(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="invalid!") + + def test_invalid_database(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", database="invalid!") + + def test_invalid_catalog(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", catalog="invalid!") + + def test_schema(self): + unity_ds = ManagedTableDataSet( + table="test", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + ) + expected_schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + assert unity_ds._table.schema() == expected_schema + + def test_invalid_schema(self): + with pytest.raises(DataSetError): + ManagedTableDataSet( + table="test", + schema={ + "fields": [ + { + "invalid": "schema", + } + ], + "type": "struct", + }, + )._table.schema() + + def test_catalog_exists(self): + unity_ds = ManagedTableDataSet( + catalog="test", database="invalid", table="test_not_there" + ) + assert not unity_ds._exists() + + def test_table_does_not_exist(self): + unity_ds = ManagedTableDataSet(database="invalid", table="test_not_there") + assert not unity_ds._exists() + + def test_save_default(self, sample_spark_df: DataFrame): + unity_ds = ManagedTableDataSet(database="test", table="test_save") + unity_ds.save(sample_spark_df) + saved_table = unity_ds.load() + assert ( + unity_ds._exists() and sample_spark_df.exceptAll(saved_table).count() == 0 + ) + + def test_save_schema_spark( + self, subset_spark_df: DataFrame, subset_expected_df: DataFrame + ): + unity_ds = ManagedTableDataSet( + database="test", + table="test_save_spark_schema", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + ) + unity_ds.save(subset_spark_df) + saved_table = unity_ds.load() + assert subset_expected_df.exceptAll(saved_table).count() == 0 + + def test_save_schema_pandas( + self, subset_pandas_df: pd.DataFrame, subset_expected_df: DataFrame + ): + unity_ds = ManagedTableDataSet( + database="test", + table="test_save_pd_schema", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + dataframe_type="pandas", + ) + unity_ds.save(subset_pandas_df) + saved_ds = ManagedTableDataSet( + database="test", + table="test_save_pd_schema", + ) + saved_table = saved_ds.load() + assert subset_expected_df.exceptAll(saved_table).count() == 0 + + def test_save_overwrite( + self, sample_spark_df: DataFrame, append_spark_df: DataFrame + ): + unity_ds = ManagedTableDataSet(database="test", table="test_save") + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + overwritten_table = unity_ds.load() + + assert append_spark_df.exceptAll(overwritten_table).count() == 0 + + def test_save_append( + self, + sample_spark_df: DataFrame, + append_spark_df: DataFrame, + expected_append_spark_df: DataFrame, + ): + unity_ds = ManagedTableDataSet( + database="test", table="test_save_append", write_mode="append" + ) + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + appended_table = unity_ds.load() + + assert expected_append_spark_df.exceptAll(appended_table).count() == 0 + + def test_save_upsert( + self, + sample_spark_df: DataFrame, + upsert_spark_df: DataFrame, + expected_upsert_spark_df: DataFrame, + ): + unity_ds = ManagedTableDataSet( + database="test", + table="test_save_upsert", + write_mode="upsert", + primary_key="name", + ) + unity_ds.save(sample_spark_df) + unity_ds.save(upsert_spark_df) + + upserted_table = unity_ds.load() + + assert expected_upsert_spark_df.exceptAll(upserted_table).count() == 0 + + def test_save_upsert_multiple_primary( + self, + sample_spark_df: DataFrame, + upsert_spark_df: DataFrame, + expected_upsert_multiple_primary_spark_df: DataFrame, + ): + unity_ds = ManagedTableDataSet( + database="test", + table="test_save_upsert_multiple", + write_mode="upsert", + primary_key=["name", "age"], + ) + unity_ds.save(sample_spark_df) + unity_ds.save(upsert_spark_df) + + upserted_table = unity_ds.load() + + assert ( + expected_upsert_multiple_primary_spark_df.exceptAll(upserted_table).count() + == 0 + ) + + def test_save_upsert_mismatched_columns( + self, + sample_spark_df: DataFrame, + mismatched_upsert_spark_df: DataFrame, + ): + unity_ds = ManagedTableDataSet( + database="test", + table="test_save_upsert_mismatch", + write_mode="upsert", + primary_key="name", + ) + unity_ds.save(sample_spark_df) + with pytest.raises(DataSetError): + unity_ds.save(mismatched_upsert_spark_df) + + def test_load_spark(self, sample_spark_df: DataFrame): + unity_ds = ManagedTableDataSet(database="test", table="test_load_spark") + unity_ds.save(sample_spark_df) + + delta_ds = ManagedTableDataSet(database="test", table="test_load_spark") + delta_table = delta_ds.load() + + assert ( + isinstance(delta_table, DataFrame) + and delta_table.exceptAll(sample_spark_df).count() == 0 + ) + + def test_load_spark_no_version(self, sample_spark_df: DataFrame): + unity_ds = ManagedTableDataSet(database="test", table="test_load_spark") + unity_ds.save(sample_spark_df) + + delta_ds = ManagedTableDataSet( + database="test", table="test_load_spark", version=Version(2, None) + ) + with pytest.raises(VersionNotFoundError): + _ = delta_ds.load() + + def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFrame): + unity_ds = ManagedTableDataSet( + database="test", table="test_load_version", write_mode="append" + ) + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + loaded_ds = ManagedTableDataSet( + database="test", table="test_load_version", version=Version(0, None) + ) + loaded_df = loaded_ds.load() + + assert loaded_df.exceptAll(sample_spark_df).count() == 0 + + def test_load_pandas(self, sample_pandas_df: pd.DataFrame): + unity_ds = ManagedTableDataSet( + database="test", table="test_load_pandas", dataframe_type="pandas" + ) + unity_ds.save(sample_pandas_df) + + pandas_ds = ManagedTableDataSet( + database="test", table="test_load_pandas", dataframe_type="pandas" + ) + pandas_df = pandas_ds.load().sort_values("name", ignore_index=True) + + assert isinstance(pandas_df, pd.DataFrame) and pandas_df.equals( + sample_pandas_df + ) From 0aaa922b39d2ff7177c05a0e8ccbaba346d6a579 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan <mediumnok@gmail.com> Date: Mon, 22 May 2023 17:49:58 +0100 Subject: [PATCH 90/96] docs: Update APIDataset docs and refactor (#217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update APIDataset docs and refactor * Acknowledge community contributor * Fix more broken doc Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> * Lint Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Fix release notes of upcoming kedro-datasets --------- Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com> Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/RELEASE.md | 9 ++------- kedro-datasets/kedro_datasets/api/api_dataset.py | 12 ++++++------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 76d730159..3ff1dbf6f 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -10,18 +10,13 @@ ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets. ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [BrianCechmanek](https://github.com/BrianCechmanek) - -# Release 1.2.1: - -## Major features and improvements: - -## Bug fixes and other changes -* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets. +* [McDonnellJoseph](https://github.com/McDonnellJoseph) # Release 1.2.0: diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index ad2a6c367..82bba3546 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -59,7 +59,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}' >>> data_set = APIDataSet( - method = "POST" + method = "POST", url = "url_of_remote_server", save_args = {"chunk_size":1} ) @@ -109,14 +109,14 @@ def __init__( during load method. Adds an optional parameter, ``chunk_size`` which determines the size of the package sent at each request. credentials: Allows specifying secrets in credentials.yml. - Expected format is ``('login', 'password')`` if given as a tuple or list. - An ``AuthBase`` instance can be provided for more complex cases. + Expected format is ``('login', 'password')`` if given as a tuple or + list. An ``AuthBase`` instance can be provided for more complex cases. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. Raises: - ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are - specified. + ValueError: if both ``auth`` and ``credentials`` are specified or used + unsupported RESTful API method. """ super().__init__() @@ -124,7 +124,7 @@ def __init__( if method == "GET": self._params = load_args or {} - # PUT, POST, DELETE means save + # PUT, POST means save elif method in ["PUT", "POST"]: self._params = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: From ccec03bb9fbae1ab5d91353b8ac5de5d87de1013 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Mon, 22 May 2023 21:23:08 +0100 Subject: [PATCH 91/96] feat: Release `kedro-datasets` version `1.3.0` (#219) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Modify release version and RELEASE.md Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Add proper name for ManagedTableDataSet Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> * Update kedro-datasets/RELEASE.md Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Revert lost semicolon for release 1.2.0 Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> --------- Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com> Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/RELEASE.md | 17 ++++++++++++++--- kedro-datasets/kedro_datasets/__init__.py | 2 +- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 3ff1dbf6f..ed347ca60 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,15 +1,25 @@ # Upcoming Release: -## Major features and improvements: +## Major features and improvements + +## Bug fixes and other changes + +## Community contributions + +# Release 1.3.0: + +## Major features and improvements * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). * Added a save method to the APIDataSet * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. * Relaxed Kedro version pin to `>=0.16` * Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins. +* Added `ManagedTableDataSet` for managed delta tables on Databricks. ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +* Upgraded required `polars` version to 0.17. * Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets. ## Community contributions @@ -17,10 +27,11 @@ Many thanks to the following Kedroids for contributing PRs to this release: * [BrianCechmanek](https://github.com/BrianCechmanek) * [McDonnellJoseph](https://github.com/McDonnellJoseph) +* [Danny Farah](https://github.com/dannyrfar) # Release 1.2.0: -## Major features and improvements: +## Major features and improvements * Added `fsspec` resolution in `SparkDataSet` to support more filesystems. * Added the `_preview` method to the Pandas `ExcelDataSet` and `CSVDataSet` classes. @@ -35,7 +46,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: # Release 1.1.0: -## Major features and improvements: +## Major features and improvements * Added the following new datasets: diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index 5c3838ceb..96aa32f85 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,3 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" -__version__ = "1.2.0" +__version__ = "1.3.0" From c2a712853a10931b22e1cc7dfe0410a45135289a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <juan_luis_cano@mckinsey.com> Date: Tue, 23 May 2023 23:38:56 +0200 Subject: [PATCH 92/96] docs: Fix APIDataSet docstring (#220) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix APIDataSet docstring Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Add release notes Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Separate [docs] extras from [all] in kedro-datasets Fix gh-143. Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> --------- Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/RELEASE.md | 1 + .../kedro_datasets/api/api_dataset.py | 4 ++-- kedro-datasets/setup.py | 22 +++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index ed347ca60..8406a063c 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,6 +3,7 @@ ## Major features and improvements ## Bug fixes and other changes +* Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. ## Community contributions diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 82bba3546..0929f56fe 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -54,7 +54,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): >>> data = data_set.load() ``APIDataSet`` can also be used to save output on a remote server using HTTP(S) - methods. + methods. :: >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}' @@ -116,7 +116,7 @@ def __init__( Raises: ValueError: if both ``auth`` and ``credentials`` are specified or used - unsupported RESTful API method. + unsupported RESTful API method. """ super().__init__() diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index a154d8132..bc4ce794d 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -84,17 +84,6 @@ def _collect_requirements(requires): "biosequence": _collect_requirements(biosequence_require), "dask": _collect_requirements(dask_require), "databricks": _collect_requirements(databricks_require), - "docs": [ - "docutils==0.16", - "sphinx~=3.4.3", - "sphinx_rtd_theme==0.4.1", - "nbsphinx==0.8.1", - "nbstripout~=0.4", - "sphinx-autodoc-typehints==1.11.1", - "sphinx_copybutton==0.3.1", - "ipykernel>=5.3, <7.0", - "myst-parser~=0.17.2", - ], "geopandas": _collect_requirements(geopandas_require), "holoviews": _collect_requirements(holoviews_require), "matplotlib": _collect_requirements(matplotlib_require), @@ -131,6 +120,17 @@ def _collect_requirements(requires): } extras_require["all"] = _collect_requirements(extras_require) +extras_require["docs"] = [ + "docutils==0.16", + "sphinx~=3.4.3", + "sphinx_rtd_theme==0.4.1", + "nbsphinx==0.8.1", + "nbstripout~=0.4", + "sphinx-autodoc-typehints==1.11.1", + "sphinx_copybutton==0.3.1", + "ipykernel>=5.3, <7.0", + "myst-parser~=0.17.2", +] setup( extras_require=extras_require, From 64446dc56772d117d888a1d650727cc191cbef39 Mon Sep 17 00:00:00 2001 From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com> Date: Tue, 30 May 2023 09:40:55 +0100 Subject: [PATCH 93/96] Update kedro-datasets/tests/spark/test_spark_streaming_dataset.py Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index b4e1f0414..c4fb6c005 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -41,7 +41,7 @@ def sample_spark_df_schema() -> StructType: @pytest.fixture def sample_spark_streaming_df(tmp_path, sample_spark_df_schema): - """Create s sample dataframe for streaming""" + """Create a sample dataframe for streaming""" data = [("0001", 2), ("0001", 7), ("0002", 4)] schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix() with open(schema_path, "w", encoding="utf-8") as f: From 497001d5f3dcd1885814af0b143ded2aa9bef2c2 Mon Sep 17 00:00:00 2001 From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com> Date: Tue, 30 May 2023 09:41:52 +0100 Subject: [PATCH 94/96] Update kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 0f7e841ed..b34f277f9 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -58,7 +58,7 @@ def __init__( a list of read options for each supported format in Spark DataFrame read documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html, - Please note that a schema is mandatory for a streaming DataFrame if schemaInference + Please note that a schema is mandatory for a streaming DataFrame if ``schemaInference`` is not True. save_args: Save args passed to Spark DataFrame write options. Similar to load_args this is dependent on the selected file From 7f25f3c5b101c07165bac3ff42359291421cbfa4 Mon Sep 17 00:00:00 2001 From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com> Date: Tue, 30 May 2023 09:42:27 +0100 Subject: [PATCH 95/96] Update kedro-datasets/setup.py Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu> Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- kedro-datasets/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index bc4ce794d..210eb6884 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -51,7 +51,7 @@ def _collect_requirements(requires): "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"], } polars_require = { - "polars.CSVDataSet": [POLARS], + "polars.CSVDataSet": [POLARS] } redis_require = {"redis.PickleDataSet": ["redis~=4.1"]} snowflake_require = { From c094db1246a4efe8d8be4c43e0c79a3e84b12ac1 Mon Sep 17 00:00:00 2001 From: Tom Kurian <tom_kurian@mckinsey.com> Date: Tue, 30 May 2023 17:34:51 +0100 Subject: [PATCH 96/96] fix linting issue Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com> --- .../kedro_datasets/spark/spark_streaming_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index b34f277f9..2f7743e65 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -58,8 +58,8 @@ def __init__( a list of read options for each supported format in Spark DataFrame read documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html, - Please note that a schema is mandatory for a streaming DataFrame if ``schemaInference`` - is not True. + Please note that a schema is mandatory for a streaming DataFrame + if ``schemaInference`` is not True. save_args: Save args passed to Spark DataFrame write options. Similar to load_args this is dependent on the selected file format. You can pass ``mode`` and ``partitionBy`` to specify