From d18fe922efa3c0dbb93c170ad637f132a9f9d406 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 26 Jan 2023 09:43:29 -0500 Subject: [PATCH 1/3] Fix docs formatting and phrasing for some datasets Signed-off-by: Deepyaman Datta --- .../kedro_datasets/api/api_dataset.py | 35 +++++----- .../kedro_datasets/json/json_dataset.py | 27 ++++---- .../matplotlib/matplotlib_writer.py | 26 +++---- .../kedro_datasets/pandas/csv_dataset.py | 43 ++++++------ .../kedro_datasets/pandas/excel_dataset.py | 60 +++++++++------- .../kedro_datasets/pandas/feather_dataset.py | 37 +++++----- .../kedro_datasets/pandas/gbq_dataset.py | 31 +++++---- .../kedro_datasets/pandas/generic_dataset.py | 50 +++++++------- .../kedro_datasets/pandas/hdf_dataset.py | 18 ++--- .../kedro_datasets/pandas/json_dataset.py | 28 ++++---- .../kedro_datasets/pandas/parquet_dataset.py | 52 +++++++------- .../kedro_datasets/pandas/sql_dataset.py | 69 ++++++++++--------- .../kedro_datasets/pandas/xml_dataset.py | 4 +- .../kedro_datasets/pickle/pickle_dataset.py | 34 ++++----- .../kedro_datasets/pillow/image_dataset.py | 4 +- .../kedro_datasets/plotly/json_dataset.py | 18 +++-- .../kedro_datasets/plotly/plotly_dataset.py | 36 +++++----- .../kedro_datasets/redis/redis_dataset.py | 38 +++++----- .../kedro_datasets/spark/spark_dataset.py | 64 ++++++++--------- .../spark/spark_hive_dataset.py | 20 +++--- .../spark/spark_jdbc_dataset.py | 33 ++++----- .../svmlight/svmlight_dataset.py | 46 +++++++------ .../tensorflow/tensorflow_model_dataset.py | 30 ++++---- .../kedro_datasets/text/text_dataset.py | 14 ++-- .../kedro_datasets/tracking/json_dataset.py | 17 ++--- .../kedro_datasets/video/video_dataset.py | 29 ++++---- .../kedro_datasets/yaml/yaml_dataset.py | 17 ++--- 27 files changed, 466 insertions(+), 414 deletions(-) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 2d175f26e..93e39fb51 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -12,27 +12,26 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): """``APIDataSet`` loads the data from HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> usda: - >>> type: api.APIDataSet - >>> url: https://quickstats.nass.usda.gov - >>> params: - >>> key: SOME_TOKEN, - >>> format: JSON, - >>> commodity_desc: CORN, - >>> statisticcat_des: YIELD, - >>> agg_level_desc: STATE, - >>> year: 2000 - >>> - - - Example using Python API: + usda: + type: api.APIDataSet + url: https://quickstats.nass.usda.gov + params: + key: SOME_TOKEN, + format: JSON, + commodity_desc: CORN, + statisticcat_des: YIELD, + agg_level_desc: STATE, + year: 2000 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.api import APIDataSet diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 89b7081cd..73268b223 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -20,22 +20,21 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - Example adding a catalog entry with the ``YAML API``: - + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> json_dataset: - >>> type: json.JSONDataSet - >>> filepath: data/01_raw/location.json - >>> - >>> cars: - >>> type: json.JSONDataSet - >>> filepath: gcs://your_bucket/cars.json - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - - Example using Python API: + cars: + type: json.JSONDataSet + filepath: gcs://your_bucket/cars.json + fs_args: + project: my-project + credentials: my_gcp_credentials + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 0a6163a23..5757b08ab 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -26,21 +26,21 @@ class MatplotlibWriter( """``MatplotlibWriter`` saves one or more Matplotlib objects as image files to an underlying filesystem (e.g. local, S3, GCS). - Example adding a catalog entry with the `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> output_plot: - >>> type: matplotlib.MatplotlibWriter - >>> filepath: data/08_reporting/output_plot.png - >>> save_args: - >>> format: png - >>> - - Example using the Python API: + output_plot: + type: matplotlib.MatplotlibWriter + filepath: data/08_reporting/output_plot.png + save_args: + format: png + Example usage for the + `Python API `_: :: >>> import matplotlib.pyplot as plt @@ -55,7 +55,6 @@ class MatplotlibWriter( >>> plot_writer.save(fig) Example saving a plot as a PDF file: - :: >>> import matplotlib.pyplot as plt @@ -70,9 +69,7 @@ class MatplotlibWriter( >>> plt.close() >>> pdf_plot_writer.save(fig) - Example saving multiple plots in a folder, using a dictionary: - :: >>> import matplotlib.pyplot as plt @@ -90,7 +87,6 @@ class MatplotlibWriter( >>> dict_plot_writer.save(plots_dict) Example saving multiple plots in a folder, using a list: - :: >>> import matplotlib.pyplot as plt diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 41bc27c9d..2a6366bd0 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -25,30 +25,31 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.CSVDataSet - >>> filepath: data/01_raw/company/cars.csv - >>> load_args: - >>> sep: "," - >>> na_values: ["#NA", NA] - >>> save_args: - >>> index: False - >>> date_format: "%Y-%m-%d %H:%M" - >>> decimal: . - >>> - >>> motorbikes: - >>> type: pandas.CSVDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv - >>> credentials: dev_s3 - - Example using Python API: + cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d %H:%M" + decimal: . + + motorbikes: + type: pandas.CSVDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import CSVDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index ec072d7c4..aec96c6ed 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -30,26 +30,30 @@ class ExcelDataSet( """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. - Example adding a catalog entry with the ``YAML API``: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> rockets: - >>> type: pandas.ExcelDataSet - >>> filepath: gcs://your_bucket/rockets.xlsx - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - >>> save_args: - >>> sheet_name: Sheet1 - >>> load_args: - >>> sheet_name: Sheet1 - >>> - >>> shuttles: - >>> type: pandas.ExcelDataSet - >>> filepath: data/01_raw/shuttles.xlsx - - Example using Python API: + rockets: + type: pandas.ExcelDataSet + filepath: gcs://your_bucket/rockets.xlsx + fs_args: + project: my-project + credentials: my_gcp_credentials + save_args: + sheet_name: Sheet1 + load_args: + sheet_name: Sheet1 + + shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import ExcelDataSet @@ -63,21 +67,27 @@ class ExcelDataSet( >>> reloaded = data_set.load() >>> assert data.equals(reloaded) - Note: To save a multi-sheet Excel file, no special ``save_args`` are required. + To save a multi-sheet Excel file, no special ``save_args`` are required. Instead, return a dictionary of ``Dict[str, pd.DataFrame]`` where the string keys are your sheet names. - Example adding a catalog entry for multi-sheet Excel file with the ``YAML API``: + Example usage for the + `YAML API `_ + for a multi-sheet Excel file: .. code-block:: yaml - >>> trains: - >>> type: pandas.ExcelDataSet - >>> filepath: data/02_intermediate/company/trains.xlsx - >>> load_args: - >>> sheet_name: [Sheet1, Sheet2, Sheet3] + trains: + type: pandas.ExcelDataSet + filepath: data/02_intermediate/company/trains.xlsx + load_args: + sheet_name: [Sheet1, Sheet2, Sheet3] - Example multi-sheet Excel file using Python API: + Example usage for the + `Python API `_ + for a multi-sheet Excel file: :: >>> from kedro_datasets.pandas import ExcelDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index a6dfa3ca8..9dc56b2b5 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -27,28 +27,27 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): is supported by pandas, so it supports all allowed pandas options for loading and saving csv files. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.FeatherDataSet - >>> filepath: data/01_raw/company/cars.feather - >>> load_args: - >>> columns: ['col1', 'col2', 'col3'] - >>> use_threads: True - >>> - >>> motorbikes: - >>> type: pandas.FeatherDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather - >>> credentials: dev_s3 - >>> - - - Example using Python API: + cars: + type: pandas.FeatherDataSet + filepath: data/01_raw/company/cars.feather + load_args: + columns: ['col1', 'col2', 'col3'] + use_threads: True + + motorbikes: + type: pandas.FeatherDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import FeatherDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index 4a9464c5c..02dc31002 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -24,25 +24,26 @@ class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]): """``GBQTableDataSet`` loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> vehicles: - >>> type: pandas.GBQTableDataSet - >>> dataset: big_query_dataset - >>> table_name: big_query_table - >>> project: my-project - >>> credentials: gbq-creds - >>> load_args: - >>> reauth: True - >>> save_args: - >>> chunk_size: 100 - - - Example using Python API: + vehicles: + type: pandas.GBQTableDataSet + dataset: big_query_dataset + table_name: big_query_table + project: my-project + credentials: gbq-creds + load_args: + reauth: True + save_args: + chunk_size: 100 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import GBQTableDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 117d99015..08717fbb3 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -33,37 +33,39 @@ class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): filesystem (e.g.: local, S3, GCS). It uses pandas to dynamically select the appropriate type of read/write target on a best effort basis. - Example using `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.GenericDataSet - >>> file_format: csv - >>> filepath: s3://data/01_raw/company/cars.csv - >>> load_args: - >>> sep: "," - >>> na_values: ["#NA", NA] - >>> save_args: - >>> index: False - >>> date_format: "%Y-%m-%d" - - This second example is able to load a SAS7BDAT file via the :code:`pd.read_sas` method. - Trying to save this dataset will raise a `DataSetError` since pandas does not provide an - equivalent :code:`pd.DataFrame.to_sas` write method. + cars: + type: pandas.GenericDataSet + file_format: csv + filepath: s3://data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d" + + This second example is able to load a SAS7BDAT file via the ``pd.read_sas`` method. + Trying to save this dataset will raise a ``DataSetError`` since pandas does not provide an + equivalent ``pd.DataFrame.to_sas`` write method. .. code-block:: yaml - >>> flights: - >>> type: pandas.GenericDataSet - >>> file_format: sas - >>> filepath: data/01_raw/airplanes.sas7bdat - >>> load_args: - >>> format: sas7bdat + flights: + type: pandas.GenericDataSet + file_format: sas + filepath: data/01_raw/airplanes.sas7bdat + load_args: + format: sas7bdat - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import GenericDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index b790e6529..bf43a883e 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -23,19 +23,21 @@ class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> hdf_dataset: - >>> type: pandas.HDFDataSet - >>> filepath: s3://my_bucket/raw/sensor_reading.h5 - >>> credentials: aws_s3_creds - >>> key: data + hdf_dataset: + type: pandas.HDFDataSet + filepath: s3://my_bucket/raw/sensor_reading.h5 + credentials: aws_s3_creds + key: data - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import HDFDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 9c44fb502..cea0b985d 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -25,24 +25,26 @@ class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the json file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> clickstream_dataset: - >>> type: pandas.JSONDataSet - >>> filepath: abfs://landing_area/primary/click_stream.json - >>> credentials: abfs_creds - >>> - >>> json_dataset: - >>> type: pandas.JSONDataSet - >>> filepath: data/01_raw/Video_Games.json - >>> load_args: - >>> lines: True + clickstream_dataset: + type: pandas.JSONDataSet + filepath: abfs://landing_area/primary/click_stream.json + credentials: abfs_creds + + json_dataset: + type: pandas.JSONDataSet + filepath: data/01_raw/Video_Games.json + load_args: + lines: True - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 2352c2fd7..d0acdc5d1 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -25,35 +25,37 @@ class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> boats: - >>> type: pandas.ParquetDataSet - >>> filepath: data/01_raw/boats.parquet - >>> load_args: - >>> engine: pyarrow - >>> use_nullable_dtypes: True - >>> save_args: - >>> file_scheme: hive - >>> has_nulls: False - >>> engine: pyarrow - >>> - >>> trucks: - >>> type: pandas.ParquetDataSet - >>> filepath: abfs://container/02_intermediate/trucks.parquet - >>> credentials: dev_abs - >>> load_args: - >>> columns: [name, gear, disp, wt] - >>> index: name - >>> save_args: - >>> compression: GZIP - >>> partition_on: [name] - - Example using Python API: + boats: + type: pandas.ParquetDataSet + filepath: data/01_raw/boats.parquet + load_args: + engine: pyarrow + use_nullable_dtypes: True + save_args: + file_scheme: hive + has_nulls: False + engine: pyarrow + + trucks: + type: pandas.ParquetDataSet + filepath: abfs://container/02_intermediate/trucks.parquet + credentials: dev_abs + load_args: + columns: [name, gear, disp, wt] + index: name + save_args: + compression: GZIP + partition_on: [name] + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import ParquetDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 4de537812..400195719 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -101,30 +101,32 @@ class SQLTableDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): the data with no index. This is designed to make load and save methods symmetric. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> shuttles_table_dataset: - >>> type: pandas.SQLTableDataSet - >>> credentials: db_credentials - >>> table_name: shuttles - >>> load_args: - >>> schema: dwschema - >>> save_args: - >>> schema: dwschema - >>> if_exists: replace + shuttles_table_dataset: + type: pandas.SQLTableDataSet + credentials: db_credentials + table_name: shuttles + load_args: + schema: dwschema + save_args: + schema: dwschema + if_exists: replace Sample database credentials entry in ``credentials.yml``: .. code-block:: yaml - >>> db_credentials: - >>> con: postgresql://scott:tiger@localhost/test + db_credentials: + con: postgresql://scott:tiger@localhost/test - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import SQLTableDataSet @@ -270,38 +272,40 @@ class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): To save data to a SQL server use ``SQLTableDataSet``. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> shuttle_id_dataset: - >>> type: pandas.SQLQueryDataSet - >>> sql: "select shuttle, shuttle_id from spaceflights.shuttles;" - >>> credentials: db_credentials + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials - Advanced example using the `stream_results` and `chunksize` option to reduce memory usage + Advanced example using the ``stream_results`` and ``chunksize`` options to reduce memory usage: .. code-block:: yaml - >>> shuttle_id_dataset: - >>> type: pandas.SQLQueryDataSet - >>> sql: "select shuttle, shuttle_id from spaceflights.shuttles;" - >>> credentials: db_credentials - >>> execution_options: - >>> stream_results: true - >>> load_args: - >>> chunksize: 1000 + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials + execution_options: + stream_results: true + load_args: + chunksize: 1000 Sample database credentials entry in ``credentials.yml``: .. code-block:: yaml - >>> db_credentials: - >>> con: postgresql://scott:tiger@localhost/test + db_credentials: + con: postgresql://scott:tiger@localhost/test - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import SQLQueryDataSet @@ -317,7 +321,6 @@ class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): >>> credentials=credentials) >>> >>> sql_data = data_set.load() - >>> """ diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 64a3f9541..5760268a7 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -25,7 +25,9 @@ class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``XMLDataSet`` loads/saves data from/to a XML file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. - Example: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import XMLDataSet diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 522950308..db5d85137 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -24,26 +24,28 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): the specified backend library passed in (defaults to the ``pickle`` library), so it supports all allowed options for loading and saving pickle files. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> test_model: # simple example without compression - >>> type: pickle.PickleDataSet - >>> filepath: data/07_model_output/test_model.pkl - >>> backend: pickle - >>> - >>> final_model: # example with load and save args - >>> type: pickle.PickleDataSet - >>> filepath: s3://your_bucket/final_model.pkl.lz4 - >>> backend: joblib - >>> credentials: s3_credentials - >>> save_args: - >>> compress: lz4 - - Example using Python API: + test_model: # simple example without compression + type: pickle.PickleDataSet + filepath: data/07_model_output/test_model.pkl + backend: pickle + + final_model: # example with load and save args + type: pickle.PickleDataSet + filepath: s3://your_bucket/final_model.pkl.lz4 + backend: joblib + credentials: s3_credentials + save_args: + compress: lz4 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pickle import PickleDataSet diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 71f6eb974..8c2fdc983 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -20,7 +20,9 @@ class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): """``ImageDataSet`` loads/saves image data as `numpy` from an underlying filesystem (e.g.: local, S3, GCS). It uses Pillow to handle image file. - Example: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pillow import ImageDataSet diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 528751086..7eaae8da9 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -22,17 +22,21 @@ class JSONDataSet( """``JSONDataSet`` loads/saves a plotly figure from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> scatter_plot: - >>> type: plotly.JSONDataSet - >>> filepath: data/08_reporting/scatter_plot.json - >>> save_args: - >>> engine: auto + scatter_plot: + type: plotly.JSONDataSet + filepath: data/08_reporting/scatter_plot.json + save_args: + engine: auto - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.plotly import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 4325d105c..1bb0acef6 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -21,25 +21,29 @@ class PlotlyDataSet(JSONDataSet): ``PlotlyDataSet`` is a convenience wrapper for ``plotly.JSONDataSet``. It generates the JSON file directly from a pandas DataFrame through ``plotly_args``. - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> bar_plot: - >>> type: plotly.PlotlyDataSet - >>> filepath: data/08_reporting/bar_plot.json - >>> plotly_args: - >>> type: bar - >>> fig: - >>> x: features - >>> y: importance - >>> orientation: h - >>> layout: - >>> xaxis_title: x - >>> yaxis_title: y - >>> title: Title - - Example using Python API: + bar_plot: + type: plotly.PlotlyDataSet + filepath: data/08_reporting/bar_plot.json + plotly_args: + type: bar + fig: + x: features + y: importance + orientation: h + layout: + xaxis_title: x + yaxis_title: y + title: Title + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.plotly import PlotlyDataSet diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 49ad02dce..b8b6e06fa 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -17,28 +17,30 @@ class PickleDataSet(AbstractDataSet[Any, Any]): all allowed options for instantiating the redis app ``from_url`` and setting a value. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> my_python_object: # simple example - >>> type: redis.PickleDataSet - >>> key: my_object - >>> from_url_args: - >>> url: redis://127.0.0.1:6379 - >>> - >>> final_python_object: # example with save args - >>> type: redis.PickleDataSet - >>> key: my_final_object - >>> from_url_args: - >>> url: redis://127.0.0.1:6379 - >>> db: 1 - >>> save_args: - >>> ex: 10 - - Example using Python API: + my_python_object: # simple example + type: redis.PickleDataSet + key: my_object + from_url_args: + url: redis://127.0.0.1:6379 + + final_python_object: # example with save args + type: redis.PickleDataSet + key: my_final_object + from_url_args: + url: redis://127.0.0.1:6379 + db: 1 + save_args: + ex: 10 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.redis import PickleDataSet diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 7a2b54eef..2250ae337 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -159,41 +159,43 @@ def hdfs_glob(self, pattern: str) -> List[str]: class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]): """``SparkDataSet`` loads and saves Spark dataframes. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> weather: - >>> type: spark.SparkDataSet - >>> filepath: s3a://your_bucket/data/01_raw/weather/* - >>> file_format: csv - >>> load_args: - >>> header: True - >>> inferSchema: True - >>> save_args: - >>> sep: '|' - >>> header: True - >>> - >>> weather_schema: - >>> type: spark.SparkDataSet - >>> filepath: s3a://your_bucket/data/01_raw/weather/* - >>> file_format: csv - >>> load_args: - >>> header: True - >>> schema: - >>> filepath: path/to/schema.json - >>> save_args: - >>> sep: '|' - >>> header: True - >>> - >>> weather_cleaned: - >>> type: spark.SparkDataSet - >>> filepath: data/02_intermediate/data.parquet - >>> file_format: parquet - - Example using Python API: + weather: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + inferSchema: True + save_args: + sep: '|' + header: True + + weather_with_schema: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + schema: + filepath: path/to/schema.json + save_args: + sep: '|' + header: True + + weather_cleaned: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: parquet + + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index c4cb80bf2..613b6af5f 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -18,25 +18,29 @@ class SparkHiveDataSet(AbstractDataSet[DataFrame, DataFrame]): of the existing file/partition. This DataSet has some key assumptions: + - Schemas do not change during the pipeline run (defined PKs must be present for the duration of the pipeline) - Tables are not being externally modified during upserts. The upsert method is NOT ATOMIC + to external changes to the target table while executing. Upsert methodology works by leveraging Spark DataFrame execution plan checkpointing. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> hive_dataset: - >>> type: spark.SparkHiveDataSet - >>> database: hive_database - >>> table: table_name - >>> write_mode: overwrite + hive_dataset: + type: spark.SparkHiveDataSet + database: hive_database + table: table_name + write_mode: overwrite - Example using Python API: + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 9567d3e73..24bb3220a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -16,26 +16,27 @@ class SparkJDBCDataSet(AbstractDataSet[DataFrame, DataFrame]): ``pyspark.sql.DataFrameReader`` and ``pyspark.sql.DataFrameWriter`` internally, so it supports all allowed PySpark options on ``jdbc``. - - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> weather: - >>> type: spark.SparkJDBCDataSet - >>> table: weather_table - >>> url: jdbc:postgresql://localhost/test - >>> credentials: db_credentials - >>> load_args: - >>> properties: - >>> driver: org.postgresql.Driver - >>> save_args: - >>> properties: - >>> driver: org.postgresql.Driver - - Example using Python API: + weather: + type: spark.SparkJDBCDataSet + table: weather_table + url: jdbc:postgresql://localhost/test + credentials: db_credentials + load_args: + properties: + driver: org.postgresql.Driver + save_args: + properties: + driver: org.postgresql.Driver + + Example usage for the + `Python API `_: :: >>> import pandas as pd diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 4763abff4..5c9e0699f 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -42,30 +42,34 @@ class SVMLightDataSet(AbstractVersionedDataSet[_DI, _DO]): This format is used as the default format for both svmlight and the libsvm command line programs. - Example adding a catalog entry with the ``YAML API``: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> svm_dataset: - >>> type: svmlight.SVMLightDataSet - >>> filepath: data/01_raw/location.svm - >>> load_args: - >>> zero_based: False - >>> save_args: - >>> zero_based: False - >>> - >>> cars: - >>> type: svmlight.SVMLightDataSet - >>> filepath: gcs://your_bucket/cars.svm - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - >>> load_args: - >>> zero_based: False - >>> save_args: - >>> zero_based: False - - Example using Python API: + svm_dataset: + type: svmlight.SVMLightDataSet + filepath: data/01_raw/location.svm + load_args: + zero_based: False + save_args: + zero_based: False + + cars: + type: svmlight.SVMLightDataSet + filepath: gcs://your_bucket/cars.svm + fs_args: + project: my-project + credentials: my_gcp_credentials + load_args: + zero_based: False + save_args: + zero_based: False + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.svmlight import SVMLightDataSet diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 97bf4e505..63e53b7b4 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -24,21 +24,25 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M The underlying functionality is supported by, and passes input arguments through to, TensorFlow 2.X load_model and save_model methods. - .. code-block:: yaml - - >>> tensorflow_model: - >>> type: tensorflow.TensorFlowModelDataset - >>> filepath: data/06_models/tensorflow_model.h5 - >>> load_args: - >>> compile: False - >>> save_args: - >>> overwrite: True - >>> include_optimizer: False - >>> credentials: tf_creds - >>> + Example usage for the + `YAML API `_: + .. code-block:: yaml - Example using Python API: + tensorflow_model: + type: tensorflow.TensorFlowModelDataset + filepath: data/06_models/tensorflow_model.h5 + load_args: + compile: False + save_args: + overwrite: True + include_optimizer: False + credentials: tf_creds + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tensorflow import TensorFlowModelDataset diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 80ddbaf55..5ba2ee060 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -19,15 +19,19 @@ class TextDataSet(AbstractVersionedDataSet[str, str]): """``TextDataSet`` loads/saves data from/to a text file using an underlying filesystem (e.g.: local, S3, GCS) - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> alice_book: - >>> type: text.TextDataSet - >>> filepath: data/01_raw/alice.txt + alice_book: + type: text.TextDataSet + filepath: data/01_raw/alice.txt - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.text import TextDataSet diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 9454b2cbc..4235df999 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -15,18 +15,19 @@ class JSONDataSet(JDS): The ``JSONDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only and it is versioned by default. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: tracking.JSONDataSet - >>> filepath: data/09_tracking/cars.json + cars: + type: tracking.JSONDataSet + filepath: data/09_tracking/cars.json - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tracking import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index ad550d5b3..22bd51bc5 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -195,25 +195,24 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): """``VideoDataSet`` loads / save video data from a given filepath as sequence of PIL.Image.Image using OpenCV. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: video.VideoDataSet - >>> filepath: data/01_raw/cars.mp4 - >>> - >>> motorbikes: - >>> type: video.VideoDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.mp4 - >>> credentials: dev_s3 - >>> + cars: + type: video.VideoDataSet + filepath: data/01_raw/cars.mp4 + motorbikes: + type: video.VideoDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.mp4 + credentials: dev_s3 - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.video import VideoDataSet @@ -241,7 +240,7 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): >>> video.save(SequenceVideo(imgs, fps=25)) - Example creating a video from numpy frames using a generator and Python API: + Example creating a video from numpy frames using a generator and the Python API: :: >>> from kedro_datasets.video.video_dataset import VideoDataSet, GeneratorVideo diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 5e9126e93..1ab2fa43b 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -20,18 +20,19 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): """``YAMLDataSet`` loads/saves data from/to a YAML file using an underlying filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: yaml.YAMLDataSet - >>> filepath: cars.yaml + cars: + type: yaml.YAMLDataSet + filepath: cars.yaml - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.yaml import YAMLDataSet From b8ce448523e67315657bd2228b46c6e3b2fa1871 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 26 Jan 2023 09:51:17 -0500 Subject: [PATCH 2/3] Manually fix files not resolved with patch command Signed-off-by: Deepyaman Datta --- .../kedro_datasets/dask/parquet_dataset.py | 131 +++++++++--------- .../spark/deltatable_dataset.py | 78 ++++++----- .../tracking/metrics_dataset.py | 17 +-- 3 files changed, 114 insertions(+), 112 deletions(-) diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 01624a44b..9161fa4e6 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -15,73 +15,72 @@ class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]): remote data services to handle the corresponding load and save operations: https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html - Example adding a catalog entry with - `YAML API - `_: - - .. code-block:: yaml - - >>> cars: - >>> type: dask.ParquetDataSet - >>> filepath: s3://bucket_name/path/to/folder - >>> save_args: - >>> compression: GZIP - >>> credentials: - >>> client_kwargs: - >>> aws_access_key_id: YOUR_KEY - >>> aws_secret_access_key: YOUR_SECRET + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: dask.ParquetDataSet + filepath: s3://bucket_name/path/to/folder + save_args: + compression: GZIP + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: YOUR_SECRET + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.dask import ParquetDataSet + >>> import pandas as pd + >>> import dask.dataframe as dd >>> - - - Example using Python API (AWS S3): - :: - - >>> from kedro_datasets.dask import ParquetDataSet - >>> import pandas as pd - >>> import dask.dataframe as dd - >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [[5, 6], [7, 8]]}) - >>> ddf = dd.from_pandas(data, npartitions=2) - >>> - >>> data_set = ParquetDataSet( - >>> filepath="s3://bucket_name/path/to/folder", - >>> credentials={ - >>> 'client_kwargs':{ - >>> 'aws_access_key_id': 'YOUR_KEY', - >>> 'aws_secret_access_key': 'YOUR SECRET', - >>> } - >>> }, - >>> save_args={"compression": "GZIP"} - >>> ) - >>> data_set.save(ddf) - >>> reloaded = data_set.load() - >>> - >>> assert ddf.compute().equals(reloaded.compute()) - - The output schema can also be explicitly specified using Triad's grammar. - This is processed to map specific columns into pyarrow field types or schema. - - References: - https://triad.readthedocs.io/en/latest/api/triad.collections.html#module-triad.collections.schema - https://arrow.apache.org/docs/python/api/datatypes.html - - .. code-block:: yaml - - >>> parquet_dataset: - >>> type: dask.ParquetDataSet - >>> filepath: "s3://bucket_name/path/to/folder" - >>> credentials: - >>> client_kwargs: - >>> aws_access_key_id: YOUR_KEY - >>> aws_secret_access_key: "YOUR SECRET" - >>> save_args: - >>> compression: GZIP - >>> schema: - >>> col1: [int32] - >>> col2: [int32] - >>> col3: [[int32]] + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [[5, 6], [7, 8]]}) + >>> ddf = dd.from_pandas(data, npartitions=2) + >>> + >>> data_set = ParquetDataSet( + >>> filepath="s3://bucket_name/path/to/folder", + >>> credentials={ + >>> 'client_kwargs':{ + >>> 'aws_access_key_id': 'YOUR_KEY', + >>> 'aws_secret_access_key': 'YOUR SECRET', + >>> } + >>> }, + >>> save_args={"compression": "GZIP"} + >>> ) + >>> data_set.save(ddf) + >>> reloaded = data_set.load() + >>> + >>> assert ddf.compute().equals(reloaded.compute()) + + The output schema can also be explicitly specified using + `Triad `_. + This is processed to map specific columns to + `PyArrow field types `_ or schema. For instance: + + .. code-block:: yaml + + parquet_dataset: + type: dask.ParquetDataSet + filepath: "s3://bucket_name/path/to/folder" + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: "YOUR SECRET" + save_args: + compression: GZIP + schema: + col1: [int32] + col2: [int32] + col3: [[int32]] """ DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index eaa593e87..db45bc12c 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -15,44 +15,46 @@ class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): """``DeltaTableDataSet`` loads data into DeltaTable objects. - Example adding a catalog entry with - `YAML API `_: - - .. code-block:: yaml - - >>> weather@spark: - >>> type: spark.SparkDataSet - >>> filepath: data/02_intermediate/data.parquet - >>> file_format: "delta" - >>> - >>> weather@delta: - >>> type: spark.DeltaTableDataSet - >>> filepath: data/02_intermediate/data.parquet - - Example using Python API: - :: - - >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - >>> IntegerType, StructType) - >>> - >>> from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet - >>> - >>> schema = StructType([StructField("name", StringType(), True), - >>> StructField("age", IntegerType(), True)]) - >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] - >>> - >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) - >>> - >>> data_set = SparkDataSet(filepath="test_data", file_format="delta") - >>> data_set.save(spark_df) - >>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") - >>> delta_table = deltatable_dataset.load() - >>> - >>> delta_table.update() - """ + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + weather@spark: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: "delta" + + weather@delta: + type: spark.DeltaTableDataSet + filepath: data/02_intermediate/data.parquet + + Example usage for the + `Python API `_: + :: + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.types import (StructField, StringType, + >>> IntegerType, StructType) + >>> + >>> from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet + >>> + >>> schema = StructType([StructField("name", StringType(), True), + >>> StructField("age", IntegerType(), True)]) + >>> + >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + >>> + >>> data_set = SparkDataSet(filepath="test_data", file_format="delta") + >>> data_set.save(spark_df) + >>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") + >>> delta_table = deltatable_dataset.load() + >>> + >>> delta_table.update() + """ # this dataset cannot be used with ``ParallelRunner``, # therefore it has the attribute ``_SINGLE_PROCESS = True`` diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index f65adc7a3..6c0fc7389 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -17,18 +17,19 @@ class MetricsDataSet(JSONDataSet): ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only, it is versioned by default and only takes metrics of numeric values. -Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: tracking.MetricsDataSet - >>> filepath: data/09_tracking/cars.json + cars: + type: metrics.MetricsDataSet + filepath: data/09_tracking/cars.json - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tracking import MetricsDataSet From 4d57823bf0c109942b4d239b56173163cdfff07b Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 26 Jan 2023 10:10:15 -0500 Subject: [PATCH 3/3] Apply fix from #98 Signed-off-by: Deepyaman Datta --- kedro-datasets/kedro_datasets/tracking/metrics_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 6c0fc7389..7c7546a85 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -24,7 +24,7 @@ class MetricsDataSet(JSONDataSet): .. code-block:: yaml cars: - type: metrics.MetricsDataSet + type: tracking.MetricsDataSet filepath: data/09_tracking/cars.json Example usage for the