[ENH] add examples in APIDataSet docstring

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>
McDonnellJoseph · May 11, 2023 · 245c63b · 245c63b
1 parent bc528d8
commit 245c63b
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 22 deletions.
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -14,27 +14,18 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
     """``APIDataSet`` loads the data from HTTP(S) APIs.
     It uses the python requests library: https://requests.readthedocs.io/en/latest/
 
-    Example usage for the
-    `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+    Example usage for the `YAML API <https://kedro.readthedocs.io/en/stable/data/\
     data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
 
     .. code-block:: yaml
 
         usda:
-          type: api.APIDataSet
-          url: https://quickstats.nass.usda.gov
-          params:
-            key: SOME_TOKEN,
-            format: JSON,
-            commodity_desc: CORN,
-            statisticcat_des: YIELD,
-            agg_level_desc: STATE,
-            year: 2000
-
-    Example usage for the
-    `Python API <https://kedro.readthedocs.io/en/stable/data/\
-    data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
-    ::
+          type: api.APIDataSet url: https://quickstats.nass.usda.gov params:
+            key: SOME_TOKEN, format: JSON, commodity_desc: CORN, statisticcat_des: YIELD,
+            agg_level_desc: STATE, year: 2000
+
+    Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
+    data_catalog.html#use-the-data-catalog-with-the-code-api>`_: ::
 
         >>> from kedro_datasets.api import APIDataSet
         >>>
@@ -52,6 +43,35 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
         >>> )
         >>> data = data_set.load()
 
+    ``APIDataSet`` can also be used to save some output on some remote server using
+    HTTP(S) methods.
+
+        >>> import pandas as pd
+        >>> example_table = pd.DataFrame({"col1":["val1", "val2"], "col2":["val3", "val4"]}
+
+    Here we initialise our APIDataSet with the correct parameters to make requests
+    towards the configured remote server.
+
+        >>> data_set = APIDataSet(
+                url = "url_of_remote_server",
+                save_args = {"method":"POST",
+                            "chunk_size":1}
+        )
+    On initialization, we can specify all the necessary parameters in the save args
+    dictionary. The default HTTP(S) method is POST but all other methods are supported.
+    Two important parameters to keep in mind are timeout and chunk_size. ``Timeout``
+    defines how long  our program waits for a response after a request. ``Chunk_size``, is
+    only used if the input of save method is a list. It will, divide the request into
+    chunks of size ``chunk_size``. For example, here we will send two requests each
+    containing one row of our example DataFrame.
+
+        >>> data_to_save = example_table.to_dict(orient="records")
+        >>> data_set.save(data_to_save)
+
+    If the data passed to the save method is not a list, ``APIDataSet`` will check if
+    it can be loaded as JSON. If true, it will send the data unchanged in a single
+    request. Otherwise, the ``_save`` method will try to dump the data in JSON format and
+    execute the request.
     """
 
     DEFAULT_SAVE_ARGS = {

diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
@@ -181,7 +181,6 @@ def _ensure_file_system_target(self) -> None:
             )
 
     def _load(self) -> pd.DataFrame:
-
         self._ensure_file_system_target()
 
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
@@ -196,7 +195,6 @@ def _load(self) -> pd.DataFrame:
         )
 
     def _save(self, data: pd.DataFrame) -> None:
-
         self._ensure_file_system_target()
 
         save_path = get_filepath_str(self._get_save_path(), self._protocol)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
@@ -126,7 +126,6 @@ def __init__(
 
         # Update properties in load_args and save_args with credentials.
         if credentials is not None:
-
             # Check credentials for bad inputs.
             for cred_key, cred_value in credentials.items():
                 if cred_value is None:

diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
@@ -170,7 +170,6 @@ def test_dict_save(self, tmp_path, mock_dict_plot, plot_writer, mocked_s3_bucket
         plot_writer.save(mock_dict_plot)
 
         for colour in COLOUR_LIST:
-
             download_path = tmp_path / "downloaded_image.png"
             actual_filepath = tmp_path / "locally_saved.png"
 
@@ -361,7 +360,6 @@ def test_list_save(self, tmp_path, mock_list_plot, versioned_plot_writer):
         versioned_plot_writer.save(mock_list_plot)
 
         for index in range(5):
-
             test_path = tmp_path / "test_image.png"
             versioned_filepath = str(versioned_plot_writer._get_load_path())
 

diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -77,7 +77,6 @@ def mocked_dataframe():
 
 @pytest.fixture
 def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame):
-
     binarycsv = mocked_dataframe.write_csv()[:-1]
 
     mocked_s3_bucket.put_object(