From ddabd345dc805e739838741f3f677e3a00f4cea8 Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Tue, 30 Jul 2024 14:54:01 -0500 Subject: [PATCH] [DOCS]: small update to "scaling-up" (#2577) --- docs/source/conf.py | 2 +- .../migration_guides/coming_from_dask.rst | 2 +- docs/source/user_guide/poweruser.rst | 2 +- .../poweruser/distributed-computing.rst | 67 +++++++++++++++++++ .../user_guide/poweruser/scaling-up.rst | 22 ------ ...ft_tutorial_embeddings_stackexchange.ipynb | 4 +- tutorials/intro.ipynb | 2 +- .../talks_and_demos/linkedin-03-05-2024.ipynb | 2 +- 8 files changed, 74 insertions(+), 29 deletions(-) create mode 100644 docs/source/user_guide/poweruser/distributed-computing.rst delete mode 100644 docs/source/user_guide/poweruser/scaling-up.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index ceee5862a5..36e66be49a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -83,7 +83,7 @@ "learn/install": "../install.html", "learn/user_guides/dataframes": "intro-dataframes.html", "learn/user_guides/types_and_ops": "intro-dataframes.html", - "learn/user_guides/remote_cluster_execution": "scaling-up.html", + "learn/user_guides/remote_cluster_execution": "distributed-computing.html", "learn/quickstart": "learn/10-min.html", "learn/10-min": "../10-min.html", } diff --git a/docs/source/migration_guides/coming_from_dask.rst b/docs/source/migration_guides/coming_from_dask.rst index 10fcf8f5c3..f1df5b8a21 100644 --- a/docs/source/migration_guides/coming_from_dask.rst +++ b/docs/source/migration_guides/coming_from_dask.rst @@ -118,7 +118,7 @@ Dask supports the same data types as pandas. Daft is built to support many more Distributed Computing and Remote Clusters ----------------------------------------- -Both Dask and Daft support distributed computing on remote clusters. In Dask, you create a Dask cluster either locally or remotely and perform computations in parallel there. Currently, Daft supports distributed cluster computing :doc:`with Ray <../user_guide/poweruser/scaling-up>`. Support for running Daft computations on Dask clusters is on the roadmap. +Both Dask and Daft support distributed computing on remote clusters. In Dask, you create a Dask cluster either locally or remotely and perform computations in parallel there. Currently, Daft supports distributed cluster computing :doc:`with Ray <../user_guide/poweruser/distributed-computing>`. Support for running Daft computations on Dask clusters is on the roadmap. Cloud support for both Dask and Daft is the same. diff --git a/docs/source/user_guide/poweruser.rst b/docs/source/user_guide/poweruser.rst index 3ca1df5086..765fbe83ca 100644 --- a/docs/source/user_guide/poweruser.rst +++ b/docs/source/user_guide/poweruser.rst @@ -5,4 +5,4 @@ The Daft Poweruser poweruser/memory poweruser/partitioning - poweruser/scaling-up + poweruser/distributed-computing diff --git a/docs/source/user_guide/poweruser/distributed-computing.rst b/docs/source/user_guide/poweruser/distributed-computing.rst new file mode 100644 index 0000000000..f3b93f3a1f --- /dev/null +++ b/docs/source/user_guide/poweruser/distributed-computing.rst @@ -0,0 +1,67 @@ +Distributed Computing +===================== + +By default, Daft runs using your local machine's resources and your operations are thus limited by the CPUs, memory and GPUs available to you in your single local development machine. + +However, Daft has strong integrations with `Ray `_ which is a distributed computing framework for distributing computations across a cluster of machines. Here is a snippet showing how you can connect Daft to a Ray cluster: + +.. code:: python + + import daft + + daft.context.set_runner_ray() + +By default, if no address is specified Daft will spin up a Ray cluster locally on your machine. If you are running Daft on a powerful machine (such as an AWS P3 machine which is equipped with multiple GPUs) this is already very useful because Daft can parallelize its execution of computation across your CPUs and GPUs. However, if instead you already have your own Ray cluster running remotely, you can connect Daft to it by supplying an address: + +.. code:: python + + daft.context.set_runner_ray(address="ray://url-to-mycluster") + +For more information about the ``address`` keyword argument, please see the `Ray documentation on initialization `_. + + +If you want to start a single node ray cluster on your local machine, you can do the following: + +.. code:: shell + + > pip install ray[default] + > ray start --head --port=6379 + +This should output something like: + +.. code:: shell + + Usage stats collection is enabled. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. + + Local node IP: 127.0.0.1 + + -------------------- + Ray runtime started. + -------------------- + + ... + +You can take the IP address and port and pass it to Daft: + +.. code:: python + + >>> import daft + >>> daft.context.set_runner_ray("127.0.0.1:6379") + DaftContext(_daft_execution_config=, _daft_planning_config=, _runner_config=_RayRunnerConfig(address='127.0.0.1:6379', max_task_backlog=None), _disallow_set_runner=True, _runner=None) + >>> df = daft.from_pydict({ + ... 'text': ['hello', 'world'] + ... }) + 2024-07-29 15:49:26,610 INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379... + 2024-07-29 15:49:26,622 INFO worker.py:1752 -- Connected to Ray cluster. + >>> print(df) + ╭───────╮ + │ text │ + │ --- │ + │ Utf8 │ + ╞═══════╡ + │ hello │ + ├╌╌╌╌╌╌╌┤ + │ world │ + ╰───────╯ + + (Showing first 2 of 2 rows) diff --git a/docs/source/user_guide/poweruser/scaling-up.rst b/docs/source/user_guide/poweruser/scaling-up.rst deleted file mode 100644 index 4319f58e29..0000000000 --- a/docs/source/user_guide/poweruser/scaling-up.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. _scaling_up: - -Distributed Computing -===================== - -By default, Daft runs using your local machine's resources and your operations are thus limited by the CPUs, memory and GPUs available to you in your single local development machine. - -However, Daft has strong integrations with `Ray `_ which is a distributed computing framework for distributing computations across a cluster of machines. Here is a snippet showing how you can connect Daft to a Ray cluster: - -.. code:: python - - import daft - - daft.context.set_runner_ray() - -By default, if no address is specified Daft will spin up a Ray cluster locally on your machine. If you are running Daft on a powerful machine (such as an AWS P3 machine which is equipped with multiple GPUs) this is already very useful because Daft can parallelize its execution of computation across your CPUs and GPUs. However, if instead you already have your own Ray cluster running remotely, you can connect Daft to it by supplying an address: - -.. code:: python - - daft.context.set_runner_ray(address="ray://url-to-mycluster") - -For more information about the ``address`` keyword argument, please see the `Ray documentation on initialization `_. diff --git a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb index 5c78dd9258..83e27fee23 100644 --- a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb +++ b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb @@ -67,7 +67,7 @@ "\n", "*EDIT (June 2023): Our hosted version of the full dataset is temporarily unavailable. Please enjoy the demo with the sample dataset for now.*\n", "\n", - "**Note:** This demo runs best on a cluster with many GPUs available. Information on how to connect Daft to a cluster is available [here](https://www.getdaft.io/projects/docs/en/stable/learn/user_guides/scaling-up.html). \n", + "**Note:** This demo runs best on a cluster with many GPUs available. Information on how to connect Daft to a cluster is available [here](https://www.getdaft.io/projects/docs/en/stable/learn/user_guides/poweruser/distributed-computing.html). \n", "\n", "If running on a single node, you can use the provided subsample of the data, which is 75MB in size. If you like, you can also truncate either dataset to a desired number of rows using `df.limit`." ] @@ -468,7 +468,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/tutorials/intro.ipynb b/tutorials/intro.ipynb index 7214cb8dd8..22b5a79323 100644 --- a/tutorials/intro.ipynb +++ b/tutorials/intro.ipynb @@ -156,7 +156,7 @@ "\n", "![image.png](attachment:image.png)\n", "\n", - "See ([Daft Documentation: Distributed Computing](https://www.getdaft.io/projects/docs/en/latest/learn/user_guides/scaling-up.html))" + "See ([Daft Documentation: Distributed Computing](https://www.getdaft.io/projects/docs/en/latest/user_guide/poweruser/distributed-computing.html))" ] }, { diff --git a/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb b/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb index 46d06aae15..3440a7e4e5 100644 --- a/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb +++ b/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb @@ -221,7 +221,7 @@ "\n", "![image.png](attachment:image.png)\n", "\n", - "See ([Daft Documentation: Distributed Computing](https://www.getdaft.io/projects/docs/en/latest/learn/user_guides/scaling-up.html))" + "See ([Daft Documentation: Distributed Computing](https://www.getdaft.io/projects/docs/en/stable/learn/user_guides/poweruser/distributed-computing.html))" ] }, {