From 83f197d6c521000e1836aa98eafbcc4a0da93f63 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Wed, 25 Sep 2024 14:44:17 -0700 Subject: [PATCH 01/21] Refactor homepage --- docs/source/index.rst | 52 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3a2d3eabb5..0c6663ba94 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,14 +1,50 @@ Daft Documentation ================== -Daft is a distributed query engine for large-scale data processing in Python and is implemented in Rust. - -* **Familiar interactive API:** Lazy Python Dataframe for rapid and interactive iteration -* **Focus on the what:** Powerful Query Optimizer that rewrites queries to be as efficient as possible -* **Data Catalog integrations:** Full integration with data catalogs such as Apache Iceberg -* **Rich multimodal type-system:** Supports multimodal types such as Images, URLs, Tensors and more -* **Seamless Interchange**: Built on the `Apache Arrow `_ In-Memory Format -* **Built for the cloud:** `Record-setting `_ I/O performance for integrations with S3 cloud storage +Daft is a unified distributed data engine for data engineering, analytics and ML/AI. + +Daft exposes both a **SQL and Python DataFrame interface** and is implemented in Rust. + +Daft provides a snappy and delightful local interactive experience, but also seamlessly scales to distributed petabyte-scale data engineering workloads. + +Use-Cases +--------- + +Data Engineering +**************** + +*Provides the local performance and memory stability of DuckDB/Polars with the scalability of Apache Spark* + +* **Extract → Transform → Load (ETL):** Perform data engineering on messy multimodal data at scales ranging from MB to PB, on a single node or a distributed cluster +* **Cloud-native:** Native integrations with modern cloud storage (e.g. S3), open catalogs/table formats (e.g. Apache Iceberg, DeltaLake) and open data formats (e.g. Apache Parquet) + +Data Analytics +************** + +*Provides a SQL interface with the snappiness of local engines such as DuckDB and scalability of engines such as Spark/Trino* + +* **Local Analytics:** Snappy interactive data exploration and aggregations from Python notebooks using DataFrames or SQL with the performance/development experience of local engines such as DuckDB/Polars +* **Distributed Analytics:** Powerful capabilities to scale to the cloud when required to process larger datasets, outperforming distributed analytics engines such as Spark and Trino + +ML/AI +***** + +*Replaces opinionated data formats such as Mosaic Data Shard (MDS) or TFRecords with dataloading directly from open formats (Apache Parquet, JPEG) into Pytorch or Numpy while saturating network bandwidth* + +* **Dataloading for training:** Fast and memory efficient dataloaders from open file formats such as Parquet and JPEG +* **Model batch inference on GPUs:** Schedule large-scale model batch inference on a fleet of GPUs on a distributed cluster. + +Technology +---------- + +Daft boasts strong integrations with technologies common across these workloads: + +* **Cloud Object Storage:** Record-setting I/O performance for integrations with S3 cloud storage, `battle-tested at exabyte-scale at Amazon `_ +* **ML/AI Python Ecosystem:** first-class integrations with `PyTorch `_ and `NumPy `_ for efficient interoperability with your ML/AI stack +* **Data Catalogs/Table Formats:** capabilities to effectively query table formats such as `Apache Iceberg `_, `Delta Lake `_ and `Apache Hudi `_ +* **Seamless Data Interchange:** zero-copy integration with `Apache Arrow `_ +* **Multimodal/ML Data:** native functionality for data modalities such as tensors, images, URLs, long-form text and embeddings + Installing Daft --------------- From a5063646f321c43713f9892a938d1ddad7fd1d68 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Wed, 25 Sep 2024 14:46:46 -0700 Subject: [PATCH 02/21] Even stronger wording --- docs/source/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 0c6663ba94..f774281951 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,11 +1,11 @@ Daft Documentation ================== -Daft is a unified distributed data engine for data engineering, analytics and ML/AI. +Daft is a unified data engine for **data engineering, analytics and ML/AI**. Daft exposes both a **SQL and Python DataFrame interface** and is implemented in Rust. -Daft provides a snappy and delightful local interactive experience, but also seamlessly scales to distributed petabyte-scale data engineering workloads. +Daft provides a **snappy and delightful local interactive experience**, but also seamlessly **scales to petabyte-scale distributed workloads**. Use-Cases --------- From 82cf949b1d0f3e526e2e533dfb1addc84ff151e9 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Wed, 25 Sep 2024 14:47:52 -0700 Subject: [PATCH 03/21] Stronger@! --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index f774281951..c29c284fb0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,7 +3,7 @@ Daft Documentation Daft is a unified data engine for **data engineering, analytics and ML/AI**. -Daft exposes both a **SQL and Python DataFrame interface** and is implemented in Rust. +Daft exposes both **SQL and Python DataFrame interfaces** as first-class citizens and is implemented in Rust. Daft provides a **snappy and delightful local interactive experience**, but also seamlessly **scales to petabyte-scale distributed workloads**. From 985268ae9a6f95fa4a7c16a286b9a3d320bec013 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Wed, 25 Sep 2024 14:48:38 -0700 Subject: [PATCH 04/21] ok --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index c29c284fb0..5f5fb318f1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,7 +3,7 @@ Daft Documentation Daft is a unified data engine for **data engineering, analytics and ML/AI**. -Daft exposes both **SQL and Python DataFrame interfaces** as first-class citizens and is implemented in Rust. +Daft exposes both **SQL and Python DataFrame interfaces** as first-class citizens and is written in Rust. Daft provides a **snappy and delightful local interactive experience**, but also seamlessly **scales to petabyte-scale distributed workloads**. From c56c5c5518b570a0e2e08f0383b7f2f192e21d6a Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Wed, 25 Sep 2024 15:35:58 -0700 Subject: [PATCH 05/21] Refactor user-guide basic concepts --- docs/source/user_guide/basic_concepts.rst | 604 +++++++++++++++++- .../basic_concepts/dataframe_introduction.rst | 203 ------ .../user_guide/basic_concepts/expressions.rst | 343 ---------- .../basic_concepts/introduction.rst | 92 --- docs/source/user_guide/index.rst | 27 +- .../{basic_concepts => }/read-and-write.rst | 51 +- docs/source/user_guide/sql.rst | 2 + 7 files changed, 657 insertions(+), 665 deletions(-) delete mode 100644 docs/source/user_guide/basic_concepts/dataframe_introduction.rst delete mode 100644 docs/source/user_guide/basic_concepts/expressions.rst delete mode 100644 docs/source/user_guide/basic_concepts/introduction.rst rename docs/source/user_guide/{basic_concepts => }/read-and-write.rst (64%) create mode 100644 docs/source/user_guide/sql.rst diff --git a/docs/source/user_guide/basic_concepts.rst b/docs/source/user_guide/basic_concepts.rst index 3bb3a89023..b40f074971 100644 --- a/docs/source/user_guide/basic_concepts.rst +++ b/docs/source/user_guide/basic_concepts.rst @@ -1,9 +1,603 @@ Basic Concepts ============== -.. toctree:: +Daft is a distributed data engine. The main abstraction in Daft is the :class:`DataFrame `, which conceptually can be thought of as a "table" of data with rows and columns. - basic_concepts/introduction - basic_concepts/dataframe_introduction - basic_concepts/expressions - basic_concepts/read-and-write +Daft also exposes a :doc:`sql` interface which interoperates closely with the DataFrame interface, allowing you to express data transformations and queries on your tables as SQL strings. + +.. image:: /_static/daft_illustration.png + :alt: Daft python dataframes make it easy to load any data such as PDF documents, images, protobufs, csv, parquet and audio files into a table dataframe structure for easy querying + :width: 500 + :align: center + +Terminology +----------- + +DataFrames +^^^^^^^^^^ + +The :class:`DataFrame ` is the core concept in Daft. Think of it as a table with rows and columns, similar to a spreadsheet or a database table. It's designed to handle large amounts of data efficiently. + +Daft DataFrames are lazy. This means that calling most methods on a DataFrame will not execute that operation immediately - instead, DataFrames expose explicit methods such as :meth:`daft.DataFrame.show` and :meth:`daft.DataFrame.write_parquet` +which will actually trigger computation of the DataFrame. + +Expressions +^^^^^^^^^^^ + +An :class:`Expression ` is a fundamental concept in Daft that allows you to define computations on DataFrame columns. They are the building blocks for transforming and manipulating data +within your DataFrame and will be your best friend if you are working with Daft primarily using the Python API. + +Query Plan +^^^^^^^^^^ + +As mentioned earlier, Daft DataFrames are lazy. Under the hood, each DataFrame in Daft is represented by a plan of operations that describes how to compute that DataFrame. + +This plan is called the "query plan" and calling methods on the DataFrame actually adds steps to the query plan! + +When your DataFrame is executed, Daft will read this plan, optimize it to make it run faster and then execute it to compute the requested results. + +Structured Query Language (SQL) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +SQL is a common query language for expressing queries over tables of data. Daft exposes a SQL API as an alternative (but often also complementary API) to the Python :class:`DataFrame ` and +:class:`Expression ` APIs for building queries. + +DataFrame +--------- + +If you are coming from other DataFrame libraries such as Pandas or Polars, here are some key differences about Daft DataFrames: + +1. **Distributed:** When running in a distributed cluster, Daft splits your data into smaller "chunks" called *Partitions*. This allows Daft to process your data in parallel across multiple machines, leveraging more resources to work with large datasets. + +2. **Lazy:** When you write operations on a DataFrame, Daft doesn't execute them immediately. Instead, it creates a plan (called a query plan) of what needs to be done. This plan is optimized and only executed when you specifically request the results, which can lead to more efficient computations. + +3. **Multimodal:** Unlike traditional tables that usually contain simple data types like numbers and text, Daft DataFrames can handle complex data types in its columns. This includes things like images, audio files, or even custom Python objects. + +Common data operations that you would perform on DataFrames are: + +1. **Filtering rows:** Use :meth:`df.where(...) ` to keep only the rows that meet certain conditions. +2. **Creating new columns:** Use :meth:`df.with_column(...) ` to add a new column based on calculations from existing ones. +3. **Joining tables:** Use :meth:`df.join(other_df, ...) ` to combine two DataFrames based on common columns. +4. **Sorting:** Use :meth:`df.sort(...) ` to arrange your data based on values in one or more columns. +5. **Grouping and aggregating:** Use :meth:`df.groupby(...).agg(...) ` to summarize your data by groups. + +Creating a Dataframe +^^^^^^^^^^^^^^^^^^^^ + +Let's create our first Dataframe from a Python dictionary of columns. + +.. code:: python + + import daft + + df = daft.from_pydict({ + "A": [1, 2, 3, 4], + "B": [1.5, 2.5, 3.5, 4.5], + "C": [True, True, False, False], + "D": [None, None, None, None], + }) + +Examine your Dataframe by printing it: + +.. code:: python + + df + +.. code-block:: text + :caption: Output + + +---------+-----------+-----------+-----------+ + | A | B | C | D | + | Int64 | Float64 | Boolean | Null | + +=========+===========+===========+===========+ + | 1 | 1.5 | true | None | + +---------+-----------+-----------+-----------+ + | 2 | 2.5 | true | None | + +---------+-----------+-----------+-----------+ + | 3 | 3.5 | false | None | + +---------+-----------+-----------+-----------+ + | 4 | 4.5 | false | None | + +---------+-----------+-----------+-----------+ + (Showing first 4 of 4 rows) + + +Congratulations - you just created your first DataFrame! It has 4 columns, "A", "B", "C", and "D". Let's try to select only the "A", "B", and "C" columns: + +.. code:: python + + df.select("A", "B", "C") + +.. code-block:: text + :caption: Output + + +---------+-----------+-----------+ + | A | B | C | + | Int64 | Float64 | Boolean | + +=========+===========+===========+ + +---------+-----------+-----------+ + (No data to display: Dataframe not materialized) + + +But wait - why is it printing the message ``(No data to display: Dataframe not materialized)`` and where are the rows of each column? + +Executing our DataFrame and Viewing Data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The reason that our DataFrame currently does not display its rows is that Daft DataFrames are **lazy**. This just means that Daft DataFrames will defer all its work until you tell it to execute. + +In this case, Daft is just deferring the work required to read the data and select columns, however in practice this laziness can be very useful for helping Daft optimize your queries before execution! + +.. NOTE:: + + When you call methods on a Daft Dataframe, it defers the work by adding to an internal "plan". You can examine the current plan of a DataFrame by calling :meth:`df.explain() `! + + Passing the ``show_all=True`` argument will show you the plan after Daft applies its query optimizations and the physical (lower-level) plan. + +We can tell Daft to execute our DataFrame and cache the results using :meth:`df.collect() `: + +.. code:: python + + df.collect() + df + +.. code-block:: text + :caption: Output + + +---------+-----------+-----------+ + | A | B | C | + | Int64 | Float64 | Boolean | + +=========+===========+===========+ + | 1 | 1.5 | true | + +---------+-----------+-----------+ + | 2 | 2.5 | true | + +---------+-----------+-----------+ + | 3 | 3.5 | false | + +---------+-----------+-----------+ + | 4 | 4.5 | false | + +---------+-----------+-----------+ + (Showing first 4 of 4 rows) + +Now your DataFrame object ``df`` is **materialized** - Daft has executed all the steps required to compute the results, and has cached the results in memory so that it can display this preview. + +Any subsequent operations on ``df`` will avoid recomputations, and just use this materialized result! + +When should I materialize my DataFrame? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you "eagerly" call :meth:`df.collect() ` immediately on every DataFrame, you may run into issues: + +1. If data is too large at any step, materializing all of it may cause memory issues +2. Optimizations are not possible since we cannot "predict future operations" + +However, data science is all about experimentation and trying different things on the same data. This means that materialization is crucial when working interactively with DataFrames, since it speeds up all subsequent experimentation on that DataFrame. + +We suggest materializing DataFrames using :meth:`df.collect() ` when they contain expensive operations (e.g. sorts or expensive function calls) and have to be called multiple times by downstream code: + +.. code:: python + + df = df.with_column("A", df["A"].apply(expensive_function)) # expensive function + df = df.sort("A") # expensive sort + df.collect() # materialize the DataFrame + + # All subsequent work on df avoids recomputing previous steps + df.sum().show() + df.mean().show() + df.with_column("try_this", df["A"] + 1).show(5) + +In many other cases however, there are better options than materializing your entire DataFrame with :meth:`df.collect() `: + +1. **Peeking with df.show(N)**: If you only want to "peek" at the first few rows of your data for visualization purposes, you can use :meth:`df.show(N) `, which processes and shows only the first ``N`` rows. +2. **Writing to disk**: The ``df.write_*`` methods will process and write your data to disk per-partition, avoiding materializing it all in memory at once. +3. **Pruning data**: You can materialize your DataFrame after performing a :meth:`df.limit() `, :meth:`df.where() ` or :meth:`df.select() ` operation which processes your data or prune it down to a smaller size. + +Schemas and Types +^^^^^^^^^^^^^^^^^ + +Notice also that when we printed our DataFrame, Daft displayed its **schema**. Each column of your DataFrame has a **name** and a **type**, and all data in that column will adhere to that type! + +Daft can display your DataFrame's schema without materializing it. Under the hood, it performs intelligent sampling of your data to determine the appropriate schema, and if you make any modifications to your DataFrame it can infer the resulting types based on the operation. + +.. NOTE:: + + Under the hood, Daft represents data in the `Apache Arrow `_ format, which allows it to efficiently represent and work on data using high-performance kernels which are written in Rust. + + +Running Computations +^^^^^^^^^^^^^^^^^^^^ + +To run computations on data in our DataFrame, we use Expressions. + +The following statement will :meth:`df.show() ` a DataFrame that has only one column - the column ``A`` from our original DataFrame but with every row incremented by 1. + +.. code:: python + + df.select(df["A"] + 1).show() + +.. code-block:: text + :caption: Output + + +---------+ + | A | + | Int64 | + +=========+ + | 2 | + +---------+ + | 3 | + +---------+ + | 4 | + +---------+ + | 5 | + +---------+ + (Showing first 4 rows) + +.. NOTE:: + + A common pattern is to create a new columns using ``DataFrame.with_column``: + + .. code:: python + + # Creates a new column named "foo" which takes on values + # of column "A" incremented by 1 + df = df.with_column("foo", df["A"] + 1) + +Congratulations, you have just written your first **Expression**: ``df["A"] + 1``! + +Expressions +----------- + +Expressions are how you can express computations that should be run over columns of data. + +Creating Expressions +^^^^^^^^^^^^^^^^^^^^ + +Referring to a column in a DataFrame +#################################### + +Most commonly you will be creating expressions by using the :func:`daft.col` function. + +.. code:: python + + # Refers to column "A" + daft.col("A") + +.. code-block:: text + :caption: Output + + col(A) + +The above code creates an Expression that refers to a column named ``"A"``. + +Using SQL +######### + +Daft can also parse valid SQL as expressions. + +.. code:: python + + from daft import sql_expr + + sql_expr("A + 1") + +.. code-block:: text + :caption: Output + + col(A) + lit(1) + +The above code will create an expression representing "the column named 'x' incremented by 1". + +Literals +######## + +You may find yourself needing to hardcode a "single value" oftentimes as an expression. Daft provides a :func:`~daft.expressions.lit` helper to do so: + +.. code:: python + + from daft import lit + + # Refers to an expression which always evaluates to 42 + lit(42) + +.. code-block:: text + :caption: Output + + lit(42) + +This special :func:`~daft.expressions.lit` expression we just created evaluates always to the value ``42``. + +Wildcard Expressions +#################### + +You can create expressions on multiple columns at once using a wildcard. The expression `col("*")` selects every column in a DataFrame, and you can operate on this expression in the same way as a single column: + +.. code:: python + + import daft + from daft import col + + df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.select(col("*") * 3).show() + +.. code-block:: text + :caption: Output + + ╭───────┬───────╮ + │ A ┆ B │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 3 ┆ 12 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 6 ┆ 15 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 9 ┆ 18 │ + ╰───────┴───────╯ + +Composing Expressions +^^^^^^^^^^^^^^^^^^^^^ + +.. _userguide-numeric-expressions: + +Numeric Expressions +################### + +Since column "A" is an integer, we can run numeric computation such as addition, division and checking its value. Here are some examples where we create new columns using the results of such computations: + +.. code:: python + + # Add 1 to each element in column "A" + df = df.with_column("A_add_one", df["A"] + 1) + + # Divide each element in column A by 2 + df = df.with_column("A_divide_two", df["A"] / 2.) + + # Check if each element in column A is more than 1 + df = df.with_column("A_gt_1", df["A"] > 1) + + df.collect() + +.. code-block:: text + :caption: Output + + +---------+-------------+----------------+-----------+ + | A | A_add_one | A_divide_two | A_gt_1 | + | Int64 | Int64 | Float64 | Boolean | + +=========+=============+================+===========+ + | 1 | 2 | 0.5 | false | + +---------+-------------+----------------+-----------+ + | 2 | 3 | 1 | true | + +---------+-------------+----------------+-----------+ + | 3 | 4 | 1.5 | true | + +---------+-------------+----------------+-----------+ + (Showing first 3 of 3 rows) + +Notice that the returned types of these operations are also well-typed according to their input types. For example, calling ``df["A"] > 1`` returns a column of type :meth:`Boolean `. + +Both the :meth:`Float ` and :meth:`Int ` types are numeric types, and inherit many of the same arithmetic Expression operations. You may find the full list of numeric operations in the :ref:`Expressions API reference `. + +.. _userguide-string-expressions: + +String Expressions +################## + +Daft also lets you have columns of strings in a DataFrame. Let's take a look! + +.. code:: python + + df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) + df.show() + +.. code-block:: text + :caption: Output + + +--------+ + | B | + | Utf8 | + +========+ + | foo | + +--------+ + | bar | + +--------+ + | baz | + +--------+ + (Showing first 3 rows) + +Unlike the numeric types, the string type does not support arithmetic operations such as ``*`` and ``/``. The one exception to this is the ``+`` operator, which is overridden to concatenate two string expressions as is commonly done in Python. Let's try that! + +.. code:: python + + df = df.with_column("B2", df["B"] + "foo") + df.show() + +.. code-block:: text + :caption: Output + + +--------+--------+ + | B | B2 | + | Utf8 | Utf8 | + +========+========+ + | foo | foofoo | + +--------+--------+ + | bar | barfoo | + +--------+--------+ + | baz | bazfoo | + +--------+--------+ + (Showing first 3 rows) + +There are also many string operators that are accessed through a separate :meth:`.str.* ` "method namespace". + +For example, to check if each element in column "B" contains the substring "a", we can use the :meth:`.str.contains ` method: + +.. code:: python + + df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) + df.show() + +.. code-block:: text + :caption: Output + + +--------+--------+-----------------+ + | B | B2 | B2_contains_B | + | Utf8 | Utf8 | Boolean | + +========+========+=================+ + | foo | foofoo | true | + +--------+--------+-----------------+ + | bar | barfoo | true | + +--------+--------+-----------------+ + | baz | bazfoo | true | + +--------+--------+-----------------+ + (Showing first 3 rows) + +You may find a full list of string operations in the :ref:`Expressions API reference `. + +URL Expressions +############### + +One special case of a String column you may find yourself working with is a column of URL strings. + +Daft provides the :meth:`.url.* ` method namespace with functionality for working with URL strings. For example, to download data from URLs: + +.. code:: python + + df = daft.from_pydict({ + "urls": [ + "https://www.google.com", + "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", + ], + }) + df = df.with_column("data", df["urls"].url.download()) + df.collect() + +.. code-block:: text + :caption: Output + + +----------------------+----------------------+ + | urls | data | + | Utf8 | Binary | + +======================+======================+ + | https://www.google.c | b'`_ as the underlying executor, so you can find the full list of supported filters in the `jaq documentation `_. + +.. _userguide-logical-expressions: + +Logical Expressions +################### + +Logical Expressions are an expression that refers to a column of type :meth:`Boolean `, and can only take on the values True or False. + +.. code:: python + + df = daft.from_pydict({"C": [True, False, True]}) + df["C"] + +Daft supports logical operations such as ``&`` (and) and ``|`` (or) between logical expressions. + +Comparisons +########### + +Many of the types in Daft support comparisons between expressions that returns a Logical Expression. + +For example, here we can compare if each element in column "A" is equal to elements in column "B": + +.. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + + df = df.with_column("A_eq_B", df["A"] == df["B"]) + + df.collect() + +.. code-block:: text + :caption: Output + + +---------+---------+-----------+ + | A | B | A_eq_B | + | Int64 | Int64 | Boolean | + +=========+=========+===========+ + | 1 | 1 | true | + +---------+---------+-----------+ + | 2 | 2 | true | + +---------+---------+-----------+ + | 3 | 4 | false | + +---------+---------+-----------+ + (Showing first 3 of 3 rows) + +Other useful comparisons can be found in the :ref:`Expressions API reference `. + +If Else Pattern +############### + +The :meth:`.if_else() ` method is a useful expression to have up your sleeve for choosing values between two other expressions based on a logical expression: + +.. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + + # Pick values from column A if the value in column A is bigger + # than the value in column B. Otherwise, pick values from column B. + df = df.with_column( + "A_if_bigger_else_B", + (df["A"] > df["B"]).if_else(df["A"], df["B"]), + ) + + df.collect() + +.. code-block:: text + :caption: Output + + +---------+---------+----------------------+ + | A | B | A_if_bigger_else_B | + | Int64 | Int64 | Int64 | + +=========+=========+======================+ + | 1 | 0 | 1 | + +---------+---------+----------------------+ + | 2 | 2 | 2 | + +---------+---------+----------------------+ + | 3 | 4 | 4 | + +---------+---------+----------------------+ + (Showing first 3 of 3 rows) + +This is a useful expression for cleaning your data! diff --git a/docs/source/user_guide/basic_concepts/dataframe_introduction.rst b/docs/source/user_guide/basic_concepts/dataframe_introduction.rst deleted file mode 100644 index 7e1075b34b..0000000000 --- a/docs/source/user_guide/basic_concepts/dataframe_introduction.rst +++ /dev/null @@ -1,203 +0,0 @@ -Dataframe -========= - -Data in Daft is represented as a DataFrame, which is a collection of data organized as a **table** with **rows** and **columns**. - -.. image:: /_static/daft_illustration.png - :alt: Daft python dataframes make it easy to load any data such as PDF documents, images, protobufs, csv, parquet and audio files into a table dataframe structure for easy querying - :width: 500 - :align: center - -This document provides an introduction to the Daft Dataframe. - -Creating a Dataframe --------------------- - -Let's create our first Dataframe from a Python dictionary of columns. - -.. code:: python - - import daft - - df = daft.from_pydict({ - "A": [1, 2, 3, 4], - "B": [1.5, 2.5, 3.5, 4.5], - "C": [True, True, False, False], - "D": [None, None, None, None], - }) - -Examine your Dataframe by printing it: - -.. code:: python - - df - -.. code:: none - - +---------+-----------+-----------+-----------+ - | A | B | C | D | - | Int64 | Float64 | Boolean | Null | - +=========+===========+===========+===========+ - | 1 | 1.5 | true | None | - +---------+-----------+-----------+-----------+ - | 2 | 2.5 | true | None | - +---------+-----------+-----------+-----------+ - | 3 | 3.5 | false | None | - +---------+-----------+-----------+-----------+ - | 4 | 4.5 | false | None | - +---------+-----------+-----------+-----------+ - (Showing first 4 of 4 rows) - - -Congratulations - you just created your first DataFrame! It has 4 columns, "A", "B", "C", and "D". Let's try to select only the "A", "B", and "C" columns: - -.. code:: python - - df.select("A", "B", "C") - -.. code:: none - - +---------+-----------+-----------+ - | A | B | C | - | Int64 | Float64 | Boolean | - +=========+===========+===========+ - +---------+-----------+-----------+ - (No data to display: Dataframe not materialized) - - -But wait - why is it printing the message ``(No data to display: Dataframe not materialized)`` and where are the rows of each column? - -Executing our DataFrame and Viewing Data ----------------------------------------- - -The reason that our DataFrame currently does not display its rows is that Daft DataFrames are **lazy**. This just means that Daft DataFrames will defer all its work until you tell it to execute. - -In this case, Daft is just deferring the work required to read the data and select columns, however in practice this laziness can be very useful for helping Daft optimize your queries before execution! - -.. NOTE:: - - When you call methods on a Daft Dataframe, it defers the work by adding to an internal "plan". You can examine the current plan of a DataFrame by calling :meth:`df.explain() `! - - Passing the ``show_all=True`` argument will show you the plan after Daft applies its query optimizations and the physical (lower-level) plan. - -We can tell Daft to execute our DataFrame and cache the results using :meth:`df.collect() `: - -.. code:: python - - df.collect() - df - -.. code:: none - - +---------+-----------+-----------+ - | A | B | C | - | Int64 | Float64 | Boolean | - +=========+===========+===========+ - | 1 | 1.5 | true | - +---------+-----------+-----------+ - | 2 | 2.5 | true | - +---------+-----------+-----------+ - | 3 | 3.5 | false | - +---------+-----------+-----------+ - | 4 | 4.5 | false | - +---------+-----------+-----------+ - (Showing first 4 of 4 rows) - -Now your DataFrame object ``df`` is **materialized** - Daft has executed all the steps required to compute the results, and has cached the results in memory so that it can display this preview. - -Any subsequent operations on ``df`` will avoid recomputations, and just use this materialized result! - -When should I materialize my DataFrame? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you "eagerly" call :meth:`df.collect() ` immediately on every DataFrame, you may run into issues: - -1. If data is too large at any step, materializing all of it may cause memory issues -2. Optimizations are not possible since we cannot "predict future operations" - -However, data science is all about experimentation and trying different things on the same data. This means that materialization is crucial when working interactively with DataFrames, since it speeds up all subsequent experimentation on that DataFrame. - -We suggest materializing DataFrames using :meth:`df.collect() ` when they contain expensive operations (e.g. sorts or expensive function calls) and have to be called multiple times by downstream code: - -.. code:: python - - df = df.with_column("A", df["A"].apply(expensive_function)) # expensive function - df = df.sort("A") # expensive sort - df.collect() # materialize the DataFrame - - # All subsequent work on df avoids recomputing previous steps - df.sum().show() - df.mean().show() - df.with_column("try_this", df["A"] + 1).show(5) - -In many other cases however, there are better options than materializing your entire DataFrame with :meth:`df.collect() `: - -1. **Peeking with df.show(N)**: If you only want to "peek" at the first few rows of your data for visualization purposes, you can use :meth:`df.show(N) `, which processes and shows only the first ``N`` rows. -2. **Writing to disk**: The ``df.write_*`` methods will process and write your data to disk per-partition, avoiding materializing it all in memory at once. -3. **Pruning data**: You can materialize your DataFrame after performing a :meth:`df.limit() `, :meth:`df.where() ` or :meth:`df.select() ` operation which processes your data or prune it down to a smaller size. - -Schemas and Types ------------------ - -Notice also that when we printed our DataFrame, Daft displayed its **schema**. Each column of your DataFrame has a **name** and a **type**, and all data in that column will adhere to that type! - -Daft can display your DataFrame's schema without materializing it. Under the hood, it performs intelligent sampling of your data to determine the appropriate schema, and if you make any modifications to your DataFrame it can infer the resulting types based on the operation. - -.. NOTE:: - - Under the hood, Daft represents data in the `Apache Arrow `_ format, which allows it to efficiently represent and work on data using high-performance kernels which are written in Rust. - - -Running Computations --------------------- - -To run computations on data in our DataFrame, we use Expressions. - -The following statement will :meth:`df.show() ` a DataFrame that has only one column - the column ``A`` from our original DataFrame but with every row incremented by 1. - -.. code:: python - - df.select(df["A"] + 1).show() - -.. code:: none - - +---------+ - | A | - | Int64 | - +=========+ - | 2 | - +---------+ - | 3 | - +---------+ - | 4 | - +---------+ - | 5 | - +---------+ - (Showing first 4 rows) - -.. NOTE:: - - A common pattern is to create a new columns using ``DataFrame.with_column``: - - .. code:: python - - # Creates a new column named "foo" which takes on values - # of column "A" incremented by 1 - df = df.with_column("foo", df["A"] + 1) - -Congratulations, you have just written your first **Expression**: ``df["A"] + 1``! - -Expressions -^^^^^^^^^^^ - -Expressions are how you define computations on your columns in Daft. - -The world of Daft contains much more than just numbers, and you can do much more than just add numbers together. Daft's rich Expressions API allows you to do things such as: - -1. Convert between different types with :meth:`df["numbers"].cast(float) ` -2. Download Bytes from a column containing String URLs using :meth:`df["urls"].url.download() ` -3. Run arbitrary Python functions on your data using :meth:`df["objects"].apply(my_python_function) ` - -We are also constantly looking to improve Daft and add more Expression functionality. Please contribute to the project with your ideas and code if you have an Expression in mind! - -The next section on :doc:`expressions` will provide a much deeper look at the Expressions that Daft provides. diff --git a/docs/source/user_guide/basic_concepts/expressions.rst b/docs/source/user_guide/basic_concepts/expressions.rst deleted file mode 100644 index db62ddb2fb..0000000000 --- a/docs/source/user_guide/basic_concepts/expressions.rst +++ /dev/null @@ -1,343 +0,0 @@ -Expressions -=========== - -Expressions are how you can express computations that should be run over columns of data. - -Creating Expressions --------------------- - -Referring to a column in a DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Most commonly you will be creating expressions by referring to a column from an existing DataFrame. - -To do so, simply index a DataFrame with the string name of the column: - -.. code:: python - - import daft - - df = daft.from_pydict({"A": [1, 2, 3]}) - - # Refers to column "A" in `df` - df["A"] - -.. code:: none - - col(A) - -When we evaluate this ``df["A"]`` Expression, it will evaluate to the column from the ``df`` DataFrame with name "A"! - -Refer to a column with a certain name -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You may also find it necessary in certain situations to create an Expression with just the name of a column, without having an existing DataFrame to refer to. You can do this with the :func:`~daft.expressions.col` helper: - -.. code:: python - - from daft import col - - # Refers to a column named "A" - col("A") - -When this Expression is evaluated, it will resolve to "the column named A" in whatever evaluation context it is used within! - -Refer to multiple columns using a wildcard -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can create expressions on multiple columns at once using a wildcard. The expression `col("*")` selects every column in a DataFrame, and you can operate on this expression in the same way as a single column: - -.. code:: python - - import daft - from daft import col - - df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) - df.select(col("*") * 3).show() - -.. code:: none - - ╭───────┬───────╮ - │ A ┆ B │ - │ --- ┆ --- │ - │ Int64 ┆ Int64 │ - ╞═══════╪═══════╡ - │ 3 ┆ 12 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 6 ┆ 15 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 9 ┆ 18 │ - ╰───────┴───────╯ - -Literals -^^^^^^^^ - -You may find yourself needing to hardcode a "single value" oftentimes as an expression. Daft provides a :func:`~daft.expressions.lit` helper to do so: - -.. code:: python - - from daft import lit - - # Refers to an expression which always evaluates to 42 - lit(42) - -This special :func:`~daft.expressions.lit` expression we just created evaluates always to the value ``42``. - -.. _userguide-numeric-expressions: - -Numeric Expressions -------------------- - -Since column "A" is an integer, we can run numeric computation such as addition, division and checking its value. Here are some examples where we create new columns using the results of such computations: - -.. code:: python - - # Add 1 to each element in column "A" - df = df.with_column("A_add_one", df["A"] + 1) - - # Divide each element in column A by 2 - df = df.with_column("A_divide_two", df["A"] / 2.) - - # Check if each element in column A is more than 1 - df = df.with_column("A_gt_1", df["A"] > 1) - - df.collect() - -.. code:: none - - +---------+-------------+----------------+-----------+ - | A | A_add_one | A_divide_two | A_gt_1 | - | Int64 | Int64 | Float64 | Boolean | - +=========+=============+================+===========+ - | 1 | 2 | 0.5 | false | - +---------+-------------+----------------+-----------+ - | 2 | 3 | 1 | true | - +---------+-------------+----------------+-----------+ - | 3 | 4 | 1.5 | true | - +---------+-------------+----------------+-----------+ - (Showing first 3 of 3 rows) - -Notice that the returned types of these operations are also well-typed according to their input types. For example, calling ``df["A"] > 1`` returns a column of type :meth:`Boolean `. - -Both the :meth:`Float ` and :meth:`Int ` types are numeric types, and inherit many of the same arithmetic Expression operations. You may find the full list of numeric operations in the :ref:`Expressions API reference `. - -.. _userguide-string-expressions: - -String Expressions ------------------- - -Daft also lets you have columns of strings in a DataFrame. Let's take a look! - -.. code:: python - - df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) - df.show() - -.. code:: none - - +--------+ - | B | - | Utf8 | - +========+ - | foo | - +--------+ - | bar | - +--------+ - | baz | - +--------+ - (Showing first 3 rows) - -Unlike the numeric types, the string type does not support arithmetic operations such as ``*`` and ``/``. The one exception to this is the ``+`` operator, which is overridden to concatenate two string expressions as is commonly done in Python. Let's try that! - -.. code:: python - - df = df.with_column("B2", df["B"] + "foo") - df.show() - -.. code:: none - - +--------+--------+ - | B | B2 | - | Utf8 | Utf8 | - +========+========+ - | foo | foofoo | - +--------+--------+ - | bar | barfoo | - +--------+--------+ - | baz | bazfoo | - +--------+--------+ - (Showing first 3 rows) - -There are also many string operators that are accessed through a separate :meth:`.str.* ` "method namespace". - -For example, to check if each element in column "B" contains the substring "a", we can use the :meth:`.str.contains ` method: - -.. code:: python - - df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) - df.show() - -.. code:: none - - +--------+--------+-----------------+ - | B | B2 | B2_contains_B | - | Utf8 | Utf8 | Boolean | - +========+========+=================+ - | foo | foofoo | true | - +--------+--------+-----------------+ - | bar | barfoo | true | - +--------+--------+-----------------+ - | baz | bazfoo | true | - +--------+--------+-----------------+ - (Showing first 3 rows) - -You may find a full list of string operations in the :ref:`Expressions API reference `. - -URL Expressions -^^^^^^^^^^^^^^^ - -One special case of a String column you may find yourself working with is a column of URL strings. - -Daft provides the :meth:`.url.* ` method namespace with functionality for working with URL strings. For example, to download data from URLs: - -.. code:: python - - df = daft.from_pydict({ - "urls": [ - "https://www.google.com", - "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", - ], - }) - df = df.with_column("data", df["urls"].url.download()) - df.collect() - -.. code:: none - - +----------------------+----------------------+ - | urls | data | - | Utf8 | Binary | - +======================+======================+ - | https://www.google.c | b'`_ as the underlying executor, so you can find the full list of supported filters in the `jaq documentation `_. - -.. _userguide-logical-expressions: - -Logical Expressions -------------------- - -Logical Expressions are an expression that refers to a column of type :meth:`Boolean `, and can only take on the values True or False. - -.. code:: python - - df = daft.from_pydict({"C": [True, False, True]}) - df["C"] - -Daft supports logical operations such as ``&`` (and) and ``|`` (or) between logical expressions. - -Comparisons -^^^^^^^^^^^ - -Many of the types in Daft support comparisons between expressions that returns a Logical Expression. - -For example, here we can compare if each element in column "A" is equal to elements in column "B": - -.. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) - - df = df.with_column("A_eq_B", df["A"] == df["B"]) - - df.collect() - -.. code:: none - - +---------+---------+-----------+ - | A | B | A_eq_B | - | Int64 | Int64 | Boolean | - +=========+=========+===========+ - | 1 | 1 | true | - +---------+---------+-----------+ - | 2 | 2 | true | - +---------+---------+-----------+ - | 3 | 4 | false | - +---------+---------+-----------+ - (Showing first 3 of 3 rows) - -Other useful comparisons can be found in the :ref:`Expressions API reference `. - -If Else Pattern -^^^^^^^^^^^^^^^ - -The :meth:`.if_else() ` method is a useful expression to have up your sleeve for choosing values between two other expressions based on a logical expression: - -.. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) - - # Pick values from column A if the value in column A is bigger - # than the value in column B. Otherwise, pick values from column B. - df = df.with_column( - "A_if_bigger_else_B", - (df["A"] > df["B"]).if_else(df["A"], df["B"]), - ) - - df.collect() - -.. code:: none - - +---------+---------+----------------------+ - | A | B | A_if_bigger_else_B | - | Int64 | Int64 | Int64 | - +=========+=========+======================+ - | 1 | 0 | 1 | - +---------+---------+----------------------+ - | 2 | 2 | 2 | - +---------+---------+----------------------+ - | 3 | 4 | 4 | - +---------+---------+----------------------+ - (Showing first 3 of 3 rows) - -This is a useful expression for cleaning your data! diff --git a/docs/source/user_guide/basic_concepts/introduction.rst b/docs/source/user_guide/basic_concepts/introduction.rst deleted file mode 100644 index 2fa1c8fa94..0000000000 --- a/docs/source/user_guide/basic_concepts/introduction.rst +++ /dev/null @@ -1,92 +0,0 @@ -Introduction -============ - -Daft is a distributed query engine with a DataFrame API. The two key concepts to Daft are: - -1. :class:`DataFrame `: a Table-like structure that represents rows and columns of data -2. :class:`Expression `: a symbolic representation of computation that transforms columns of the DataFrame to a new one. - -With Daft, you create :class:`DataFrame ` from a variety of sources (e.g. reading data from files, data catalogs or from Python dictionaries) and use :class:`Expression ` to manipulate data in that DataFrame. Let's take a closer look at these two abstractions! - -DataFrame ---------- - -Conceptually, a DataFrame is a "table" of data, with rows and columns. - -.. image:: /_static/daft_illustration.png - :alt: Daft python dataframes make it easy to load any data such as PDF documents, images, protobufs, csv, parquet and audio files into a table dataframe structure for easy querying - :width: 500 - :align: center - -Using this abstraction of a DataFrame, you can run common tabular operations such as: - -1. Filtering rows: :meth:`df.where(...) ` -2. Creating new columns as a computation of existing columns: :meth:`df.with_column(...) ` -3. Joining two tables together: :meth:`df.join(...) ` -4. Sorting a table by the values in specified column(s): :meth:`df.sort(...) ` -5. Grouping and aggregations: :meth:`df.groupby(...).agg(...) ` - -Daft DataFrames are: - -1. **Distributed:** your data is split into *Partitions* and can be processed in parallel/on different machines -2. **Lazy:** computations are enqueued in a query plan which is then optimized and executed only when requested -3. **Multimodal:** columns can contain complex datatypes such as tensors, images and Python objects - -Since Daft is lazy, it can actually execute the query plan on a variety of different backends. By default, it will run computations locally using Python multithreading. However if you need to scale to large amounts of data that cannot be processed on a single machine, using the Ray runner allows Daft to run computations on a `Ray `_ cluster instead. - -Expressions ------------ - -The other important concept to understand when working with Daft are **expressions**. - -Because Daft is "lazy", it needs a way to represent computations that need to be performed on its data so that it can execute these computations at some later time. The answer to this is an :class:`~daft.expressions.Expression`! - -The simplest Expressions are: - -1. The column expression: :func:`col("a") ` which is used to refer to "some column named 'a'" -2. Or, if you already have an existing DataFrame ``df`` with a column named "a", you can refer to its column with Python's square bracket indexing syntax: ``df["a"]`` -3. The literal expression: :func:`lit(100) ` which represents a column that always takes on the provided value - -Daft then provides an extremely rich Expressions library to allow you to compose different computations that need to happen. For example: - -.. code:: python - - from daft import col, DataType - - # Take the column named "a" and add 1 to each element - col("a") + 1 - - # Take the column named "a", cast it to a string and check each element, returning True if it starts with "1" - col("a").cast(DataType.string()).str.startswith("1") - -Expressions are used in DataFrame operations, and the names of these Expressions are resolved to column names on the DataFrame that they are running on. Here is an example: - -.. code:: python - - import daft - - # Create a dataframe with a column "a" that has values [1, 2, 3] - df = daft.from_pydict({"a": [1, 2, 3]}) - - # Create new columns called "a_plus_1" and "a_startswith_1" using Expressions - df = df.select( - col("a"), - (col("a") + 1).alias("a_plus_1"), - col("a").cast(DataType.string()).str.startswith("1").alias("a_startswith_1"), - ) - - df.show() - -.. code:: none - - +---------+------------+------------------+ - | a | a_plus_1 | a_startswith_1 | - | Int64 | Int64 | Boolean | - +=========+============+==================+ - | 1 | 2 | true | - +---------+------------+------------------+ - | 2 | 3 | false | - +---------+------------+------------------+ - | 3 | 4 | false | - +---------+------------+------------------+ - (Showing first 3 rows) diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst index e79607a84d..653240bed9 100644 --- a/docs/source/user_guide/index.rst +++ b/docs/source/user_guide/index.rst @@ -6,6 +6,8 @@ Daft User Guide :maxdepth: 1 basic_concepts + read-and-write + sql daft_in_depth poweruser integrations @@ -14,22 +16,7 @@ Daft User Guide Welcome to **Daft**! -Daft is a Python dataframe library that enables Pythonic data processing at large scale. - -* **Fast** - Daft kernels are written and accelerated using Rust on Apache Arrow arrays. - -* **Flexible** - you can work with any Python object in a Daft Dataframe. - -* **Interactive** - Daft provides a first-class notebook experience. - -* **Scalable** - Daft uses out-of-core algorithms to work with datasets that cannot fit in memory. - -* **Distributed** - Daft scales to a cluster of machines using Ray to crunch terabytes of data. - -* **Intelligent** - Daft performs query optimizations to speed up your work. - -The core interface provided by Daft is the *DataFrame*, which is a table of data consisting of rows and columns. This user guide -aims to help Daft users master the usage of the Daft *DataFrame* for all your data processing needs! +This user guide aims to help Daft users master the usage of the Daft for all your data needs. .. NOTE:: @@ -39,8 +26,7 @@ aims to help Daft users master the usage of the Daft *DataFrame* for all your da code you may wish to take a look at these resources: 1. :doc:`../10-min`: Itching to run some Daft code? Hit the ground running with our 10 minute quickstart notebook. - 2. (Coming soon!) Cheatsheet: Quick reference to commonly-used Daft APIs and usage patterns - useful to keep next to your laptop as you code! - 3. :doc:`../api_docs/index`: Searchable documentation and reference material to Daft's public Python API. + 2. :doc:`../api_docs/index`: Searchable documentation and reference material to Daft's public API. Table of Contents ----------------- @@ -52,6 +38,11 @@ The Daft User Guide is laid out as follows: High-level overview of Daft interfaces and usage to give you a better understanding of how Daft will fit into your day-to-day workflow. +:doc:`Structured Query Language (SQL) ` +******************************************** + +A look into Daft's SQL interface and how it complements Daft's Pythonic DataFrame APIs. + :doc:`Daft in Depth ` ************************************ diff --git a/docs/source/user_guide/basic_concepts/read-and-write.rst b/docs/source/user_guide/read-and-write.rst similarity index 64% rename from docs/source/user_guide/basic_concepts/read-and-write.rst rename to docs/source/user_guide/read-and-write.rst index 1d1a481fea..f8585111d9 100644 --- a/docs/source/user_guide/basic_concepts/read-and-write.rst +++ b/docs/source/user_guide/read-and-write.rst @@ -1,5 +1,5 @@ -Reading/Writing -=============== +Reading/Writing Data +==================== Daft can read data from a variety of sources, and write data to many destinations. @@ -37,7 +37,7 @@ To learn more about each of these constructors, as well as the options that they From Data Catalogs ^^^^^^^^^^^^^^^^^^ -If you use catalogs such as Apache Iceberg or Hive, you may wish to consult our user guide on integrations with Data Catalogs: :doc:`Daft integration with Data Catalogs <../integrations/>`. +If you use catalogs such as Apache Iceberg or Hive, you may wish to consult our user guide on integrations with Data Catalogs: :doc:`Daft integration with Data Catalogs `. From File Paths ^^^^^^^^^^^^^^^ @@ -87,7 +87,50 @@ In order to partition the data, you can specify a partition column, which will a # Read with a partition column df = daft.read_sql("SELECT * FROM my_table", partition_col="date", uri) -To learn more, consult the :doc:`SQL User Guide <../integrations/sql>` or the API documentation on :func:`daft.read_sql`. +To learn more, consult the :doc:`SQL User Guide ` or the API documentation on :func:`daft.read_sql`. + + +Reading a column of URLs +------------------------ + +Daft provides a convenient way to read data from a column of URLs using the :meth:`.url.download() ` method. This is particularly useful when you have a DataFrame with a column containing URLs pointing to external resources that you want to fetch and incorporate into your DataFrame. + +Here's an example of how to use this feature: + +.. code:: python + + # Assume we have a DataFrame with a column named 'image_urls' + df = daft.from_pydict({ + "image_urls": [ + "https://example.com/image1.jpg", + "https://example.com/image2.jpg", + "https://example.com/image3.jpg" + ] + }) + + # Download the content from the URLs and create a new column 'image_data' + df = df.with_column("image_data", df["image_urls"].url.download()) + df.show() + +.. code-block:: text + :caption: Output + + +------------------------------------+------------------------------------+ + | image_urls | image_data | + | Utf8 | Binary | + +====================================+====================================+ + | https://example.com/image1.jpg | b'\xff\xd8\xff\xe0\x00\x10JFIF...' | + +------------------------------------+------------------------------------+ + | https://example.com/image2.jpg | b'\xff\xd8\xff\xe0\x00\x10JFIF...' | + +------------------------------------+------------------------------------+ + | https://example.com/image3.jpg | b'\xff\xd8\xff\xe0\x00\x10JFIF...' | + +------------------------------------+------------------------------------+ + + (Showing first 3 of 3 rows) + + +This approach allows you to efficiently download and process data from a large number of URLs in parallel, leveraging Daft's distributed computing capabilities. + Writing Data diff --git a/docs/source/user_guide/sql.rst b/docs/source/user_guide/sql.rst new file mode 100644 index 0000000000..6e99c2fc34 --- /dev/null +++ b/docs/source/user_guide/sql.rst @@ -0,0 +1,2 @@ +SQL +=== From 03f54817a984af07367cab422f1798a5ebcbdae8 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 26 Sep 2024 02:49:44 -0700 Subject: [PATCH 06/21] Working my way through basic concepts --- docs/source/conf.py | 1 + docs/source/index.rst | 21 +- docs/source/user_guide/basic_concepts.rst | 381 ++++++++++++++++------ requirements-dev.txt | 1 + 4 files changed, 304 insertions(+), 100 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 36e66be49a..a52c35c635 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -45,6 +45,7 @@ "myst_nb", "sphinx_copybutton", "sphinx_autosummary_accessors", + "sphinx_tabs.tabs", ] templates_path = ["_templates", sphinx_autosummary_accessors.templates_path] diff --git a/docs/source/index.rst b/docs/source/index.rst index 5f5fb318f1..6ee5c431b7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,26 +13,29 @@ Use-Cases Data Engineering **************** -*Provides the local performance and memory stability of DuckDB/Polars with the scalability of Apache Spark* +*Combine the performance of DuckDB, Pythonic UX of Polars and scalability of Apache Spark for data engineering from MB to PB scale* -* **Extract → Transform → Load (ETL):** Perform data engineering on messy multimodal data at scales ranging from MB to PB, on a single node or a distributed cluster -* **Cloud-native:** Native integrations with modern cloud storage (e.g. S3), open catalogs/table formats (e.g. Apache Iceberg, DeltaLake) and open data formats (e.g. Apache Parquet) +* Scale ETL workflows effortlessly from local to distributed environments +* Enjoy a Python-first experience without JVM dependency hell +* Leverage native integrations with cloud storage, open catalogs, and data formats Data Analytics ************** -*Provides a SQL interface with the snappiness of local engines such as DuckDB and scalability of engines such as Spark/Trino* +*Blend the snappiness of DuckDB with the scalability of Spark/Trino for unified local and distributed analytics* -* **Local Analytics:** Snappy interactive data exploration and aggregations from Python notebooks using DataFrames or SQL with the performance/development experience of local engines such as DuckDB/Polars -* **Distributed Analytics:** Powerful capabilities to scale to the cloud when required to process larger datasets, outperforming distributed analytics engines such as Spark and Trino +* Utilize complementary SQL and Python interfaces for versatile analytics +* Perform snappy local exploration with DuckDB-like performance +* Seamlessly scale to the cloud, outperforming distributed engines like Spark and Trino ML/AI ***** -*Replaces opinionated data formats such as Mosaic Data Shard (MDS) or TFRecords with dataloading directly from open formats (Apache Parquet, JPEG) into Pytorch or Numpy while saturating network bandwidth* +*Streamline ML/AI workflows with efficient dataloading from open formats like Parquet and JPEG* -* **Dataloading for training:** Fast and memory efficient dataloaders from open file formats such as Parquet and JPEG -* **Model batch inference on GPUs:** Schedule large-scale model batch inference on a fleet of GPUs on a distributed cluster. +* Load data efficiently from open formats directly into PyTorch or NumPy +* Schedule large-scale model batch inference on distributed GPU clusters +* Optimize data curation with advanced clustering, deduplication, and filtering Technology ---------- diff --git a/docs/source/user_guide/basic_concepts.rst b/docs/source/user_guide/basic_concepts.rst index b40f074971..224e7e48a4 100644 --- a/docs/source/user_guide/basic_concepts.rst +++ b/docs/source/user_guide/basic_concepts.rst @@ -66,16 +66,20 @@ Creating a Dataframe Let's create our first Dataframe from a Python dictionary of columns. -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python - import daft + .. code:: python - df = daft.from_pydict({ - "A": [1, 2, 3, 4], - "B": [1.5, 2.5, 3.5, 4.5], - "C": [True, True, False, False], - "D": [None, None, None, None], - }) + import daft + + df = daft.from_pydict({ + "A": [1, 2, 3, 4], + "B": [1.5, 2.5, 3.5, 4.5], + "C": [True, True, False, False], + "D": [None, None, None, None], + }) Examine your Dataframe by printing it: @@ -103,9 +107,21 @@ Examine your Dataframe by printing it: Congratulations - you just created your first DataFrame! It has 4 columns, "A", "B", "C", and "D". Let's try to select only the "A", "B", and "C" columns: -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.select("A", "B", "C") + df - df.select("A", "B", "C") + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql("SELECT A, B, C FROM df") + df .. code-block:: text :caption: Output @@ -135,26 +151,32 @@ In this case, Daft is just deferring the work required to read the data and sele We can tell Daft to execute our DataFrame and cache the results using :meth:`df.collect() `: -.. code:: python +.. tabs:: - df.collect() - df + .. group-tab:: 🐍 Python + + .. code:: python + + df.collect() + df .. code-block:: text :caption: Output - +---------+-----------+-----------+ - | A | B | C | - | Int64 | Float64 | Boolean | - +=========+===========+===========+ - | 1 | 1.5 | true | - +---------+-----------+-----------+ - | 2 | 2.5 | true | - +---------+-----------+-----------+ - | 3 | 3.5 | false | - +---------+-----------+-----------+ - | 4 | 4.5 | false | - +---------+-----------+-----------+ + ╭───────┬─────────┬─────────┬──────╮ + │ A ┆ B ┆ C ┆ D │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ Int64 ┆ Float64 ┆ Boolean ┆ Null │ + ╞═══════╪═════════╪═════════╪══════╡ + │ 1 ┆ 1.5 ┆ true ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 2 ┆ 2.5 ┆ true ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 3 ┆ 3.5 ┆ false ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 4 ┆ 4.5 ┆ false ┆ None │ + ╰───────┴─────────┴─────────┴──────╯ + (Showing first 4 of 4 rows) Now your DataFrame object ``df`` is **materialized** - Daft has executed all the steps required to compute the results, and has cached the results in memory so that it can display this preview. @@ -173,16 +195,55 @@ However, data science is all about experimentation and trying different things o We suggest materializing DataFrames using :meth:`df.collect() ` when they contain expensive operations (e.g. sorts or expensive function calls) and have to be called multiple times by downstream code: -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.sort("A") # expensive sort + df.collect() # materialize the DataFrame - df = df.with_column("A", df["A"].apply(expensive_function)) # expensive function - df = df.sort("A") # expensive sort - df.collect() # materialize the DataFrame + # All subsequent work on df avoids recomputing previous steps + df.sum("B").show() + df.mean("B").show() + df.with_column("try_this", df["A"] + 1).show(5) + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql("SELECT * FROM df ORDER BY A") + df.collect() + + # All subsequent work on df avoids recomputing previous steps + daft.sql("SELECT sum(B) FROM df").show() + daft.sql("SELECT mean(B) FROM df").show() + daft.sql("SELECT *, (A + 1) AS try_this FROM df").show(5) + +.. code-block:: text + :caption: Output + + ╭─────────╮ + │ B │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 12 │ + ╰─────────╯ + + (Showing first 1 of 1 rows) + + ╭─────────╮ + │ B │ + │ --- │ + │ Float64 │ + ╞═════════╡ + │ 3 │ + ╰─────────╯ + + (Showing first 1 of 1 rows) - # All subsequent work on df avoids recomputing previous steps - df.sum().show() - df.mean().show() - df.with_column("try_this", df["A"] + 1).show(5) In many other cases however, there are better options than materializing your entire DataFrame with :meth:`df.collect() `: @@ -209,9 +270,19 @@ To run computations on data in our DataFrame, we use Expressions. The following statement will :meth:`df.show() ` a DataFrame that has only one column - the column ``A`` from our original DataFrame but with every row incremented by 1. -.. code:: python +.. tabs:: - df.select(df["A"] + 1).show() + .. group-tab:: 🐍 Python + + .. code:: python + + df.select(df["A"] + 1).show() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + daft.sql("SELECT A + 1 FROM df") .. code-block:: text :caption: Output @@ -234,11 +305,23 @@ The following statement will :meth:`df.show() ` a DataFrame A common pattern is to create a new columns using ``DataFrame.with_column``: - .. code:: python + .. tabs:: + + .. group-tab:: 🐍 Python - # Creates a new column named "foo" which takes on values - # of column "A" incremented by 1 - df = df.with_column("foo", df["A"] + 1) + .. code:: python + + # Creates a new column named "foo" which takes on values + # of column "A" incremented by 1 + df = df.with_column("foo", df["A"] + 1) + + .. group-tab:: ⚙️ SQL + + .. code:: python + + # Creates a new column named "foo" which takes on values + # of column "A" incremented by 1 + df = daft.sql("SELECT *, A + 1 AS foo FROM df") Congratulations, you have just written your first **Expression**: ``df["A"] + 1``! @@ -255,10 +338,20 @@ Referring to a column in a DataFrame Most commonly you will be creating expressions by using the :func:`daft.col` function. -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python - # Refers to column "A" - daft.col("A") + .. code:: python + + # Refers to column "A" + daft.col("A") + + .. group-tab:: ⚙️ SQL + + .. code:: python + + daft.sql_expr("A") .. code-block:: text :caption: Output @@ -272,30 +365,43 @@ Using SQL Daft can also parse valid SQL as expressions. -.. code:: python +.. tabs:: + + .. group-tab:: ⚙️ SQL - from daft import sql_expr + .. code:: python - sql_expr("A + 1") + daft.sql_expr("A + 1") .. code-block:: text :caption: Output col(A) + lit(1) -The above code will create an expression representing "the column named 'x' incremented by 1". +The above code will create an expression representing "the column named 'x' incremented by 1". For many APIs, sql_expr will actually be applied for you as syntactic sugar! Literals ######## You may find yourself needing to hardcode a "single value" oftentimes as an expression. Daft provides a :func:`~daft.expressions.lit` helper to do so: -.. code:: python +.. tabs:: - from daft import lit + .. group-tab:: 🐍 Python - # Refers to an expression which always evaluates to 42 - lit(42) + .. code:: python + + from daft import lit + + # Refers to an expression which always evaluates to 42 + lit(42) + + .. group-tab:: ⚙️ SQL + + .. code:: python + + # Refers to an expression which always evaluates to 42 + daft.sql_expr("42") .. code-block:: text :caption: Output @@ -309,13 +415,17 @@ Wildcard Expressions You can create expressions on multiple columns at once using a wildcard. The expression `col("*")` selects every column in a DataFrame, and you can operate on this expression in the same way as a single column: -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python - import daft - from daft import col + import daft + from daft import col - df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) - df.select(col("*") * 3).show() + df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.select(col("*") * 3).show() .. code-block:: text :caption: Output @@ -342,18 +452,36 @@ Numeric Expressions Since column "A" is an integer, we can run numeric computation such as addition, division and checking its value. Here are some examples where we create new columns using the results of such computations: -.. code:: python +.. tabs:: - # Add 1 to each element in column "A" - df = df.with_column("A_add_one", df["A"] + 1) + .. group-tab:: 🐍 Python - # Divide each element in column A by 2 - df = df.with_column("A_divide_two", df["A"] / 2.) + .. code:: python - # Check if each element in column A is more than 1 - df = df.with_column("A_gt_1", df["A"] > 1) + # Add 1 to each element in column "A" + df = df.with_column("A_add_one", df["A"] + 1) - df.collect() + # Divide each element in column A by 2 + df = df.with_column("A_divide_two", df["A"] / 2.) + + # Check if each element in column A is more than 1 + df = df.with_column("A_gt_1", df["A"] > 1) + + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql(""" + SELECT + *, + A + 1 AS A_add_one, + A / 2.0 AS A_divide_two, + A > 1 AS A_gt_1 + FROM df + """) + df.collect() .. code-block:: text :caption: Output @@ -381,10 +509,14 @@ String Expressions Daft also lets you have columns of strings in a DataFrame. Let's take a look! -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python - df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) - df.show() + .. code:: python + + df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) + df.show() .. code-block:: text :caption: Output @@ -403,10 +535,21 @@ Daft also lets you have columns of strings in a DataFrame. Let's take a look! Unlike the numeric types, the string type does not support arithmetic operations such as ``*`` and ``/``. The one exception to this is the ``+`` operator, which is overridden to concatenate two string expressions as is commonly done in Python. Let's try that! -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.with_column("B2", df["B"] + "foo") + df.show() + + .. group-tab:: ⚙️ SQL - df = df.with_column("B2", df["B"] + "foo") - df.show() + .. code:: python + + df = daft.sql("SELECT *, B + 'foo' AS B2 FROM df") + df.show() .. code-block:: text :caption: Output @@ -427,10 +570,21 @@ There are also many string operators that are accessed through a separate :meth: For example, to check if each element in column "B" contains the substring "a", we can use the :meth:`.str.contains ` method: -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) + df.show() + + .. group-tab:: ⚙️ SQL - df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) - df.show() + .. code:: python + + df = daft.sql("SELECT *, contains(B2, B) AS B2_contains_B FROM df") + df.show() .. code-block:: text :caption: Output @@ -456,16 +610,39 @@ One special case of a String column you may find yourself working with is a colu Daft provides the :meth:`.url.* ` method namespace with functionality for working with URL strings. For example, to download data from URLs: -.. code:: python +.. tabs:: - df = daft.from_pydict({ - "urls": [ - "https://www.google.com", - "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", - ], - }) - df = df.with_column("data", df["urls"].url.download()) - df.collect() + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({ + "urls": [ + "https://www.google.com", + "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", + ], + }) + df = df.with_column("data", df["urls"].url.download()) + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + + df = daft.from_pydict({ + "urls": [ + "https://www.google.com", + "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", + ], + }) + df = daft.sql(""" + SELECT + urls, + url_download(urls) AS data + FROM df + """) + df.collect() .. code-block:: text :caption: Output @@ -493,16 +670,38 @@ JSON Expressions If you have a column of JSON strings, Daft provides the :meth:`.json.* ` method namespace to run `JQ-style filters `_ on them. For example, to extract a value from a JSON object: -.. code:: python - - df = daft.from_pydict({ - "json": [ - '{"a": 1, "b": 2}', - '{"a": 3, "b": 4}', - ], - }) - df = df.with_column("a", df["json"].json.query(".a")) - df.collect() +.. tab-set:: + + .. tab-item:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({ + "json": [ + '{"a": 1, "b": 2}', + '{"a": 3, "b": 4}', + ], + }) + df = df.with_column("a", df["json"].json.query(".a")) + df.collect() + + .. tab-item:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({ + "json": [ + '{"a": 1, "b": 2}', + '{"a": 3, "b": 4}', + ], + }) + df = daft.sql(""" + SELECT + json, + json_query(json, '$.a') AS a + FROM df + """) + df.collect() .. code-block:: text :caption: Output diff --git a/requirements-dev.txt b/requirements-dev.txt index a67574df90..9c7809ac80 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -88,3 +88,4 @@ sphinx-book-theme==1.1.0; python_version >= "3.9" sphinx-reredirects>=0.1.1 sphinx-copybutton>=0.5.2 sphinx-autosummary-accessors==2023.4.0; python_version >= "3.9" +sphinx-tabs==3.4.5 From 87f8478a642b256b1e956f32a33e0da3bedad900 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 26 Sep 2024 03:08:41 -0700 Subject: [PATCH 07/21] First pass through basic_concepts --- docs/source/user_guide/basic_concepts.rst | 130 +++++++++++++++------- 1 file changed, 90 insertions(+), 40 deletions(-) diff --git a/docs/source/user_guide/basic_concepts.rst b/docs/source/user_guide/basic_concepts.rst index 224e7e48a4..64ef5ccd5f 100644 --- a/docs/source/user_guide/basic_concepts.rst +++ b/docs/source/user_guide/basic_concepts.rst @@ -670,9 +670,9 @@ JSON Expressions If you have a column of JSON strings, Daft provides the :meth:`.json.* ` method namespace to run `JQ-style filters `_ on them. For example, to extract a value from a JSON object: -.. tab-set:: +.. tabs:: - .. tab-item:: 🐍 Python + .. group-tab:: 🐍 Python .. code:: python @@ -685,7 +685,7 @@ If you have a column of JSON strings, Daft provides the :meth:`.json.* `, and can only take on the values True or False. -.. code:: python +.. tabs:: - df = daft.from_pydict({"C": [True, False, True]}) - df["C"] + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"C": [True, False, True]}) Daft supports logical operations such as ``&`` (and) and ``|`` (or) between logical expressions. @@ -741,27 +744,49 @@ Many of the types in Daft support comparisons between expressions that returns a For example, here we can compare if each element in column "A" is equal to elements in column "B": -.. code:: python +.. tabs:: - df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + .. group-tab:: 🐍 Python - df = df.with_column("A_eq_B", df["A"] == df["B"]) + .. code:: python - df.collect() + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + + df = df.with_column("A_eq_B", df["A"] == df["B"]) + + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + + df = daft.sql(""" + SELECT + A, + B, + A = B AS A_eq_B + FROM df + """) + + df.collect() .. code-block:: text :caption: Output - +---------+---------+-----------+ - | A | B | A_eq_B | - | Int64 | Int64 | Boolean | - +=========+=========+===========+ - | 1 | 1 | true | - +---------+---------+-----------+ - | 2 | 2 | true | - +---------+---------+-----------+ - | 3 | 4 | false | - +---------+---------+-----------+ + ╭───────┬───────┬─────────╮ + │ A ┆ B ┆ A_eq_B │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Boolean │ + ╞═══════╪═══════╪═════════╡ + │ 1 ┆ 1 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 ┆ false │ + ╰───────┴───────┴─────────╯ + (Showing first 3 of 3 rows) Other useful comparisons can be found in the :ref:`Expressions API reference `. @@ -771,32 +796,57 @@ If Else Pattern The :meth:`.if_else() ` method is a useful expression to have up your sleeve for choosing values between two other expressions based on a logical expression: -.. code:: python +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + + # Pick values from column A if the value in column A is bigger + # than the value in column B. Otherwise, pick values from column B. + df = df.with_column( + "A_if_bigger_else_B", + (df["A"] > df["B"]).if_else(df["A"], df["B"]), + ) - df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) - # Pick values from column A if the value in column A is bigger - # than the value in column B. Otherwise, pick values from column B. - df = df.with_column( - "A_if_bigger_else_B", - (df["A"] > df["B"]).if_else(df["A"], df["B"]), - ) + df = daft.sql(""" + SELECT + A, + B, + CASE + WHEN A > B THEN A + ELSE B + END AS A_if_bigger_else_B + FROM df + """) - df.collect() + df.collect() .. code-block:: text :caption: Output - +---------+---------+----------------------+ - | A | B | A_if_bigger_else_B | - | Int64 | Int64 | Int64 | - +=========+=========+======================+ - | 1 | 0 | 1 | - +---------+---------+----------------------+ - | 2 | 2 | 2 | - +---------+---------+----------------------+ - | 3 | 4 | 4 | - +---------+---------+----------------------+ + ╭───────┬───────┬────────────────────╮ + │ A ┆ B ┆ A_if_bigger_else_B │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪════════════════════╡ + │ 1 ┆ 0 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 ┆ 4 │ + ╰───────┴───────┴────────────────────╯ + (Showing first 3 of 3 rows) This is a useful expression for cleaning your data! From fa38e034bf32080a0ef3300ea980bf97eb518fd1 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 26 Sep 2024 11:43:10 -0700 Subject: [PATCH 08/21] Add SQL docs --- daft/sql/sql.py | 69 +++++++++-- docs/source/api_docs/index.rst | 1 + docs/source/api_docs/sql.rst | 21 ++++ docs/source/user_guide/basic_concepts.rst | 143 +++++++++++++++++----- 4 files changed, 188 insertions(+), 46 deletions(-) create mode 100644 docs/source/api_docs/sql.rst diff --git a/daft/sql/sql.py b/daft/sql/sql.py index 987a9baeb0..cb4ccd7114 100644 --- a/daft/sql/sql.py +++ b/daft/sql/sql.py @@ -1,7 +1,7 @@ # isort: dont-add-import: from __future__ import annotations import inspect -from typing import Optional, overload +from typing import Optional from daft.api_annotations import PublicAPI from daft.context import get_context @@ -41,19 +41,64 @@ def sql_expr(sql: str) -> Expression: return Expression._from_pyexpr(_sql_expr(sql)) -@overload -def sql(sql: str) -> DataFrame: ... - - -@overload -def sql(sql: str, catalog: SQLCatalog, register_globals: bool = ...) -> DataFrame: ... - - @PublicAPI def sql(sql: str, catalog: Optional[SQLCatalog] = None, register_globals: bool = True) -> DataFrame: - """Create a DataFrame from an SQL query. - - EXPERIMENTAL: This features is early in development and will change. + """Run a SQL query, returning the results as a DataFrame + + .. WARNING:: + This features is early in development and will likely experience API changes. + + Examples: + + A simple example joining 2 dataframes together using a SQL statement, relying on Daft to detect the names of + SQL tables using their corresponding Python variable names. + + >>> import daft + >>> + >>> df1 = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]}) + >>> df2 = daft.from_pydict({"a": [1, 2, 3], "c": ["daft", None, None]}) + >>> + >>> # Daft automatically detects `df1` and `df2` from your Python global namespace + >>> result_df = daft.sql("SELECT * FROM df1 JOIN df2 ON df1.a = df2.a") + >>> result_df.show() + ╭───────┬──────┬──────╮ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Utf8 ┆ Utf8 │ + ╞═══════╪══════╪══════╡ + │ 1 ┆ foo ┆ daft │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 2 ┆ bar ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 3 ┆ baz ┆ None │ + ╰───────┴──────┴──────╯ + + (Showing first 3 of 3 rows) + + A more complex example using a SQLCatalog to create a named table called `"my_table"`, which can then be referenced from inside your SQL statement. + + >>> import daft + >>> from daft.sql import SQLCatalog + >>> + >>> df = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]}) + >>> + >>> # Register dataframes as tables in SQL explicitly with names + >>> catalog = SQLCatalog({"my_table": df}) + >>> + >>> daft.sql("SELECT a FROM my_table", catalog=catalog).show() + ╭───────╮ + │ a │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ├╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ╰───────╯ + + (Showing first 3 of 3 rows) Args: sql (str): SQL query to execute diff --git a/docs/source/api_docs/index.rst b/docs/source/api_docs/index.rst index 3079870df6..411d697449 100644 --- a/docs/source/api_docs/index.rst +++ b/docs/source/api_docs/index.rst @@ -8,6 +8,7 @@ API Documentation creation dataframe expressions + sql schema datatype groupby diff --git a/docs/source/api_docs/sql.rst b/docs/source/api_docs/sql.rst new file mode 100644 index 0000000000..34ecf7ed3f --- /dev/null +++ b/docs/source/api_docs/sql.rst @@ -0,0 +1,21 @@ +SQL +=== + +SQL Functions +------------- + +.. autofunction:: daft.sql + +.. autofunction:: daft.sql_expr + +SQL Functions +------------- + +This is a full list of functions that can be used from within SQL. + +.. TODO! +.. .. autosummary:: +.. :recursive: +.. :toctree: doc_gen/sql_funcs + +.. daft.sql_func_module diff --git a/docs/source/user_guide/basic_concepts.rst b/docs/source/user_guide/basic_concepts.rst index 64ef5ccd5f..aa6edc4e22 100644 --- a/docs/source/user_guide/basic_concepts.rst +++ b/docs/source/user_guide/basic_concepts.rst @@ -42,6 +42,8 @@ Structured Query Language (SQL) SQL is a common query language for expressing queries over tables of data. Daft exposes a SQL API as an alternative (but often also complementary API) to the Python :class:`DataFrame ` and :class:`Expression ` APIs for building queries. +You can use SQL in Daft via the :func:`daft.sql` function, and Daft will also convert many SQL-compatible strings into Expressions via :func:`daft.sql_expr` for easy interoperability with DataFrames. + DataFrame --------- @@ -90,18 +92,20 @@ Examine your Dataframe by printing it: .. code-block:: text :caption: Output - +---------+-----------+-----------+-----------+ - | A | B | C | D | - | Int64 | Float64 | Boolean | Null | - +=========+===========+===========+===========+ - | 1 | 1.5 | true | None | - +---------+-----------+-----------+-----------+ - | 2 | 2.5 | true | None | - +---------+-----------+-----------+-----------+ - | 3 | 3.5 | false | None | - +---------+-----------+-----------+-----------+ - | 4 | 4.5 | false | None | - +---------+-----------+-----------+-----------+ + ╭───────┬─────────┬─────────┬──────╮ + │ A ┆ B ┆ C ┆ D │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ Int64 ┆ Float64 ┆ Boolean ┆ Null │ + ╞═══════╪═════════╪═════════╪══════╡ + │ 1 ┆ 1.5 ┆ true ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 2 ┆ 2.5 ┆ true ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 3 ┆ 3.5 ┆ false ┆ None │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + │ 4 ┆ 4.5 ┆ false ┆ None │ + ╰───────┴─────────┴─────────┴──────╯ + (Showing first 4 of 4 rows) @@ -126,11 +130,12 @@ Congratulations - you just created your first DataFrame! It has 4 columns, "A", .. code-block:: text :caption: Output - +---------+-----------+-----------+ - | A | B | C | - | Int64 | Float64 | Boolean | - +=========+===========+===========+ - +---------+-----------+-----------+ + ╭───────┬─────────┬─────────╮ + │ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Float64 ┆ Boolean │ + ╰───────┴─────────┴─────────╯ + (No data to display: Dataframe not materialized) @@ -149,7 +154,38 @@ In this case, Daft is just deferring the work required to read the data and sele Passing the ``show_all=True`` argument will show you the plan after Daft applies its query optimizations and the physical (lower-level) plan. -We can tell Daft to execute our DataFrame and cache the results using :meth:`df.collect() `: + .. code-block:: text + :caption: Plan Output + + == Unoptimized Logical Plan == + + * Project: col(A), col(B), col(C) + | + * Source: + | Number of partitions = 1 + | Output schema = A#Int64, B#Float64, C#Boolean, D#Null + + + == Optimized Logical Plan == + + * Project: col(A), col(B), col(C) + | + * Source: + | Number of partitions = 1 + | Output schema = A#Int64, B#Float64, C#Boolean, D#Null + + + == Physical Plan == + + * Project: col(A), col(B), col(C) + | Clustering spec = { Num partitions = 1 } + | + * InMemoryScan: + | Schema = A#Int64, B#Float64, C#Boolean, D#Null, + | Size bytes = 65, + | Clustering spec = { Num partitions = 1 } + +We can tell Daft to execute our DataFrame and store the results in-memory using :meth:`df.collect() `: .. tabs:: @@ -244,6 +280,22 @@ We suggest materializing DataFrames using :meth:`df.collect() `: @@ -263,8 +315,8 @@ Daft can display your DataFrame's schema without materializing it. Under the hoo Under the hood, Daft represents data in the `Apache Arrow `_ format, which allows it to efficiently represent and work on data using high-performance kernels which are written in Rust. -Running Computations -^^^^^^^^^^^^^^^^^^^^ +Running Computation with Expressions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To run computations on data in our DataFrame, we use Expressions. @@ -282,24 +334,26 @@ The following statement will :meth:`df.show() ` a DataFrame .. code:: python - daft.sql("SELECT A + 1 FROM df") + daft.sql("SELECT A + 1 FROM df").show() .. code-block:: text :caption: Output - +---------+ - | A | - | Int64 | - +=========+ - | 2 | - +---------+ - | 3 | - +---------+ - | 4 | - +---------+ - | 5 | - +---------+ - (Showing first 4 rows) + ╭───────╮ + │ A │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 3 │ + ├╌╌╌╌╌╌╌┤ + │ 4 │ + ├╌╌╌╌╌╌╌┤ + │ 5 │ + ╰───────╯ + + (Showing first 4 of 4 rows) .. NOTE:: @@ -314,6 +368,7 @@ The following statement will :meth:`df.show() ` a DataFrame # Creates a new column named "foo" which takes on values # of column "A" incremented by 1 df = df.with_column("foo", df["A"] + 1) + df.show() .. group-tab:: ⚙️ SQL @@ -322,6 +377,26 @@ The following statement will :meth:`df.show() ` a DataFrame # Creates a new column named "foo" which takes on values # of column "A" incremented by 1 df = daft.sql("SELECT *, A + 1 AS foo FROM df") + df.show() + +.. code-block:: text + :caption: Output + + ╭───────┬─────────┬─────────┬───────╮ + │ A ┆ B ┆ C ┆ foo │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ Int64 ┆ Float64 ┆ Boolean ┆ Int64 │ + ╞═══════╪═════════╪═════════╪═══════╡ + │ 1 ┆ 1.5 ┆ true ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 2.5 ┆ true ┆ 3 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 3.5 ┆ false ┆ 4 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 4 ┆ 4.5 ┆ false ┆ 5 │ + ╰───────┴─────────┴─────────┴───────╯ + + (Showing first 4 of 4 rows) Congratulations, you have just written your first **Expression**: ``df["A"] + 1``! From 9d3df3d07a81ad78a895321180b50e80b52d1c7a Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Fri, 27 Sep 2024 09:33:53 -0700 Subject: [PATCH 09/21] Restructure --- docs/source/conf.py | 5 + .../{daft_in_depth => }/aggregations.rst | 0 docs/source/user_guide/basic_concepts.rst | 530 +--------------- docs/source/user_guide/daft_in_depth.rst | 9 - .../dataframe-operations.rst | 0 .../{daft_in_depth => }/datatypes.rst | 0 docs/source/user_guide/expressions.rst | 584 ++++++++++++++++++ docs/source/user_guide/index.rst | 5 +- .../user_guide/{daft_in_depth => }/udf.rst | 0 9 files changed, 598 insertions(+), 535 deletions(-) rename docs/source/user_guide/{daft_in_depth => }/aggregations.rst (100%) delete mode 100644 docs/source/user_guide/daft_in_depth.rst rename docs/source/user_guide/{daft_in_depth => }/dataframe-operations.rst (100%) rename docs/source/user_guide/{daft_in_depth => }/datatypes.rst (100%) create mode 100644 docs/source/user_guide/expressions.rst rename docs/source/user_guide/{daft_in_depth => }/udf.rst (100%) diff --git a/docs/source/conf.py b/docs/source/conf.py index a52c35c635..108666a328 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -87,6 +87,11 @@ "learn/user_guides/remote_cluster_execution": "distributed-computing.html", "learn/quickstart": "learn/10-min.html", "learn/10-min": "../10-min.html", + "user_guide/basic_concepts/*": "user_guide/basic_concepts.html", + "user_guide/daft_in_depth/aggregations": "user_guide/aggregations", + "user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations", + "user_guide/daft_in_depth/datatypes": "user_guide/datatypes", + "user_guide/daft_in_depth/udf": "user_guide/udf", } # Resolving code links to github diff --git a/docs/source/user_guide/daft_in_depth/aggregations.rst b/docs/source/user_guide/aggregations.rst similarity index 100% rename from docs/source/user_guide/daft_in_depth/aggregations.rst rename to docs/source/user_guide/aggregations.rst diff --git a/docs/source/user_guide/basic_concepts.rst b/docs/source/user_guide/basic_concepts.rst index aa6edc4e22..50fb8641cc 100644 --- a/docs/source/user_guide/basic_concepts.rst +++ b/docs/source/user_guide/basic_concepts.rst @@ -66,6 +66,10 @@ Common data operations that you would perform on DataFrames are: Creating a Dataframe ^^^^^^^^^^^^^^^^^^^^ +.. seealso:: + + :doc:`read-and-write` - a more in-depth guide on various options for reading/writing data to/from Daft DataFrames from in-memory data (Python, Arrow), files (Parquet, CSV, JSON), SQL Databases and Data Catalogs + Let's create our first Dataframe from a Python dictionary of columns. .. tabs:: @@ -400,528 +404,4 @@ The following statement will :meth:`df.show() ` a DataFrame Congratulations, you have just written your first **Expression**: ``df["A"] + 1``! -Expressions ------------ - -Expressions are how you can express computations that should be run over columns of data. - -Creating Expressions -^^^^^^^^^^^^^^^^^^^^ - -Referring to a column in a DataFrame -#################################### - -Most commonly you will be creating expressions by using the :func:`daft.col` function. - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - # Refers to column "A" - daft.col("A") - - .. group-tab:: ⚙️ SQL - - .. code:: python - - daft.sql_expr("A") - -.. code-block:: text - :caption: Output - - col(A) - -The above code creates an Expression that refers to a column named ``"A"``. - -Using SQL -######### - -Daft can also parse valid SQL as expressions. - -.. tabs:: - - .. group-tab:: ⚙️ SQL - - .. code:: python - - daft.sql_expr("A + 1") - -.. code-block:: text - :caption: Output - - col(A) + lit(1) - -The above code will create an expression representing "the column named 'x' incremented by 1". For many APIs, sql_expr will actually be applied for you as syntactic sugar! - -Literals -######## - -You may find yourself needing to hardcode a "single value" oftentimes as an expression. Daft provides a :func:`~daft.expressions.lit` helper to do so: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - from daft import lit - - # Refers to an expression which always evaluates to 42 - lit(42) - - .. group-tab:: ⚙️ SQL - - .. code:: python - - # Refers to an expression which always evaluates to 42 - daft.sql_expr("42") - -.. code-block:: text - :caption: Output - - lit(42) - -This special :func:`~daft.expressions.lit` expression we just created evaluates always to the value ``42``. - -Wildcard Expressions -#################### - -You can create expressions on multiple columns at once using a wildcard. The expression `col("*")` selects every column in a DataFrame, and you can operate on this expression in the same way as a single column: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - import daft - from daft import col - - df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) - df.select(col("*") * 3).show() - -.. code-block:: text - :caption: Output - - ╭───────┬───────╮ - │ A ┆ B │ - │ --- ┆ --- │ - │ Int64 ┆ Int64 │ - ╞═══════╪═══════╡ - │ 3 ┆ 12 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 6 ┆ 15 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ - │ 9 ┆ 18 │ - ╰───────┴───────╯ - -Composing Expressions -^^^^^^^^^^^^^^^^^^^^^ - -.. _userguide-numeric-expressions: - -Numeric Expressions -################### - -Since column "A" is an integer, we can run numeric computation such as addition, division and checking its value. Here are some examples where we create new columns using the results of such computations: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - # Add 1 to each element in column "A" - df = df.with_column("A_add_one", df["A"] + 1) - - # Divide each element in column A by 2 - df = df.with_column("A_divide_two", df["A"] / 2.) - - # Check if each element in column A is more than 1 - df = df.with_column("A_gt_1", df["A"] > 1) - - df.collect() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - df = daft.sql(""" - SELECT - *, - A + 1 AS A_add_one, - A / 2.0 AS A_divide_two, - A > 1 AS A_gt_1 - FROM df - """) - df.collect() - -.. code-block:: text - :caption: Output - - +---------+-------------+----------------+-----------+ - | A | A_add_one | A_divide_two | A_gt_1 | - | Int64 | Int64 | Float64 | Boolean | - +=========+=============+================+===========+ - | 1 | 2 | 0.5 | false | - +---------+-------------+----------------+-----------+ - | 2 | 3 | 1 | true | - +---------+-------------+----------------+-----------+ - | 3 | 4 | 1.5 | true | - +---------+-------------+----------------+-----------+ - (Showing first 3 of 3 rows) - -Notice that the returned types of these operations are also well-typed according to their input types. For example, calling ``df["A"] > 1`` returns a column of type :meth:`Boolean `. - -Both the :meth:`Float ` and :meth:`Int ` types are numeric types, and inherit many of the same arithmetic Expression operations. You may find the full list of numeric operations in the :ref:`Expressions API reference `. - -.. _userguide-string-expressions: - -String Expressions -################## - -Daft also lets you have columns of strings in a DataFrame. Let's take a look! - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) - df.show() - -.. code-block:: text - :caption: Output - - +--------+ - | B | - | Utf8 | - +========+ - | foo | - +--------+ - | bar | - +--------+ - | baz | - +--------+ - (Showing first 3 rows) - -Unlike the numeric types, the string type does not support arithmetic operations such as ``*`` and ``/``. The one exception to this is the ``+`` operator, which is overridden to concatenate two string expressions as is commonly done in Python. Let's try that! - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = df.with_column("B2", df["B"] + "foo") - df.show() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - df = daft.sql("SELECT *, B + 'foo' AS B2 FROM df") - df.show() - -.. code-block:: text - :caption: Output - - +--------+--------+ - | B | B2 | - | Utf8 | Utf8 | - +========+========+ - | foo | foofoo | - +--------+--------+ - | bar | barfoo | - +--------+--------+ - | baz | bazfoo | - +--------+--------+ - (Showing first 3 rows) - -There are also many string operators that are accessed through a separate :meth:`.str.* ` "method namespace". - -For example, to check if each element in column "B" contains the substring "a", we can use the :meth:`.str.contains ` method: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) - df.show() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - df = daft.sql("SELECT *, contains(B2, B) AS B2_contains_B FROM df") - df.show() - -.. code-block:: text - :caption: Output - - +--------+--------+-----------------+ - | B | B2 | B2_contains_B | - | Utf8 | Utf8 | Boolean | - +========+========+=================+ - | foo | foofoo | true | - +--------+--------+-----------------+ - | bar | barfoo | true | - +--------+--------+-----------------+ - | baz | bazfoo | true | - +--------+--------+-----------------+ - (Showing first 3 rows) - -You may find a full list of string operations in the :ref:`Expressions API reference `. - -URL Expressions -############### - -One special case of a String column you may find yourself working with is a column of URL strings. - -Daft provides the :meth:`.url.* ` method namespace with functionality for working with URL strings. For example, to download data from URLs: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = daft.from_pydict({ - "urls": [ - "https://www.google.com", - "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", - ], - }) - df = df.with_column("data", df["urls"].url.download()) - df.collect() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - - df = daft.from_pydict({ - "urls": [ - "https://www.google.com", - "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", - ], - }) - df = daft.sql(""" - SELECT - urls, - url_download(urls) AS data - FROM df - """) - df.collect() - -.. code-block:: text - :caption: Output - - +----------------------+----------------------+ - | urls | data | - | Utf8 | Binary | - +======================+======================+ - | https://www.google.c | b'`_ as the underlying executor, so you can find the full list of supported filters in the `jaq documentation `_. - -.. _userguide-logical-expressions: - -Logical Expressions -################### - -Logical Expressions are an expression that refers to a column of type :meth:`Boolean `, and can only take on the values True or False. - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = daft.from_pydict({"C": [True, False, True]}) - -Daft supports logical operations such as ``&`` (and) and ``|`` (or) between logical expressions. - -Comparisons -########### - -Many of the types in Daft support comparisons between expressions that returns a Logical Expression. - -For example, here we can compare if each element in column "A" is equal to elements in column "B": - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) - - df = df.with_column("A_eq_B", df["A"] == df["B"]) - - df.collect() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) - - df = daft.sql(""" - SELECT - A, - B, - A = B AS A_eq_B - FROM df - """) - - df.collect() - -.. code-block:: text - :caption: Output - - ╭───────┬───────┬─────────╮ - │ A ┆ B ┆ A_eq_B │ - │ --- ┆ --- ┆ --- │ - │ Int64 ┆ Int64 ┆ Boolean │ - ╞═══════╪═══════╪═════════╡ - │ 1 ┆ 1 ┆ true │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 2 ┆ 2 ┆ true │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ - │ 3 ┆ 4 ┆ false │ - ╰───────┴───────┴─────────╯ - - (Showing first 3 of 3 rows) - -Other useful comparisons can be found in the :ref:`Expressions API reference `. - -If Else Pattern -############### - -The :meth:`.if_else() ` method is a useful expression to have up your sleeve for choosing values between two other expressions based on a logical expression: - -.. tabs:: - - .. group-tab:: 🐍 Python - - .. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) - - # Pick values from column A if the value in column A is bigger - # than the value in column B. Otherwise, pick values from column B. - df = df.with_column( - "A_if_bigger_else_B", - (df["A"] > df["B"]).if_else(df["A"], df["B"]), - ) - - df.collect() - - .. group-tab:: ⚙️ SQL - - .. code:: python - - df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) - - df = daft.sql(""" - SELECT - A, - B, - CASE - WHEN A > B THEN A - ELSE B - END AS A_if_bigger_else_B - FROM df - """) - - df.collect() - -.. code-block:: text - :caption: Output - - ╭───────┬───────┬────────────────────╮ - │ A ┆ B ┆ A_if_bigger_else_B │ - │ --- ┆ --- ┆ --- │ - │ Int64 ┆ Int64 ┆ Int64 │ - ╞═══════╪═══════╪════════════════════╡ - │ 1 ┆ 0 ┆ 1 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 2 ┆ 2 ┆ 2 │ - ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 3 ┆ 4 ┆ 4 │ - ╰───────┴───────┴────────────────────╯ - - (Showing first 3 of 3 rows) - -This is a useful expression for cleaning your data! +Expressions are a powerful way of describing computation on columns. For more details, check out the next section on :doc:`expressions` diff --git a/docs/source/user_guide/daft_in_depth.rst b/docs/source/user_guide/daft_in_depth.rst deleted file mode 100644 index 9b9702daca..0000000000 --- a/docs/source/user_guide/daft_in_depth.rst +++ /dev/null @@ -1,9 +0,0 @@ -Daft in Depth -============= - -.. toctree:: - - daft_in_depth/datatypes - daft_in_depth/dataframe-operations - daft_in_depth/aggregations - daft_in_depth/udf diff --git a/docs/source/user_guide/daft_in_depth/dataframe-operations.rst b/docs/source/user_guide/dataframe-operations.rst similarity index 100% rename from docs/source/user_guide/daft_in_depth/dataframe-operations.rst rename to docs/source/user_guide/dataframe-operations.rst diff --git a/docs/source/user_guide/daft_in_depth/datatypes.rst b/docs/source/user_guide/datatypes.rst similarity index 100% rename from docs/source/user_guide/daft_in_depth/datatypes.rst rename to docs/source/user_guide/datatypes.rst diff --git a/docs/source/user_guide/expressions.rst b/docs/source/user_guide/expressions.rst new file mode 100644 index 0000000000..54147a9401 --- /dev/null +++ b/docs/source/user_guide/expressions.rst @@ -0,0 +1,584 @@ +Expressions +=========== + +Expressions are how you can express computations that should be run over columns of data. + +Creating Expressions +^^^^^^^^^^^^^^^^^^^^ + +Referring to a column in a DataFrame +#################################### + +Most commonly you will be creating expressions by using the :func:`daft.col` function. + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + # Refers to column "A" + daft.col("A") + + .. group-tab:: ⚙️ SQL + + .. code:: python + + daft.sql_expr("A") + +.. code-block:: text + :caption: Output + + col(A) + +The above code creates an Expression that refers to a column named ``"A"``. + +Using SQL +######### + +Daft can also parse valid SQL as expressions. + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + daft.sql_expr("A + 1") + +.. code-block:: text + :caption: Output + + col(A) + lit(1) + +The above code will create an expression representing "the column named 'x' incremented by 1". For many APIs, sql_expr will actually be applied for you as syntactic sugar! + +Literals +######## + +You may find yourself needing to hardcode a "single value" oftentimes as an expression. Daft provides a :func:`~daft.expressions.lit` helper to do so: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + from daft import lit + + # Refers to an expression which always evaluates to 42 + lit(42) + + .. group-tab:: ⚙️ SQL + + .. code:: python + + # Refers to an expression which always evaluates to 42 + daft.sql_expr("42") + +.. code-block:: text + :caption: Output + + lit(42) + +This special :func:`~daft.expressions.lit` expression we just created evaluates always to the value ``42``. + +Wildcard Expressions +#################### + +You can create expressions on multiple columns at once using a wildcard. The expression `col("*")` selects every column in a DataFrame, and you can operate on this expression in the same way as a single column: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + import daft + from daft import col + + df = daft.from_pydict({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.select(col("*") * 3).show() + +.. code-block:: text + :caption: Output + + ╭───────┬───────╮ + │ A ┆ B │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 3 ┆ 12 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 6 ┆ 15 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 9 ┆ 18 │ + ╰───────┴───────╯ + +Wildcards also work very well for accessing all members of a struct column: + + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + import daft + from daft import col + + df = daft.from_pydict({ + "person": [ + {"name": "Alice", "age": 30}, + {"name": "Bob", "age": 25}, + {"name": "Charlie", "age": 35} + ] + }) + + # Access all fields of the 'person' struct + df.select(col("person.*")).show() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + import daft + + df = daft.from_pydict({ + "person": [ + {"name": "Alice", "age": 30}, + {"name": "Bob", "age": 25}, + {"name": "Charlie", "age": 35} + ] + }) + + # Access all fields of the 'person' struct using SQL + daft.sql("SELECT person.* FROM df").show() + +.. code-block:: text + :caption: Output + + ╭──────────┬───────╮ + │ name ┆ age │ + │ --- ┆ --- │ + │ String ┆ Int64 │ + ╞══════════╪═══════╡ + │ Alice ┆ 30 │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ Bob ┆ 25 │ + ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ Charlie ┆ 35 │ + ╰──────────┴───────╯ + +In this example, we use the wildcard `*` to access all fields of the `person` struct column. This is equivalent to selecting each field individually (`person.name`, `person.age`), but is more concise and flexible, especially when dealing with structs that have many fields. + + + +Composing Expressions +^^^^^^^^^^^^^^^^^^^^^ + +.. _userguide-numeric-expressions: + +Numeric Expressions +################### + +Since column "A" is an integer, we can run numeric computation such as addition, division and checking its value. Here are some examples where we create new columns using the results of such computations: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + # Add 1 to each element in column "A" + df = df.with_column("A_add_one", df["A"] + 1) + + # Divide each element in column A by 2 + df = df.with_column("A_divide_two", df["A"] / 2.) + + # Check if each element in column A is more than 1 + df = df.with_column("A_gt_1", df["A"] > 1) + + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql(""" + SELECT + *, + A + 1 AS A_add_one, + A / 2.0 AS A_divide_two, + A > 1 AS A_gt_1 + FROM df + """) + df.collect() + +.. code-block:: text + :caption: Output + + +---------+-------------+----------------+-----------+ + | A | A_add_one | A_divide_two | A_gt_1 | + | Int64 | Int64 | Float64 | Boolean | + +=========+=============+================+===========+ + | 1 | 2 | 0.5 | false | + +---------+-------------+----------------+-----------+ + | 2 | 3 | 1 | true | + +---------+-------------+----------------+-----------+ + | 3 | 4 | 1.5 | true | + +---------+-------------+----------------+-----------+ + (Showing first 3 of 3 rows) + +Notice that the returned types of these operations are also well-typed according to their input types. For example, calling ``df["A"] > 1`` returns a column of type :meth:`Boolean `. + +Both the :meth:`Float ` and :meth:`Int ` types are numeric types, and inherit many of the same arithmetic Expression operations. You may find the full list of numeric operations in the :ref:`Expressions API reference `. + +.. _userguide-string-expressions: + +String Expressions +################## + +Daft also lets you have columns of strings in a DataFrame. Let's take a look! + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"B": ["foo", "bar", "baz"]}) + df.show() + +.. code-block:: text + :caption: Output + + +--------+ + | B | + | Utf8 | + +========+ + | foo | + +--------+ + | bar | + +--------+ + | baz | + +--------+ + (Showing first 3 rows) + +Unlike the numeric types, the string type does not support arithmetic operations such as ``*`` and ``/``. The one exception to this is the ``+`` operator, which is overridden to concatenate two string expressions as is commonly done in Python. Let's try that! + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.with_column("B2", df["B"] + "foo") + df.show() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql("SELECT *, B + 'foo' AS B2 FROM df") + df.show() + +.. code-block:: text + :caption: Output + + +--------+--------+ + | B | B2 | + | Utf8 | Utf8 | + +========+========+ + | foo | foofoo | + +--------+--------+ + | bar | barfoo | + +--------+--------+ + | baz | bazfoo | + +--------+--------+ + (Showing first 3 rows) + +There are also many string operators that are accessed through a separate :meth:`.str.* ` "method namespace". + +For example, to check if each element in column "B" contains the substring "a", we can use the :meth:`.str.contains ` method: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = df.with_column("B2_contains_B", df["B2"].str.contains(df["B"])) + df.show() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.sql("SELECT *, contains(B2, B) AS B2_contains_B FROM df") + df.show() + +.. code-block:: text + :caption: Output + + +--------+--------+-----------------+ + | B | B2 | B2_contains_B | + | Utf8 | Utf8 | Boolean | + +========+========+=================+ + | foo | foofoo | true | + +--------+--------+-----------------+ + | bar | barfoo | true | + +--------+--------+-----------------+ + | baz | bazfoo | true | + +--------+--------+-----------------+ + (Showing first 3 rows) + +You may find a full list of string operations in the :ref:`Expressions API reference `. + +URL Expressions +############### + +One special case of a String column you may find yourself working with is a column of URL strings. + +Daft provides the :meth:`.url.* ` method namespace with functionality for working with URL strings. For example, to download data from URLs: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({ + "urls": [ + "https://www.google.com", + "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", + ], + }) + df = df.with_column("data", df["urls"].url.download()) + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + + df = daft.from_pydict({ + "urls": [ + "https://www.google.com", + "s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg", + ], + }) + df = daft.sql(""" + SELECT + urls, + url_download(urls) AS data + FROM df + """) + df.collect() + +.. code-block:: text + :caption: Output + + +----------------------+----------------------+ + | urls | data | + | Utf8 | Binary | + +======================+======================+ + | https://www.google.c | b'`_ as the underlying executor, so you can find the full list of supported filters in the `jaq documentation `_. + +.. _userguide-logical-expressions: + +Logical Expressions +################### + +Logical Expressions are an expression that refers to a column of type :meth:`Boolean `, and can only take on the values True or False. + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"C": [True, False, True]}) + +Daft supports logical operations such as ``&`` (and) and ``|`` (or) between logical expressions. + +Comparisons +########### + +Many of the types in Daft support comparisons between expressions that returns a Logical Expression. + +For example, here we can compare if each element in column "A" is equal to elements in column "B": + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + + df = df.with_column("A_eq_B", df["A"] == df["B"]) + + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 4]}) + + df = daft.sql(""" + SELECT + A, + B, + A = B AS A_eq_B + FROM df + """) + + df.collect() + +.. code-block:: text + :caption: Output + + ╭───────┬───────┬─────────╮ + │ A ┆ B ┆ A_eq_B │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Boolean │ + ╞═══════╪═══════╪═════════╡ + │ 1 ┆ 1 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 ┆ false │ + ╰───────┴───────┴─────────╯ + + (Showing first 3 of 3 rows) + +Other useful comparisons can be found in the :ref:`Expressions API reference `. + +If Else Pattern +############### + +The :meth:`.if_else() ` method is a useful expression to have up your sleeve for choosing values between two other expressions based on a logical expression: + +.. tabs:: + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + + # Pick values from column A if the value in column A is bigger + # than the value in column B. Otherwise, pick values from column B. + df = df.with_column( + "A_if_bigger_else_B", + (df["A"] > df["B"]).if_else(df["A"], df["B"]), + ) + + df.collect() + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) + + df = daft.sql(""" + SELECT + A, + B, + CASE + WHEN A > B THEN A + ELSE B + END AS A_if_bigger_else_B + FROM df + """) + + df.collect() + +.. code-block:: text + :caption: Output + + ╭───────┬───────┬────────────────────╮ + │ A ┆ B ┆ A_if_bigger_else_B │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪════════════════════╡ + │ 1 ┆ 0 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ 4 ┆ 4 │ + ╰───────┴───────┴────────────────────╯ + + (Showing first 3 of 3 rows) + +This is a useful expression for cleaning your data! diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst index 653240bed9..4971a43c33 100644 --- a/docs/source/user_guide/index.rst +++ b/docs/source/user_guide/index.rst @@ -7,8 +7,11 @@ Daft User Guide basic_concepts read-and-write + expressions + datatypes + dataframe-operations sql - daft_in_depth + aggregations poweruser integrations tutorials diff --git a/docs/source/user_guide/daft_in_depth/udf.rst b/docs/source/user_guide/udf.rst similarity index 100% rename from docs/source/user_guide/daft_in_depth/udf.rst rename to docs/source/user_guide/udf.rst From 762c8eea9d8f9b2e35b1b2e1d172091263f2727b Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Fri, 27 Sep 2024 09:42:45 -0700 Subject: [PATCH 10/21] Restructure --- docs/source/10-min.ipynb | 4 ++-- docs/source/conf.py | 8 +++++--- .../migration_guides/coming_from_dask.rst | 6 +++--- .../user_guide/fotw/fotw-001-images.ipynb | 4 ++-- docs/source/user_guide/index.rst | 18 +++++++++++++----- 5 files changed, 25 insertions(+), 15 deletions(-) diff --git a/docs/source/10-min.ipynb b/docs/source/10-min.ipynb index cbda803752..d4444c2cd5 100644 --- a/docs/source/10-min.ipynb +++ b/docs/source/10-min.ipynb @@ -569,7 +569,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "See: [Expressions](user_guide/basic_concepts/expressions.rst)\n", + "See: [Expressions](user_guide/expressions.rst)\n", "\n", "Expressions are an API for defining computation that needs to happen over your columns.\n", "\n", @@ -1516,7 +1516,7 @@ "source": [ "### User-Defined Functions\n", "\n", - "See: [UDF User Guide](user_guide/daft_in_depth/udf)" + "See: [UDF User Guide](user_guide/udf)" ] }, { diff --git a/docs/source/conf.py b/docs/source/conf.py index 108666a328..7dbe36f417 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -87,11 +87,13 @@ "learn/user_guides/remote_cluster_execution": "distributed-computing.html", "learn/quickstart": "learn/10-min.html", "learn/10-min": "../10-min.html", - "user_guide/basic_concepts/*": "user_guide/basic_concepts.html", + "user_guide/basic_concepts/expressions": "user_guide/expressions", + "user_guide/basic_concepts/dataframe_introduction": "user_guide/basic_concepts", + "user_guide/basic_concepts/introduction": "user_guide/basic_concepts", "user_guide/daft_in_depth/aggregations": "user_guide/aggregations", "user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations", - "user_guide/daft_in_depth/datatypes": "user_guide/datatypes", - "user_guide/daft_in_depth/udf": "user_guide/udf", + "user_guide/datatypes": "user_guide/datatypes", + "user_guide/udf": "user_guide/udf", } # Resolving code links to github diff --git a/docs/source/migration_guides/coming_from_dask.rst b/docs/source/migration_guides/coming_from_dask.rst index 4e649ec8d3..99606c3ff9 100644 --- a/docs/source/migration_guides/coming_from_dask.rst +++ b/docs/source/migration_guides/coming_from_dask.rst @@ -30,7 +30,7 @@ Daft does not use an index Dask aims for as much feature-parity with pandas as possible, including maintaining the presence of an Index in the DataFrame. But keeping an Index is difficult when moving to a distributed computing environment. Dask doesn’t support row-based positional indexing (with .iloc) because it does not track the length of its partitions. It also does not support pandas MultiIndex. The argument for keeping the Index is that it makes some operations against the sorted index column very fast. In reality, resetting the Index forces a data shuffle and is an expensive operation. -Daft drops the need for an Index to make queries more readable and consistent. How you write a query should not change because of the state of an index or a reset_index call. In our opinion, eliminating the index makes things simpler, more explicit, more readable and therefore less error-prone. Daft achieves this by using the [Expressions API](../user_guide/basic_concepts/expressions). +Daft drops the need for an Index to make queries more readable and consistent. How you write a query should not change because of the state of an index or a reset_index call. In our opinion, eliminating the index makes things simpler, more explicit, more readable and therefore less error-prone. Daft achieves this by using the [Expressions API](../user_guide/expressions). In Dask you would index your DataFrame to return row ``b`` as follows: @@ -80,7 +80,7 @@ For example: res = ddf.map_partitions(my_function, **kwargs) -Daft implements two APIs for mapping computations over the data in your DataFrame in parallel: :doc:`Expressions <../user_guide/basic_concepts/expressions>` and :doc:`UDFs <../user_guide/daft_in_depth/udf>`. Expressions are most useful when you need to define computation over your columns. +Daft implements two APIs for mapping computations over the data in your DataFrame in parallel: :doc:`Expressions <../user_guide/expressions>` and :doc:`UDFs <../user_guide/udf>`. Expressions are most useful when you need to define computation over your columns. .. code:: python @@ -113,7 +113,7 @@ Daft is built as a DataFrame API for distributed Machine learning. You can use D Daft supports Multimodal Data Types ----------------------------------- -Dask supports the same data types as pandas. Daft is built to support many more data types, including Images, nested JSON, tensors, etc. See :doc:`the documentation <../user_guide/daft_in_depth/datatypes>` for a list of all supported data types. +Dask supports the same data types as pandas. Daft is built to support many more data types, including Images, nested JSON, tensors, etc. See :doc:`the documentation <../user_guide/datatypes>` for a list of all supported data types. Distributed Computing and Remote Clusters ----------------------------------------- diff --git a/docs/source/user_guide/fotw/fotw-001-images.ipynb b/docs/source/user_guide/fotw/fotw-001-images.ipynb index 827f98dd57..37d1f796d2 100644 --- a/docs/source/user_guide/fotw/fotw-001-images.ipynb +++ b/docs/source/user_guide/fotw/fotw-001-images.ipynb @@ -447,7 +447,7 @@ "metadata": {}, "source": [ "### Create Thumbnails\n", - "[Expressions](../basic_concepts/expressions) are a Daft API for defining computation that needs to happen over your columns. There are dedicated `image.(...)` Expressions for working with images.\n", + "[Expressions](../expressions) are a Daft API for defining computation that needs to happen over your columns. There are dedicated `image.(...)` Expressions for working with images.\n", "\n", "You can use the `image.resize` Expression to create a thumbnail of each image:" ] @@ -527,7 +527,7 @@ "\n", "We'll define a function that uses a pre-trained PyTorch model [ResNet50](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html) to classify the dog pictures. We'll then pass the `image` column to this PyTorch model and send the classification predictions to a new column `classify_breed`. \n", "\n", - "You will use Daft [User-Defined Functions (UDFs)](../daft_in_depth/udf) to do this. Daft UDFs which are the best way to run computations over multiple rows or columns.\n", + "You will use Daft [User-Defined Functions (UDFs)](../udf) to do this. Daft UDFs which are the best way to run computations over multiple rows or columns.\n", "\n", "#### Setting up PyTorch\n", "\n", diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst index 4971a43c33..b4b7150215 100644 --- a/docs/source/user_guide/index.rst +++ b/docs/source/user_guide/index.rst @@ -12,6 +12,7 @@ Daft User Guide dataframe-operations sql aggregations + udf poweruser integrations tutorials @@ -41,16 +42,23 @@ The Daft User Guide is laid out as follows: High-level overview of Daft interfaces and usage to give you a better understanding of how Daft will fit into your day-to-day workflow. +Daft in Depth +************* + +Core Daft concepts all Daft users will find useful to understand deeply. + +* :doc:`read-and-write` +* :doc:`expressions` +* :doc:`datatypes` +* :doc:`dataframe-operations` +* :doc:`aggregations` +* :doc:`udf` + :doc:`Structured Query Language (SQL) ` ******************************************** A look into Daft's SQL interface and how it complements Daft's Pythonic DataFrame APIs. -:doc:`Daft in Depth ` -************************************ - -Core Daft concepts all Daft users will find useful to understand deeply. - :doc:`The Daft Poweruser ` ************************************* From a82589d869dfa70da1d52d4a5ee5db95e2e932fa Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Sun, 29 Sep 2024 14:13:53 -0700 Subject: [PATCH 11/21] Fixes --- daft/sql/sql.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ docs/source/conf.py | 4 ++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/daft/sql/sql.py b/daft/sql/sql.py index cb4ccd7114..b60552ffbe 100644 --- a/daft/sql/sql.py +++ b/daft/sql/sql.py @@ -38,6 +38,55 @@ def _copy_from(self, other: "SQLCatalog") -> None: @PublicAPI def sql_expr(sql: str) -> Expression: + """Parses a SQL string into a Daft Expression + + This function allows you to create Daft Expressions from SQL snippets, which can then be used + in Daft operations or combined with other Daft Expressions. + + Args: + sql (str): A SQL string to be parsed into a Daft Expression. + + Returns: + Expression: A Daft Expression representing the parsed SQL. + + Examples: + Create a simple SQL expression: + + >>> import daft + >>> expr = daft.sql_expr("1 + 2") + >>> print(expr) + lit(1) + lit(2) + + Use SQL expression in a Daft DataFrame operation: + + >>> df = daft.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df = df.with_column("c", daft.sql_expr("a + b")) + >>> df.show() + ╭───────┬───────┬───────╮ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ Int64 ┆ Int64 ┆ Int64 │ + ╞═══════╪═══════╪═══════╡ + │ 1 ┆ 4 ┆ 5 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 5 ┆ 7 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 6 ┆ 9 │ + ╰───────┴───────┴───────╯ + + `daft.sql_expr` is also called automatically for you in some DataFrame operations such as filters: + + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}) + >>> result = df.where("x < 3 AND y > 4") + >>> result.show() + ╭───────┬───────╮ + │ x ┆ y │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 2 ┆ 5 │ + ╰───────┴───────╯ + """ return Expression._from_pyexpr(_sql_expr(sql)) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7dbe36f417..c5ba1fbe0a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -92,8 +92,8 @@ "user_guide/basic_concepts/introduction": "user_guide/basic_concepts", "user_guide/daft_in_depth/aggregations": "user_guide/aggregations", "user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations", - "user_guide/datatypes": "user_guide/datatypes", - "user_guide/udf": "user_guide/udf", + "user_guide/daft_in_depth/datatypes": "user_guide/datatypes", + "user_guide/daft_in_depth/udf": "user_guide/udf", } # Resolving code links to github From 914d22f96883e5474d98680d9921c6eaac6227e7 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Sun, 29 Sep 2024 14:15:05 -0700 Subject: [PATCH 12/21] Fix docstrings --- daft/sql/sql.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/daft/sql/sql.py b/daft/sql/sql.py index b60552ffbe..2c9bb78554 100644 --- a/daft/sql/sql.py +++ b/daft/sql/sql.py @@ -73,6 +73,8 @@ def sql_expr(sql: str) -> Expression: ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ │ 3 ┆ 6 ┆ 9 │ ╰───────┴───────┴───────╯ + + (Showing first 3 of 3 rows) `daft.sql_expr` is also called automatically for you in some DataFrame operations such as filters: @@ -86,6 +88,8 @@ def sql_expr(sql: str) -> Expression: ╞═══════╪═══════╡ │ 2 ┆ 5 │ ╰───────┴───────╯ + + (Showing first 1 of 1 rows) """ return Expression._from_pyexpr(_sql_expr(sql)) From ce86a0b9b39eb1c5233abd5f63d4e18eda02dfd0 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Sun, 29 Sep 2024 14:49:48 -0700 Subject: [PATCH 13/21] Fleshed out SQL user guide --- docs/source/user_guide/sql.rst | 242 +++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) diff --git a/docs/source/user_guide/sql.rst b/docs/source/user_guide/sql.rst index 6e99c2fc34..790891325b 100644 --- a/docs/source/user_guide/sql.rst +++ b/docs/source/user_guide/sql.rst @@ -1,2 +1,244 @@ SQL === + +Daft supports Structured Query Language (SQL) as a way of constructing query plans (represented in Python as a :class:`daft.DataFrame`) and expressions (:class:`daft.Expression`). + +SQL is a human-readable way of constructing these query plans, and can often be more ergonomic than using DataFrames for writing queries. + +.. NOTE:: + Daft's SQL support is new and is constantly being improved on! Please give us feedback and we'd love to hear more about what you would like. + +Running SQL on DataFrames +------------------------- + +Daft's :func:`daft.sql` function will automatically detect any :class:`daft.DataFrame` objects in your current Python environment to let you query them easily by name. + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + # Note the variable name `my_special_df` + my_special_df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 3]}) + + # Use the SQL table name "my_special_df" to refer to the above DataFrame! + sql_df = daft.sql("SELECT A, B FROM my_special_df") + + sql_df.show() + +.. code-block:: text + :caption: Output + + ╭───────┬───────╮ + │ A ┆ B │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 1 ┆ 1 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 2 ┆ 2 │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + │ 3 ┆ 3 │ + ╰───────┴───────╯ + + (Showing first 3 of 3 rows) + +In the above example, we query the DataFrame called `"my_special_df"` by simply referring to it in the SQL command. This produces a new DataFrame `sql_df` which can +natively integrate with the rest of your Daft query. + +Reading data from SQL +--------------------- + +.. WARNING:: + + This feature is a WIP and will be coming soon! We will support reading common datasources directly from SQL: + + .. code-block:: python + + daft.sql("SELECT * FROM read_parquet('s3://...')") + daft.sql("SELECT * FROM read_delta_lake('s3://...')") + + Today, a workaround for this is to construct your dataframe in Python first and use it from SQL instead: + + .. code-block:: python + + df = daft.read_parquet("s3://...") + daft.sql("SELECT * FROM df") + + We appreciate your patience with us and hope to deliver this crucial feature soon! + +SQL Expressions +--------------- + +SQL has the concept of expressions as well. Here is an example of a simple addition expression, adding columns "a" and "b" in SQL to produce a new column C. + +We also present here the equivalent query for SQL and DataFrame. Notice how similar the concepts are! + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 3]}) + df = daft.sql("SELECT A + B as C FROM df") + df.show() + + .. group-tab:: 🐍 Python + + .. code:: python + + expr = (daft.col("A") + daft.col("B")).alias("C") + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 3]}) + df = df.select(expr) + df.show() + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ C │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 2 │ + ├╌╌╌╌╌╌╌┤ + │ 4 │ + ├╌╌╌╌╌╌╌┤ + │ 6 │ + ╰───────╯ + + (Showing first 3 of 3 rows) + +In the above query, both the SQL version of the query and the DataFrame version of the query produce the same result. + +Under the hood, they run the same Expression ``col("A") + col("B")``! + +One really cool trick you can do is to use the :func:`daft.sql_expr` function as a helper to easily create Expressions. The following are equivalent: + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + sql_expr = daft.sql_expr("A + B as C") + print("SQL expression:", sql_expr) + + .. group-tab:: 🐍 Python + + .. code:: python + + py_expr = (daft.col("A") + daft.col("B")).alias("C") + print("Python expression:", py_expr) + + +.. code-block:: text + :caption: Output + + SQL expression: col(A) + col(B) as C + Python expression: col(A) + col(B) as C + +This means that you can pretty much use SQL anywhere you use Python expressions, making Daft extremely versatile at mixing workflows which leverage both SQL and Python. + +As an example, consider the filter query below and compare the two equivalent Python and SQL queries: + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 3]}) + + # Daft automatically converts this string using `daft.sql_expr` + df = df.where("A < 2") + + df.show() + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"A": [1, 2, 3], "B": [1, 2, 3]}) + + # Using Daft's Python Expression API + df = df.where(df["A"] < 2) + + df.show() + +.. code-block:: text + :caption: Output + + ╭───────┬───────╮ + │ A ┆ B │ + │ --- ┆ --- │ + │ Int64 ┆ Int64 │ + ╞═══════╪═══════╡ + │ 1 ┆ 1 │ + ╰───────┴───────╯ + + (Showing first 1 of 1 rows) + +Pretty sweet! Of course, this support for running Expressions on your columns extends well beyond arithmetic as we'll see in the next section on SQL Functions. + +SQL Functions +------------- + +SQL also has access to all of Daft's powerful :class:`daft.Expression` functionality through SQL functions. + +However, unlike the Python Expression API which encourages method-chaining (e.g. ``col("a").url.download().image.decode()``), in SQL you have to do function nesting instead (e.g. ``"image_decode(url_download(a))""``). + +.. NOTE:: + + A full catalog of the available SQL Functions in Daft is available in the :doc:`API Reference - SQL Functions`. + + Note that it closely mirrors the Python API, with some function naming differences vs the available Python methods. + We also have some aliased functions for ANSI SQL-compliance or familiarity to users coming from other common SQL dialects such as PostgreSQL and SparkSQL to easily find their functionality. + +Here is an example of an equivalent function call in SQL vs Python: + +.. tabs:: + + .. group-tab:: ⚙️ SQL + + .. code:: python + + df = daft.from_pydict({"urls": [ + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + ]}) + df = daft.sql("SELECT image_decode(url_download(urls)) FROM df") + df.show() + + .. group-tab:: 🐍 Python + + .. code:: python + + df = daft.from_pydict({"urls": [ + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + "https://user-images.githubusercontent.com/17691182/190476440-28f29e87-8e3b-41c4-9c28-e112e595f558.png", + ]}) + df = df.select(daft.col("urls").url.download().image.decode()) + df.show() + +.. code-block:: text + :caption: Output + + ╭──────────────╮ + │ urls │ + │ --- │ + │ Image[MIXED] │ + ╞══════════════╡ + │ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ │ + ╰──────────────╯ + + (Showing first 3 of 3 rows) From 932497c06cc5582b2c2d1b6afadc3e4f218bf61e Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Sun, 29 Sep 2024 23:05:49 -0700 Subject: [PATCH 14/21] Add generation of SQL function stubs --- daft/daft/__init__.pyi | 2 +- daft/sql/_sql_funcs.py | 30 +++++++++++ docs/source/api_docs/index.rst | 2 +- docs/source/api_docs/sql.rst | 7 +-- docs/source/conf.py | 6 +++ docs/source/ext/__init__.py | 0 docs/source/ext/sql_autosummary.py | 80 ++++++++++++++++++++++++++++++ docs/source/user_guide/sql.rst | 2 +- src/daft-sql/src/python.rs | 9 +++- 9 files changed, 128 insertions(+), 10 deletions(-) create mode 100644 daft/sql/_sql_funcs.py create mode 100644 docs/source/ext/__init__.py create mode 100644 docs/source/ext/sql_autosummary.py diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index 08ec0860ba..5c9740c98e 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -1203,7 +1203,7 @@ def minhash( # ----- def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... def sql_expr(sql: str) -> PyExpr: ... -def list_sql_functions() -> list[str]: ... +def list_sql_functions() -> list[tuple[str, str, list[str]]]: ... def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ... def to_struct(inputs: list[PyExpr]) -> PyExpr: ... diff --git a/daft/sql/_sql_funcs.py b/daft/sql/_sql_funcs.py new file mode 100644 index 0000000000..5a47d8e469 --- /dev/null +++ b/daft/sql/_sql_funcs.py @@ -0,0 +1,30 @@ +"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow +Sphinx to generate documentation pages for every SQL function. +""" + +from __future__ import annotations + +from inspect import Parameter as _Parameter +from inspect import Signature as _Signature + +from daft.daft import list_sql_functions as _list_sql_funcstions + + +def _create_sql_function(func_name: str, docstring: str, arg_names: list[str]): + def sql_function(*args, **kwargs): + raise NotImplementedError("This function is for documentation purposes only and should not be called.") + + sql_function.__name__ = func_name + sql_function.__qualname__ = func_name + sql_function.__doc__ = docstring + sql_function.__signature__ = _Signature([_Parameter(name, _Parameter.POSITIONAL_OR_KEYWORD) for name in arg_names]) # type: ignore[attr-defined] + + # Register the function in the current module + globals()[func_name] = sql_function + + +__all__ = [] + +for sql_func_name, docstring, arg_names in _list_sql_funcstions(): + _create_sql_function(sql_func_name, docstring, arg_names) + __all__.append(sql_func_name) diff --git a/docs/source/api_docs/index.rst b/docs/source/api_docs/index.rst index 411d697449..6bee44ad95 100644 --- a/docs/source/api_docs/index.rst +++ b/docs/source/api_docs/index.rst @@ -7,8 +7,8 @@ API Documentation Table of Contents creation dataframe - expressions sql + expressions schema datatype groupby diff --git a/docs/source/api_docs/sql.rst b/docs/source/api_docs/sql.rst index 34ecf7ed3f..bfdb9477a4 100644 --- a/docs/source/api_docs/sql.rst +++ b/docs/source/api_docs/sql.rst @@ -13,9 +13,6 @@ SQL Functions This is a full list of functions that can be used from within SQL. -.. TODO! -.. .. autosummary:: -.. :recursive: -.. :toctree: doc_gen/sql_funcs -.. daft.sql_func_module +.. sql-autosummary:: + :toctree: doc_gen/sql_funcs diff --git a/docs/source/conf.py b/docs/source/conf.py index c5ba1fbe0a..d4b91be226 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,12 +9,16 @@ import inspect import os import subprocess +import sys import sphinx_autosummary_accessors # Set environment variable to help code determine whether or not we are running a Sphinx doc build process os.environ["DAFT_SPHINX_BUILD"] = "1" +# Help Sphinx find local custom extensions/directives that we build +sys.path.insert(0, os.path.abspath("ext")) + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "Daft" @@ -46,6 +50,8 @@ "sphinx_copybutton", "sphinx_autosummary_accessors", "sphinx_tabs.tabs", + # Local extensions + "sql_autosummary", ] templates_path = ["_templates", sphinx_autosummary_accessors.templates_path] diff --git a/docs/source/ext/__init__.py b/docs/source/ext/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/source/ext/sql_autosummary.py b/docs/source/ext/sql_autosummary.py new file mode 100644 index 0000000000..1e1c6f8bfb --- /dev/null +++ b/docs/source/ext/sql_autosummary.py @@ -0,0 +1,80 @@ +import inspect +import os + +from sphinx.ext.autosummary import Autosummary +from sphinx.util import logging + +logger = logging.getLogger(__name__) + + +TOCTREE = "doc_gen/sql_funcs" +SQL_MODULE_NAME = "daft.sql._sql_funcs" + +STUB_TEMPLATE = """ +.. currentmodule:: {module_name} + +.. autofunction:: {name} +""" + + +class SQLAutosummary(Autosummary): + def run(self): + func_names = get_sql_func_names() + # Run the normal autosummary stuff, override self.content + self.content = [f"~{SQL_MODULE_NAME}.{f}" for f in func_names] + nodes = super().run() + return nodes + + def get_sql_module_name(self): + return self.arguments[0] + + +def get_sql_func_names(): + # Import the SQL functions module + module = __import__(SQL_MODULE_NAME, fromlist=[""]) + + names = [] + for name, obj in inspect.getmembers(module): + if inspect.isfunction(obj) and not name.startswith("_"): + names.append(name) + + return names + + +def generate_stub(name: str): + """Generates a stub string for a SQL function""" + stub = name + "\n" + stub += "=" * len(name) + "\n\n" + stub += STUB_TEMPLATE.format(module_name=SQL_MODULE_NAME, name=name) + return stub + + +def generate_files(app): + # Determine where to write .rst files to + output_dir = os.path.join(app.srcdir, "api_docs", TOCTREE) + os.makedirs(output_dir, exist_ok=True) + + # Write stubfiles + func_names = get_sql_func_names() + for name in func_names: + stub_content = generate_stub(name) + filename = f"{SQL_MODULE_NAME}.{name}.rst" + filepath = os.path.join(output_dir, filename) + with open(filepath, "w") as f: + f.write(stub_content) + + # HACK: Not sure if this is ok? + app.env.found_docs.add(filepath) + + +def setup(app): + app.add_directive("sql-autosummary", SQLAutosummary) + + # Generate and register files when the builder is initialized + app.connect("builder-inited", generate_files) + + return { + "version": "0.1", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/source/user_guide/sql.rst b/docs/source/user_guide/sql.rst index 790891325b..fec2761e05 100644 --- a/docs/source/user_guide/sql.rst +++ b/docs/source/user_guide/sql.rst @@ -193,7 +193,7 @@ However, unlike the Python Expression API which encourages method-chaining (e.g. .. NOTE:: - A full catalog of the available SQL Functions in Daft is available in the :doc:`API Reference - SQL Functions`. + A full catalog of the available SQL Functions in Daft is available in the :doc:`../api_docs/sql`. Note that it closely mirrors the Python API, with some function naming differences vs the available Python methods. We also have some aliased functions for ANSI SQL-compliance or familiarity to users coming from other common SQL dialects such as PostgreSQL and SparkSQL to easily find their functionality. diff --git a/src/daft-sql/src/python.rs b/src/daft-sql/src/python.rs index 283184f014..6c885bf192 100644 --- a/src/daft-sql/src/python.rs +++ b/src/daft-sql/src/python.rs @@ -23,8 +23,13 @@ pub fn sql_expr(sql: &str) -> PyResult { } #[pyfunction] -pub fn list_sql_functions() -> Vec { - SQL_FUNCTIONS.map.keys().cloned().collect() +pub fn list_sql_functions() -> Vec<(String, &'static str, Vec<&'static str>)> { + SQL_FUNCTIONS + .map + .keys() + .cloned() + .map(|name| (name, "TODO: docstrings", vec!["foo", "bar"])) + .collect() } /// PyCatalog is the Python interface to the Catalog. From 18fcb76ebd4131d0c957100eafd56f9d6aa125b2 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 01:12:41 -0700 Subject: [PATCH 15/21] Add static str mechanism to populate docstrings from rust --- src/daft-sql/src/functions.rs | 11 +++- src/daft-sql/src/modules/aggs.rs | 50 +++++++++++++-- src/daft-sql/src/modules/float.rs | 8 +-- src/daft-sql/src/modules/image/mod.rs | 30 +++++++-- src/daft-sql/src/modules/json.rs | 2 +- src/daft-sql/src/modules/list.rs | 24 ++++---- src/daft-sql/src/modules/map.rs | 4 +- src/daft-sql/src/modules/numeric.rs | 78 ++++++++++++++++-------- src/daft-sql/src/modules/partitioning.rs | 32 ++++++++-- src/daft-sql/src/modules/structs.rs | 14 ++++- src/daft-sql/src/modules/temporal.rs | 18 +++--- src/daft-sql/src/modules/utf8.rs | 62 +++++++++++-------- src/daft-sql/src/python.rs | 5 +- 13 files changed, 242 insertions(+), 96 deletions(-) diff --git a/src/daft-sql/src/functions.rs b/src/daft-sql/src/functions.rs index 6172d2d382..f492fe3ad5 100644 --- a/src/daft-sql/src/functions.rs +++ b/src/daft-sql/src/functions.rs @@ -89,6 +89,7 @@ pub trait SQLFunction: Send + Sync { /// - Add more functions.. pub struct SQLFunctions { pub(crate) map: HashMap>, + pub(crate) docsmap: HashMap, } pub(crate) struct SQLFunctionArguments { @@ -110,6 +111,7 @@ impl SQLFunctions { pub fn new() -> Self { Self { map: HashMap::new(), + docsmap: HashMap::new(), } } @@ -119,8 +121,15 @@ impl SQLFunctions { } /// Add a [FunctionExpr] to the [SQLFunctions] instance. - pub fn add_fn(&mut self, name: &str, func: F) { + pub fn add_fn( + &mut self, + name: &str, + func: F, + docstring: &'static str, + args: &'static [&'static str], + ) { self.map.insert(name.to_string(), Arc::new(func)); + self.docsmap.insert(name.to_string(), (docstring, args)); } /// Get a function by name from the [SQLFunctions] instance. diff --git a/src/daft-sql/src/modules/aggs.rs b/src/daft-sql/src/modules/aggs.rs index 695d3c9c79..63f9cc5162 100644 --- a/src/daft-sql/src/modules/aggs.rs +++ b/src/daft-sql/src/modules/aggs.rs @@ -22,12 +22,52 @@ impl SQLModule for SQLModuleAggs { parent.add_fn( "count", Count(nil.clone(), daft_core::count_mode::CountMode::Valid), + " +Aggregates the number of items in the input expression. Only counts non-null items. + +Example: + +For a table like this: + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 2 of 2 rows) + +```sql +SELECT count(x) FROM tbl +``` + +Result: + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 1 │ + ╰───────╯ + (Showing first 1 of 1 rows) +", + &["input"], ); - parent.add_fn("sum", Sum(nil.clone())); - parent.add_fn("avg", Mean(nil.clone())); - parent.add_fn("mean", Mean(nil.clone())); - parent.add_fn("min", Min(nil.clone())); - parent.add_fn("max", Max(nil.clone())); + parent.add_fn("sum", Sum(nil.clone()), "TODO: Docstring", &[]); + parent.add_fn("avg", Mean(nil.clone()), "TODO: Docstring", &[]); + parent.add_fn("mean", Mean(nil.clone()), "TODO: Docstring", &[]); + parent.add_fn("min", Min(nil.clone()), "TODO: Docstring", &[]); + parent.add_fn("max", Max(nil.clone()), "TODO: Docstring", &[]); } } diff --git a/src/daft-sql/src/modules/float.rs b/src/daft-sql/src/modules/float.rs index 4cfffe34b4..f2a283868c 100644 --- a/src/daft-sql/src/modules/float.rs +++ b/src/daft-sql/src/modules/float.rs @@ -13,10 +13,10 @@ pub struct SQLModuleFloat; impl SQLModule for SQLModuleFloat { fn register(parent: &mut SQLFunctions) { - parent.add_fn("fill_nan", SQLFillNan {}); - parent.add_fn("is_inf", SQLIsInf {}); - parent.add_fn("is_nan", SQLIsNan {}); - parent.add_fn("not_nan", SQLNotNan {}); + parent.add_fn("fill_nan", SQLFillNan {}, "TODO: Docstring", &["TODO"]); + parent.add_fn("is_inf", SQLIsInf {}, "TODO: Docstring", &["TODO"]); + parent.add_fn("is_nan", SQLIsNan {}, "TODO: Docstring", &["TODO"]); + parent.add_fn("not_nan", SQLNotNan {}, "TODO: Docstring", &["TODO"]); } } diff --git a/src/daft-sql/src/modules/image/mod.rs b/src/daft-sql/src/modules/image/mod.rs index 89a0527ef0..06afcf9df4 100644 --- a/src/daft-sql/src/modules/image/mod.rs +++ b/src/daft-sql/src/modules/image/mod.rs @@ -10,10 +10,30 @@ pub struct SQLModuleImage; impl SQLModule for SQLModuleImage { fn register(parent: &mut SQLFunctions) { - parent.add_fn("image_crop", crop::SQLImageCrop {}); - parent.add_fn("image_decode", decode::SQLImageDecode {}); - parent.add_fn("image_encode", encode::SQLImageEncode {}); - parent.add_fn("image_resize", resize::SQLImageResize {}); - parent.add_fn("image_to_mode", to_mode::SQLImageToMode {}); + parent.add_fn("image_crop", crop::SQLImageCrop {}, "TODO: Docstring", &[]); + parent.add_fn( + "image_decode", + decode::SQLImageDecode {}, + "TODO: Docstring", + &[], + ); + parent.add_fn( + "image_encode", + encode::SQLImageEncode {}, + "TODO: Docstring", + &[], + ); + parent.add_fn( + "image_resize", + resize::SQLImageResize {}, + "TODO: Docstring", + &[], + ); + parent.add_fn( + "image_to_mode", + to_mode::SQLImageToMode {}, + "TODO: Docstring", + &[], + ); } } diff --git a/src/daft-sql/src/modules/json.rs b/src/daft-sql/src/modules/json.rs index f0d600daea..51f1b10c63 100644 --- a/src/daft-sql/src/modules/json.rs +++ b/src/daft-sql/src/modules/json.rs @@ -8,7 +8,7 @@ pub struct SQLModuleJson; impl SQLModule for SQLModuleJson { fn register(parent: &mut SQLFunctions) { - parent.add_fn("json_query", JsonQuery); + parent.add_fn("json_query", JsonQuery, "TODO: Docstring", &["TODO"]); } } diff --git a/src/daft-sql/src/modules/list.rs b/src/daft-sql/src/modules/list.rs index b9e52d9748..2b8ac22fee 100644 --- a/src/daft-sql/src/modules/list.rs +++ b/src/daft-sql/src/modules/list.rs @@ -12,20 +12,20 @@ pub struct SQLModuleList; impl SQLModule for SQLModuleList { fn register(parent: &mut SQLFunctions) { - parent.add_fn("list_chunk", SQLListChunk); - parent.add_fn("list_count", SQLListCount); - parent.add_fn("explode", SQLExplode); - parent.add_fn("unnest", SQLExplode); + parent.add_fn("list_chunk", SQLListChunk, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_count", SQLListCount, "TODO: Docstring", &["TODO"]); + parent.add_fn("explode", SQLExplode, "TODO: Docstring", &["TODO"]); + parent.add_fn("unnest", SQLExplode, "TODO: Docstring", &["TODO"]); // this is commonly called `array_to_string` in other SQL dialects - parent.add_fn("array_to_string", SQLListJoin); + parent.add_fn("array_to_string", SQLListJoin, "TODO: Docstring", &["TODO"]); // but we also want to support our `list_join` alias as well - parent.add_fn("list_join", SQLListJoin); - parent.add_fn("list_max", SQLListMax); - parent.add_fn("list_min", SQLListMin); - parent.add_fn("list_sum", SQLListSum); - parent.add_fn("list_mean", SQLListMean); - parent.add_fn("list_slice", SQLListSlice); - parent.add_fn("list_sort", SQLListSort); + parent.add_fn("list_join", SQLListJoin, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_max", SQLListMax, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_min", SQLListMin, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_sum", SQLListSum, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_mean", SQLListMean, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_slice", SQLListSlice, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_sort", SQLListSort, "TODO: Docstring", &["TODO"]); // TODO } diff --git a/src/daft-sql/src/modules/map.rs b/src/daft-sql/src/modules/map.rs index d3a328f3a4..171495660d 100644 --- a/src/daft-sql/src/modules/map.rs +++ b/src/daft-sql/src/modules/map.rs @@ -8,8 +8,8 @@ pub struct SQLModuleMap; impl SQLModule for SQLModuleMap { fn register(parent: &mut SQLFunctions) { - parent.add_fn("map_get", MapGet); - parent.add_fn("map_extract", MapGet); + parent.add_fn("map_get", MapGet, "TODO: Docstring", &["input", "key"]); + parent.add_fn("map_extract", MapGet, "Alias of map_get", &["input", "key"]); } } diff --git a/src/daft-sql/src/modules/numeric.rs b/src/daft-sql/src/modules/numeric.rs index 197d958860..6bc2864df5 100644 --- a/src/daft-sql/src/modules/numeric.rs +++ b/src/daft-sql/src/modules/numeric.rs @@ -26,30 +26,60 @@ pub struct SQLModuleNumeric; impl SQLModule for SQLModuleNumeric { fn register(parent: &mut SQLFunctions) { - parent.add_fn("abs", SQLNumericExpr::Abs); - parent.add_fn("ceil", SQLNumericExpr::Ceil); - parent.add_fn("floor", SQLNumericExpr::Floor); - parent.add_fn("sign", SQLNumericExpr::Sign); - parent.add_fn("round", SQLNumericExpr::Round); - parent.add_fn("sqrt", SQLNumericExpr::Sqrt); - parent.add_fn("sin", SQLNumericExpr::Sin); - parent.add_fn("cos", SQLNumericExpr::Cos); - parent.add_fn("tan", SQLNumericExpr::Tan); - parent.add_fn("cot", SQLNumericExpr::Cot); - parent.add_fn("asin", SQLNumericExpr::ArcSin); - parent.add_fn("acos", SQLNumericExpr::ArcCos); - parent.add_fn("atan", SQLNumericExpr::ArcTan); - parent.add_fn("atan2", SQLNumericExpr::ArcTan2); - parent.add_fn("radians", SQLNumericExpr::Radians); - parent.add_fn("degrees", SQLNumericExpr::Degrees); - parent.add_fn("log2", SQLNumericExpr::Log2); - parent.add_fn("log10", SQLNumericExpr::Log10); - parent.add_fn("log", SQLNumericExpr::Log); - parent.add_fn("ln", SQLNumericExpr::Ln); - parent.add_fn("exp", SQLNumericExpr::Exp); - parent.add_fn("atanh", SQLNumericExpr::ArcTanh); - parent.add_fn("acosh", SQLNumericExpr::ArcCosh); - parent.add_fn("asinh", SQLNumericExpr::ArcSinh); + parent.add_fn("abs", SQLNumericExpr::Abs, "TODO: Docstring", &["TODO"]); + parent.add_fn("ceil", SQLNumericExpr::Ceil, "TODO: Docstring", &["TODO"]); + parent.add_fn("floor", SQLNumericExpr::Floor, "TODO: Docstring", &["TODO"]); + parent.add_fn("sign", SQLNumericExpr::Sign, "TODO: Docstring", &["TODO"]); + parent.add_fn("round", SQLNumericExpr::Round, "TODO: Docstring", &["TODO"]); + parent.add_fn("sqrt", SQLNumericExpr::Sqrt, "TODO: Docstring", &["TODO"]); + parent.add_fn("sin", SQLNumericExpr::Sin, "TODO: Docstring", &["TODO"]); + parent.add_fn("cos", SQLNumericExpr::Cos, "TODO: Docstring", &["TODO"]); + parent.add_fn("tan", SQLNumericExpr::Tan, "TODO: Docstring", &["TODO"]); + parent.add_fn("cot", SQLNumericExpr::Cot, "TODO: Docstring", &["TODO"]); + parent.add_fn("asin", SQLNumericExpr::ArcSin, "TODO: Docstring", &["TODO"]); + parent.add_fn("acos", SQLNumericExpr::ArcCos, "TODO: Docstring", &["TODO"]); + parent.add_fn("atan", SQLNumericExpr::ArcTan, "TODO: Docstring", &["TODO"]); + parent.add_fn( + "atan2", + SQLNumericExpr::ArcTan2, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "radians", + SQLNumericExpr::Radians, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "degrees", + SQLNumericExpr::Degrees, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn("log2", SQLNumericExpr::Log2, "TODO: Docstring", &["TODO"]); + parent.add_fn("log10", SQLNumericExpr::Log10, "TODO: Docstring", &["TODO"]); + parent.add_fn("log", SQLNumericExpr::Log, "TODO: Docstring", &["TODO"]); + parent.add_fn("ln", SQLNumericExpr::Ln, "TODO: Docstring", &["TODO"]); + parent.add_fn("exp", SQLNumericExpr::Exp, "TODO: Docstring", &["TODO"]); + parent.add_fn( + "atanh", + SQLNumericExpr::ArcTanh, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "acosh", + SQLNumericExpr::ArcCosh, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "asinh", + SQLNumericExpr::ArcSinh, + "TODO: Docstring", + &["TODO"], + ); } } enum SQLNumericExpr { diff --git a/src/daft-sql/src/modules/partitioning.rs b/src/daft-sql/src/modules/partitioning.rs index e833edd51d..c1e62eb193 100644 --- a/src/daft-sql/src/modules/partitioning.rs +++ b/src/daft-sql/src/modules/partitioning.rs @@ -10,17 +10,41 @@ pub struct SQLModulePartitioning; impl SQLModule for SQLModulePartitioning { fn register(parent: &mut SQLFunctions) { - parent.add_fn("partitioning_years", PartitioningExpr::Years); - parent.add_fn("partitioning_months", PartitioningExpr::Months); - parent.add_fn("partitioning_days", PartitioningExpr::Days); - parent.add_fn("partitioning_hours", PartitioningExpr::Hours); + parent.add_fn( + "partitioning_years", + PartitioningExpr::Years, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "partitioning_months", + PartitioningExpr::Months, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "partitioning_days", + PartitioningExpr::Days, + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "partitioning_hours", + PartitioningExpr::Hours, + "TODO: Docstring", + &["TODO"], + ); parent.add_fn( "partitioning_iceberg_bucket", PartitioningExpr::IcebergBucket(0), + "TODO: Docstring", + &["TODO"], ); parent.add_fn( "partitioning_iceberg_truncate", PartitioningExpr::IcebergTruncate(0), + "TODO: Docstring", + &["TODO"], ); } } diff --git a/src/daft-sql/src/modules/structs.rs b/src/daft-sql/src/modules/structs.rs index 66be42d8e3..72d94f13da 100644 --- a/src/daft-sql/src/modules/structs.rs +++ b/src/daft-sql/src/modules/structs.rs @@ -8,8 +8,18 @@ pub struct SQLModuleStructs; impl SQLModule for SQLModuleStructs { fn register(parent: &mut SQLFunctions) { - parent.add_fn("struct_get", StructGet); - parent.add_fn("struct_extract", StructGet); + parent.add_fn( + "struct_get", + StructGet, + "TODO: Docstring", + &["input", "key"], + ); + parent.add_fn( + "struct_extract", + StructGet, + "Alias of struct_get", + &["input", "key"], + ); } } diff --git a/src/daft-sql/src/modules/temporal.rs b/src/daft-sql/src/modules/temporal.rs index 58687724fa..46ebee1fd0 100644 --- a/src/daft-sql/src/modules/temporal.rs +++ b/src/daft-sql/src/modules/temporal.rs @@ -13,15 +13,15 @@ pub struct SQLModuleTemporal; impl SQLModule for SQLModuleTemporal { fn register(parent: &mut SQLFunctions) { - parent.add_fn("date", SQLDate); - parent.add_fn("day", SQLDay); - parent.add_fn("dayofweek", SQLDayOfWeek); - parent.add_fn("hour", SQLHour); - parent.add_fn("minute", SQLMinute); - parent.add_fn("month", SQLMonth); - parent.add_fn("second", SQLSecond); - parent.add_fn("year", SQLYear); - parent.add_fn("time", SQLTime); + parent.add_fn("date", SQLDate, "TODO: Docstring", &["TODO"]); + parent.add_fn("day", SQLDay, "TODO: Docstring", &["TODO"]); + parent.add_fn("dayofweek", SQLDayOfWeek, "TODO: Docstring", &["TODO"]); + parent.add_fn("hour", SQLHour, "TODO: Docstring", &["TODO"]); + parent.add_fn("minute", SQLMinute, "TODO: Docstring", &["TODO"]); + parent.add_fn("month", SQLMonth, "TODO: Docstring", &["TODO"]); + parent.add_fn("second", SQLSecond, "TODO: Docstring", &["TODO"]); + parent.add_fn("year", SQLYear, "TODO: Docstring", &["TODO"]); + parent.add_fn("time", SQLTime, "TODO: Docstring", &["TODO"]); // TODO: Add truncate // Our `dt_truncate` function has vastly different semantics than SQL `DATE_TRUNCATE` function. diff --git a/src/daft-sql/src/modules/utf8.rs b/src/daft-sql/src/modules/utf8.rs index 263a8bd9e7..fc4ca9af30 100644 --- a/src/daft-sql/src/modules/utf8.rs +++ b/src/daft-sql/src/modules/utf8.rs @@ -14,36 +14,46 @@ pub struct SQLModuleUtf8; impl SQLModule for SQLModuleUtf8 { fn register(parent: &mut crate::functions::SQLFunctions) { use Utf8Expr::*; - parent.add_fn("ends_with", EndsWith); - parent.add_fn("starts_with", StartsWith); - parent.add_fn("contains", Contains); - parent.add_fn("split", Split(true)); + parent.add_fn("ends_with", EndsWith, "TODO: Docstring", &["TODO"]); + parent.add_fn("starts_with", StartsWith, "TODO: Docstring", &["TODO"]); + parent.add_fn("contains", Contains, "TODO: Docstring", &["TODO"]); + parent.add_fn("split", Split(true), "TODO: Docstring", &["TODO"]); // TODO add split variants // parent.add("split", f(Split(false))); - parent.add_fn("match", Match); - parent.add_fn("extract", Extract(0)); - parent.add_fn("extract_all", ExtractAll(0)); - parent.add_fn("replace", Replace(true)); + parent.add_fn("match", Match, "TODO: Docstring", &["TODO"]); + parent.add_fn("extract", Extract(0), "TODO: Docstring", &["TODO"]); + parent.add_fn("extract_all", ExtractAll(0), "TODO: Docstring", &["TODO"]); + parent.add_fn("replace", Replace(true), "TODO: Docstring", &["TODO"]); // TODO add replace variants // parent.add("replace", f(Replace(false))); - parent.add_fn("length", Length); - parent.add_fn("lower", Lower); - parent.add_fn("upper", Upper); - parent.add_fn("lstrip", Lstrip); - parent.add_fn("rstrip", Rstrip); - parent.add_fn("reverse", Reverse); - parent.add_fn("capitalize", Capitalize); - parent.add_fn("left", Left); - parent.add_fn("right", Right); - parent.add_fn("find", Find); - parent.add_fn("rpad", Rpad); - parent.add_fn("lpad", Lpad); - parent.add_fn("repeat", Repeat); - parent.add_fn("like", Like); - parent.add_fn("ilike", Ilike); - parent.add_fn("substr", Substr); - parent.add_fn("to_date", ToDate("".to_string())); - parent.add_fn("to_datetime", ToDatetime("".to_string(), None)); + parent.add_fn("length", Length, "TODO: Docstring", &["TODO"]); + parent.add_fn("lower", Lower, "TODO: Docstring", &["TODO"]); + parent.add_fn("upper", Upper, "TODO: Docstring", &["TODO"]); + parent.add_fn("lstrip", Lstrip, "TODO: Docstring", &["TODO"]); + parent.add_fn("rstrip", Rstrip, "TODO: Docstring", &["TODO"]); + parent.add_fn("reverse", Reverse, "TODO: Docstring", &["TODO"]); + parent.add_fn("capitalize", Capitalize, "TODO: Docstring", &["TODO"]); + parent.add_fn("left", Left, "TODO: Docstring", &["TODO"]); + parent.add_fn("right", Right, "TODO: Docstring", &["TODO"]); + parent.add_fn("find", Find, "TODO: Docstring", &["TODO"]); + parent.add_fn("rpad", Rpad, "TODO: Docstring", &["TODO"]); + parent.add_fn("lpad", Lpad, "TODO: Docstring", &["TODO"]); + parent.add_fn("repeat", Repeat, "TODO: Docstring", &["TODO"]); + parent.add_fn("like", Like, "TODO: Docstring", &["TODO"]); + parent.add_fn("ilike", Ilike, "TODO: Docstring", &["TODO"]); + parent.add_fn("substr", Substr, "TODO: Docstring", &["TODO"]); + parent.add_fn( + "to_date", + ToDate("".to_string()), + "TODO: Docstring", + &["TODO"], + ); + parent.add_fn( + "to_datetime", + ToDatetime("".to_string(), None), + "TODO: Docstring", + &["TODO"], + ); // TODO add normalization variants. // parent.add("normalize", f(Normalize(Default::default()))); } diff --git a/src/daft-sql/src/python.rs b/src/daft-sql/src/python.rs index 6c885bf192..6fe9c72270 100644 --- a/src/daft-sql/src/python.rs +++ b/src/daft-sql/src/python.rs @@ -28,7 +28,10 @@ pub fn list_sql_functions() -> Vec<(String, &'static str, Vec<&'static str>)> { .map .keys() .cloned() - .map(|name| (name, "TODO: docstrings", vec!["foo", "bar"])) + .map(|name| { + let (docstring, args) = SQL_FUNCTIONS.docsmap.get(&name).unwrap(); + (name, *docstring, args.to_vec()) + }) .collect() } From 911e6563e18827c975d80c2980110d59403b798f Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 19:51:11 -0700 Subject: [PATCH 16/21] Use SQLFunction trait to define docstrings --- src/daft-sql/src/functions.rs | 23 ++- src/daft-sql/src/modules/aggs.rs | 236 ++++++++++++++++++++--- src/daft-sql/src/modules/float.rs | 8 +- src/daft-sql/src/modules/image/mod.rs | 30 +-- src/daft-sql/src/modules/json.rs | 2 +- src/daft-sql/src/modules/list.rs | 24 +-- src/daft-sql/src/modules/map.rs | 4 +- src/daft-sql/src/modules/numeric.rs | 78 +++----- src/daft-sql/src/modules/partitioning.rs | 32 +-- src/daft-sql/src/modules/structs.rs | 14 +- src/daft-sql/src/modules/temporal.rs | 18 +- src/daft-sql/src/modules/utf8.rs | 62 +++--- src/daft-sql/src/python.rs | 4 +- 13 files changed, 317 insertions(+), 218 deletions(-) diff --git a/src/daft-sql/src/functions.rs b/src/daft-sql/src/functions.rs index f492fe3ad5..12372fb8db 100644 --- a/src/daft-sql/src/functions.rs +++ b/src/daft-sql/src/functions.rs @@ -82,6 +82,16 @@ pub trait SQLFunction: Send + Sync { } fn to_expr(&self, inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult; + + /// Produce the docstrings for this SQL function, parametrized by an alias which is the function name to invoke this in SQL + fn docstrings(&self, alias: &str) -> String { + format!("{alias}: No docstring available") + } + + /// Produce the docstrings for this SQL function, parametrized by an alias which is the function name to invoke this in SQL + fn arg_names(&self) -> &'static [&'static str] { + &["todo"] + } } /// TODOs @@ -89,7 +99,7 @@ pub trait SQLFunction: Send + Sync { /// - Add more functions.. pub struct SQLFunctions { pub(crate) map: HashMap>, - pub(crate) docsmap: HashMap, + pub(crate) docsmap: HashMap, } pub(crate) struct SQLFunctionArguments { @@ -121,15 +131,10 @@ impl SQLFunctions { } /// Add a [FunctionExpr] to the [SQLFunctions] instance. - pub fn add_fn( - &mut self, - name: &str, - func: F, - docstring: &'static str, - args: &'static [&'static str], - ) { + pub fn add_fn(&mut self, name: &str, func: F) { + self.docsmap + .insert(name.to_string(), (func.docstrings(name), func.arg_names())); self.map.insert(name.to_string(), Arc::new(func)); - self.docsmap.insert(name.to_string(), (docstring, args)); } /// Get a function by name from the [SQLFunctions] instance. diff --git a/src/daft-sql/src/modules/aggs.rs b/src/daft-sql/src/modules/aggs.rs index 63f9cc5162..2ea044dde0 100644 --- a/src/daft-sql/src/modules/aggs.rs +++ b/src/daft-sql/src/modules/aggs.rs @@ -14,20 +14,16 @@ use crate::{ pub struct SQLModuleAggs; -impl SQLModule for SQLModuleAggs { - fn register(parent: &mut SQLFunctions) { - use AggExpr::*; - // HACK TO USE AggExpr as an enum rather than a - let nil = Arc::new(Expr::Literal(LiteralValue::Null)); - parent.add_fn( - "count", - Count(nil.clone(), daft_core::count_mode::CountMode::Valid), - " -Aggregates the number of items in the input expression. Only counts non-null items. +mod static_docs { + pub(crate) const COUNT_DOCSTRING: &str = + "Counts the number of non-null elements in the input expression. Example: -For a table like this: +.. code-block:: sql + :caption: SQL + + SELECT count(x) FROM tbl .. code-block:: text :caption: Input @@ -39,13 +35,11 @@ For a table like this: ╞═══════╡ │ 100 │ ├╌╌╌╌╌╌╌┤ + │ 200 │ + ├╌╌╌╌╌╌╌┤ │ null │ ╰───────╯ - (Showing first 2 of 2 rows) - -```sql -SELECT count(x) FROM tbl -``` + (Showing first 3 of 3 rows) Result: @@ -59,15 +53,189 @@ Result: ╞═══════╡ │ 1 │ ╰───────╯ - (Showing first 1 of 1 rows) -", - &["input"], + (Showing first 1 of 1 rows)"; + + pub(crate) const SUM_DOCSTRING: &str = + "Calculates the sum of non-null elements in the input expression. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT sum(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ├╌╌╌╌╌╌╌┤ + │ 200 │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +Result: + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 300 │ + ╰───────╯ + (Showing first 1 of 1 rows)"; + + pub(crate) const AVG_DOCSTRING: &str = + "Calculates the average (mean) of non-null elements in the input expression. + +.. seealso:: + This SQL Function has aliases. + + :function:`daft.sql._sql_funcs.mean` + :function:`daft.sql._sql_funcs.avg` + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT {}(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ├╌╌╌╌╌╌╌┤ + │ 200 │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +Result: + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ x │ + │ --- │ + │ Float64 │ + ╞═══════════╡ + │ 150.0 │ + ╰───────────╯ + (Showing first 1 of 1 rows)"; + + pub(crate) const MIN_DOCSTRING: &str = + "Finds the minimum value among non-null elements in the input expression. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT min(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ├╌╌╌╌╌╌╌┤ + │ 200 │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +Result: + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ╰───────╯ + (Showing first 1 of 1 rows)"; + + pub(crate) const MAX_DOCSTRING: &str = + "Finds the maximum value among non-null elements in the input expression. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT max(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 100 │ + ├╌╌╌╌╌╌╌┤ + │ 200 │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +Result: + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 200 │ + ╰───────╯ + (Showing first 1 of 1 rows)"; +} + +impl SQLModule for SQLModuleAggs { + fn register(parent: &mut SQLFunctions) { + use AggExpr::*; + // HACK TO USE AggExpr as an enum rather than a + let nil = Arc::new(Expr::Literal(LiteralValue::Null)); + parent.add_fn( + "count", + Count(nil.clone(), daft_core::count_mode::CountMode::Valid), ); - parent.add_fn("sum", Sum(nil.clone()), "TODO: Docstring", &[]); - parent.add_fn("avg", Mean(nil.clone()), "TODO: Docstring", &[]); - parent.add_fn("mean", Mean(nil.clone()), "TODO: Docstring", &[]); - parent.add_fn("min", Min(nil.clone()), "TODO: Docstring", &[]); - parent.add_fn("max", Max(nil.clone()), "TODO: Docstring", &[]); + parent.add_fn("sum", Sum(nil.clone())); + parent.add_fn("avg", Mean(nil.clone())); + parent.add_fn("mean", Mean(nil.clone())); + parent.add_fn("min", Min(nil.clone())); + parent.add_fn("max", Max(nil.clone())); } } @@ -81,6 +249,26 @@ impl SQLFunction for AggExpr { to_expr(self, inputs.as_slice()) } } + + fn docstrings(&self, alias: &str) -> String { + match self { + Self::Count(_, _) => static_docs::COUNT_DOCSTRING.to_string(), + Self::Sum(_) => static_docs::SUM_DOCSTRING.to_string(), + Self::Mean(_) => static_docs::AVG_DOCSTRING.replace("{}", alias), + Self::Min(_) => static_docs::MIN_DOCSTRING.to_string(), + Self::Max(_) => static_docs::MAX_DOCSTRING.to_string(), + e => unimplemented!("Need to implement docstrings for {e}"), + } + } + + fn arg_names(&self) -> &'static [&'static str] { + match self { + Self::Count(_, _) | Self::Sum(_) | Self::Mean(_) | Self::Min(_) | Self::Max(_) => { + &["input"] + } + e => unimplemented!("Need to implement arg names for {e}"), + } + } } fn handle_count(inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult { diff --git a/src/daft-sql/src/modules/float.rs b/src/daft-sql/src/modules/float.rs index f2a283868c..4cfffe34b4 100644 --- a/src/daft-sql/src/modules/float.rs +++ b/src/daft-sql/src/modules/float.rs @@ -13,10 +13,10 @@ pub struct SQLModuleFloat; impl SQLModule for SQLModuleFloat { fn register(parent: &mut SQLFunctions) { - parent.add_fn("fill_nan", SQLFillNan {}, "TODO: Docstring", &["TODO"]); - parent.add_fn("is_inf", SQLIsInf {}, "TODO: Docstring", &["TODO"]); - parent.add_fn("is_nan", SQLIsNan {}, "TODO: Docstring", &["TODO"]); - parent.add_fn("not_nan", SQLNotNan {}, "TODO: Docstring", &["TODO"]); + parent.add_fn("fill_nan", SQLFillNan {}); + parent.add_fn("is_inf", SQLIsInf {}); + parent.add_fn("is_nan", SQLIsNan {}); + parent.add_fn("not_nan", SQLNotNan {}); } } diff --git a/src/daft-sql/src/modules/image/mod.rs b/src/daft-sql/src/modules/image/mod.rs index 06afcf9df4..89a0527ef0 100644 --- a/src/daft-sql/src/modules/image/mod.rs +++ b/src/daft-sql/src/modules/image/mod.rs @@ -10,30 +10,10 @@ pub struct SQLModuleImage; impl SQLModule for SQLModuleImage { fn register(parent: &mut SQLFunctions) { - parent.add_fn("image_crop", crop::SQLImageCrop {}, "TODO: Docstring", &[]); - parent.add_fn( - "image_decode", - decode::SQLImageDecode {}, - "TODO: Docstring", - &[], - ); - parent.add_fn( - "image_encode", - encode::SQLImageEncode {}, - "TODO: Docstring", - &[], - ); - parent.add_fn( - "image_resize", - resize::SQLImageResize {}, - "TODO: Docstring", - &[], - ); - parent.add_fn( - "image_to_mode", - to_mode::SQLImageToMode {}, - "TODO: Docstring", - &[], - ); + parent.add_fn("image_crop", crop::SQLImageCrop {}); + parent.add_fn("image_decode", decode::SQLImageDecode {}); + parent.add_fn("image_encode", encode::SQLImageEncode {}); + parent.add_fn("image_resize", resize::SQLImageResize {}); + parent.add_fn("image_to_mode", to_mode::SQLImageToMode {}); } } diff --git a/src/daft-sql/src/modules/json.rs b/src/daft-sql/src/modules/json.rs index 51f1b10c63..f0d600daea 100644 --- a/src/daft-sql/src/modules/json.rs +++ b/src/daft-sql/src/modules/json.rs @@ -8,7 +8,7 @@ pub struct SQLModuleJson; impl SQLModule for SQLModuleJson { fn register(parent: &mut SQLFunctions) { - parent.add_fn("json_query", JsonQuery, "TODO: Docstring", &["TODO"]); + parent.add_fn("json_query", JsonQuery); } } diff --git a/src/daft-sql/src/modules/list.rs b/src/daft-sql/src/modules/list.rs index 2b8ac22fee..b9e52d9748 100644 --- a/src/daft-sql/src/modules/list.rs +++ b/src/daft-sql/src/modules/list.rs @@ -12,20 +12,20 @@ pub struct SQLModuleList; impl SQLModule for SQLModuleList { fn register(parent: &mut SQLFunctions) { - parent.add_fn("list_chunk", SQLListChunk, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_count", SQLListCount, "TODO: Docstring", &["TODO"]); - parent.add_fn("explode", SQLExplode, "TODO: Docstring", &["TODO"]); - parent.add_fn("unnest", SQLExplode, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_chunk", SQLListChunk); + parent.add_fn("list_count", SQLListCount); + parent.add_fn("explode", SQLExplode); + parent.add_fn("unnest", SQLExplode); // this is commonly called `array_to_string` in other SQL dialects - parent.add_fn("array_to_string", SQLListJoin, "TODO: Docstring", &["TODO"]); + parent.add_fn("array_to_string", SQLListJoin); // but we also want to support our `list_join` alias as well - parent.add_fn("list_join", SQLListJoin, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_max", SQLListMax, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_min", SQLListMin, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_sum", SQLListSum, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_mean", SQLListMean, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_slice", SQLListSlice, "TODO: Docstring", &["TODO"]); - parent.add_fn("list_sort", SQLListSort, "TODO: Docstring", &["TODO"]); + parent.add_fn("list_join", SQLListJoin); + parent.add_fn("list_max", SQLListMax); + parent.add_fn("list_min", SQLListMin); + parent.add_fn("list_sum", SQLListSum); + parent.add_fn("list_mean", SQLListMean); + parent.add_fn("list_slice", SQLListSlice); + parent.add_fn("list_sort", SQLListSort); // TODO } diff --git a/src/daft-sql/src/modules/map.rs b/src/daft-sql/src/modules/map.rs index 171495660d..d3a328f3a4 100644 --- a/src/daft-sql/src/modules/map.rs +++ b/src/daft-sql/src/modules/map.rs @@ -8,8 +8,8 @@ pub struct SQLModuleMap; impl SQLModule for SQLModuleMap { fn register(parent: &mut SQLFunctions) { - parent.add_fn("map_get", MapGet, "TODO: Docstring", &["input", "key"]); - parent.add_fn("map_extract", MapGet, "Alias of map_get", &["input", "key"]); + parent.add_fn("map_get", MapGet); + parent.add_fn("map_extract", MapGet); } } diff --git a/src/daft-sql/src/modules/numeric.rs b/src/daft-sql/src/modules/numeric.rs index 6bc2864df5..197d958860 100644 --- a/src/daft-sql/src/modules/numeric.rs +++ b/src/daft-sql/src/modules/numeric.rs @@ -26,60 +26,30 @@ pub struct SQLModuleNumeric; impl SQLModule for SQLModuleNumeric { fn register(parent: &mut SQLFunctions) { - parent.add_fn("abs", SQLNumericExpr::Abs, "TODO: Docstring", &["TODO"]); - parent.add_fn("ceil", SQLNumericExpr::Ceil, "TODO: Docstring", &["TODO"]); - parent.add_fn("floor", SQLNumericExpr::Floor, "TODO: Docstring", &["TODO"]); - parent.add_fn("sign", SQLNumericExpr::Sign, "TODO: Docstring", &["TODO"]); - parent.add_fn("round", SQLNumericExpr::Round, "TODO: Docstring", &["TODO"]); - parent.add_fn("sqrt", SQLNumericExpr::Sqrt, "TODO: Docstring", &["TODO"]); - parent.add_fn("sin", SQLNumericExpr::Sin, "TODO: Docstring", &["TODO"]); - parent.add_fn("cos", SQLNumericExpr::Cos, "TODO: Docstring", &["TODO"]); - parent.add_fn("tan", SQLNumericExpr::Tan, "TODO: Docstring", &["TODO"]); - parent.add_fn("cot", SQLNumericExpr::Cot, "TODO: Docstring", &["TODO"]); - parent.add_fn("asin", SQLNumericExpr::ArcSin, "TODO: Docstring", &["TODO"]); - parent.add_fn("acos", SQLNumericExpr::ArcCos, "TODO: Docstring", &["TODO"]); - parent.add_fn("atan", SQLNumericExpr::ArcTan, "TODO: Docstring", &["TODO"]); - parent.add_fn( - "atan2", - SQLNumericExpr::ArcTan2, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "radians", - SQLNumericExpr::Radians, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "degrees", - SQLNumericExpr::Degrees, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn("log2", SQLNumericExpr::Log2, "TODO: Docstring", &["TODO"]); - parent.add_fn("log10", SQLNumericExpr::Log10, "TODO: Docstring", &["TODO"]); - parent.add_fn("log", SQLNumericExpr::Log, "TODO: Docstring", &["TODO"]); - parent.add_fn("ln", SQLNumericExpr::Ln, "TODO: Docstring", &["TODO"]); - parent.add_fn("exp", SQLNumericExpr::Exp, "TODO: Docstring", &["TODO"]); - parent.add_fn( - "atanh", - SQLNumericExpr::ArcTanh, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "acosh", - SQLNumericExpr::ArcCosh, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "asinh", - SQLNumericExpr::ArcSinh, - "TODO: Docstring", - &["TODO"], - ); + parent.add_fn("abs", SQLNumericExpr::Abs); + parent.add_fn("ceil", SQLNumericExpr::Ceil); + parent.add_fn("floor", SQLNumericExpr::Floor); + parent.add_fn("sign", SQLNumericExpr::Sign); + parent.add_fn("round", SQLNumericExpr::Round); + parent.add_fn("sqrt", SQLNumericExpr::Sqrt); + parent.add_fn("sin", SQLNumericExpr::Sin); + parent.add_fn("cos", SQLNumericExpr::Cos); + parent.add_fn("tan", SQLNumericExpr::Tan); + parent.add_fn("cot", SQLNumericExpr::Cot); + parent.add_fn("asin", SQLNumericExpr::ArcSin); + parent.add_fn("acos", SQLNumericExpr::ArcCos); + parent.add_fn("atan", SQLNumericExpr::ArcTan); + parent.add_fn("atan2", SQLNumericExpr::ArcTan2); + parent.add_fn("radians", SQLNumericExpr::Radians); + parent.add_fn("degrees", SQLNumericExpr::Degrees); + parent.add_fn("log2", SQLNumericExpr::Log2); + parent.add_fn("log10", SQLNumericExpr::Log10); + parent.add_fn("log", SQLNumericExpr::Log); + parent.add_fn("ln", SQLNumericExpr::Ln); + parent.add_fn("exp", SQLNumericExpr::Exp); + parent.add_fn("atanh", SQLNumericExpr::ArcTanh); + parent.add_fn("acosh", SQLNumericExpr::ArcCosh); + parent.add_fn("asinh", SQLNumericExpr::ArcSinh); } } enum SQLNumericExpr { diff --git a/src/daft-sql/src/modules/partitioning.rs b/src/daft-sql/src/modules/partitioning.rs index c1e62eb193..e833edd51d 100644 --- a/src/daft-sql/src/modules/partitioning.rs +++ b/src/daft-sql/src/modules/partitioning.rs @@ -10,41 +10,17 @@ pub struct SQLModulePartitioning; impl SQLModule for SQLModulePartitioning { fn register(parent: &mut SQLFunctions) { - parent.add_fn( - "partitioning_years", - PartitioningExpr::Years, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "partitioning_months", - PartitioningExpr::Months, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "partitioning_days", - PartitioningExpr::Days, - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "partitioning_hours", - PartitioningExpr::Hours, - "TODO: Docstring", - &["TODO"], - ); + parent.add_fn("partitioning_years", PartitioningExpr::Years); + parent.add_fn("partitioning_months", PartitioningExpr::Months); + parent.add_fn("partitioning_days", PartitioningExpr::Days); + parent.add_fn("partitioning_hours", PartitioningExpr::Hours); parent.add_fn( "partitioning_iceberg_bucket", PartitioningExpr::IcebergBucket(0), - "TODO: Docstring", - &["TODO"], ); parent.add_fn( "partitioning_iceberg_truncate", PartitioningExpr::IcebergTruncate(0), - "TODO: Docstring", - &["TODO"], ); } } diff --git a/src/daft-sql/src/modules/structs.rs b/src/daft-sql/src/modules/structs.rs index 72d94f13da..66be42d8e3 100644 --- a/src/daft-sql/src/modules/structs.rs +++ b/src/daft-sql/src/modules/structs.rs @@ -8,18 +8,8 @@ pub struct SQLModuleStructs; impl SQLModule for SQLModuleStructs { fn register(parent: &mut SQLFunctions) { - parent.add_fn( - "struct_get", - StructGet, - "TODO: Docstring", - &["input", "key"], - ); - parent.add_fn( - "struct_extract", - StructGet, - "Alias of struct_get", - &["input", "key"], - ); + parent.add_fn("struct_get", StructGet); + parent.add_fn("struct_extract", StructGet); } } diff --git a/src/daft-sql/src/modules/temporal.rs b/src/daft-sql/src/modules/temporal.rs index 46ebee1fd0..58687724fa 100644 --- a/src/daft-sql/src/modules/temporal.rs +++ b/src/daft-sql/src/modules/temporal.rs @@ -13,15 +13,15 @@ pub struct SQLModuleTemporal; impl SQLModule for SQLModuleTemporal { fn register(parent: &mut SQLFunctions) { - parent.add_fn("date", SQLDate, "TODO: Docstring", &["TODO"]); - parent.add_fn("day", SQLDay, "TODO: Docstring", &["TODO"]); - parent.add_fn("dayofweek", SQLDayOfWeek, "TODO: Docstring", &["TODO"]); - parent.add_fn("hour", SQLHour, "TODO: Docstring", &["TODO"]); - parent.add_fn("minute", SQLMinute, "TODO: Docstring", &["TODO"]); - parent.add_fn("month", SQLMonth, "TODO: Docstring", &["TODO"]); - parent.add_fn("second", SQLSecond, "TODO: Docstring", &["TODO"]); - parent.add_fn("year", SQLYear, "TODO: Docstring", &["TODO"]); - parent.add_fn("time", SQLTime, "TODO: Docstring", &["TODO"]); + parent.add_fn("date", SQLDate); + parent.add_fn("day", SQLDay); + parent.add_fn("dayofweek", SQLDayOfWeek); + parent.add_fn("hour", SQLHour); + parent.add_fn("minute", SQLMinute); + parent.add_fn("month", SQLMonth); + parent.add_fn("second", SQLSecond); + parent.add_fn("year", SQLYear); + parent.add_fn("time", SQLTime); // TODO: Add truncate // Our `dt_truncate` function has vastly different semantics than SQL `DATE_TRUNCATE` function. diff --git a/src/daft-sql/src/modules/utf8.rs b/src/daft-sql/src/modules/utf8.rs index fc4ca9af30..263a8bd9e7 100644 --- a/src/daft-sql/src/modules/utf8.rs +++ b/src/daft-sql/src/modules/utf8.rs @@ -14,46 +14,36 @@ pub struct SQLModuleUtf8; impl SQLModule for SQLModuleUtf8 { fn register(parent: &mut crate::functions::SQLFunctions) { use Utf8Expr::*; - parent.add_fn("ends_with", EndsWith, "TODO: Docstring", &["TODO"]); - parent.add_fn("starts_with", StartsWith, "TODO: Docstring", &["TODO"]); - parent.add_fn("contains", Contains, "TODO: Docstring", &["TODO"]); - parent.add_fn("split", Split(true), "TODO: Docstring", &["TODO"]); + parent.add_fn("ends_with", EndsWith); + parent.add_fn("starts_with", StartsWith); + parent.add_fn("contains", Contains); + parent.add_fn("split", Split(true)); // TODO add split variants // parent.add("split", f(Split(false))); - parent.add_fn("match", Match, "TODO: Docstring", &["TODO"]); - parent.add_fn("extract", Extract(0), "TODO: Docstring", &["TODO"]); - parent.add_fn("extract_all", ExtractAll(0), "TODO: Docstring", &["TODO"]); - parent.add_fn("replace", Replace(true), "TODO: Docstring", &["TODO"]); + parent.add_fn("match", Match); + parent.add_fn("extract", Extract(0)); + parent.add_fn("extract_all", ExtractAll(0)); + parent.add_fn("replace", Replace(true)); // TODO add replace variants // parent.add("replace", f(Replace(false))); - parent.add_fn("length", Length, "TODO: Docstring", &["TODO"]); - parent.add_fn("lower", Lower, "TODO: Docstring", &["TODO"]); - parent.add_fn("upper", Upper, "TODO: Docstring", &["TODO"]); - parent.add_fn("lstrip", Lstrip, "TODO: Docstring", &["TODO"]); - parent.add_fn("rstrip", Rstrip, "TODO: Docstring", &["TODO"]); - parent.add_fn("reverse", Reverse, "TODO: Docstring", &["TODO"]); - parent.add_fn("capitalize", Capitalize, "TODO: Docstring", &["TODO"]); - parent.add_fn("left", Left, "TODO: Docstring", &["TODO"]); - parent.add_fn("right", Right, "TODO: Docstring", &["TODO"]); - parent.add_fn("find", Find, "TODO: Docstring", &["TODO"]); - parent.add_fn("rpad", Rpad, "TODO: Docstring", &["TODO"]); - parent.add_fn("lpad", Lpad, "TODO: Docstring", &["TODO"]); - parent.add_fn("repeat", Repeat, "TODO: Docstring", &["TODO"]); - parent.add_fn("like", Like, "TODO: Docstring", &["TODO"]); - parent.add_fn("ilike", Ilike, "TODO: Docstring", &["TODO"]); - parent.add_fn("substr", Substr, "TODO: Docstring", &["TODO"]); - parent.add_fn( - "to_date", - ToDate("".to_string()), - "TODO: Docstring", - &["TODO"], - ); - parent.add_fn( - "to_datetime", - ToDatetime("".to_string(), None), - "TODO: Docstring", - &["TODO"], - ); + parent.add_fn("length", Length); + parent.add_fn("lower", Lower); + parent.add_fn("upper", Upper); + parent.add_fn("lstrip", Lstrip); + parent.add_fn("rstrip", Rstrip); + parent.add_fn("reverse", Reverse); + parent.add_fn("capitalize", Capitalize); + parent.add_fn("left", Left); + parent.add_fn("right", Right); + parent.add_fn("find", Find); + parent.add_fn("rpad", Rpad); + parent.add_fn("lpad", Lpad); + parent.add_fn("repeat", Repeat); + parent.add_fn("like", Like); + parent.add_fn("ilike", Ilike); + parent.add_fn("substr", Substr); + parent.add_fn("to_date", ToDate("".to_string())); + parent.add_fn("to_datetime", ToDatetime("".to_string(), None)); // TODO add normalization variants. // parent.add("normalize", f(Normalize(Default::default()))); } diff --git a/src/daft-sql/src/python.rs b/src/daft-sql/src/python.rs index 6fe9c72270..84d75d6c59 100644 --- a/src/daft-sql/src/python.rs +++ b/src/daft-sql/src/python.rs @@ -23,14 +23,14 @@ pub fn sql_expr(sql: &str) -> PyResult { } #[pyfunction] -pub fn list_sql_functions() -> Vec<(String, &'static str, Vec<&'static str>)> { +pub fn list_sql_functions() -> Vec<(String, String, Vec<&'static str>)> { SQL_FUNCTIONS .map .keys() .cloned() .map(|name| { let (docstring, args) = SQL_FUNCTIONS.docsmap.get(&name).unwrap(); - (name, *docstring, args.to_vec()) + (name, docstring.to_string(), args.to_vec()) }) .collect() } From 3c35bd90a9f97be067f8f61a8919af0273e2af04 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 20:02:21 -0700 Subject: [PATCH 17/21] Add SQLFunctionStub pyclass --- daft/daft/__init__.pyi | 10 +++++++++- daft/sql/_sql_funcs.py | 8 ++++---- src/daft-sql/src/python.rs | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index 5c9740c98e..c90817dfc2 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -1201,9 +1201,17 @@ def minhash( # ----- # SQL functions # ----- +class SQLFunctionStub: + @property + def name(self) -> str: ... + @property + def docstring(self) -> str: ... + @property + def arg_names(self) -> list[str]: ... + def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... def sql_expr(sql: str) -> PyExpr: ... -def list_sql_functions() -> list[tuple[str, str, list[str]]]: ... +def list_sql_functions() -> list[SQLFunctionStub]: ... def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ... def to_struct(inputs: list[PyExpr]) -> PyExpr: ... diff --git a/daft/sql/_sql_funcs.py b/daft/sql/_sql_funcs.py index 5a47d8e469..030cd3b53f 100644 --- a/daft/sql/_sql_funcs.py +++ b/daft/sql/_sql_funcs.py @@ -7,7 +7,7 @@ from inspect import Parameter as _Parameter from inspect import Signature as _Signature -from daft.daft import list_sql_functions as _list_sql_funcstions +from daft.daft import list_sql_functions as _list_sql_functions def _create_sql_function(func_name: str, docstring: str, arg_names: list[str]): @@ -25,6 +25,6 @@ def sql_function(*args, **kwargs): __all__ = [] -for sql_func_name, docstring, arg_names in _list_sql_funcstions(): - _create_sql_function(sql_func_name, docstring, arg_names) - __all__.append(sql_func_name) +for sql_function_stub in _list_sql_functions(): + _create_sql_function(sql_function_stub.name, sql_function_stub.docstring, sql_function_stub.arg_names) + __all__.append(sql_function_stub.name) diff --git a/src/daft-sql/src/python.rs b/src/daft-sql/src/python.rs index 84d75d6c59..b61d3fedd2 100644 --- a/src/daft-sql/src/python.rs +++ b/src/daft-sql/src/python.rs @@ -5,6 +5,31 @@ use pyo3::prelude::*; use crate::{catalog::SQLCatalog, functions::SQL_FUNCTIONS, planner::SQLPlanner}; +#[pyclass] +pub struct SQLFunctionStub { + name: String, + docstring: String, + arg_names: Vec<&'static str>, +} + +#[pymethods] +impl SQLFunctionStub { + #[getter] + fn name(&self) -> PyResult { + Ok(self.name.clone()) + } + + #[getter] + fn docstring(&self) -> PyResult { + Ok(self.docstring.clone()) + } + + #[getter] + fn arg_names(&self) -> PyResult> { + Ok(self.arg_names.clone()) + } +} + #[pyfunction] pub fn sql( sql: &str, @@ -23,14 +48,18 @@ pub fn sql_expr(sql: &str) -> PyResult { } #[pyfunction] -pub fn list_sql_functions() -> Vec<(String, String, Vec<&'static str>)> { +pub fn list_sql_functions() -> Vec { SQL_FUNCTIONS .map .keys() .cloned() .map(|name| { let (docstring, args) = SQL_FUNCTIONS.docsmap.get(&name).unwrap(); - (name, docstring.to_string(), args.to_vec()) + SQLFunctionStub { + name, + docstring: docstring.to_string(), + arg_names: args.to_vec(), + } }) .collect() } From 7d8947c94fe728b26e7c95d12697b1a94399b5f4 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 20:22:58 -0700 Subject: [PATCH 18/21] Remove module name prefix --- docs/source/conf.py | 2 ++ docs/source/ext/sql_autosummary.py | 4 ++-- src/daft-sql/src/modules/aggs.rs | 14 ++------------ 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d4b91be226..fd59d32625 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -56,6 +56,8 @@ templates_path = ["_templates", sphinx_autosummary_accessors.templates_path] +# Removes module names that prefix our classes +add_module_names = False # -- Options for Notebook rendering # https://myst-nb.readthedocs.io/en/latest/configuration.html?highlight=nb_execution_mode#execution diff --git a/docs/source/ext/sql_autosummary.py b/docs/source/ext/sql_autosummary.py index 1e1c6f8bfb..5e37456cbe 100644 --- a/docs/source/ext/sql_autosummary.py +++ b/docs/source/ext/sql_autosummary.py @@ -11,9 +11,9 @@ SQL_MODULE_NAME = "daft.sql._sql_funcs" STUB_TEMPLATE = """ -.. currentmodule:: {module_name} +.. currentmodule:: None -.. autofunction:: {name} +.. autofunction:: {module_name}.{name} """ diff --git a/src/daft-sql/src/modules/aggs.rs b/src/daft-sql/src/modules/aggs.rs index 2ea044dde0..4ea6eda428 100644 --- a/src/daft-sql/src/modules/aggs.rs +++ b/src/daft-sql/src/modules/aggs.rs @@ -41,8 +41,6 @@ Example: ╰───────╯ (Showing first 3 of 3 rows) -Result: - .. code-block:: text :caption: Output @@ -81,8 +79,6 @@ Example: ╰───────╯ (Showing first 3 of 3 rows) -Result: - .. code-block:: text :caption: Output @@ -101,8 +97,8 @@ Result: .. seealso:: This SQL Function has aliases. - :function:`daft.sql._sql_funcs.mean` - :function:`daft.sql._sql_funcs.avg` + * :func:`~daft.sql._sql_funcs.mean` + * :func:`~daft.sql._sql_funcs.avg` Example: @@ -127,8 +123,6 @@ Example: ╰───────╯ (Showing first 3 of 3 rows) -Result: - .. code-block:: text :caption: Output @@ -167,8 +161,6 @@ Example: ╰───────╯ (Showing first 3 of 3 rows) -Result: - .. code-block:: text :caption: Output @@ -207,8 +199,6 @@ Example: ╰───────╯ (Showing first 3 of 3 rows) -Result: - .. code-block:: text :caption: Output From f30efcaec0ce221c41c1ce81224a863d6fb89c82 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 20:52:52 -0700 Subject: [PATCH 19/21] Theres a LOT oh my god --- docs/source/api_docs/sql.rst | 3 - src/daft-sql/src/modules/aggs.rs | 220 +++++++-------- src/daft-sql/src/modules/float.rs | 202 ++++++++++++++ src/daft-sql/src/modules/json.rs | 48 ++++ src/daft-sql/src/modules/list.rs | 442 ++++++++++++++++++++++++++++++ src/daft-sql/src/modules/map.rs | 53 ++++ 6 files changed, 855 insertions(+), 113 deletions(-) diff --git a/docs/source/api_docs/sql.rst b/docs/source/api_docs/sql.rst index bfdb9477a4..33cf0c25dd 100644 --- a/docs/source/api_docs/sql.rst +++ b/docs/source/api_docs/sql.rst @@ -1,9 +1,6 @@ SQL === -SQL Functions -------------- - .. autofunction:: daft.sql .. autofunction:: daft.sql_expr diff --git a/src/daft-sql/src/modules/aggs.rs b/src/daft-sql/src/modules/aggs.rs index 4ea6eda428..0fbd2f7067 100644 --- a/src/daft-sql/src/modules/aggs.rs +++ b/src/daft-sql/src/modules/aggs.rs @@ -14,6 +14,116 @@ use crate::{ pub struct SQLModuleAggs; +impl SQLModule for SQLModuleAggs { + fn register(parent: &mut SQLFunctions) { + use AggExpr::*; + // HACK TO USE AggExpr as an enum rather than a + let nil = Arc::new(Expr::Literal(LiteralValue::Null)); + parent.add_fn( + "count", + Count(nil.clone(), daft_core::count_mode::CountMode::Valid), + ); + parent.add_fn("sum", Sum(nil.clone())); + parent.add_fn("avg", Mean(nil.clone())); + parent.add_fn("mean", Mean(nil.clone())); + parent.add_fn("min", Min(nil.clone())); + parent.add_fn("max", Max(nil.clone())); + } +} + +impl SQLFunction for AggExpr { + fn to_expr(&self, inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult { + // COUNT(*) needs a bit of extra handling, so we process that outside of `to_expr` + if let Self::Count(_, _) = self { + handle_count(inputs, planner) + } else { + let inputs = self.args_to_expr_unnamed(inputs, planner)?; + to_expr(self, inputs.as_slice()) + } + } + + fn docstrings(&self, alias: &str) -> String { + match self { + Self::Count(_, _) => static_docs::COUNT_DOCSTRING.to_string(), + Self::Sum(_) => static_docs::SUM_DOCSTRING.to_string(), + Self::Mean(_) => static_docs::AVG_DOCSTRING.replace("{}", alias), + Self::Min(_) => static_docs::MIN_DOCSTRING.to_string(), + Self::Max(_) => static_docs::MAX_DOCSTRING.to_string(), + e => unimplemented!("Need to implement docstrings for {e}"), + } + } + + fn arg_names(&self) -> &'static [&'static str] { + match self { + Self::Count(_, _) | Self::Sum(_) | Self::Mean(_) | Self::Min(_) | Self::Max(_) => { + &["input"] + } + e => unimplemented!("Need to implement arg names for {e}"), + } + } +} + +fn handle_count(inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult { + Ok(match inputs { + [FunctionArg::Unnamed(FunctionArgExpr::Wildcard)] => match planner.relation_opt() { + Some(rel) => { + let schema = rel.schema(); + col(schema.fields[0].name.clone()) + .count(daft_core::count_mode::CountMode::All) + .alias("count") + } + None => unsupported_sql_err!("Wildcard is not supported in this context"), + }, + [FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(name))] => { + match planner.relation_opt() { + Some(rel) if name.to_string() == rel.name => { + let schema = rel.schema(); + col(schema.fields[0].name.clone()) + .count(daft_core::count_mode::CountMode::All) + .alias("count") + } + _ => unsupported_sql_err!("Wildcard is not supported in this context"), + } + } + [expr] => { + // SQL default COUNT ignores nulls + let input = planner.plan_function_arg(expr)?; + input.count(daft_core::count_mode::CountMode::Valid) + } + _ => unsupported_sql_err!("COUNT takes exactly one argument"), + }) +} + +pub(crate) fn to_expr(expr: &AggExpr, args: &[ExprRef]) -> SQLPlannerResult { + match expr { + AggExpr::Count(_, _) => unreachable!("count should be handled by by this point"), + AggExpr::Sum(_) => { + ensure!(args.len() == 1, "sum takes exactly one argument"); + Ok(args[0].clone().sum()) + } + AggExpr::ApproxCountDistinct(_) => unsupported_sql_err!("approx_percentile"), + AggExpr::ApproxPercentile(_) => unsupported_sql_err!("approx_percentile"), + AggExpr::ApproxSketch(_, _) => unsupported_sql_err!("approx_sketch"), + AggExpr::MergeSketch(_, _) => unsupported_sql_err!("merge_sketch"), + AggExpr::Mean(_) => { + ensure!(args.len() == 1, "mean takes exactly one argument"); + Ok(args[0].clone().mean()) + } + AggExpr::Min(_) => { + ensure!(args.len() == 1, "min takes exactly one argument"); + Ok(args[0].clone().min()) + } + AggExpr::Max(_) => { + ensure!(args.len() == 1, "max takes exactly one argument"); + Ok(args[0].clone().max()) + } + AggExpr::AnyValue(_, _) => unsupported_sql_err!("any_value"), + AggExpr::List(_) => unsupported_sql_err!("list"), + AggExpr::Concat(_) => unsupported_sql_err!("concat"), + AggExpr::MapGroups { .. } => unsupported_sql_err!("map_groups"), + } +} + mod static_docs { pub(crate) const COUNT_DOCSTRING: &str = "Counts the number of non-null elements in the input expression. @@ -211,113 +321,3 @@ Example: ╰───────╯ (Showing first 1 of 1 rows)"; } - -impl SQLModule for SQLModuleAggs { - fn register(parent: &mut SQLFunctions) { - use AggExpr::*; - // HACK TO USE AggExpr as an enum rather than a - let nil = Arc::new(Expr::Literal(LiteralValue::Null)); - parent.add_fn( - "count", - Count(nil.clone(), daft_core::count_mode::CountMode::Valid), - ); - parent.add_fn("sum", Sum(nil.clone())); - parent.add_fn("avg", Mean(nil.clone())); - parent.add_fn("mean", Mean(nil.clone())); - parent.add_fn("min", Min(nil.clone())); - parent.add_fn("max", Max(nil.clone())); - } -} - -impl SQLFunction for AggExpr { - fn to_expr(&self, inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult { - // COUNT(*) needs a bit of extra handling, so we process that outside of `to_expr` - if let Self::Count(_, _) = self { - handle_count(inputs, planner) - } else { - let inputs = self.args_to_expr_unnamed(inputs, planner)?; - to_expr(self, inputs.as_slice()) - } - } - - fn docstrings(&self, alias: &str) -> String { - match self { - Self::Count(_, _) => static_docs::COUNT_DOCSTRING.to_string(), - Self::Sum(_) => static_docs::SUM_DOCSTRING.to_string(), - Self::Mean(_) => static_docs::AVG_DOCSTRING.replace("{}", alias), - Self::Min(_) => static_docs::MIN_DOCSTRING.to_string(), - Self::Max(_) => static_docs::MAX_DOCSTRING.to_string(), - e => unimplemented!("Need to implement docstrings for {e}"), - } - } - - fn arg_names(&self) -> &'static [&'static str] { - match self { - Self::Count(_, _) | Self::Sum(_) | Self::Mean(_) | Self::Min(_) | Self::Max(_) => { - &["input"] - } - e => unimplemented!("Need to implement arg names for {e}"), - } - } -} - -fn handle_count(inputs: &[FunctionArg], planner: &SQLPlanner) -> SQLPlannerResult { - Ok(match inputs { - [FunctionArg::Unnamed(FunctionArgExpr::Wildcard)] => match planner.relation_opt() { - Some(rel) => { - let schema = rel.schema(); - col(schema.fields[0].name.clone()) - .count(daft_core::count_mode::CountMode::All) - .alias("count") - } - None => unsupported_sql_err!("Wildcard is not supported in this context"), - }, - [FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(name))] => { - match planner.relation_opt() { - Some(rel) if name.to_string() == rel.name => { - let schema = rel.schema(); - col(schema.fields[0].name.clone()) - .count(daft_core::count_mode::CountMode::All) - .alias("count") - } - _ => unsupported_sql_err!("Wildcard is not supported in this context"), - } - } - [expr] => { - // SQL default COUNT ignores nulls - let input = planner.plan_function_arg(expr)?; - input.count(daft_core::count_mode::CountMode::Valid) - } - _ => unsupported_sql_err!("COUNT takes exactly one argument"), - }) -} - -pub(crate) fn to_expr(expr: &AggExpr, args: &[ExprRef]) -> SQLPlannerResult { - match expr { - AggExpr::Count(_, _) => unreachable!("count should be handled by by this point"), - AggExpr::Sum(_) => { - ensure!(args.len() == 1, "sum takes exactly one argument"); - Ok(args[0].clone().sum()) - } - AggExpr::ApproxCountDistinct(_) => unsupported_sql_err!("approx_percentile"), - AggExpr::ApproxPercentile(_) => unsupported_sql_err!("approx_percentile"), - AggExpr::ApproxSketch(_, _) => unsupported_sql_err!("approx_sketch"), - AggExpr::MergeSketch(_, _) => unsupported_sql_err!("merge_sketch"), - AggExpr::Mean(_) => { - ensure!(args.len() == 1, "mean takes exactly one argument"); - Ok(args[0].clone().mean()) - } - AggExpr::Min(_) => { - ensure!(args.len() == 1, "min takes exactly one argument"); - Ok(args[0].clone().min()) - } - AggExpr::Max(_) => { - ensure!(args.len() == 1, "max takes exactly one argument"); - Ok(args[0].clone().max()) - } - AggExpr::AnyValue(_, _) => unsupported_sql_err!("any_value"), - AggExpr::List(_) => unsupported_sql_err!("list"), - AggExpr::Concat(_) => unsupported_sql_err!("concat"), - AggExpr::MapGroups { .. } => unsupported_sql_err!("map_groups"), - } -} diff --git a/src/daft-sql/src/modules/float.rs b/src/daft-sql/src/modules/float.rs index 4cfffe34b4..3765a5fb08 100644 --- a/src/daft-sql/src/modules/float.rs +++ b/src/daft-sql/src/modules/float.rs @@ -37,6 +37,14 @@ impl SQLFunction for SQLFillNan { _ => unsupported_sql_err!("Invalid arguments for 'fill_nan': '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::FILL_NAN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "fill_value"] + } } pub struct SQLIsInf {} @@ -52,6 +60,14 @@ impl SQLFunction for SQLIsInf { _ => unsupported_sql_err!("Invalid arguments for 'is_inf': '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::IS_INF_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLIsNan {} @@ -67,6 +83,14 @@ impl SQLFunction for SQLIsNan { _ => unsupported_sql_err!("Invalid arguments for 'is_nan': '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::IS_NAN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLNotNan {} @@ -82,4 +106,182 @@ impl SQLFunction for SQLNotNan { _ => unsupported_sql_err!("Invalid arguments for 'not_nan': '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::NOT_NAN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } +} + +mod static_docs { + pub(crate) const FILL_NAN_DOCSTRING: &str = + "Replaces NaN values in the input expression with a specified fill value. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT fill_nan(x, 0) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Float │ + ╞═══════╡ + │ 1.0 │ + ├╌╌╌╌╌╌╌┤ + │ NaN │ + ├╌╌╌╌╌╌╌┤ + │ 3.0 │ + ╰───────╯ + (Showing first 3 of 3 rows) + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Float │ + ╞═══════╡ + │ 1.0 │ + ├╌╌╌╌╌╌╌┤ + │ 0.0 │ + ├╌╌╌╌╌╌╌┤ + │ 3.0 │ + ╰───────╯ + (Showing first 3 of 3 rows)"; + + pub(crate) const IS_INF_DOCSTRING: &str = + "Checks if the input expression is infinite (positive or negative infinity). + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT is_inf(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Float │ + ╞═══════╡ + │ 1.0 │ + ├╌╌╌╌╌╌╌┤ + │ inf │ + ├╌╌╌╌╌╌╌┤ + │ -inf │ + ╰───────╯ + (Showing first 3 of 3 rows) + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Bool │ + ╞═══════╡ + │ false │ + ├╌╌╌╌╌╌╌┤ + │ true │ + ├╌╌╌╌╌╌╌┤ + │ true │ + ╰───────╯ + (Showing first 3 of 3 rows)"; + + pub(crate) const IS_NAN_DOCSTRING: &str = + "Checks if the input expression is NaN (Not a Number). + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT is_nan(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Float │ + ╞═══════╡ + │ 1.0 │ + ├╌╌╌╌╌╌╌┤ + │ NaN │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Bool │ + ╞═══════╡ + │ false │ + ├╌╌╌╌╌╌╌┤ + │ true │ + ├╌╌╌╌╌╌╌┤ + │ false │ + ╰───────╯ + (Showing first 3 of 3 rows)"; + + pub(crate) const NOT_NAN_DOCSTRING: &str = + "Checks if the input expression is not NaN (Not a Number). + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT not_nan(x) FROM tbl + +.. code-block:: text + :caption: Input + + ╭───────╮ + │ x │ + │ --- │ + │ Float │ + ╞═══════╡ + │ 1.0 │ + ├╌╌╌╌╌╌╌┤ + │ NaN │ + ├╌╌╌╌╌╌╌┤ + │ null │ + ╰───────╯ + (Showing first 3 of 3 rows) + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ x │ + │ --- │ + │ Bool │ + ╞═══════╡ + │ true │ + ├╌╌╌╌╌╌╌┤ + │ false │ + ├╌╌╌╌╌╌╌┤ + │ true │ + ╰───────╯ + (Showing first 3 of 3 rows)"; } diff --git a/src/daft-sql/src/modules/json.rs b/src/daft-sql/src/modules/json.rs index f0d600daea..5fa631242e 100644 --- a/src/daft-sql/src/modules/json.rs +++ b/src/daft-sql/src/modules/json.rs @@ -35,4 +35,52 @@ impl SQLFunction for JsonQuery { ), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::JSON_QUERY_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "query"] + } +} + +mod static_docs { + pub(crate) const JSON_QUERY_DOCSTRING: &str = + "Extracts a JSON object from a JSON string using a JSONPath expression. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT json_query(data, '$.store.book[0].title') FROM json_table + +.. code-block:: text + :caption: Input + + ╭────────────────────────────────────────────────────────────────────╮ + │ data │ + │ ---- │ + │ String │ + ╞════════════════════════════════════════════════════════════════════╡ + │ {\"store\": {\"book\": [{\"title\": \"Sayings of the Century\"}]}} │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ {\"store\": {\"book\": [{\"title\": \"Sword of Honour\"}]}} │ + ╰────────────────────────────────────────────────────────────────────╯ + (Showing first 2 of 2 rows) + +.. code-block:: text + :caption: Output + + ╭────────────────────────────╮ + │ data │ + │ -------------------------- │ + │ String │ + ╞════════════════════════════╡ + │ Sayings of the Century │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌--╌╌┤ + │ Sword of Honour │ + ╰────────────────────────────╯ + (Showing first 2 of 2 rows)"; } diff --git a/src/daft-sql/src/modules/list.rs b/src/daft-sql/src/modules/list.rs index b9e52d9748..6fb44af455 100644 --- a/src/daft-sql/src/modules/list.rs +++ b/src/daft-sql/src/modules/list.rs @@ -55,6 +55,14 @@ impl SQLFunction for SQLListChunk { ), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_CHUNK_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "chunk_size"] + } } pub struct SQLListCount; @@ -86,6 +94,14 @@ impl SQLFunction for SQLListCount { _ => unsupported_sql_err!("invalid arguments for list_count. Expected either list_count(expr) or list_count(expr, mode)"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_COUNT_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "mode"] + } } pub struct SQLExplode; @@ -104,6 +120,14 @@ impl SQLFunction for SQLExplode { _ => unsupported_sql_err!("Expected 1 argument"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::EXPLODE_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLListJoin; @@ -125,6 +149,14 @@ impl SQLFunction for SQLListJoin { ), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_JOIN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "separator"] + } } pub struct SQLListMax; @@ -143,6 +175,14 @@ impl SQLFunction for SQLListMax { _ => unsupported_sql_err!("invalid arguments for list_max. Expected list_max(expr)"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_MAX_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLListMean; @@ -161,6 +201,14 @@ impl SQLFunction for SQLListMean { _ => unsupported_sql_err!("invalid arguments for list_mean. Expected list_mean(expr)"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_MEAN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLListMin; @@ -179,6 +227,14 @@ impl SQLFunction for SQLListMin { _ => unsupported_sql_err!("invalid arguments for list_min. Expected list_min(expr)"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_MIN_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLListSum; @@ -197,6 +253,14 @@ impl SQLFunction for SQLListSum { _ => unsupported_sql_err!("invalid arguments for list_sum. Expected list_sum(expr)"), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_SUM_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } pub struct SQLListSlice; @@ -219,6 +283,14 @@ impl SQLFunction for SQLListSlice { ), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_SLICE_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "start", "end"] + } } pub struct SQLListSort; @@ -258,4 +330,374 @@ impl SQLFunction for SQLListSort { ), } } + + fn docstrings(&self, _alias: &str) -> String { + static_docs::LIST_SORT_DOCSTRING.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "order"] + } +} + +mod static_docs { + pub(crate) const LIST_CHUNK_DOCSTRING: &str = "Splits a list into chunks of a specified size. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_chunk(numbers, 2) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 2, 3, 4, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 7, 8, 9, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────────────────────╮ + │ list_chunk │ + │ ---------- │ + │ List[List[Int64]] │ + ╞═══════════════════════════╡ + │ [[1, 2], [3, 4], [5]] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [[6, 7], [8, 9], [10]] │ + ╰───────────────────────────╯"; + + pub(crate) const LIST_COUNT_DOCSTRING: &str = "Counts the number of elements in a list. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_count(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 2, 3, 4, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 7, 8, 9, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ list_count│ + │ ----------│ + │ Int64 │ + ╞═══════════╡ + │ 5 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 5 │ + ╰───────────╯"; + + pub(crate) const EXPLODE_DOCSTRING: &str = "Expands a list column into multiple rows. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT explode(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 2, 3] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [4, 5] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ explode │ + │ ------- │ + │ Int64 │ + ╞═══════════╡ + │ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 4 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 5 │ + ╰───────────╯"; + + pub(crate) const LIST_JOIN_DOCSTRING: &str = + "Joins elements of a list into a single string using a specified separator. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_join(words, ', ') FROM word_table + +.. code-block:: text + :caption: Input + + ╭───────────────────────╮ + │ words │ + │ ----- │ + │ List[String] │ + ╞═══════════════════════╡ + │ ['apple', 'banana'] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ ['cherry', 'date'] │ + ╰───────────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────────────╮ + │ list_join │ + │ --------- │ + │ String │ + ╞═══════════════════╡ + │ apple, banana │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ cherry, date │ + ╰───────────────────╯"; + + pub(crate) const LIST_MAX_DOCSTRING: &str = "Returns the maximum value in a list. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_max(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 3, 2, 5, 4] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 8, 7, 10, 9] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ list_max │ + │ -------- │ + │ Int64 │ + ╞═══════════╡ + │ 5 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 10 │ + ╰───────────╯"; + + pub(crate) const LIST_MEAN_DOCSTRING: &str = + "Calculates the mean (average) of values in a list. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_mean(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 2, 3, 4, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 7, 8, 9, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ list_mean │ + │ --------- │ + │ Float64 │ + ╞═══════════╡ + │ 3.0 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 8.0 │ + ╰───────────╯"; + + pub(crate) const LIST_MIN_DOCSTRING: &str = "Returns the minimum value in a list. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_min(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [3, 1, 4, 2, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [8, 6, 9, 7, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ list_min │ + │ -------- │ + │ Int64 │ + ╞═══════════╡ + │ 1 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 6 │ + ╰───────────╯"; + + pub(crate) const LIST_SUM_DOCSTRING: &str = "Calculates the sum of values in a list. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_sum(numbers) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [1, 2, 3, 4, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 7, 8, 9, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────╮ + │ list_sum │ + │ -------- │ + │ Int64 │ + ╞═══════════╡ + │ 15 │ + ├╌╌╌╌╌╌╌╌╌╌╌┤ + │ 40 │ + ╰───────────╯"; + + pub(crate) const LIST_SLICE_DOCSTRING: &str = + "Extracts a portion of a list from a start index to an end index. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_slice(numbers, 1, 4) FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════════╡ + │ [1, 2, 3, 4, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [6, 7, 8, 9, 10] │ + ╰───────────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────────╮ + │ list_slice │ + │ ---------- │ + │ List[Int64] │ + ╞═══════════════╡ + │ [2, 3, 4] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [7, 8, 9] │ + ╰───────────────╯"; + + pub(crate) const LIST_SORT_DOCSTRING: &str = + "Sorts the elements of a list in ascending or descending order. + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT list_sort(numbers, 'DESC') FROM number_table + +.. code-block:: text + :caption: Input + + ╭───────────────────╮ + │ numbers │ + │ ------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [3, 1, 4, 2, 5] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [8, 6, 9, 7, 10] │ + ╰───────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────────────────╮ + │ list_sort │ + │ --------- │ + │ List[Int64] │ + ╞═══════════════════╡ + │ [5, 4, 3, 2, 1] │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ [10, 9, 8, 7, 6] │ + ╰───────────────────╯"; } diff --git a/src/daft-sql/src/modules/map.rs b/src/daft-sql/src/modules/map.rs index d3a328f3a4..f0695f94eb 100644 --- a/src/daft-sql/src/modules/map.rs +++ b/src/daft-sql/src/modules/map.rs @@ -30,4 +30,57 @@ impl SQLFunction for MapGet { _ => invalid_operation_err!("Expected 2 input args"), } } + + fn docstrings(&self, alias: &str) -> String { + static_docs::MAP_GET_DOCSTRING.replace("{}", alias) + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "key"] + } +} + +// ... existing code ... + +mod static_docs { + pub(crate) const MAP_GET_DOCSTRING: &str = + "Retrieves the value associated with a given key from a map. + +.. seealso:: + + * :func:`~daft.sql._sql_funcs.map_get` + * :func:`~daft.sql._sql_funcs.map_extract` + +Example: + +.. code-block:: sql + :caption: SQL + + SELECT {}(user_data, 'age') FROM users_table + +.. code-block:: text + :caption: Input + + ╭───────────────────────────────╮ + │ user_data │ + │ --------- │ + │ Map[Utf8, Int64] │ + ╞═══════════════════════════════╡ + │ {'name': 'Alice', 'age': 30} │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ {'name': 'Bob', 'age': 25} │ + ╰───────────────────────────────╯ + +.. code-block:: text + :caption: Output + + ╭───────╮ + │ age │ + │ --- │ + │ Int64 │ + ╞═══════╡ + │ 30 │ + ├╌╌╌╌╌╌╌┤ + │ 25 │ + ╰───────╯"; } From 71b62902465651bf9c693dd73c2a89e9abf64a90 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 21:42:38 -0700 Subject: [PATCH 20/21] Add comments --- src/daft-sql/src/modules/float.rs | 164 +--------- src/daft-sql/src/modules/json.rs | 37 +-- src/daft-sql/src/modules/list.rs | 362 +---------------------- src/daft-sql/src/modules/map.rs | 36 +-- src/daft-sql/src/modules/numeric.rs | 61 ++++ src/daft-sql/src/modules/partitioning.rs | 22 ++ src/daft-sql/src/modules/structs.rs | 8 + src/daft-sql/src/modules/temporal.rs | 10 + 8 files changed, 120 insertions(+), 580 deletions(-) diff --git a/src/daft-sql/src/modules/float.rs b/src/daft-sql/src/modules/float.rs index 3765a5fb08..292a5c4d85 100644 --- a/src/daft-sql/src/modules/float.rs +++ b/src/daft-sql/src/modules/float.rs @@ -118,170 +118,14 @@ impl SQLFunction for SQLNotNan { mod static_docs { pub(crate) const FILL_NAN_DOCSTRING: &str = - "Replaces NaN values in the input expression with a specified fill value. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT fill_nan(x, 0) FROM tbl - -.. code-block:: text - :caption: Input - - ╭───────╮ - │ x │ - │ --- │ - │ Float │ - ╞═══════╡ - │ 1.0 │ - ├╌╌╌╌╌╌╌┤ - │ NaN │ - ├╌╌╌╌╌╌╌┤ - │ 3.0 │ - ╰───────╯ - (Showing first 3 of 3 rows) - -.. code-block:: text - :caption: Output - - ╭───────╮ - │ x │ - │ --- │ - │ Float │ - ╞═══════╡ - │ 1.0 │ - ├╌╌╌╌╌╌╌┤ - │ 0.0 │ - ├╌╌╌╌╌╌╌┤ - │ 3.0 │ - ╰───────╯ - (Showing first 3 of 3 rows)"; + "Replaces NaN values in the input expression with a specified fill value."; pub(crate) const IS_INF_DOCSTRING: &str = - "Checks if the input expression is infinite (positive or negative infinity). - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT is_inf(x) FROM tbl - -.. code-block:: text - :caption: Input - - ╭───────╮ - │ x │ - │ --- │ - │ Float │ - ╞═══════╡ - │ 1.0 │ - ├╌╌╌╌╌╌╌┤ - │ inf │ - ├╌╌╌╌╌╌╌┤ - │ -inf │ - ╰───────╯ - (Showing first 3 of 3 rows) - -.. code-block:: text - :caption: Output - - ╭───────╮ - │ x │ - │ --- │ - │ Bool │ - ╞═══════╡ - │ false │ - ├╌╌╌╌╌╌╌┤ - │ true │ - ├╌╌╌╌╌╌╌┤ - │ true │ - ╰───────╯ - (Showing first 3 of 3 rows)"; + "Checks if the input expression is infinite (positive or negative infinity)."; pub(crate) const IS_NAN_DOCSTRING: &str = - "Checks if the input expression is NaN (Not a Number). - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT is_nan(x) FROM tbl - -.. code-block:: text - :caption: Input - - ╭───────╮ - │ x │ - │ --- │ - │ Float │ - ╞═══════╡ - │ 1.0 │ - ├╌╌╌╌╌╌╌┤ - │ NaN │ - ├╌╌╌╌╌╌╌┤ - │ null │ - ╰───────╯ - (Showing first 3 of 3 rows) - -.. code-block:: text - :caption: Output - - ╭───────╮ - │ x │ - │ --- │ - │ Bool │ - ╞═══════╡ - │ false │ - ├╌╌╌╌╌╌╌┤ - │ true │ - ├╌╌╌╌╌╌╌┤ - │ false │ - ╰───────╯ - (Showing first 3 of 3 rows)"; + "Checks if the input expression is NaN (Not a Number)."; pub(crate) const NOT_NAN_DOCSTRING: &str = - "Checks if the input expression is not NaN (Not a Number). - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT not_nan(x) FROM tbl - -.. code-block:: text - :caption: Input - - ╭───────╮ - │ x │ - │ --- │ - │ Float │ - ╞═══════╡ - │ 1.0 │ - ├╌╌╌╌╌╌╌┤ - │ NaN │ - ├╌╌╌╌╌╌╌┤ - │ null │ - ╰───────╯ - (Showing first 3 of 3 rows) - -.. code-block:: text - :caption: Output - - ╭───────╮ - │ x │ - │ --- │ - │ Bool │ - ╞═══════╡ - │ true │ - ├╌╌╌╌╌╌╌┤ - │ false │ - ├╌╌╌╌╌╌╌┤ - │ true │ - ╰───────╯ - (Showing first 3 of 3 rows)"; + "Checks if the input expression is not NaN (Not a Number)."; } diff --git a/src/daft-sql/src/modules/json.rs b/src/daft-sql/src/modules/json.rs index 5fa631242e..8dc9e617f5 100644 --- a/src/daft-sql/src/modules/json.rs +++ b/src/daft-sql/src/modules/json.rs @@ -47,40 +47,5 @@ impl SQLFunction for JsonQuery { mod static_docs { pub(crate) const JSON_QUERY_DOCSTRING: &str = - "Extracts a JSON object from a JSON string using a JSONPath expression. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT json_query(data, '$.store.book[0].title') FROM json_table - -.. code-block:: text - :caption: Input - - ╭────────────────────────────────────────────────────────────────────╮ - │ data │ - │ ---- │ - │ String │ - ╞════════════════════════════════════════════════════════════════════╡ - │ {\"store\": {\"book\": [{\"title\": \"Sayings of the Century\"}]}} │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ {\"store\": {\"book\": [{\"title\": \"Sword of Honour\"}]}} │ - ╰────────────────────────────────────────────────────────────────────╯ - (Showing first 2 of 2 rows) - -.. code-block:: text - :caption: Output - - ╭────────────────────────────╮ - │ data │ - │ -------------------------- │ - │ String │ - ╞════════════════════════════╡ - │ Sayings of the Century │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌--╌╌┤ - │ Sword of Honour │ - ╰────────────────────────────╯ - (Showing first 2 of 2 rows)"; + "Extracts a JSON object from a JSON string using a JSONPath expression."; } diff --git a/src/daft-sql/src/modules/list.rs b/src/daft-sql/src/modules/list.rs index 6fb44af455..bd6db25990 100644 --- a/src/daft-sql/src/modules/list.rs +++ b/src/daft-sql/src/modules/list.rs @@ -341,363 +341,27 @@ impl SQLFunction for SQLListSort { } mod static_docs { - pub(crate) const LIST_CHUNK_DOCSTRING: &str = "Splits a list into chunks of a specified size. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_chunk(numbers, 2) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 2, 3, 4, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 7, 8, 9, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────────────────────╮ - │ list_chunk │ - │ ---------- │ - │ List[List[Int64]] │ - ╞═══════════════════════════╡ - │ [[1, 2], [3, 4], [5]] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [[6, 7], [8, 9], [10]] │ - ╰───────────────────────────╯"; - - pub(crate) const LIST_COUNT_DOCSTRING: &str = "Counts the number of elements in a list. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_count(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 2, 3, 4, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 7, 8, 9, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ list_count│ - │ ----------│ - │ Int64 │ - ╞═══════════╡ - │ 5 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 5 │ - ╰───────────╯"; - - pub(crate) const EXPLODE_DOCSTRING: &str = "Expands a list column into multiple rows. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT explode(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 2, 3] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [4, 5] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ explode │ - │ ------- │ - │ Int64 │ - ╞═══════════╡ - │ 1 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 2 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 3 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 4 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 5 │ - ╰───────────╯"; + pub(crate) const LIST_CHUNK_DOCSTRING: &str = "Splits a list into chunks of a specified size."; - pub(crate) const LIST_JOIN_DOCSTRING: &str = - "Joins elements of a list into a single string using a specified separator. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_join(words, ', ') FROM word_table - -.. code-block:: text - :caption: Input - - ╭───────────────────────╮ - │ words │ - │ ----- │ - │ List[String] │ - ╞═══════════════════════╡ - │ ['apple', 'banana'] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ ['cherry', 'date'] │ - ╰───────────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────────────╮ - │ list_join │ - │ --------- │ - │ String │ - ╞═══════════════════╡ - │ apple, banana │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ cherry, date │ - ╰───────────────────╯"; - - pub(crate) const LIST_MAX_DOCSTRING: &str = "Returns the maximum value in a list. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_max(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 3, 2, 5, 4] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 8, 7, 10, 9] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ list_max │ - │ -------- │ - │ Int64 │ - ╞═══════════╡ - │ 5 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 10 │ - ╰───────────╯"; - - pub(crate) const LIST_MEAN_DOCSTRING: &str = - "Calculates the mean (average) of values in a list. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_mean(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 2, 3, 4, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 7, 8, 9, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ list_mean │ - │ --------- │ - │ Float64 │ - ╞═══════════╡ - │ 3.0 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 8.0 │ - ╰───────────╯"; - - pub(crate) const LIST_MIN_DOCSTRING: &str = "Returns the minimum value in a list. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_min(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [3, 1, 4, 2, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [8, 6, 9, 7, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ list_min │ - │ -------- │ - │ Int64 │ - ╞═══════════╡ - │ 1 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 6 │ - ╰───────────╯"; - - pub(crate) const LIST_SUM_DOCSTRING: &str = "Calculates the sum of values in a list. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_sum(numbers) FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [1, 2, 3, 4, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 7, 8, 9, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────╮ - │ list_sum │ - │ -------- │ - │ Int64 │ - ╞═══════════╡ - │ 15 │ - ├╌╌╌╌╌╌╌╌╌╌╌┤ - │ 40 │ - ╰───────────╯"; - - pub(crate) const LIST_SLICE_DOCSTRING: &str = - "Extracts a portion of a list from a start index to an end index. + pub(crate) const LIST_COUNT_DOCSTRING: &str = "Counts the number of elements in a list."; -Example: + pub(crate) const EXPLODE_DOCSTRING: &str = "Expands a list column into multiple rows."; -.. code-block:: sql - :caption: SQL + pub(crate) const LIST_JOIN_DOCSTRING: &str = + "Joins elements of a list into a single string using a specified separator."; - SELECT list_slice(numbers, 1, 4) FROM number_table + pub(crate) const LIST_MAX_DOCSTRING: &str = "Returns the maximum value in a list."; -.. code-block:: text - :caption: Input + pub(crate) const LIST_MEAN_DOCSTRING: &str = + "Calculates the mean (average) of values in a list."; - ╭───────────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════════╡ - │ [1, 2, 3, 4, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [6, 7, 8, 9, 10] │ - ╰───────────────────────╯ + pub(crate) const LIST_MIN_DOCSTRING: &str = "Returns the minimum value in a list."; -.. code-block:: text - :caption: Output + pub(crate) const LIST_SUM_DOCSTRING: &str = "Calculates the sum of values in a list."; - ╭───────────────╮ - │ list_slice │ - │ ---------- │ - │ List[Int64] │ - ╞═══════════════╡ - │ [2, 3, 4] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [7, 8, 9] │ - ╰───────────────╯"; + pub(crate) const LIST_SLICE_DOCSTRING: &str = + "Extracts a portion of a list from a start index to an end index."; pub(crate) const LIST_SORT_DOCSTRING: &str = - "Sorts the elements of a list in ascending or descending order. - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT list_sort(numbers, 'DESC') FROM number_table - -.. code-block:: text - :caption: Input - - ╭───────────────────╮ - │ numbers │ - │ ------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [3, 1, 4, 2, 5] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [8, 6, 9, 7, 10] │ - ╰───────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────────────────╮ - │ list_sort │ - │ --------- │ - │ List[Int64] │ - ╞═══════════════════╡ - │ [5, 4, 3, 2, 1] │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ [10, 9, 8, 7, 6] │ - ╰───────────────────╯"; + "Sorts the elements of a list in ascending or descending order."; } diff --git a/src/daft-sql/src/modules/map.rs b/src/daft-sql/src/modules/map.rs index f0695f94eb..0ae5aca2be 100644 --- a/src/daft-sql/src/modules/map.rs +++ b/src/daft-sql/src/modules/map.rs @@ -40,8 +40,6 @@ impl SQLFunction for MapGet { } } -// ... existing code ... - mod static_docs { pub(crate) const MAP_GET_DOCSTRING: &str = "Retrieves the value associated with a given key from a map. @@ -50,37 +48,5 @@ mod static_docs { * :func:`~daft.sql._sql_funcs.map_get` * :func:`~daft.sql._sql_funcs.map_extract` - -Example: - -.. code-block:: sql - :caption: SQL - - SELECT {}(user_data, 'age') FROM users_table - -.. code-block:: text - :caption: Input - - ╭───────────────────────────────╮ - │ user_data │ - │ --------- │ - │ Map[Utf8, Int64] │ - ╞═══════════════════════════════╡ - │ {'name': 'Alice', 'age': 30} │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ {'name': 'Bob', 'age': 25} │ - ╰───────────────────────────────╯ - -.. code-block:: text - :caption: Output - - ╭───────╮ - │ age │ - │ --- │ - │ Int64 │ - ╞═══════╡ - │ 30 │ - ├╌╌╌╌╌╌╌┤ - │ 25 │ - ╰───────╯"; +"; } diff --git a/src/daft-sql/src/modules/numeric.rs b/src/daft-sql/src/modules/numeric.rs index 197d958860..21ac2a0873 100644 --- a/src/daft-sql/src/modules/numeric.rs +++ b/src/daft-sql/src/modules/numeric.rs @@ -88,6 +88,67 @@ impl SQLFunction for SQLNumericExpr { let inputs = self.args_to_expr_unnamed(inputs, planner)?; to_expr(self, inputs.as_slice()) } + + fn docstrings(&self, _alias: &str) -> String { + let docstring = match self { + Self::Abs => "Gets the absolute value of a number.", + Self::Ceil => "Rounds a number up to the nearest integer.", + Self::Exp => "Calculates the exponential of a number (e^x).", + Self::Floor => "Rounds a number down to the nearest integer.", + Self::Round => "Rounds a number to a specified number of decimal places.", + Self::Sign => "Returns the sign of a number (-1, 0, or 1).", + Self::Sqrt => "Calculates the square root of a number.", + Self::Sin => "Calculates the sine of an angle in radians.", + Self::Cos => "Calculates the cosine of an angle in radians.", + Self::Tan => "Calculates the tangent of an angle in radians.", + Self::Cot => "Calculates the cotangent of an angle in radians.", + Self::ArcSin => "Calculates the inverse sine (arc sine) of a number.", + Self::ArcCos => "Calculates the inverse cosine (arc cosine) of a number.", + Self::ArcTan => "Calculates the inverse tangent (arc tangent) of a number.", + Self::ArcTan2 => { + "Calculates the angle between the positive x-axis and the ray from (0,0) to (x,y)." + } + Self::Radians => "Converts an angle from degrees to radians.", + Self::Degrees => "Converts an angle from radians to degrees.", + Self::Log => "Calculates the natural logarithm of a number.", + Self::Log2 => "Calculates the base-2 logarithm of a number.", + Self::Log10 => "Calculates the base-10 logarithm of a number.", + Self::Ln => "Calculates the natural logarithm of a number.", + Self::ArcTanh => "Calculates the inverse hyperbolic tangent of a number.", + Self::ArcCosh => "Calculates the inverse hyperbolic cosine of a number.", + Self::ArcSinh => "Calculates the inverse hyperbolic sine of a number.", + }; + docstring.to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + match self { + Self::Abs + | Self::Ceil + | Self::Floor + | Self::Sign + | Self::Sqrt + | Self::Sin + | Self::Cos + | Self::Tan + | Self::Cot + | Self::ArcSin + | Self::ArcCos + | Self::ArcTan + | Self::Radians + | Self::Degrees + | Self::Log2 + | Self::Log10 + | Self::Ln + | Self::ArcTanh + | Self::ArcCosh + | Self::ArcSinh => &["input"], + Self::Log => &["input", "base"], + Self::Round => &["input", "precision"], + Self::Exp => &["input", "exponent"], + Self::ArcTan2 => &["y", "x"], + } + } } fn to_expr(expr: &SQLNumericExpr, args: &[ExprRef]) -> SQLPlannerResult { diff --git a/src/daft-sql/src/modules/partitioning.rs b/src/daft-sql/src/modules/partitioning.rs index e833edd51d..def20b2774 100644 --- a/src/daft-sql/src/modules/partitioning.rs +++ b/src/daft-sql/src/modules/partitioning.rs @@ -80,6 +80,28 @@ impl SQLFunction for PartitioningExpr { } } } + + fn docstrings(&self, _alias: &str) -> String { + match self { + Self::Years => "Extracts the number of years since epoch time from a datetime expression.".to_string(), + Self::Months => "Extracts the number of months since epoch time from a datetime expression.".to_string(), + Self::Days => "Extracts the number of days since epoch time from a datetime expression.".to_string(), + Self::Hours => "Extracts the number of hours since epoch time from a datetime expression.".to_string(), + Self::IcebergBucket(_) => "Computes a bucket number for the input expression based the specified number of buckets using an Iceberg-specific hash.".to_string(), + Self::IcebergTruncate(_) => "Truncates the input expression to a specified width.".to_string(), + } + } + + fn arg_names(&self) -> &'static [&'static str] { + match self { + Self::Years => &["input"], + Self::Months => &["input"], + Self::Days => &["input"], + Self::Hours => &["input"], + Self::IcebergBucket(_) => &["input", "num_buckets"], + Self::IcebergTruncate(_) => &["input", "width"], + } + } } fn partitioning_helper daft_dsl::ExprRef>( diff --git a/src/daft-sql/src/modules/structs.rs b/src/daft-sql/src/modules/structs.rs index 66be42d8e3..17fae85c9e 100644 --- a/src/daft-sql/src/modules/structs.rs +++ b/src/daft-sql/src/modules/structs.rs @@ -34,4 +34,12 @@ impl SQLFunction for StructGet { _ => invalid_operation_err!("Expected 2 input args"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Extracts a field from a struct expression by name.".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "field"] + } } diff --git a/src/daft-sql/src/modules/temporal.rs b/src/daft-sql/src/modules/temporal.rs index 58687724fa..840c278765 100644 --- a/src/daft-sql/src/modules/temporal.rs +++ b/src/daft-sql/src/modules/temporal.rs @@ -50,6 +50,16 @@ macro_rules! temporal { ), } } + fn docstrings(&self, _alias: &str) -> String { + format!( + "Extracts the {} component from a datetime expression.", + stringify!($fn_name).replace("dt_", "") + ) + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input"] + } } }; } From 75b7acbc82727989a9584430c824ebd2a30f8886 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Mon, 30 Sep 2024 21:54:18 -0700 Subject: [PATCH 21/21] Add more docstrings for utf8 and image --- src/daft-sql/src/modules/image/crop.rs | 8 +++ src/daft-sql/src/modules/image/decode.rs | 8 +++ src/daft-sql/src/modules/image/encode.rs | 8 +++ src/daft-sql/src/modules/image/resize.rs | 8 +++ src/daft-sql/src/modules/image/to_mode.rs | 8 +++ src/daft-sql/src/modules/utf8.rs | 66 +++++++++++++++++++++++ 6 files changed, 106 insertions(+) diff --git a/src/daft-sql/src/modules/image/crop.rs b/src/daft-sql/src/modules/image/crop.rs index 36c72fcca3..286208889c 100644 --- a/src/daft-sql/src/modules/image/crop.rs +++ b/src/daft-sql/src/modules/image/crop.rs @@ -21,4 +21,12 @@ impl SQLFunction for SQLImageCrop { _ => unsupported_sql_err!("Invalid arguments for image_crop: '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Crops an image to a specified bounding box. The bounding box is specified as [x, y, width, height].".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input_image", "bounding_box"] + } } diff --git a/src/daft-sql/src/modules/image/decode.rs b/src/daft-sql/src/modules/image/decode.rs index a6b95d538d..a896c67a05 100644 --- a/src/daft-sql/src/modules/image/decode.rs +++ b/src/daft-sql/src/modules/image/decode.rs @@ -61,4 +61,12 @@ impl SQLFunction for SQLImageDecode { _ => unsupported_sql_err!("Invalid arguments for image_decode: '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Decodes an image from binary data. Optionally, you can specify the image mode and error handling behavior.".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input", "mode", "on_error"] + } } diff --git a/src/daft-sql/src/modules/image/encode.rs b/src/daft-sql/src/modules/image/encode.rs index a902179f88..acf489c807 100644 --- a/src/daft-sql/src/modules/image/encode.rs +++ b/src/daft-sql/src/modules/image/encode.rs @@ -46,4 +46,12 @@ impl SQLFunction for SQLImageEncode { _ => unsupported_sql_err!("Invalid arguments for image_encode: '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Encodes an image into the specified image file format, returning a binary column of encoded bytes.".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input_image", "image_format"] + } } diff --git a/src/daft-sql/src/modules/image/resize.rs b/src/daft-sql/src/modules/image/resize.rs index 8ce37eb7f8..e4c9804d39 100644 --- a/src/daft-sql/src/modules/image/resize.rs +++ b/src/daft-sql/src/modules/image/resize.rs @@ -64,4 +64,12 @@ impl SQLFunction for SQLImageResize { _ => unsupported_sql_err!("Invalid arguments for image_resize: '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Resizes an image to the specified width and height.".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input_image", "width", "height"] + } } diff --git a/src/daft-sql/src/modules/image/to_mode.rs b/src/daft-sql/src/modules/image/to_mode.rs index a02efb2d36..b5b9202d1f 100644 --- a/src/daft-sql/src/modules/image/to_mode.rs +++ b/src/daft-sql/src/modules/image/to_mode.rs @@ -41,4 +41,12 @@ impl SQLFunction for SQLImageToMode { _ => unsupported_sql_err!("Invalid arguments for image_encode: '{inputs:?}'"), } } + + fn docstrings(&self, _alias: &str) -> String { + "Converts an image to the specified mode (e.g. RGB, RGBA, Grayscale).".to_string() + } + + fn arg_names(&self) -> &'static [&'static str] { + &["input_image", "mode"] + } } diff --git a/src/daft-sql/src/modules/utf8.rs b/src/daft-sql/src/modules/utf8.rs index 263a8bd9e7..0ebff7e7ff 100644 --- a/src/daft-sql/src/modules/utf8.rs +++ b/src/daft-sql/src/modules/utf8.rs @@ -60,6 +60,72 @@ impl SQLFunction for Utf8Expr { let inputs = self.args_to_expr_unnamed(inputs, planner)?; to_expr(self, &inputs) } + + fn docstrings(&self, _alias: &str) -> String { + match self { + Self::EndsWith => "Returns true if the string ends with the specified substring".to_string(), + Self::StartsWith => "Returns true if the string starts with the specified substring".to_string(), + Self::Contains => "Returns true if the string contains the specified substring".to_string(), + Self::Split(_) => "Splits the string by the specified delimiter and returns an array of substrings".to_string(), + Self::Match => "Returns true if the string matches the specified regular expression pattern".to_string(), + Self::Extract(_) => "Extracts the first substring that matches the specified regular expression pattern".to_string(), + Self::ExtractAll(_) => "Extracts all substrings that match the specified regular expression pattern".to_string(), + Self::Replace(_) => "Replaces all occurrences of a substring with a new string".to_string(), + Self::Like => "Returns true if the string matches the specified SQL LIKE pattern".to_string(), + Self::Ilike => "Returns true if the string matches the specified SQL LIKE pattern (case-insensitive)".to_string(), + Self::Length => "Returns the length of the string".to_string(), + Self::Lower => "Converts the string to lowercase".to_string(), + Self::Upper => "Converts the string to uppercase".to_string(), + Self::Lstrip => "Removes leading whitespace from the string".to_string(), + Self::Rstrip => "Removes trailing whitespace from the string".to_string(), + Self::Reverse => "Reverses the order of characters in the string".to_string(), + Self::Capitalize => "Capitalizes the first character of the string".to_string(), + Self::Left => "Returns the specified number of leftmost characters from the string".to_string(), + Self::Right => "Returns the specified number of rightmost characters from the string".to_string(), + Self::Find => "Returns the index of the first occurrence of a substring within the string".to_string(), + Self::Rpad => "Pads the string on the right side with the specified string until it reaches the specified length".to_string(), + Self::Lpad => "Pads the string on the left side with the specified string until it reaches the specified length".to_string(), + Self::Repeat => "Repeats the string the specified number of times".to_string(), + Self::Substr => "Returns a substring of the string starting at the specified position and length".to_string(), + Self::ToDate(_) => "Parses the string as a date using the specified format.".to_string(), + Self::ToDatetime(_, _) => "Parses the string as a datetime using the specified format.".to_string(), + Self::LengthBytes => "Returns the length of the string in bytes".to_string(), + Self::Normalize(_) => unimplemented!("Normalize not implemented"), + } + } + + fn arg_names(&self) -> &'static [&'static str] { + match self { + Self::EndsWith => &["string_input", "substring"], + Self::StartsWith => &["string_input", "substring"], + Self::Contains => &["string_input", "substring"], + Self::Split(_) => &["string_input", "delimiter"], + Self::Match => &["string_input", "pattern"], + Self::Extract(_) => &["string_input", "pattern"], + Self::ExtractAll(_) => &["string_input", "pattern"], + Self::Replace(_) => &["string_input", "pattern", "replacement"], + Self::Like => &["string_input", "pattern"], + Self::Ilike => &["string_input", "pattern"], + Self::Length => &["string_input"], + Self::Lower => &["string_input"], + Self::Upper => &["string_input"], + Self::Lstrip => &["string_input"], + Self::Rstrip => &["string_input"], + Self::Reverse => &["string_input"], + Self::Capitalize => &["string_input"], + Self::Left => &["string_input", "length"], + Self::Right => &["string_input", "length"], + Self::Find => &["string_input", "substring"], + Self::Rpad => &["string_input", "length", "pad"], + Self::Lpad => &["string_input", "length", "pad"], + Self::Repeat => &["string_input", "count"], + Self::Substr => &["string_input", "start", "length"], + Self::ToDate(_) => &["string_input", "format"], + Self::ToDatetime(_, _) => &["string_input", "format"], + Self::LengthBytes => &["string_input"], + Self::Normalize(_) => unimplemented!("Normalize not implemented"), + } + } } fn to_expr(expr: &Utf8Expr, args: &[ExprRef]) -> SQLPlannerResult {