Merge remote-tracking branch 'origin/main' into andrew/clippy

Eventual-Inc · Oct 4, 2024 · a165fcf · a165fcf
2 parents 696f1f1 + 62d0581
commit a165fcf
Show file tree

Hide file tree

Showing 65 changed files with 3,312 additions and 755 deletions.
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -1 +1,2 @@
 d5e444d0a71409ae3701d4249ad877f1fb9e2235 # introduced `rustfmt.toml` and ran formatter; ignoring large formatting changes
+45e2944e252ccdd563dc20edd9b29762e05cec1d # auto-fix prefer `Self` over explicit type
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1201,9 +1201,17 @@ def minhash(
 # -----
 # SQL functions
 # -----
+class SQLFunctionStub:
+    @property
+    def name(self) -> str: ...
+    @property
+    def docstring(self) -> str: ...
+    @property
+    def arg_names(self) -> list[str]: ...
+
 def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ...
 def sql_expr(sql: str) -> PyExpr: ...
-def list_sql_functions() -> list[str]: ...
+def list_sql_functions() -> list[SQLFunctionStub]: ...
 def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ...
 def to_struct(inputs: list[PyExpr]) -> PyExpr: ...
 

diff --git a/daft/sql/_sql_funcs.py b/daft/sql/_sql_funcs.py
@@ -0,0 +1,30 @@
+"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow
+Sphinx to generate documentation pages for every SQL function.
+"""
+
+from __future__ import annotations
+
+from inspect import Parameter as _Parameter
+from inspect import Signature as _Signature
+
+from daft.daft import list_sql_functions as _list_sql_functions
+
+
+def _create_sql_function(func_name: str, docstring: str, arg_names: list[str]):
+    def sql_function(*args, **kwargs):
+        raise NotImplementedError("This function is for documentation purposes only and should not be called.")
+
+    sql_function.__name__ = func_name
+    sql_function.__qualname__ = func_name
+    sql_function.__doc__ = docstring
+    sql_function.__signature__ = _Signature([_Parameter(name, _Parameter.POSITIONAL_OR_KEYWORD) for name in arg_names])  # type: ignore[attr-defined]
+
+    # Register the function in the current module
+    globals()[func_name] = sql_function
+
+
+__all__ = []
+
+for sql_function_stub in _list_sql_functions():
+    _create_sql_function(sql_function_stub.name, sql_function_stub.docstring, sql_function_stub.arg_names)
+    __all__.append(sql_function_stub.name)
diff --git a/daft/sql/sql.py b/daft/sql/sql.py
@@ -1,7 +1,7 @@
 # isort: dont-add-import: from __future__ import annotations
 
 import inspect
-from typing import Optional, overload
+from typing import Optional
 
 from daft.api_annotations import PublicAPI
 from daft.context import get_context
@@ -38,22 +38,120 @@ def _copy_from(self, other: "SQLCatalog") -> None:
 
 @PublicAPI
 def sql_expr(sql: str) -> Expression:
-    return Expression._from_pyexpr(_sql_expr(sql))
-
+    """Parses a SQL string into a Daft Expression
 
-@overload
-def sql(sql: str) -> DataFrame: ...
+    This function allows you to create Daft Expressions from SQL snippets, which can then be used
+    in Daft operations or combined with other Daft Expressions.
 
+    Args:
+        sql (str): A SQL string to be parsed into a Daft Expression.
 
-@overload
-def sql(sql: str, catalog: SQLCatalog, register_globals: bool = ...) -> DataFrame: ...
+    Returns:
+        Expression: A Daft Expression representing the parsed SQL.
+
+    Examples:
+        Create a simple SQL expression:
+
+        >>> import daft
+        >>> expr = daft.sql_expr("1 + 2")
+        >>> print(expr)
+        lit(1) + lit(2)
+
+        Use SQL expression in a Daft DataFrame operation:
+
+        >>> df = daft.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
+        >>> df = df.with_column("c", daft.sql_expr("a + b"))
+        >>> df.show()
+        ╭───────┬───────┬───────╮
+        │ a     ┆ b     ┆ c     │
+        │ ---   ┆ ---   ┆ ---   │
+        │ Int64 ┆ Int64 ┆ Int64 │
+        ╞═══════╪═══════╪═══════╡
+        │ 1     ┆ 4     ┆ 5     │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+        │ 2     ┆ 5     ┆ 7     │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+        │ 3     ┆ 6     ┆ 9     │
+        ╰───────┴───────┴───────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+
+        `daft.sql_expr` is also called automatically for you in some DataFrame operations such as filters:
+
+        >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]})
+        >>> result = df.where("x < 3 AND y > 4")
+        >>> result.show()
+        ╭───────┬───────╮
+        │ x     ┆ y     │
+        │ ---   ┆ ---   │
+        │ Int64 ┆ Int64 │
+        ╞═══════╪═══════╡
+        │ 2     ┆ 5     │
+        ╰───────┴───────╯
+        <BLANKLINE>
+        (Showing first 1 of 1 rows)
+    """
+    return Expression._from_pyexpr(_sql_expr(sql))
 
 
 @PublicAPI
 def sql(sql: str, catalog: Optional[SQLCatalog] = None, register_globals: bool = True) -> DataFrame:
-    """Create a DataFrame from an SQL query.
-
-    EXPERIMENTAL: This features is early in development and will change.
+    """Run a SQL query, returning the results as a DataFrame
+
+    .. WARNING::
+        This features is early in development and will likely experience API changes.
+
+    Examples:
+
+        A simple example joining 2 dataframes together using a SQL statement, relying on Daft to detect the names of
+        SQL tables using their corresponding Python variable names.
+
+        >>> import daft
+        >>>
+        >>> df1 = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
+        >>> df2 = daft.from_pydict({"a": [1, 2, 3], "c": ["daft", None, None]})
+        >>>
+        >>> # Daft automatically detects `df1` and `df2` from your Python global namespace
+        >>> result_df = daft.sql("SELECT * FROM df1 JOIN df2 ON df1.a = df2.a")
+        >>> result_df.show()
+        ╭───────┬──────┬──────╮
+        │ a     ┆ b    ┆ c    │
+        │ ---   ┆ ---  ┆ ---  │
+        │ Int64 ┆ Utf8 ┆ Utf8 │
+        ╞═══════╪══════╪══════╡
+        │ 1     ┆ foo  ┆ daft │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+        │ 2     ┆ bar  ┆ None │
+        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+        │ 3     ┆ baz  ┆ None │
+        ╰───────┴──────┴──────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+
+        A more complex example using a SQLCatalog to create a named table called `"my_table"`, which can then be referenced from inside your SQL statement.
+
+        >>> import daft
+        >>> from daft.sql import SQLCatalog
+        >>>
+        >>> df = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
+        >>>
+        >>> # Register dataframes as tables in SQL explicitly with names
+        >>> catalog = SQLCatalog({"my_table": df})
+        >>>
+        >>> daft.sql("SELECT a FROM my_table", catalog=catalog).show()
+        ╭───────╮
+        │ a     │
+        │ ---   │
+        │ Int64 │
+        ╞═══════╡
+        │ 1     │
+        ├╌╌╌╌╌╌╌┤
+        │ 2     │
+        ├╌╌╌╌╌╌╌┤
+        │ 3     │
+        ╰───────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
 
     Args:
         sql (str): SQL query to execute

diff --git a/docs/source/10-min.ipynb b/docs/source/10-min.ipynb
@@ -569,7 +569,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "See: [Expressions](user_guide/basic_concepts/expressions.rst)\n",
+    "See: [Expressions](user_guide/expressions.rst)\n",
     "\n",
     "Expressions are an API for defining computation that needs to happen over your columns.\n",
     "\n",
@@ -1516,7 +1516,7 @@
    "source": [
     "### User-Defined Functions\n",
     "\n",
-    "See: [UDF User Guide](user_guide/daft_in_depth/udf)"
+    "See: [UDF User Guide](user_guide/udf)"
    ]
   },
   {

diff --git a/docs/source/api_docs/index.rst b/docs/source/api_docs/index.rst
@@ -7,6 +7,7 @@ API Documentation
    Table of Contents <self>
    creation
    dataframe
+   sql
    expressions
    schema
    datatype

diff --git a/docs/source/api_docs/sql.rst b/docs/source/api_docs/sql.rst
@@ -0,0 +1,15 @@
+SQL
+===
+
+.. autofunction:: daft.sql
+
+.. autofunction:: daft.sql_expr
+
+SQL Functions
+-------------
+
+This is a full list of functions that can be used from within SQL.
+
+
+.. sql-autosummary::
+    :toctree: doc_gen/sql_funcs
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,12 +9,16 @@
 import inspect
 import os
 import subprocess
+import sys
 
 import sphinx_autosummary_accessors
 
 # Set environment variable to help code determine whether or not we are running a Sphinx doc build process
 os.environ["DAFT_SPHINX_BUILD"] = "1"
 
+# Help Sphinx find local custom extensions/directives that we build
+sys.path.insert(0, os.path.abspath("ext"))
+
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 project = "Daft"
@@ -45,10 +49,15 @@
     "myst_nb",
     "sphinx_copybutton",
     "sphinx_autosummary_accessors",
+    "sphinx_tabs.tabs",
+    # Local extensions
+    "sql_autosummary",
 ]
 
 templates_path = ["_templates", sphinx_autosummary_accessors.templates_path]
 
+# Removes module names that prefix our classes
+add_module_names = False
 
 # -- Options for Notebook rendering
 # https://myst-nb.readthedocs.io/en/latest/configuration.html?highlight=nb_execution_mode#execution
@@ -86,6 +95,13 @@
     "learn/user_guides/remote_cluster_execution": "distributed-computing.html",
     "learn/quickstart": "learn/10-min.html",
     "learn/10-min": "../10-min.html",
+    "user_guide/basic_concepts/expressions": "user_guide/expressions",
+    "user_guide/basic_concepts/dataframe_introduction": "user_guide/basic_concepts",
+    "user_guide/basic_concepts/introduction": "user_guide/basic_concepts",
+    "user_guide/daft_in_depth/aggregations": "user_guide/aggregations",
+    "user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations",
+    "user_guide/daft_in_depth/datatypes": "user_guide/datatypes",
+    "user_guide/daft_in_depth/udf": "user_guide/udf",
 }
 
 # Resolving code links to github

diff --git a/docs/source/ext/__init__.py b/docs/source/ext/__init__.py
diff --git a/docs/source/ext/sql_autosummary.py b/docs/source/ext/sql_autosummary.py
@@ -0,0 +1,80 @@
+import inspect
+import os
+
+from sphinx.ext.autosummary import Autosummary
+from sphinx.util import logging
+
+logger = logging.getLogger(__name__)
+
+
+TOCTREE = "doc_gen/sql_funcs"
+SQL_MODULE_NAME = "daft.sql._sql_funcs"
+
+STUB_TEMPLATE = """
+.. currentmodule:: None
+
+.. autofunction:: {module_name}.{name}
+"""
+
+
+class SQLAutosummary(Autosummary):
+    def run(self):
+        func_names = get_sql_func_names()
+        # Run the normal autosummary stuff, override self.content
+        self.content = [f"~{SQL_MODULE_NAME}.{f}" for f in func_names]
+        nodes = super().run()
+        return nodes
+
+    def get_sql_module_name(self):
+        return self.arguments[0]
+
+
+def get_sql_func_names():
+    # Import the SQL functions module
+    module = __import__(SQL_MODULE_NAME, fromlist=[""])
+
+    names = []
+    for name, obj in inspect.getmembers(module):
+        if inspect.isfunction(obj) and not name.startswith("_"):
+            names.append(name)
+
+    return names
+
+
+def generate_stub(name: str):
+    """Generates a stub string for a SQL function"""
+    stub = name + "\n"
+    stub += "=" * len(name) + "\n\n"
+    stub += STUB_TEMPLATE.format(module_name=SQL_MODULE_NAME, name=name)
+    return stub
+
+
+def generate_files(app):
+    # Determine where to write .rst files to
+    output_dir = os.path.join(app.srcdir, "api_docs", TOCTREE)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Write stubfiles
+    func_names = get_sql_func_names()
+    for name in func_names:
+        stub_content = generate_stub(name)
+        filename = f"{SQL_MODULE_NAME}.{name}.rst"
+        filepath = os.path.join(output_dir, filename)
+        with open(filepath, "w") as f:
+            f.write(stub_content)
+
+        # HACK: Not sure if this is ok?
+        app.env.found_docs.add(filepath)
+
+
+def setup(app):
+    app.add_directive("sql-autosummary", SQLAutosummary)
+
+    # Generate and register files when the builder is initialized
+    app.connect("builder-inited", generate_files)
+
+    return {
+        "version": "0.1",
+        "parallel_read_safe": True,
+        "parallel_write_safe": True,
+    }