Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOCS] Fix docs to add SQL capabilities #2931

Merged
merged 21 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1203,7 +1203,7 @@ def minhash(
# -----
def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ...
def sql_expr(sql: str) -> PyExpr: ...
def list_sql_functions() -> list[str]: ...
def list_sql_functions() -> list[tuple[str, str, list[str]]]: ...
def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ...
def to_struct(inputs: list[PyExpr]) -> PyExpr: ...

Expand Down
30 changes: 30 additions & 0 deletions daft/sql/_sql_funcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow
Sphinx to generate documentation pages for every SQL function.
"""

from __future__ import annotations

Check warning on line 5 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L5

Added line #L5 was not covered by tests

from inspect import Parameter as _Parameter
from inspect import Signature as _Signature

Check warning on line 8 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L7-L8

Added lines #L7 - L8 were not covered by tests

from daft.daft import list_sql_functions as _list_sql_funcstions

Check warning on line 10 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L10

Added line #L10 was not covered by tests


def _create_sql_function(func_name: str, docstring: str, arg_names: list[str]):
def sql_function(*args, **kwargs):
raise NotImplementedError("This function is for documentation purposes only and should not be called.")

Check warning on line 15 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L13-L15

Added lines #L13 - L15 were not covered by tests

sql_function.__name__ = func_name
sql_function.__qualname__ = func_name
sql_function.__doc__ = docstring
sql_function.__signature__ = _Signature([_Parameter(name, _Parameter.POSITIONAL_OR_KEYWORD) for name in arg_names]) # type: ignore[attr-defined]

Check warning on line 20 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L17-L20

Added lines #L17 - L20 were not covered by tests

# Register the function in the current module
globals()[func_name] = sql_function

Check warning on line 23 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L23

Added line #L23 was not covered by tests


__all__ = []

Check warning on line 26 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L26

Added line #L26 was not covered by tests

for sql_func_name, docstring, arg_names in _list_sql_funcstions():
_create_sql_function(sql_func_name, docstring, arg_names)
__all__.append(sql_func_name)

Check warning on line 30 in daft/sql/_sql_funcs.py

View check run for this annotation

Codecov / codecov/patch

daft/sql/_sql_funcs.py#L28-L30

Added lines #L28 - L30 were not covered by tests
118 changes: 108 additions & 10 deletions daft/sql/sql.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# isort: dont-add-import: from __future__ import annotations

import inspect
from typing import Optional, overload
from typing import Optional

from daft.api_annotations import PublicAPI
from daft.context import get_context
Expand Down Expand Up @@ -38,22 +38,120 @@ def _copy_from(self, other: "SQLCatalog") -> None:

@PublicAPI
def sql_expr(sql: str) -> Expression:
return Expression._from_pyexpr(_sql_expr(sql))

"""Parses a SQL string into a Daft Expression

@overload
def sql(sql: str) -> DataFrame: ...
This function allows you to create Daft Expressions from SQL snippets, which can then be used
in Daft operations or combined with other Daft Expressions.

Args:
sql (str): A SQL string to be parsed into a Daft Expression.

@overload
def sql(sql: str, catalog: SQLCatalog, register_globals: bool = ...) -> DataFrame: ...
Returns:
Expression: A Daft Expression representing the parsed SQL.

Examples:
Create a simple SQL expression:

>>> import daft
>>> expr = daft.sql_expr("1 + 2")
>>> print(expr)
lit(1) + lit(2)

Use SQL expression in a Daft DataFrame operation:

>>> df = daft.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>> df = df.with_column("c", daft.sql_expr("a + b"))
>>> df.show()
╭───────┬───────┬───────╮
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ Int64 ┆ Int64 ┆ Int64 │
╞═══════╪═══════╪═══════╡
│ 1 ┆ 4 ┆ 5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 5 ┆ 7 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 6 ┆ 9 │
╰───────┴───────┴───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

`daft.sql_expr` is also called automatically for you in some DataFrame operations such as filters:

>>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]})
>>> result = df.where("x < 3 AND y > 4")
>>> result.show()
╭───────┬───────╮
│ x ┆ y │
│ --- ┆ --- │
│ Int64 ┆ Int64 │
╞═══════╪═══════╡
│ 2 ┆ 5 │
╰───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
"""
return Expression._from_pyexpr(_sql_expr(sql))


@PublicAPI
def sql(sql: str, catalog: Optional[SQLCatalog] = None, register_globals: bool = True) -> DataFrame:
"""Create a DataFrame from an SQL query.

EXPERIMENTAL: This features is early in development and will change.
"""Run a SQL query, returning the results as a DataFrame

.. WARNING::
This features is early in development and will likely experience API changes.

Examples:

A simple example joining 2 dataframes together using a SQL statement, relying on Daft to detect the names of
SQL tables using their corresponding Python variable names.

>>> import daft
>>>
>>> df1 = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
>>> df2 = daft.from_pydict({"a": [1, 2, 3], "c": ["daft", None, None]})
>>>
>>> # Daft automatically detects `df1` and `df2` from your Python global namespace
>>> result_df = daft.sql("SELECT * FROM df1 JOIN df2 ON df1.a = df2.a")
>>> result_df.show()
╭───────┬──────┬──────╮
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ Int64 ┆ Utf8 ┆ Utf8 │
╞═══════╪══════╪══════╡
│ 1 ┆ foo ┆ daft │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ bar ┆ None │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ baz ┆ None │
╰───────┴──────┴──────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

A more complex example using a SQLCatalog to create a named table called `"my_table"`, which can then be referenced from inside your SQL statement.

>>> import daft
>>> from daft.sql import SQLCatalog
>>>
>>> df = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
>>>
>>> # Register dataframes as tables in SQL explicitly with names
>>> catalog = SQLCatalog({"my_table": df})
>>>
>>> daft.sql("SELECT a FROM my_table", catalog=catalog).show()
╭───────╮
│ a │
│ --- │
│ Int64 │
╞═══════╡
│ 1 │
├╌╌╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌╌╌┤
│ 3 │
╰───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

Args:
sql (str): SQL query to execute
Expand Down
4 changes: 2 additions & 2 deletions docs/source/10-min.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"See: [Expressions](user_guide/basic_concepts/expressions.rst)\n",
"See: [Expressions](user_guide/expressions.rst)\n",
"\n",
"Expressions are an API for defining computation that needs to happen over your columns.\n",
"\n",
Expand Down Expand Up @@ -1516,7 +1516,7 @@
"source": [
"### User-Defined Functions\n",
"\n",
"See: [UDF User Guide](user_guide/daft_in_depth/udf)"
"See: [UDF User Guide](user_guide/udf)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ API Documentation
Table of Contents <self>
creation
dataframe
sql
expressions
schema
datatype
Expand Down
18 changes: 18 additions & 0 deletions docs/source/api_docs/sql.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SQL
===

SQL Functions
-------------

.. autofunction:: daft.sql

.. autofunction:: daft.sql_expr

SQL Functions
-------------

This is a full list of functions that can be used from within SQL.


.. sql-autosummary::
:toctree: doc_gen/sql_funcs
14 changes: 14 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
import inspect
import os
import subprocess
import sys

import sphinx_autosummary_accessors

# Set environment variable to help code determine whether or not we are running a Sphinx doc build process
os.environ["DAFT_SPHINX_BUILD"] = "1"

# Help Sphinx find local custom extensions/directives that we build
sys.path.insert(0, os.path.abspath("ext"))

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "Daft"
Expand Down Expand Up @@ -45,6 +49,9 @@
"myst_nb",
"sphinx_copybutton",
"sphinx_autosummary_accessors",
"sphinx_tabs.tabs",
# Local extensions
"sql_autosummary",
]

templates_path = ["_templates", sphinx_autosummary_accessors.templates_path]
Expand Down Expand Up @@ -86,6 +93,13 @@
"learn/user_guides/remote_cluster_execution": "distributed-computing.html",
"learn/quickstart": "learn/10-min.html",
"learn/10-min": "../10-min.html",
"user_guide/basic_concepts/expressions": "user_guide/expressions",
"user_guide/basic_concepts/dataframe_introduction": "user_guide/basic_concepts",
"user_guide/basic_concepts/introduction": "user_guide/basic_concepts",
"user_guide/daft_in_depth/aggregations": "user_guide/aggregations",
"user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations",
"user_guide/daft_in_depth/datatypes": "user_guide/datatypes",
"user_guide/daft_in_depth/udf": "user_guide/udf",
}

# Resolving code links to github
Expand Down
Empty file added docs/source/ext/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions docs/source/ext/sql_autosummary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import inspect
import os

from sphinx.ext.autosummary import Autosummary
from sphinx.util import logging

logger = logging.getLogger(__name__)


TOCTREE = "doc_gen/sql_funcs"
SQL_MODULE_NAME = "daft.sql._sql_funcs"

STUB_TEMPLATE = """
.. currentmodule:: {module_name}

.. autofunction:: {name}
"""


class SQLAutosummary(Autosummary):
def run(self):
func_names = get_sql_func_names()
# Run the normal autosummary stuff, override self.content
self.content = [f"~{SQL_MODULE_NAME}.{f}" for f in func_names]
nodes = super().run()
return nodes

def get_sql_module_name(self):
return self.arguments[0]


def get_sql_func_names():
# Import the SQL functions module
module = __import__(SQL_MODULE_NAME, fromlist=[""])

names = []
for name, obj in inspect.getmembers(module):
if inspect.isfunction(obj) and not name.startswith("_"):
names.append(name)

return names


def generate_stub(name: str):
"""Generates a stub string for a SQL function"""
stub = name + "\n"
stub += "=" * len(name) + "\n\n"
stub += STUB_TEMPLATE.format(module_name=SQL_MODULE_NAME, name=name)
return stub


def generate_files(app):
# Determine where to write .rst files to
output_dir = os.path.join(app.srcdir, "api_docs", TOCTREE)
os.makedirs(output_dir, exist_ok=True)

# Write stubfiles
func_names = get_sql_func_names()
for name in func_names:
stub_content = generate_stub(name)
filename = f"{SQL_MODULE_NAME}.{name}.rst"
filepath = os.path.join(output_dir, filename)
with open(filepath, "w") as f:
f.write(stub_content)

# HACK: Not sure if this is ok?
app.env.found_docs.add(filepath)


def setup(app):
app.add_directive("sql-autosummary", SQLAutosummary)

# Generate and register files when the builder is initialized
app.connect("builder-inited", generate_files)

return {
"version": "0.1",
"parallel_read_safe": True,
"parallel_write_safe": True,
}
Loading
Loading