Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into andrew/clippy
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewgazelka committed Oct 4, 2024
2 parents 696f1f1 + 62d0581 commit a165fcf
Show file tree
Hide file tree
Showing 65 changed files with 3,312 additions and 755 deletions.
1 change: 1 addition & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
d5e444d0a71409ae3701d4249ad877f1fb9e2235 # introduced `rustfmt.toml` and ran formatter; ignoring large formatting changes
45e2944e252ccdd563dc20edd9b29762e05cec1d # auto-fix prefer `Self` over explicit type
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1201,9 +1201,17 @@ def minhash(
# -----
# SQL functions
# -----
class SQLFunctionStub:
@property
def name(self) -> str: ...
@property
def docstring(self) -> str: ...
@property
def arg_names(self) -> list[str]: ...

def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ...
def sql_expr(sql: str) -> PyExpr: ...
def list_sql_functions() -> list[str]: ...
def list_sql_functions() -> list[SQLFunctionStub]: ...
def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ...
def to_struct(inputs: list[PyExpr]) -> PyExpr: ...

Expand Down
30 changes: 30 additions & 0 deletions daft/sql/_sql_funcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow
Sphinx to generate documentation pages for every SQL function.
"""

from __future__ import annotations

from inspect import Parameter as _Parameter
from inspect import Signature as _Signature

from daft.daft import list_sql_functions as _list_sql_functions


def _create_sql_function(func_name: str, docstring: str, arg_names: list[str]):
def sql_function(*args, **kwargs):
raise NotImplementedError("This function is for documentation purposes only and should not be called.")

sql_function.__name__ = func_name
sql_function.__qualname__ = func_name
sql_function.__doc__ = docstring
sql_function.__signature__ = _Signature([_Parameter(name, _Parameter.POSITIONAL_OR_KEYWORD) for name in arg_names]) # type: ignore[attr-defined]

# Register the function in the current module
globals()[func_name] = sql_function


__all__ = []

for sql_function_stub in _list_sql_functions():
_create_sql_function(sql_function_stub.name, sql_function_stub.docstring, sql_function_stub.arg_names)
__all__.append(sql_function_stub.name)
118 changes: 108 additions & 10 deletions daft/sql/sql.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# isort: dont-add-import: from __future__ import annotations

import inspect
from typing import Optional, overload
from typing import Optional

from daft.api_annotations import PublicAPI
from daft.context import get_context
Expand Down Expand Up @@ -38,22 +38,120 @@ def _copy_from(self, other: "SQLCatalog") -> None:

@PublicAPI
def sql_expr(sql: str) -> Expression:
return Expression._from_pyexpr(_sql_expr(sql))

"""Parses a SQL string into a Daft Expression
@overload
def sql(sql: str) -> DataFrame: ...
This function allows you to create Daft Expressions from SQL snippets, which can then be used
in Daft operations or combined with other Daft Expressions.
Args:
sql (str): A SQL string to be parsed into a Daft Expression.
@overload
def sql(sql: str, catalog: SQLCatalog, register_globals: bool = ...) -> DataFrame: ...
Returns:
Expression: A Daft Expression representing the parsed SQL.
Examples:
Create a simple SQL expression:
>>> import daft
>>> expr = daft.sql_expr("1 + 2")
>>> print(expr)
lit(1) + lit(2)
Use SQL expression in a Daft DataFrame operation:
>>> df = daft.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>> df = df.with_column("c", daft.sql_expr("a + b"))
>>> df.show()
╭───────┬───────┬───────╮
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ Int64 ┆ Int64 ┆ Int64 │
╞═══════╪═══════╪═══════╡
│ 1 ┆ 4 ┆ 5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 5 ┆ 7 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 6 ┆ 9 │
╰───────┴───────┴───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
`daft.sql_expr` is also called automatically for you in some DataFrame operations such as filters:
>>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]})
>>> result = df.where("x < 3 AND y > 4")
>>> result.show()
╭───────┬───────╮
│ x ┆ y │
│ --- ┆ --- │
│ Int64 ┆ Int64 │
╞═══════╪═══════╡
│ 2 ┆ 5 │
╰───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
"""
return Expression._from_pyexpr(_sql_expr(sql))


@PublicAPI
def sql(sql: str, catalog: Optional[SQLCatalog] = None, register_globals: bool = True) -> DataFrame:
"""Create a DataFrame from an SQL query.
EXPERIMENTAL: This features is early in development and will change.
"""Run a SQL query, returning the results as a DataFrame
.. WARNING::
This features is early in development and will likely experience API changes.
Examples:
A simple example joining 2 dataframes together using a SQL statement, relying on Daft to detect the names of
SQL tables using their corresponding Python variable names.
>>> import daft
>>>
>>> df1 = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
>>> df2 = daft.from_pydict({"a": [1, 2, 3], "c": ["daft", None, None]})
>>>
>>> # Daft automatically detects `df1` and `df2` from your Python global namespace
>>> result_df = daft.sql("SELECT * FROM df1 JOIN df2 ON df1.a = df2.a")
>>> result_df.show()
╭───────┬──────┬──────╮
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ Int64 ┆ Utf8 ┆ Utf8 │
╞═══════╪══════╪══════╡
│ 1 ┆ foo ┆ daft │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ bar ┆ None │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ baz ┆ None │
╰───────┴──────┴──────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
A more complex example using a SQLCatalog to create a named table called `"my_table"`, which can then be referenced from inside your SQL statement.
>>> import daft
>>> from daft.sql import SQLCatalog
>>>
>>> df = daft.from_pydict({"a": [1, 2, 3], "b": ["foo", "bar", "baz"]})
>>>
>>> # Register dataframes as tables in SQL explicitly with names
>>> catalog = SQLCatalog({"my_table": df})
>>>
>>> daft.sql("SELECT a FROM my_table", catalog=catalog).show()
╭───────╮
│ a │
│ --- │
│ Int64 │
╞═══════╡
│ 1 │
├╌╌╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌╌╌┤
│ 3 │
╰───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
Args:
sql (str): SQL query to execute
Expand Down
4 changes: 2 additions & 2 deletions docs/source/10-min.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"See: [Expressions](user_guide/basic_concepts/expressions.rst)\n",
"See: [Expressions](user_guide/expressions.rst)\n",
"\n",
"Expressions are an API for defining computation that needs to happen over your columns.\n",
"\n",
Expand Down Expand Up @@ -1516,7 +1516,7 @@
"source": [
"### User-Defined Functions\n",
"\n",
"See: [UDF User Guide](user_guide/daft_in_depth/udf)"
"See: [UDF User Guide](user_guide/udf)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ API Documentation
Table of Contents <self>
creation
dataframe
sql
expressions
schema
datatype
Expand Down
15 changes: 15 additions & 0 deletions docs/source/api_docs/sql.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SQL
===

.. autofunction:: daft.sql

.. autofunction:: daft.sql_expr

SQL Functions
-------------

This is a full list of functions that can be used from within SQL.


.. sql-autosummary::
:toctree: doc_gen/sql_funcs
16 changes: 16 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
import inspect
import os
import subprocess
import sys

import sphinx_autosummary_accessors

# Set environment variable to help code determine whether or not we are running a Sphinx doc build process
os.environ["DAFT_SPHINX_BUILD"] = "1"

# Help Sphinx find local custom extensions/directives that we build
sys.path.insert(0, os.path.abspath("ext"))

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "Daft"
Expand Down Expand Up @@ -45,10 +49,15 @@
"myst_nb",
"sphinx_copybutton",
"sphinx_autosummary_accessors",
"sphinx_tabs.tabs",
# Local extensions
"sql_autosummary",
]

templates_path = ["_templates", sphinx_autosummary_accessors.templates_path]

# Removes module names that prefix our classes
add_module_names = False

# -- Options for Notebook rendering
# https://myst-nb.readthedocs.io/en/latest/configuration.html?highlight=nb_execution_mode#execution
Expand Down Expand Up @@ -86,6 +95,13 @@
"learn/user_guides/remote_cluster_execution": "distributed-computing.html",
"learn/quickstart": "learn/10-min.html",
"learn/10-min": "../10-min.html",
"user_guide/basic_concepts/expressions": "user_guide/expressions",
"user_guide/basic_concepts/dataframe_introduction": "user_guide/basic_concepts",
"user_guide/basic_concepts/introduction": "user_guide/basic_concepts",
"user_guide/daft_in_depth/aggregations": "user_guide/aggregations",
"user_guide/daft_in_depth/dataframe-operations": "user_guide/dataframe-operations",
"user_guide/daft_in_depth/datatypes": "user_guide/datatypes",
"user_guide/daft_in_depth/udf": "user_guide/udf",
}

# Resolving code links to github
Expand Down
Empty file added docs/source/ext/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions docs/source/ext/sql_autosummary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import inspect
import os

from sphinx.ext.autosummary import Autosummary
from sphinx.util import logging

logger = logging.getLogger(__name__)


TOCTREE = "doc_gen/sql_funcs"
SQL_MODULE_NAME = "daft.sql._sql_funcs"

STUB_TEMPLATE = """
.. currentmodule:: None
.. autofunction:: {module_name}.{name}
"""


class SQLAutosummary(Autosummary):
def run(self):
func_names = get_sql_func_names()
# Run the normal autosummary stuff, override self.content
self.content = [f"~{SQL_MODULE_NAME}.{f}" for f in func_names]
nodes = super().run()
return nodes

def get_sql_module_name(self):
return self.arguments[0]


def get_sql_func_names():
# Import the SQL functions module
module = __import__(SQL_MODULE_NAME, fromlist=[""])

names = []
for name, obj in inspect.getmembers(module):
if inspect.isfunction(obj) and not name.startswith("_"):
names.append(name)

return names


def generate_stub(name: str):
"""Generates a stub string for a SQL function"""
stub = name + "\n"
stub += "=" * len(name) + "\n\n"
stub += STUB_TEMPLATE.format(module_name=SQL_MODULE_NAME, name=name)
return stub


def generate_files(app):
# Determine where to write .rst files to
output_dir = os.path.join(app.srcdir, "api_docs", TOCTREE)
os.makedirs(output_dir, exist_ok=True)

# Write stubfiles
func_names = get_sql_func_names()
for name in func_names:
stub_content = generate_stub(name)
filename = f"{SQL_MODULE_NAME}.{name}.rst"
filepath = os.path.join(output_dir, filename)
with open(filepath, "w") as f:
f.write(stub_content)

# HACK: Not sure if this is ok?
app.env.found_docs.add(filepath)


def setup(app):
app.add_directive("sql-autosummary", SQLAutosummary)

# Generate and register files when the builder is initialized
app.connect("builder-inited", generate_files)

return {
"version": "0.1",
"parallel_read_safe": True,
"parallel_write_safe": True,
}
Loading

0 comments on commit a165fcf

Please sign in to comment.