Skip to content

Commit

Permalink
feat!: remove necessary_buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
agoose77 committed Oct 3, 2023
1 parent 4c72a94 commit 8e54d0b
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 75 deletions.
2 changes: 1 addition & 1 deletion src/dask_awkward/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
partition_compatibility,
)
from dask_awkward.lib.describe import fields
from dask_awkward.lib.inspect import necessary_buffers, sample
from dask_awkward.lib.inspect import sample
from dask_awkward.lib.io.io import (
ImplementsFormTransformation,
from_awkward,
Expand Down
2 changes: 1 addition & 1 deletion src/dask_awkward/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
partition_compatibility,
)
from dask_awkward.lib.describe import fields
from dask_awkward.lib.inspect import necessary_buffers, sample
from dask_awkward.lib.inspect import sample
from dask_awkward.lib.io.io import (
ImplementsFormTransformation,
from_awkward,
Expand Down
74 changes: 1 addition & 73 deletions src/dask_awkward/lib/inspect.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING

import numpy as np
from dask.base import unpack_collections
from dask.highlevelgraph import HighLevelGraph

if TYPE_CHECKING:
from dask_awkward.lib.core import Array


def necessary_buffers(
*args: Any, traverse: bool = True
) -> dict[str, dict[str, set[str]]]:
r"""Determine the columns necessary to compute a collection.
Parameters
----------
*args : Dask collections or HighLevelGraphs
The collection (or collection graph) of interest. These can be
individual objects, lists, sets, or dictionaries.
traverse : bool, optional
If True (default), builtin Python collections are traversed
looking for any Dask collections they might contain.
Returns
-------
dict[str, dict[str, set[str]]]
Mapping that pairs the input layers in the graph to the
(data, shape) buffers that have been determined necessary from that layer.
These are not necessarily in the same order as the original input.
Examples
--------
If we have a hypothetical parquet dataset (``ds``) with the fields
- "foo"
- "bar"
- "baz"
And the "baz" field has fields
- "x"
- "y"
The calculation of ``ds.bar + ds.baz.x`` will only require the
``bar`` and ``baz.x`` columns from the parquet file.
>>> import dask_awkward as dak
>>> ds = dak.from_parquet("some-dataset")
>>> ds.fields
["foo", "bar", "baz"]
>>> ds.baz.fields
["x", "y"]
>>> x = ds.bar + ds.baz.x
>>> dak.necessary_buffers(x)
{"from-parquet-abc123": {"data: ["bar", "baz.x"], "shape": []}}
Notice that ``foo`` and ``baz.y`` are not determined to be
necessary.
"""
import dask_awkward.lib.optimize as o

collections, _ = unpack_collections(*args, traverse=traverse)
if not collections:
return {}

out: dict[str, dict[str, set[str]]] = {}
for obj in collections:
dsk = obj if isinstance(obj, HighLevelGraph) else obj.dask
dsk_buffers = o._necessary_buffers(dsk)

for key, buffers in dsk_buffers.items():
out_key_buffers = out.setdefault(key, {"data": set(), "shape": set()})
out_key_buffers["data"].update(buffers["data"])
out_key_buffers["shape"].update(buffers["shape"])

return out


def _random_boolean_like(array_like, probability):
import awkward as ak

Expand Down

0 comments on commit 8e54d0b

Please sign in to comment.