docs/source/build_datastructure_doc.py

"""Create datastructure documentation page.

This will add a page with various svg graphs and html tables
describing the datastructure: dependencies, columns provided,
and configuration options that apply to each plugins.

For extra credit, the SVGs are clickable.

"""

from collections import defaultdict
import os
import shutil
from immutabledict import immutabledict
import numpy as np
import pandas as pd
import graphviz
import strax
import straxen
from straxen import kind_colors
from straxen.docs_utils import add_spaces, add_deps_to_graph_tree

this_dir = os.path.dirname(os.path.realpath(__file__))

page_header = """
{title}
===========================================================

This page is an autogenerated reference for all the plugins in straxen's
`{context}` context.

The figures presented are dependency diagrams, meaning that an arrow stemming from a node points to a node that the stem depends on (i.e. "A depends on B" is equivalent to "A ---> B").

Colors indicate data kinds. To load tables with different data kinds,
you currently need more than one `get_df` (or `get_array`) commands.

"""

template = """
{data_type}
--------------------------------------------------------

Description
~~~~~~~~~~~~~~~~~~~~~~

Provided by plugin: `{p.__class__.__name__} <https://github.com/XENONnT/straxen/blob/master/{module}.py>`_

Data kind: {kind}

{docstring}


Columns provided
~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html

{columns}


Dependencies
~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html

{svg}


Configuration options
~~~~~~~~~~~~~~~~~~~~~~~

These are all options that affect this data type.
This also includes options taken by dependencies of this datatype,
because changing any of those options affect this data indirectly.

.. raw:: html

{config_options}


------------------
"""

data_kinds_header = """
XENONnT data kinds
====================
As explained in the
`demo <https://straxen.readthedocs.io/en/latest/tutorials/strax_demo.html>`_,
in straxen, we have **data types** and **data kinds**. The **data types** are
documented in `the datastructure <https://straxen.readthedocs.io/en/latest/reference/datastructure.html>`_
page and are the type of data that one can load in straxen using
``st.get_array(<RUN_ID>, <DATA_TYPE>)`` or ``st.get_df(<RUN_ID>, <DATA_TYPE>)``.

Additionally, each data type also has a data kind. Each data kinds has a group
of data types associated to it. All data of a given data type has the same number
of entities. As such, different data types can be loaded simultaneously if they
are of the same data kind. For example, `peak_basics` and `peak_positions` are
two data types but they contain information about the same data kind: `peaks`.

When writing a plugin, the ``plugin.compute(self, <DATA KIND>)`` method takes the **data kind**.

--------------------------------------------------------

.. raw:: html

{svg}


"""

titles = {
    "": "Straxen datastructure",
    "_he": "Straxen datastructure for high energy channels",
    "_nv": "Straxen datastructure for neutron veto",
    "_mv": "Straxen datastructure for muon veto",
}
tree_suffices = list(titles.keys())

suffices = ["_he", "_nv", "_mv"]
for suffix in suffices:
    to_copy = list(kind_colors.keys())
    for c in to_copy:
        kind_colors[c + suffix] = kind_colors[c]


def get_plugins_deps(st):
    """For a given Strax.Context return the dependencies per plugin split by the known
    tree_suffices.

    :param st: Strax.Context
    :return: dict of default dicts containing the number of dependencies.

    """
    plugins_by_deps = {k: defaultdict(list) for k in tree_suffices}
    for det_suffix in tree_suffices:
        for plugin_name, plugin_class in st._plugin_class_registry.items():
            if det_suffix not in plugin_name:
                continue
            elif det_suffix == "" and np.any([s in plugin_name for s in tree_suffices if s != ""]):
                continue
            plugins = st._get_plugins((plugin_name,), run_id="0")
            # Clear cache, otherwise we might be getting more than we asked for from the cache
            st._fixed_plugin_cache = {}
            plugins_by_deps[det_suffix][len(plugins)].append(plugin_name)
    return plugins_by_deps


def get_context():
    """Need to init a context without initializing the runs_db as that requires the appropriate
    passwords.

    :return: straxen context that mimics the xenonnt_online context without the rundb init

    """
    st = straxen.contexts.xenonnt_online(_database_init=False)
    st.context_config["forbid_creation_of"] = straxen.daqreader.DAQReader.provides
    return st


def build_datastructure_doc():
    """Build a dependency tree for all plugins."""
    pd.set_option("display.max_colwidth", int(1e9))

    st = get_context()
    # Too lazy to write proper graph sorter
    # Make dictionary {total number of dependencies below -> list of plugins}

    plugins_by_deps = get_plugins_deps(st)

    # Make graph for each suffix ('' referring to TPC)
    for suffix in tree_suffices:
        title = titles[suffix]
        out = page_header.format(title=title, context="xenonnt_online")

        print(f"------------ {suffix} ------------")
        os.makedirs(this_dir + f"/graphs{suffix}", exist_ok=True)
        for n_deps in list(reversed(sorted(list(plugins_by_deps[suffix].keys())))):
            for this_data_type in plugins_by_deps[suffix][n_deps]:
                this_plugin = st._get_plugins(targets=(this_data_type,), run_id="0")[this_data_type]

                # Create dependency graph
                graph_tree = graphviz.Digraph(format="svg")
                # Add plugins and dependencies recursively
                add_deps_to_graph_tree(graph_tree, this_plugin, this_data_type)

                # Where to save this node
                fn = this_dir + f"/graphs{suffix}/" + this_data_type
                graph_tree.render(fn)
                with open(f"{fn}.svg", mode="r") as f:
                    svg = add_spaces(f.readlines()[5:])

                config_df = st.show_config(this_data_type).sort_values(by="option")
                # Filter out the config options of lower level datatypes
                config_mask = []
                for ap_to in config_df["applies_to"].values:
                    config_mask.append(any([this_data_type in a for a in ap_to]))
                keep_cols = ["option", "default", "current", "help"]
                config_df = config_df[config_mask][keep_cols]

                # Shorten long default values
                config_df["default"] = [
                    x[:10] + "..." + x[-10:] if isinstance(x, str) and len(x) > 30 else x
                    for x in config_df["default"].values
                ]

                out += template.format(
                    p=this_plugin,
                    context="",
                    module=str(this_plugin.__module__).replace(".", "/"),
                    svg=svg,
                    data_type=this_data_type,
                    columns=add_spaces(st.data_info(this_data_type).to_html(index=False)),
                    kind=this_plugin.data_kind_for(this_data_type),
                    docstring=(
                        this_plugin.__doc__ if this_plugin.__doc__ else "(no plugin description)"
                    ),
                    config_options=add_spaces(config_df.to_html(index=False)),
                )

        with open(this_dir + f"/reference/datastructure{suffix}.rst", mode="w") as f:
            f.write(out)

        shutil.rmtree(this_dir + f"/graphs{suffix}")


def tree_to_svg(graph_tree, save_as="data_kinds"):
    # Where to save this node
    graph_tree.render(save_as)
    with open(f"{save_as}.svg", mode="r") as f:
        svg = add_spaces(f.readlines()[5:])
    os.remove(f"{save_as}.svg")
    os.remove(save_as)
    return svg


def write_data_kind_dep_tree():
    """Work in progress to build a dependency tree of the datakinds."""
    print("------------ data kinds ------------")
    st = get_context()

    def get_plugin(pov):
        return st._get_plugins((pov,), "0")[pov]

    tree = defaultdict(set)
    data_kinds = defaultdict(list)
    for data_type in st._plugin_class_registry.keys():
        this_plugin = get_plugin(data_type)
        this_data_kind = this_plugin.data_kind

        depends_on = []
        for dep in strax.to_str_tuple(this_plugin.depends_on):
            dep_kind = get_plugin(dep).data_kind
            if isinstance(dep_kind, (dict, immutabledict)):
                dep_kind = dep_kind[dep]
            depends_on.append(dep_kind)
        if isinstance(this_data_kind, (dict, immutabledict)):
            this_data_kind = this_data_kind[data_type]

        for k in strax.to_str_tuple(this_data_kind):
            this_deps = tree[k] | set(depends_on)
            tree[k] = this_deps
        data_kinds[this_data_kind].append(data_type)

    graph_tree = graphviz.Digraph(format="svg")
    graph_tree.attr(rankdir="RL")
    for data_kind in tree.keys():
        graph_tree.node(
            data_kind,
            style="filled",
            href="#" + data_kind.replace("_", "-") + "-data-kind",
            fillcolor=kind_colors.get(data_kind, "grey"),
            shape="box3d",
        )

        for d in tree[data_kind]:
            graph_tree.edge(data_kind, d)

    svg = tree_to_svg(graph_tree, save_as="data_kinds")
    output = data_kinds_header.format(svg=svg)

    # Sort by largest first
    sorted_zipped_lists = sorted(
        zip(
            [-len(d) for d in data_kinds.values()],
            data_kinds.keys(),
        )
    )

    for _, data_kind in sorted_zipped_lists:
        data_types = data_kinds[data_kind]
        graph_tree = graphviz.Graph(format="svg")
        graph_tree.attr(rankdir="LR")
        graph_tree.node(
            data_kind + "-data-kind",
            style="filled",
            href="#" + data_kind.replace("_", "-") + "-data-kind",
            fillcolor=kind_colors.get(data_kind, "grey"),
            shape="box3d",
        )

        for dtype in data_types:
            graph_tree.node(
                dtype,
                style="filled",
                href="#" + data_kind.replace("_", "-") + "-data-kind",
                fillcolor=kind_colors.get(data_kind, "grey"),
            )
            graph_tree.edge(data_kind + "-data-kind", dtype)
        output += f"""

{data_kind}-data kind
--------------------------------------------------------
The ``{data_kind}``-data kind includes the following data types:
{{data_types}}

.. raw:: html


{tree_to_svg(graph_tree, save_as=f"{data_kind}_kind")}


        """
        extra = ""
        for d in data_types:
            extra += f"\n - ``{d}``"
        output = output.format(data_types=extra)
    data_type = this_dir + f"/reference/data_kinds.rst"
    with open(data_type, mode="w") as f:
        f.write(output)
    assert os.path.exists(data_type)


if __name__ == "__main__":
    write_data_kind_dep_tree()
    build_datastructure_doc()