Skip to content

Commit

Permalink
feat: Output detailed lineage graph (#4533)
Browse files Browse the repository at this point in the history
  • Loading branch information
kgutwin authored Jun 5, 2024
1 parent cac9caf commit 1a3fc6f
Show file tree
Hide file tree
Showing 41 changed files with 740 additions and 35 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
- Join `side` parameter can take a reference that resolves to a literal (note:
this is an experimental feature which may change in the future) (@kgutwin,
#4499)
- Add `prqlc debug lineage` command to the CLI, creating an expression lineage
graph from a query (@kgutwin, #4533)

**Fixes**:

Expand Down
1 change: 1 addition & 0 deletions prqlc/bindings/prqlc-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ features = ["pyo3/extension-module"]

# The module is named `prqlc` rather than `prqlc-python`.
module-name = "prqlc"
python-source = "python"

[project.optional-dependencies]
dev = [
Expand Down
8 changes: 8 additions & 0 deletions prqlc/bindings/prqlc-python/python/prqlc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# ruff: noqa: F403, F405
#
# This is the default module init provided automatically by Maturin.
from .prqlc import *

__doc__ = prqlc.__doc__
if hasattr(prqlc, "__all__"):
__all__ = prqlc.__all__
File renamed without changes.
2 changes: 2 additions & 0 deletions prqlc/bindings/prqlc-python/python/prqlc/debug.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def prql_lineage(prql_query: str) -> str: ...
def pl_to_lineage(pl_json: str) -> str: ...
Empty file.
9 changes: 9 additions & 0 deletions prqlc/bindings/prqlc-python/python/tests/test_all.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import prqlc


Expand Down Expand Up @@ -80,3 +82,10 @@ def test_compile_options() -> None:
assert res.startswith(
"SELECT\n *\nFROM\n a\nORDER BY\n (\n SELECT\n NULL\n ) OFFSET 0 ROWS\nFETCH FIRST\n 3 ROWS ONLY"
)


def test_debug_functions() -> None:
prql_query = "from invoices | select { id, customer_id }"

lineage = json.loads(prqlc.debug.prql_lineage(prql_query))
assert lineage.keys() == {"frames", "nodes", "ast"}
44 changes: 44 additions & 0 deletions prqlc/bindings/prqlc-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,45 @@ pub fn rq_to_sql(rq_json: &str, options: Option<CompileOptions>) -> PyResult<Str
.map_err(|err| (PyErr::new::<exceptions::PyValueError, _>(err.to_json())))
}

mod debug {
use super::*;

#[pyfunction]
pub fn prql_lineage(prql_query: &str) -> PyResult<String> {
prqlc_lib::prql_to_pl(prql_query)
.and_then(prqlc_lib::debug::pl_to_lineage)
.and_then(|x| prqlc_lib::debug::json::from_lineage(&x))
.map_err(|err| (PyErr::new::<exceptions::PyValueError, _>(err.to_json())))
}

#[pyfunction]
pub fn pl_to_lineage(pl_json: &str) -> PyResult<String> {
prqlc_lib::json::to_pl(pl_json)
.and_then(prqlc_lib::debug::pl_to_lineage)
.and_then(|x| prqlc_lib::debug::json::from_lineage(&x))
.map_err(|err| (PyErr::new::<exceptions::PyValueError, _>(err.to_json())))
}
}

#[pymodule]
fn prqlc(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(compile, m)?)?;
m.add_function(wrap_pyfunction!(prql_to_pl, m)?)?;
m.add_function(wrap_pyfunction!(pl_to_rq, m)?)?;
m.add_function(wrap_pyfunction!(rq_to_sql, m)?)?;
m.add_function(wrap_pyfunction!(get_targets, m)?)?;

m.add_class::<CompileOptions>()?;
// From https://github.com/PyO3/maturin/issues/100
m.add("__version__", env!("CARGO_PKG_VERSION"))?;

// add debug submodule
let debug_module = PyModule::new(_py, "debug")?;
debug_module.add_function(wrap_pyfunction!(debug::prql_lineage, debug_module)?)?;
debug_module.add_function(wrap_pyfunction!(debug::pl_to_lineage, debug_module)?)?;

m.add_submodule(debug_module)?;

Ok(())
}

Expand Down Expand Up @@ -183,4 +211,20 @@ mod test {
id IN (1, 2, 3)
"###);
}

#[test]
fn debug_prql_lineage() {
assert_snapshot!(
debug::prql_lineage(r#"from a"#).unwrap(),
@r#"{"frames":[],"nodes":[{"id":115,"kind":"Ident","span":"1:0-6","ident":{"Ident":["default_db","a"]}}],"ast":{"name":"Project","stmts":[{"VarDef":{"kind":"Main","name":"main","value":{"FuncCall":{"name":{"Ident":"from"},"args":[{"Ident":"a"}]}}},"span":"1:0-6"}]}}"#
);
}

#[test]
fn debug_pl_to_lineage() {
assert_snapshot!(
prql_to_pl(r#"from a"#).and_then(|x| debug::pl_to_lineage(&x)).unwrap(),
@r#"{"frames":[],"nodes":[{"id":115,"kind":"Ident","ident":{"Ident":["default_db","a"]}}],"ast":{"name":"Project","stmts":[{"VarDef":{"kind":"Main","name":"main","value":{"FuncCall":{"name":{"Ident":"from"},"args":[{"Ident":"a"}]}}},"span":"1:0-6"}]}}"#
);
}
}
58 changes: 56 additions & 2 deletions prqlc/prqlc/src/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use clio::has_extension;
use clio::Output;
use is_terminal::IsTerminal;
use itertools::Itertools;
use prqlc::debug::pl_to_lineage;
use prqlc::semantic;
use prqlc::semantic::reporting::{collect_frames, label_references};
use prqlc::semantic::NS_DEFAULT_DB;
Expand Down Expand Up @@ -173,7 +174,7 @@ enum Command {

/// Commands for meant for debugging, prone to change
#[derive(Subcommand, Debug, Clone)]
pub enum DebugCommand {
enum DebugCommand {
/// Parse & and expand into PL, but don't resolve
ExpandPL(IoArgs),

Expand All @@ -189,6 +190,48 @@ pub enum DebugCommand {
/// Parse, resolve & combine source with comments annotating relation type
Annotate(IoArgs),

/// Output column-level lineage graph
///
/// The returned data includes:
///
/// * "frames": a list of Span and Lineage records corresponding to each
/// transformation frame in the main pipeline.
///
/// * "nodes": a list of expression graph nodes.
///
/// * "ast": the parsed PL abstract syntax tree.
///
/// Each expression node has attributes:
///
/// * "id": A unique ID for each expression.
///
/// * "kind": Descriptive text about the expression type.
///
/// * "span": Position of the expression in the original source (optional).
///
/// * "alias": When this expression is part of a Tuple, this is its alias
/// (optional).
///
/// * "ident": When this expression is an Ident, this is its reference
/// (optional).
///
/// * "targets": Any upstream sources of data for this expression, as a list
/// of node IDs (optional).
///
/// * "children": A list of expression IDs contained within this expression
/// (optional).
///
/// * "parent": The expression ID that contains this expression (optional).
///
/// A Python script for rendering this output as a GraphViz visualization is
/// available at https://gist.github.com/kgutwin/efe5f03df5ff930d899249018a0a551b.
Lineage {
#[command(flatten)]
io_args: IoArgs,
#[arg(value_enum, long, default_value = "yaml")]
format: Format,
},

/// Print info about the AST data structure
Ast,
}
Expand Down Expand Up @@ -406,14 +449,23 @@ impl Command {
let ctx = semantic::resolve(root_mod, Default::default())?;

let frames = if let Ok((main, _)) = ctx.find_main_rel(&[]) {
collect_frames(*main.clone().into_relation_var().unwrap())
collect_frames(*main.clone().into_relation_var().unwrap()).frames
} else {
vec![]
};

// combine with source
combine_prql_and_frames(&source, frames).as_bytes().to_vec()
}
Command::Debug(DebugCommand::Lineage { format, .. }) => {
let stmts = prql_to_pl_tree(sources)?;
let fc = pl_to_lineage(stmts)?;

match format {
Format::Json => serde_json::to_string_pretty(&fc)?.into_bytes(),
Format::Yaml => serde_yaml::to_string(&fc)?.into_bytes(),
}
}
Command::Debug(DebugCommand::Eval(_)) => {
let root_mod = prql_to_pl_tree(sources)?;

Expand Down Expand Up @@ -508,6 +560,7 @@ impl Command {
DebugCommand::Resolve(io_args)
| DebugCommand::ExpandPL(io_args)
| DebugCommand::Annotate(io_args)
| DebugCommand::Lineage { io_args, .. }
| DebugCommand::Eval(io_args),
) => io_args,
Experimental(ExperimentalCommand::GenerateDocs(io_args)) => io_args,
Expand Down Expand Up @@ -551,6 +604,7 @@ impl Command {
DebugCommand::Resolve(io_args)
| DebugCommand::ExpandPL(io_args)
| DebugCommand::Annotate(io_args)
| DebugCommand::Lineage { io_args, .. }
| DebugCommand::Eval(io_args),
) => io_args.output.clone(),
Experimental(ExperimentalCommand::GenerateDocs(io_args)) => io_args.output.clone(),
Expand Down
36 changes: 36 additions & 0 deletions prqlc/prqlc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,42 @@ impl<S: ToString> From<S> for SourceTree {
}
}

/// Debugging and unstable API functions
pub mod debug {
use super::*;

/// Create column-level lineage graph
pub fn pl_to_lineage(
pl: ast::ModuleDef,
) -> Result<semantic::reporting::FrameCollector, ErrorMessages> {
let ast = Some(pl.clone());

let root_module = semantic::resolve(pl, Default::default()).map_err(ErrorMessages::from)?;

let (main, _) = root_module.find_main_rel(&[]).unwrap();
let mut fc =
semantic::reporting::collect_frames(*main.clone().into_relation_var().unwrap());
fc.ast = ast;

Ok(fc)
}

pub mod json {
use super::*;

/// JSON serialization of FrameCollector lineage
pub fn from_lineage(
fc: &semantic::reporting::FrameCollector,
) -> Result<String, ErrorMessages> {
serde_json::to_string(fc).map_err(convert_json_err)
}

fn convert_json_err(err: serde_json::Error) -> ErrorMessages {
ErrorMessages::from(Error::new_simple(err.to_string()))
}
}
}

#[cfg(test)]
mod tests {
use std::str::FromStr;
Expand Down
Loading

0 comments on commit 1a3fc6f

Please sign in to comment.