Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement parser for type derivation programs #43

Merged
merged 11 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/misc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ jobs:
- name: Check version update patch file
run: python3 ci/version.py check

verify-antlr:
name: Verify ANTLR-generated code
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Check
working-directory: rs/antlr
run: python3 generate.py --ci

commitlint:
name: Lint commits for semantic-release
runs-on: ubuntu-latest
Expand Down
45 changes: 45 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions proto/substrait/validator/validator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ message Node {
// represents the parse result of the referred file.
string resolved_uri = 9;

// This node represents an abstract syntax tree node, used for representing
// complex YAML string parse results.
google.protobuf.Empty ast_node = 10;

// No longer used. The more generic ResolvedUri type is used instead.
YamlReference yaml_reference = 5 [deprecated = true];
}
Expand Down
3 changes: 3 additions & 0 deletions rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ serde_json = "1"
# being useful elsewhere too).
regex = "1.5"

# Used for the type derivation DSL.
antlr-rust = "0.3.0-beta"

# Used for checking URI syntax.
uriparse = "0.6"

Expand Down
2 changes: 2 additions & 0 deletions rs/antlr/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
antlr.jar
__pycache__
10 changes: 10 additions & 0 deletions rs/antlr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# ANTLR code generation logic

The validator includes a parser for type expressions based on an ANTLR grammar.
Unfortunately, the ANTLR code generator is written in Java, and would thus add
a huge build dependency (a JRE) to the validator build environment. This is
especially problematic for the distribution of Cargo crates, which are
fundamentally source distributions that should not depend on anything other
than other Rust crates. Therefore, the generated files are checked in to git
and distributed with the crate, and regeneration must thus be done manually.
Call the generate.py script to do so.
179 changes: 179 additions & 0 deletions rs/antlr/antlr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0

"""Wrapper script to download and run a suitable version of ANTLR for
generating or verifying the Rust bindings for a given grammar."""

import urllib.request
import os
import sys
import hashlib
import logging
import tempfile
import shutil
import filecmp
import subprocess
import difflib
import argparse


# NOTE: the Rust bindings for ANTLR are not (yet) official, so we need to
# download a forked ANTLR build.
ANTLR_URL = "https://github.com/rrevenantt/antlr4rust/releases/download/antlr4-4.8-2-Rust0.3.0-beta/antlr4-4.8-2-SNAPSHOT-complete.jar"
ANTLR_SHA1 = "775d24ac1ad5df1eb0ed0e802f0fb2a5aeace43c"


class Failure(Exception):
"""Used for fatal errors."""


def fail(msg):
"""Logs and throws an error message."""
logging.error(msg)
raise Failure(msg)


def download_file(fname, url):
"""Downloads a file if it does not already exist."""
if not os.path.isfile(fname):
logging.info(f"Downloading {fname}...")
urllib.request.urlretrieve(ANTLR_URL, fname)


def verify_file_hash(fname, hash_str):
"""Verifies the hash of a (downloaded) file."""
logging.info(f"Verifying {fname}...")
with open(fname, "rb") as f:
file_hash = hashlib.sha1()
while chunk := f.read(8192):
file_hash.update(chunk)
actual = file_hash.hexdigest()
if hash_str != actual:
fail(f"Verification failed; hash should be {hash_str} but was {actual}")


def verify_file_identical(new, old):
"""Verifies that two text files are identical, printing a diff if not."""
logging.info(f"Verifying {new} against {old}...")
if not os.path.isfile(new):
fail(f"{new} does not exist")
if not os.path.isfile(old):
fail(f"{old} does not exist")
if not filecmp.cmp(new, old, shallow=False):
with open(new, "r") as f:
new_data = f.readlines()
with open(old, "r") as f:
old_data = f.readlines()
sys.stdout.writelines(difflib.unified_diff(old_data, new_data, old, new))
fail(f"{new} is different, see diff")


def run_antlr(antlr, grammar, output_dir, verify=False, java="java"):
"""Runns the given ANTLR JAR on the given grammar, sending outputs to
output_dir. If verify is set, instead of copying the newly-generated files,
this checks that there are no differences between the newly and previously
generated files."""
logging.info("Running ANTLR...")

# Determine the names of the generated files that we're interested in.
name = os.path.basename(grammar).split(".")[0].lower()
expected_files = [f"{name}lexer.rs", f"{name}parser.rs", f"{name}listener.rs"]

# Run in a temporary directory, because ANTLR spams random files we didn't
# ask for in its working directory.
with tempfile.TemporaryDirectory() as generate_dir:
shutil.copyfile(grammar, os.path.join(generate_dir, os.path.basename(grammar)))
subprocess.run(
[
java,
"-jar",
os.path.realpath(antlr),
"-Dlanguage=Rust",
os.path.basename(grammar),
],
cwd=generate_dir,
)

logging.info("Copying/verifying output files...")
for expected_file in expected_files:
src = os.path.join(generate_dir, expected_file)
dest = os.path.join(output_dir, expected_file)
if not os.path.isfile(src):
fail(f"ANTLR failed to generate {expected_file}")
with open(src, "r+") as f:
data = f.read()
data = (
"// SPDX-License-Identifier: Apache-2.0\n"
"#![allow(clippy::all)]\n"
"#![cfg_attr(rustfmt, rustfmt_skip)]\n"
f"{data}"
)
f.seek(0)
f.write(data)
if verify:
verify_file_identical(src, dest)
else:
if os.path.exists(dest):
os.unlink(dest)
shutil.copyfile(src, dest)


def main(*args):
"""Utility to generate Rust bindings for an ANTLR grammar."""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument(
"--antlr",
metavar="antlr.jar",
default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "antlr.jar"),
help="alternate location for the ANTLR jar",
)
parser.add_argument(
"--no-download",
action="store_true",
help="don't attempt to download the ANTLR jar",
)
parser.add_argument(
"--no-verify",
action="store_true",
help="don't attempt to verify the hash of the ANTLR jar",
)
parser.add_argument(
"--java", default="java", help="path to java executable to call ANTLR with"
)
parser.add_argument(
"--ci-check",
action="store_true",
help="instead of regenerating the files, assert that the files do not need to be regenerated",
)
parser.add_argument("grammar", help="the .g4 grammar file to generate")
parser.add_argument(
"dest_dir", default=".", nargs="?", help="where to copy the generated files to"
)
args = parser.parse_args(args)

logging.basicConfig(level=logging.INFO)

# Acquire ANTLR jar.
if args.no_download:
if not os.path.isfile(args.antlr):
parser.error(f"{args.antlr} does not exist and auto-download is disabled")
else:
download_file(args.antlr, ANTLR_URL)
if not args.no_verify:
verify_file_hash(args.antlr, ANTLR_SHA1)

# Run ANTLR.
if not os.path.isfile(args.grammar):
parser.error(f"{args.grammar} does not exist")
run_antlr(
args.antlr, args.grammar, args.dest_dir, verify=args.ci_check, java=args.java
)


if __name__ == "__main__":
try:
main(*sys.argv[1:])
logging.info("Done")
except Failure:
logging.info("Returning failure exit status")
sys.exit(1)
38 changes: 38 additions & 0 deletions rs/antlr/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0

import sys
import logging
import argparse
import antlr

"""Script for regenerating or verifying all the ANTLR-generated files of the
validator."""


def main(*args):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--ci",
action="store_true",
help="instead of regenerating, verify that the files don't need to be regenerated",
)
args = parser.parse_args(args)

logging.basicConfig(level=logging.INFO)

ci = ["--ci-check"] if args.ci else []
antlr.main(
"../src/parse/extensions/simple/derivations/SubstraitType.g4",
"../src/parse/extensions/simple/derivations",
*ci,
)


if __name__ == "__main__":
try:
main(*sys.argv[1:])
logging.info("Done")
except antlr.Failure:
logging.info("Returning failure exit status")
sys.exit(1)
1 change: 1 addition & 0 deletions rs/src/export/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ fn format_node_tree(
tree::NodeType::YamlMap => format!("{brief} {}", format_span("type", "YAML map")),
tree::NodeType::YamlArray => format!("{brief} {}", format_span("type", "YAML array")),
tree::NodeType::YamlPrimitive(data) => format!("= {}{brief}", format_span("value", data)),
tree::NodeType::AstNode => format!("{brief} {}", format_span("type", "AST node")),
};
let header = format!(
"{} {value} {}",
Expand Down
1 change: 1 addition & 0 deletions rs/src/export/proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ impl From<&tree::NodeType> for validator::node::NodeType {
validator::node::NodeType::YamlPrimitive(data.into())
}
tree::NodeType::ResolvedUri(uri) => validator::node::NodeType::ResolvedUri(uri.clone()),
tree::NodeType::AstNode => validator::node::NodeType::AstNode(()),
}
}
}
Expand Down
Loading