diff --git a/Cargo.toml b/Cargo.toml index 5f2805d4..50f4c1e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ edition = "2021" # We need at least 1.65 for GATs[1] and 1.67 for `ilog`[2] # [1] https://blog.rust-lang.org/2022/11/03/Rust-1.65.0.html # [2] https://blog.rust-lang.org/2023/01/26/Rust-1.67.0.html#stabilized-apis -rust-version = "1.67" +rust-version = "1.77" [features] default = [] @@ -63,6 +63,7 @@ smallvec = { version = "1.9.0", features = ["const_generics"] } bumpalo = { version = "3.15.3", features = ["collections", "std"] } digest = { version = "0.9", optional = true } ice_code = "0.1.4" +rustc-hash = "2.0.0" sha2 = { version = "0.9", optional = true } serde = { version = "1.0", features = ["derive"], optional = true } serde_with = { version = "3.7.0", optional = true } diff --git a/benches/read_many_structs.rs b/benches/read_many_structs.rs index e2072fe3..508d477b 100644 --- a/benches/read_many_structs.rs +++ b/benches/read_many_structs.rs @@ -3,27 +3,260 @@ use criterion::{criterion_group, criterion_main}; #[cfg(not(feature = "experimental"))] mod benchmark { use criterion::Criterion; + pub fn criterion_benchmark(_c: &mut Criterion) { panic!("This benchmark requires the 'experimental' feature to work; try again with `--features experimental`"); } } +/// An Ion 1.1 test stream to benchmark. Each instance of this type contains data that was encoded +/// with different settings; for example, using more or fewer macros, or using length-prefixing. +#[allow(non_camel_case_types)] +pub struct TestData_1_1 { + name: String, + template_definition_text: String, + text_data: String, + binary_data: Vec, +} + +/// Produces an Ion 1.1 stream with `num_values` top-level e-expressions. Each e-expression invokes +/// a template that makes more extensive use of macros to minimize the size of each invocation. This +/// makes the stream much more compact, but comes at the expense of evaluation overhead when the +/// stream is read. If only a subset of the fields are read from each value, this overhead will be +/// minimal. +fn maximally_compact_1_1_data(num_values: usize) -> TestData_1_1 { + let template_definition_text: String = r#" + (macro event (timestamp thread_id thread_name client_num host_id parameters*) + { + 'timestamp': timestamp, + 'threadId': thread_id, + 'threadName': (make_string "scheduler-thread-" thread_name), + 'loggerName': "com.example.organization.product.component.ClassName", + 'logLevel': (literal INFO), + 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", + 'parameters': [ + "SUCCESS", + (make_string "example-client-" client_num), + (make_string "aws-us-east-5f-" host_id), + parameters + ] + } + ) + "#.to_owned(); + + let text_1_1_data = r#"(:event 1670446800245 418 "6" "1" "abc-123" (: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); + + let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM + #[rustfmt::skip] + let mut binary_1_1_data_body: Vec = vec![ + 0x03, // Macro ID 3 + 0b10, // [NOTE: `0b`] `parameters*` arg is an arg group + 0x66, // 6-byte integer (`timestamp` param) + 0x75, 0x5D, 0x63, 0xEE, 0x84, 0x01, + 0x62, // 2-byte integer (`thread_id` param) + 0xA2, 0x01, + 0x91, // 1-byte string (`thread_name` param) + 0x36, + 0x91, // 1-byte string (`client_num` param) + 0x31, + 0x96, // 6-byte string (`host_id` param) + 0x61, 0x62, 0x63, 0x31, 0x32, 0x33, + 0x4D, // Arg group length prefix + 0x98, // 8-byte string + 0x72, 0x65, 0x67, 0x69, + 0x6F, 0x6E, 0x20, 0x34, + 0xF9, // Long-form, 27-byte string + 0x37, 0x32, 0x30, 0x32, + 0x32, 0x2D, 0x31, 0x32, + 0x2D, 0x30, 0x37, 0x54, + 0x32, 0x30, 0x3A, 0x35, + 0x39, 0x3A, 0x35, 0x39, + 0x2E, 0x37, 0x34, 0x34, + 0x30, 0x30, 0x30, 0x5A, + ].repeat(num_values); + binary_1_1_data.append(&mut binary_1_1_data_body); + TestData_1_1 { + name: "maximally compact".to_owned(), + template_definition_text, + text_data: text_1_1_data, + binary_data: binary_1_1_data, + } +} + +/// Produces an Ion 1.1 stream with `num_values` top-level e-expressions. Each e-expression invokes +/// a template that does not use additional macros. This makes the stream compact relative to its +/// Ion 1.0 equivalent, but not as compact as it is in the "maximally compact" configuration above. +/// The lighter use of macros means that there is less evaluation overhead at read time. +fn moderately_compact_1_1_data(num_values: usize) -> TestData_1_1 { + let template_definition_text = r#" + (macro event (timestamp thread_id thread_name client_num host_id parameters*) + { + 'timestamp': timestamp, + 'threadId': thread_id, + 'threadName': thread_name, + 'loggerName': "com.example.organization.product.component.ClassName", + 'logLevel': (literal INFO), + 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", + 'parameters': [ + "SUCCESS", + client_num, + host_id, + parameters + ] + } + ) + "#; + + let text_1_1_data = r#"(:event 1670446800245 418 "scheduler-thread-6" "example-client-1" "aws-us-east-5f-abc-123" (: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); + let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM + #[rustfmt::skip] + let mut binary_1_1_data_body: Vec = vec![ + 0x03, + 0b10, // [NOTE: `0b` prefix] `parameters*` arg is an arg group + 0x66, // 6-byte integer (`timestamp` param) + 0x75, 0x5D, 0x63, 0xEE, 0x84, 0x01, + 0x62, // 2-byte integer (`thread_id` param) + 0xA2, 0x01, + 0xF9, // long-form string (`thread_name` param) + 0x25, // FlexUInt byte length 18 + // "scheduler-thread-6" + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6C, 0x65, 0x72, 0x2D, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x2D, 0x36, + 0xF9, // 1-byte string (`client_num` param) + 0x21, // FlexUInt byte length 16 + // "example-client-1" + 0x65, 0x78, 0x61, 0x6D, 0x70, 0x6C, 0x65, 0x2D, 0x63, 0x6C, 0x69, 0x65, 0x6E, 0x74, 0x2D, 0x31, + 0xF9, // long-form string (`host_id` param) + 0x2B, // FlexUInt byte length 21 + // "aws-us-east-5f-abc-123" + 0x61, 0x77, 0x73, 0x2D, 0x75, 0x73, + 0x2D, 0x65, 0x61, 0x73, 0x74, 0x2D, + 0x35, 0x66, 0x2D, 0x61, 0x62, 0x63, 0x31, 0x32, 0x33, + 0x4D, // Arg group length prefix + 0x98, // 8-byte string + 0x72, 0x65, 0x67, 0x69, + 0x6F, 0x6E, 0x20, 0x34, + 0xF9, // Long-form, 27-byte string + 0x37, 0x32, 0x30, 0x32, + 0x32, 0x2D, 0x31, 0x32, + 0x2D, 0x30, 0x37, 0x54, + 0x32, 0x30, 0x3A, 0x35, + 0x39, 0x3A, 0x35, 0x39, + 0x2E, 0x37, 0x34, 0x34, + 0x30, 0x30, 0x30, 0x5A, + ].repeat(num_values); + + binary_1_1_data.append(&mut binary_1_1_data_body); + TestData_1_1 { + name: "moderately compact".to_owned(), + template_definition_text: template_definition_text.to_owned(), + text_data: text_1_1_data, + binary_data: binary_1_1_data, + } +} + +/// Like `moderately_compact_1_1_data` above, but each top-level e-expression in the stream is +/// length-prefixed. This allows the reader to step over e-expressions without fully parsing them, +/// making top-level skip-scanning highly efficient at the expense of 1-2 extra bytes per +/// e-expression. +fn length_prefixed_moderately_compact_1_1_data(num_values: usize) -> TestData_1_1 { + let template_definition_text = r#" + (macro event (timestamp thread_id thread_name client_num host_id parameters*) + { + 'timestamp': timestamp, + 'threadId': thread_id, + 'threadName': thread_name, + 'loggerName': "com.example.organization.product.component.ClassName", + 'logLevel': (literal INFO), + 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", + 'parameters': [ + "SUCCESS", + client_num, + host_id, + parameters + ] + } + ) + "#; + + let text_1_1_data = r#"(:event 1670446800245 418 "scheduler-thread-6" "example-client-1" "aws-us-east-5f-abc-123" (: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); + let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM + #[rustfmt::skip] + let mut binary_1_1_data_body: Vec = vec![ + 0xF5, // LP invocation + 0x07, // Macro ID 3 + 0xDF, // Length prefix: FlexUInt 111 + 0b10, // [NOTE: `0b` prefix] `parameters*` arg is an arg group + 0x66, // 6-byte integer (`timestamp` param) + 0x75, 0x5D, 0x63, 0xEE, 0x84, 0x01, + 0x62, // 2-byte integer (`thread_id` param) + 0xA2, 0x01, + 0xF9, // long-form string (`thread_name` param) + 0x25, // FlexUInt byte length 18 + // "scheduler-thread-6" + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6C, 0x65, 0x72, 0x2D, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x2D, 0x36, + 0xF9, // 1-byte string (`client_num` param) + 0x21, // FlexUInt byte length 16 + // "example-client-1" + 0x65, 0x78, 0x61, 0x6D, 0x70, 0x6C, 0x65, 0x2D, 0x63, 0x6C, 0x69, 0x65, 0x6E, 0x74, 0x2D, 0x31, + 0xF9, // long-form string (`host_id` param) + 0x2B, // FlexUInt byte length 21 + // "aws-us-east-5f-abc-123" + 0x61, 0x77, 0x73, 0x2D, 0x75, 0x73, + 0x2D, 0x65, 0x61, 0x73, 0x74, 0x2D, + 0x35, 0x66, 0x2D, 0x61, 0x62, 0x63, 0x31, 0x32, 0x33, + 0x4D, // Arg group length prefix + 0x98, // 8-byte string + 0x72, 0x65, 0x67, 0x69, + 0x6F, 0x6E, 0x20, 0x34, + 0xF9, // Long-form, 27-byte string + 0x37, 0x32, 0x30, 0x32, + 0x32, 0x2D, 0x31, 0x32, + 0x2D, 0x30, 0x37, 0x54, + 0x32, 0x30, 0x3A, 0x35, + 0x39, 0x3A, 0x35, 0x39, + 0x2E, 0x37, 0x34, 0x34, + 0x30, 0x30, 0x30, 0x5A, + ].repeat(num_values); + + binary_1_1_data.append(&mut binary_1_1_data_body); + TestData_1_1 { + name: "moderately compact w/length-prefixed top level".to_owned(), + template_definition_text: template_definition_text.to_owned(), + text_data: text_1_1_data, + binary_data: binary_1_1_data, + } +} + #[cfg(feature = "experimental")] mod benchmark { use criterion::{black_box, Criterion}; - use ion_rs::{v1_0, v1_1, ElementReader, Encoding, IonData, Reader, WriteConfig}; + + use crate::{ + length_prefixed_moderately_compact_1_1_data, maximally_compact_1_1_data, + moderately_compact_1_1_data, TestData_1_1, + }; + use ion_rs::{ + v1_0, v1_1, ElementReader, Encoding, EncodingContext, IonData, IonVersion, + LazyRawBinaryReader_1_1, RawEExpression, RawStreamItem, Reader, Sequence, TemplateCompiler, + ValueExpr, WriteConfig, + }; use ion_rs::{Decoder, Element, IonResult, LazyStruct, LazyValue, ValueRef}; - fn rewrite_as( - pretty_ion: &str, - config: impl Into>, - ) -> IonResult> { - let values = Element::read_all(pretty_ion).unwrap(); - let mut buffer = Vec::new(); - values.encode_to(&mut buffer, config)?; - Ok(buffer) + /// The entrypoint for the benchmark. + pub fn criterion_benchmark(c: &mut Criterion) { + const NUM_VALUES: usize = 10_000; + let seq_1_0 = benchmark_1_0(c, NUM_VALUES).unwrap(); + benchmark_1_1(c, &seq_1_0, maximally_compact_1_1_data(NUM_VALUES)).unwrap(); + benchmark_1_1(c, &seq_1_0, moderately_compact_1_1_data(NUM_VALUES)).unwrap(); + benchmark_1_1( + c, + &seq_1_0, + length_prefixed_moderately_compact_1_1_data(NUM_VALUES), + ) + .unwrap(); } + /// Reads this value and, if it's a container, any nested values. Returns the number of values read. fn count_value_and_children(lazy_value: &LazyValue<'_, D>) -> IonResult { use ValueRef::*; let child_count = match lazy_value.read()? { @@ -38,6 +271,7 @@ mod benchmark { Ok(1 + child_count) } + /// Reads the child values of a list or s-expression. Returns the number of values read. fn count_sequence_children<'a, D: Decoder>( lazy_sequence: impl Iterator>>, ) -> IonResult { @@ -48,6 +282,7 @@ mod benchmark { Ok(count) } + /// Reads the field values of a struct. Returns the number of values read. fn count_struct_children(lazy_struct: &LazyStruct<'_, D>) -> IonResult { let mut count = 0; for field in lazy_struct { @@ -56,8 +291,10 @@ mod benchmark { Ok(count) } - pub fn criterion_benchmark(c: &mut Criterion) { - const NUM_VALUES: usize = 10_000; + /// Constructs and benchmarks the 'baseline' Ion 1.0 data stream with `num_values` top-level values. + /// Returns the materialized `Sequence` representation of the stream so other benchmarks can + /// confirm that they are reading Ion-equivalent data. + pub fn benchmark_1_0(c: &mut Criterion, num_values: usize) -> IonResult { let pretty_data_1_0 = r#"{ 'timestamp': 1670446800245, 'threadId': 418, @@ -65,48 +302,55 @@ mod benchmark { 'loggerName': "com.example.organization.product.component.ClassName", 'logLevel': INFO, 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", - 'parameters': ["SUCCESS","example-client-1","aws-us-east-5f-18b4fa","region 4","2022-12-07T20:59:59.744000Z",], - }"#.repeat(NUM_VALUES); + 'parameters': ["SUCCESS","example-client-1","aws-us-east-5f-abc-123","region 4","2022-12-07T20:59:59.744000Z",], + }"#.repeat(num_values); let text_1_0_data = rewrite_as(&pretty_data_1_0, v1_0::Text).unwrap(); let binary_1_0_data = rewrite_as(&pretty_data_1_0, v1_0::Binary).unwrap(); - let template_text = r#" - (macro event (timestamp thread_id thread_name client_num host_id parameters) - { - 'timestamp': timestamp, - 'threadId': thread_id, - 'threadName': (make_string "scheduler-thread-" thread_name), - 'loggerName': "com.example.organization.product.component.ClassName", - 'logLevel': (quote INFO), - 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", - 'parameters': [ - "SUCCESS", - (make_string "example-client-" client_num), - (make_string "aws-us-east-5f-" host_id), - parameters - ] - } - ) - "#; - let text_1_1_data = r#"(:event 1670446800245 418 "6" "1" "18b4fa" (:values "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(NUM_VALUES); - - println!("Bin Ion 1.0 data size: {} bytes", binary_1_0_data.len()); println!("Text Ion 1.0 data size: {} bytes", text_1_0_data.len()); - println!("Text Ion 1.1 data size: {} bytes", text_1_1_data.len()); + println!("Bin Ion 1.0 data size: {} bytes", binary_1_0_data.len()); - // As a sanity check, materialize the data from both the Ion 1.0 and 1.1 streams and make sure - // that they are equivalent before we start measuring the time needed to read them. + // Load the Ion 1.0 values into a Sequence. We'll compare our Ion 1.1 streams' data to this + // sequence to make sure that all of the tests are working on equivalent data. let seq_1_0 = Reader::new(v1_1::Text, text_1_0_data.as_slice()) .unwrap() - .read_all_elements() - .unwrap(); - let mut reader_1_1 = Reader::new(v1_1::Text, text_1_1_data.as_bytes()).unwrap(); - reader_1_1.register_template(template_text).unwrap(); - let seq_1_1 = reader_1_1.read_all_elements().unwrap(); - assert!( - IonData::eq(&seq_1_0, &seq_1_1), - "Ion 1.0 sequence was not equal to the Ion 1.1 sequence" - ); + .read_all_elements()?; + + let mut text_1_0_group = c.benchmark_group("text 1.0"); + // Visit each top level value in the stream without reading it. + text_1_0_group.bench_function("scan all", |b| { + b.iter(|| { + let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + while let Some(item) = reader.next().unwrap() { + black_box(item); + } + }) + }); + // Read every value in the stream, however deeply nested. + text_1_0_group.bench_function("read all", |b| { + b.iter(|| { + let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + let mut num_values = 0usize; + while let Some(item) = reader.next().unwrap() { + num_values += count_value_and_children(&item).unwrap(); + } + let _ = black_box(num_values); + }) + }); + // Read the 'format' field from each top-level struct in the stream. + text_1_0_group.bench_function("read 'format' field", |b| { + b.iter(|| { + let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + let mut num_values = 0usize; + while let Some(value) = reader.next().unwrap() { + let s = value.read().unwrap().expect_struct().unwrap(); + let parameters_list = s.find_expected("format").unwrap(); + num_values += count_value_and_children(¶meters_list).unwrap(); + } + let _ = black_box(num_values); + }) + }); + text_1_0_group.finish(); let mut binary_1_0_group = c.benchmark_group("binary 1.0"); binary_1_0_group.bench_function("scan all", |b| { @@ -127,20 +371,138 @@ mod benchmark { let _ = black_box(num_values); }) }); + binary_1_0_group.bench_function("read 'format' field", |b| { + b.iter(|| { + let mut reader = Reader::new(v1_0::Binary, binary_1_0_data.as_slice()).unwrap(); + let mut num_values = 0usize; + while let Some(value) = reader.next().unwrap() { + let s = value.read().unwrap().expect_struct().unwrap(); + let parameters_list = s.find_expected("format").unwrap(); + num_values += count_value_and_children(¶meters_list).unwrap(); + } + let _ = black_box(num_values); + }) + }); binary_1_0_group.finish(); - let mut text_1_0_group = c.benchmark_group("text 1.0"); - text_1_0_group.bench_function("scan all", |b| { + Ok(seq_1_0) + } + + /// Benchmarks reading the provided Ion 1.1-encoded test data using various access patterns. + /// Before benchmarking begins, tests to make sure that the Ion 1.1 data is Ion-equivalent to + /// the Ion 1.0-encoded sequence `seq_1_0`. + pub fn benchmark_1_1( + c: &mut Criterion, + seq_1_0: &Sequence, + test_data_1_1: TestData_1_1, + ) -> IonResult<()> { + let text_1_1_data = test_data_1_1.text_data.as_str(); + let binary_1_1_data = test_data_1_1.binary_data.as_slice(); + let name = test_data_1_1.name.as_str(); + + let empty_context = EncodingContext::for_ion_version(IonVersion::v1_1); + let compiled_macro = TemplateCompiler::compile_from_text( + empty_context.get_ref(), + &test_data_1_1.template_definition_text, + ) + .unwrap(); + + println!("=== v1.1: {name} ==="); + println!("Binary data size: {} bytes", binary_1_1_data.len()); + println!("Text data size: {} bytes", text_1_1_data.len()); + + // === Binary equivalence check === + let mut reader_1_1 = Reader::new(v1_1::Binary, binary_1_1_data).unwrap(); + reader_1_1.register_template(compiled_macro.clone())?; + let seq_1_1 = reader_1_1.read_all_elements().unwrap(); + assert!( + IonData::eq(seq_1_0, &seq_1_1), + "{name} binary Ion 1.1 sequence was not equal to the original Ion 1.0 sequence" + ); + + // === Text equivalence check === + let mut reader_1_1 = Reader::new(v1_1::Text, text_1_1_data).unwrap(); + reader_1_1 + .register_template(compiled_macro.clone()) + .unwrap(); + let seq_1_1 = reader_1_1.read_all_elements().unwrap(); + assert!( + IonData::eq(seq_1_0, &seq_1_1), + "{name} text Ion 1.1 sequence was not equal to the original Ion 1.0 sequence" + ); + + // Reads each raw top-level e-expression in full without performing evaluation. This is an + // optional "fast path" for macros that are known at compile time; a program can access their + // component values without performing evaluation. + // TODO: The macro table should have a reasonable interface for 'intercepting' e-expressions + // before they are evaluated when a type knows how to interpret them without evaluation. + let mut binary_1_1_group = c.benchmark_group(format!("{name} binary 1.1")); + binary_1_1_group.bench_function("read all from eexp", |b| { + let mut context = EncodingContext::for_ion_version(IonVersion::v1_1); + context + .macro_table_mut() + .add_macro(compiled_macro.clone()) + .unwrap(); + let context_ref = context.get_ref(); b.iter(|| { - let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + // We don't have an API for doing this with the application-level reader yet, so + // for now we use a manually configured context and a raw reader. + let mut reader = LazyRawBinaryReader_1_1::new(binary_1_1_data); + let mut num_top_level_values: usize = 0; + // Skip past the IVM + reader.next(context_ref).unwrap().expect_ivm().unwrap(); + // Expect every top-level item to be an e-expression. + while let RawStreamItem::EExp(raw_eexp) = reader.next(context_ref).unwrap() { + num_top_level_values += 1; + // Look up the e-expression's invoked macro ID in the encoding context. + let eexp = raw_eexp.resolve(context_ref).unwrap(); + // Visit and read all of the e-expression's arguments. + for arg in eexp.arguments() { + match arg.unwrap() { + // If the argument is a value literal, read it. + ValueExpr::ValueLiteral(value) => { + black_box(value.read_resolved().unwrap()); + } + // TODO: Support macro invocations (not just arg groups) as arguments in the benchmark + ValueExpr::MacroInvocation(macro_expr) => { + use ion_rs::MacroExprKind::*; + match macro_expr.kind() { + // If the argument is a group, read all of its contained expressions. + EExpArgGroup(group) => { + for expr in group.expressions() { + match expr.unwrap() { + ValueExpr::ValueLiteral(value) => { + black_box(value.read_resolved().unwrap()); + } + ValueExpr::MacroInvocation(_) => { + todo!("arg groups of macro invocations in benchmark") + } + } + } + } + _ => todo!("other macro types as e-expr args in benchmark"), + } + } + }; + } + } + assert_eq!(num_top_level_values, seq_1_1.len()); + black_box(num_top_level_values); + }) + }); + binary_1_1_group.bench_function("scan all", |b| { + b.iter(|| { + let mut reader = Reader::new(v1_1::Binary, binary_1_1_data).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); while let Some(item) = reader.next().unwrap() { black_box(item); } }) }); - text_1_0_group.bench_function("read all", |b| { + binary_1_1_group.bench_function("read all", |b| { b.iter(|| { - let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + let mut reader = Reader::new(v1_1::Binary, binary_1_1_data).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); let mut num_values = 0usize; while let Some(item) = reader.next().unwrap() { num_values += count_value_and_children(&item).unwrap(); @@ -148,25 +510,26 @@ mod benchmark { let _ = black_box(num_values); }) }); - text_1_0_group.bench_function("read 'format' field", |b| { + binary_1_1_group.bench_function("read 'format' field", |b| { b.iter(|| { - let mut reader = Reader::new(v1_1::Text, text_1_0_data.as_slice()).unwrap(); + let mut reader = Reader::new(v1_1::Binary, binary_1_1_data).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); let mut num_values = 0usize; while let Some(value) = reader.next().unwrap() { let s = value.read().unwrap().expect_struct().unwrap(); - let parameters_list = s.find_expected("format").unwrap(); - num_values += count_value_and_children(¶meters_list).unwrap(); + let format_field_value = s.find_expected("format").unwrap(); + num_values += count_value_and_children(&format_field_value).unwrap(); } let _ = black_box(num_values); }) }); - text_1_0_group.finish(); + binary_1_1_group.finish(); - let mut text_1_1_group = c.benchmark_group("text 1.1"); + let mut text_1_1_group = c.benchmark_group(format!("{} text 1.1", &test_data_1_1.name)); text_1_1_group.bench_function("scan all", |b| { b.iter(|| { let mut reader = Reader::new(v1_1::Text, text_1_1_data.as_bytes()).unwrap(); - reader.register_template(template_text).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); while let Some(item) = reader.next().unwrap() { black_box(item); } @@ -175,7 +538,7 @@ mod benchmark { text_1_1_group.bench_function("read all", |b| { b.iter(|| { let mut reader = Reader::new(v1_1::Text, text_1_1_data.as_bytes()).unwrap(); - reader.register_template(template_text).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); let mut num_values = 0usize; while let Some(item) = reader.next().unwrap() { num_values += count_value_and_children(&item).unwrap(); @@ -186,7 +549,7 @@ mod benchmark { text_1_1_group.bench_function("read 'format' field", |b| { b.iter(|| { let mut reader = Reader::new(v1_1::Text, text_1_1_data.as_bytes()).unwrap(); - reader.register_template(template_text).unwrap(); + reader.register_template(compiled_macro.clone()).unwrap(); let mut num_values = 0usize; while let Some(value) = reader.next().unwrap() { let s = value.read().unwrap().expect_struct().unwrap(); @@ -197,6 +560,18 @@ mod benchmark { }) }); text_1_1_group.finish(); + Ok(()) + } + + /// Transcodes the provided text Ion using the specified `WriteConfig`. + fn rewrite_as( + pretty_ion: &str, + config: impl Into>, + ) -> IonResult> { + let values = Element::read_all(pretty_ion).unwrap(); + let mut buffer = Vec::new(); + values.encode_to(&mut buffer, config)?; + Ok(buffer) } } diff --git a/benches/write_many_structs.rs b/benches/write_many_structs.rs index 13ebe77a..2d4455a7 100644 --- a/benches/write_many_structs.rs +++ b/benches/write_many_structs.rs @@ -42,7 +42,7 @@ mod benchmark { &[ black_box("SUCCESS"), black_box("example-client-1"), - black_box("aws-us-east-5f-18b4fa"), + black_box("aws-us-east-5f-abc-123"), black_box("region 4"), black_box("2022-12-07T20:59:59.744000Z"), ], @@ -73,7 +73,7 @@ mod benchmark { symbol_id(black_box(21)), // $22 = example-client-1 symbol_id(black_box(22)), - // $23 = aws-us-east-5f-18b4fa + // $23 = aws-us-east-5f-abc-123 symbol_id(black_box(23)), // $24 = region 4 symbol_id(black_box(24)), @@ -92,7 +92,7 @@ mod benchmark { // them wouldn't be beneficial. .write(black_box("6"))? // thread_name .write(black_box("1"))? // client_num - .write(symbol_id(black_box(10)))?; // host_id: "18b4fa" ($10) + .write(symbol_id(black_box(10)))?; // host_id: "abc-123" ($10) let mut nested_eexp = eexp.eexp_writer(1)?; nested_eexp // $11 = region 4 @@ -109,7 +109,7 @@ mod benchmark { .write(black_box(418))? // thread_id .write(black_box("6"))? // thread_name .write(black_box("1"))? // client_num - .write(black_box("18b4fa"))?; // host_id + .write(black_box("abc-123"))?; // host_id let mut nested_eexp = eexp.eexp_writer(1)?; nested_eexp .write(black_box("region 4"))? diff --git a/src/element/sequence.rs b/src/element/sequence.rs index 632612fe..fed8edd5 100644 --- a/src/element/sequence.rs +++ b/src/element/sequence.rs @@ -6,17 +6,34 @@ use crate::lazy::encoding::Encoding; use crate::write_config::WriteConfig; use crate::IonResult; use std::cmp::Ordering; +use std::fmt::{Debug, Formatter}; use std::io; /// An iterable, addressable series of Ion [`Element`]s. /// /// A `Sequence` is not itself an Ion value type, but can represent a series of Ion values appearing /// in a [`List`](crate::List), a [`SExp`](crate::SExp), or at the top level. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq, Eq)] pub struct Sequence { elements: Vec, } +impl Debug for Sequence { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Sequence<")?; + let mut is_first = true; + for element in self { + if is_first { + write!(f, "{element}")?; + } else { + write!(f, ", {element}")?; + is_first = false; + } + } + write!(f, ">") + } +} + impl Sequence { pub fn new, I: IntoIterator>(elements: I) -> Sequence { let elements = elements.into_iter().map(|e| e.into()).collect(); diff --git a/src/lazy/any_encoding.rs b/src/lazy/any_encoding.rs index 606ec5b8..3a643c81 100644 --- a/src/lazy/any_encoding.rs +++ b/src/lazy/any_encoding.rs @@ -11,7 +11,9 @@ use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0; use crate::lazy::binary::raw::sequence::{ LazyRawBinaryList_1_0, LazyRawBinarySExp_1_0, RawBinarySequenceIterator_1_0, }; -use crate::lazy::binary::raw::v1_1::e_expression::RawBinaryEExpression_1_1; +use crate::lazy::binary::raw::v1_1::e_expression::{ + BinaryEExpArgGroup, BinaryEExpArgGroupIterator, BinaryEExpression_1_1, +}; use crate::lazy::binary::raw::v1_1::r#struct::{ LazyRawBinaryFieldName_1_1, LazyRawBinaryStruct_1_1, RawBinaryStructIterator_1_1, }; @@ -33,7 +35,11 @@ use crate::lazy::decoder::{ use crate::lazy::encoding::{ BinaryEncoding_1_0, BinaryEncoding_1_1, TextEncoding_1_0, TextEncoding_1_1, }; -use crate::lazy::expanded::macro_evaluator::RawEExpression; +use crate::lazy::expanded::e_expression::ArgGroup; +use crate::lazy::expanded::macro_evaluator::{ + EExpArgGroupIterator, EExpressionArgGroup, RawEExpression, +}; +use crate::lazy::expanded::template::ParameterEncoding; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; @@ -45,16 +51,19 @@ use crate::lazy::text::raw::reader::LazyRawTextReader_1_0; use crate::lazy::text::raw::sequence::{ LazyRawTextList_1_0, LazyRawTextSExp_1_0, RawTextListIterator_1_0, RawTextSExpIterator_1_0, }; +use crate::lazy::text::raw::v1_1::arg_group::{ + EExpArg, EExpArgExpr, TextEExpArgGroup, TextEExpArgGroupIterator, +}; use crate::lazy::text::raw::v1_1::reader::{ LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextReader_1_1, LazyRawTextSExp_1_1, - LazyRawTextStruct_1_1, MacroIdRef, RawTextEExpression_1_1, RawTextSequenceCacheIterator_1_1, - RawTextStructCacheIterator_1_1, + LazyRawTextStruct_1_1, MacroIdRef, RawTextSequenceCacheIterator_1_1, + RawTextStructCacheIterator_1_1, TextEExpression_1_1, }; use crate::lazy::text::value::{ LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker_1_0, LazyRawTextVersionMarker_1_1, RawTextAnnotationsIterator, }; -use crate::{Encoding, IonResult, IonType, RawSymbolRef}; +use crate::{try_next, Encoding, IonResult, IonType, RawStreamItem, RawSymbolRef}; /// An implementation of the `LazyDecoder` trait that can read any encoding of Ion. #[derive(Debug, Clone, Copy)] @@ -65,7 +74,6 @@ pub struct AnyEncoding; // underlying type. impl Decoder for AnyEncoding { type Reader<'data> = LazyRawAnyReader<'data>; - type ReaderSavedState = IonEncoding; type Value<'top> = LazyRawAnyValue<'top>; type SExp<'top> = LazyRawAnySExp<'top>; type List<'top> = LazyRawAnyList<'top>; @@ -126,13 +134,23 @@ impl<'top> HasRange for LazyRawAnyVersionMarker<'top> { } impl<'top> RawVersionMarker<'top> for LazyRawAnyVersionMarker<'top> { - fn version(&self) -> (u8, u8) { + fn major_minor(&self) -> (u8, u8) { use LazyRawAnyVersionMarkerKind::*; match self.encoding { - Text_1_0(marker) => marker.version(), - Binary_1_0(marker) => marker.version(), - Text_1_1(marker) => marker.version(), - Binary_1_1(marker) => marker.version(), + Text_1_0(marker) => marker.major_minor(), + Binary_1_0(marker) => marker.major_minor(), + Text_1_1(marker) => marker.major_minor(), + Binary_1_1(marker) => marker.major_minor(), + } + } + + fn stream_encoding_before_marker(&self) -> IonEncoding { + use LazyRawAnyVersionMarkerKind::*; + match self.encoding { + Text_1_0(_) => IonEncoding::Text_1_0, + Binary_1_0(_) => IonEncoding::Binary_1_0, + Text_1_1(_) => IonEncoding::Text_1_1, + Binary_1_1(_) => IonEncoding::Binary_1_1, } } } @@ -173,8 +191,8 @@ pub struct LazyRawAnyEExpression<'top> { #[derive(Debug, Copy, Clone)] pub enum LazyRawAnyEExpressionKind<'top> { - Text_1_1(RawTextEExpression_1_1<'top>), - Binary_1_1(RawBinaryEExpression_1_1<'top>), + Text_1_1(TextEExpression_1_1<'top>), + Binary_1_1(&'top BinaryEExpression_1_1<'top>), } impl<'top> LazyRawAnyEExpression<'top> { @@ -187,15 +205,15 @@ impl<'top> LazyRawAnyEExpression<'top> { } } -impl<'top> From> for LazyRawAnyEExpression<'top> { - fn from(text_invocation: RawTextEExpression_1_1<'top>) -> Self { +impl<'top> From> for LazyRawAnyEExpression<'top> { + fn from(text_invocation: TextEExpression_1_1<'top>) -> Self { LazyRawAnyEExpression { encoding: LazyRawAnyEExpressionKind::Text_1_1(text_invocation), } } } -impl<'top> From> for LazyRawAnyEExpression<'top> { - fn from(binary_invocation: RawBinaryEExpression_1_1<'top>) -> Self { +impl<'top> From<&'top BinaryEExpression_1_1<'top>> for LazyRawAnyEExpression<'top> { + fn from(binary_invocation: &'top BinaryEExpression_1_1<'top>) -> Self { LazyRawAnyEExpression { encoding: LazyRawAnyEExpressionKind::Binary_1_1(binary_invocation), } @@ -223,76 +241,179 @@ impl<'top> HasRange for LazyRawAnyEExpression<'top> { } impl<'top> RawEExpression<'top, AnyEncoding> for LazyRawAnyEExpression<'top> { - type RawArgumentsIterator<'a> = LazyRawAnyMacroArgsIterator<'top,> where Self: 'a; + type RawArgumentsIterator = AnyEExpArgsIterator<'top>; + type ArgGroup = AnyEExpArgGroup<'top>; - fn id(&self) -> MacroIdRef<'top> { + fn id(self) -> MacroIdRef<'top> { use LazyRawAnyEExpressionKind::*; match self.encoding { Text_1_1(ref m) => m.id(), - Binary_1_1(ref m) => m.id(), + Binary_1_1(m) => m.id(), } } - fn raw_arguments(&self) -> Self::RawArgumentsIterator<'_> { + fn raw_arguments(&self) -> Self::RawArgumentsIterator { use LazyRawAnyEExpressionKind::*; match self.encoding { - Text_1_1(e) => LazyRawAnyMacroArgsIterator { - encoding: LazyRawAnyMacroArgsIteratorKind::Text_1_1(e.raw_arguments()), + Text_1_1(e) => AnyEExpArgsIterator { + encoding: LazyRawAnyEExpArgsIteratorKind::Text_1_1(e.raw_arguments()), + }, + Binary_1_1(e) => AnyEExpArgsIterator { + encoding: LazyRawAnyEExpArgsIteratorKind::Binary_1_1(e.raw_arguments()), + }, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub struct AnyEExpArgGroup<'top> { + kind: AnyEExpArgGroupKind<'top>, +} + +#[derive(Copy, Clone, Debug)] +pub enum AnyEExpArgGroupKind<'top> { + Text_1_1(TextEExpArgGroup<'top>), + Binary_1_1(BinaryEExpArgGroup<'top>), +} + +impl<'top> HasRange for AnyEExpArgGroup<'top> { + fn range(&self) -> Range { + match self.kind { + AnyEExpArgGroupKind::Text_1_1(group) => group.range(), + AnyEExpArgGroupKind::Binary_1_1(group) => group.range(), + } + } +} + +impl<'top> HasSpan<'top> for AnyEExpArgGroup<'top> { + fn span(&self) -> Span<'top> { + match self.kind { + AnyEExpArgGroupKind::Text_1_1(group) => group.span(), + AnyEExpArgGroupKind::Binary_1_1(group) => group.span(), + } + } +} + +#[derive(Copy, Clone, Debug)] +pub struct AnyEExpArgGroupIterator<'top> { + kind: AnyEExpArgGroupIteratorKind<'top>, +} + +impl< + 'top, + D: Decoder = LazyRawAnyValue<'top>, EExp<'top> = LazyRawAnyEExpression<'top>>, + > EExpArgGroupIterator<'top, D> for AnyEExpArgGroupIterator<'top> +{ + fn is_exhausted(&self) -> bool { + match self.kind { + AnyEExpArgGroupIteratorKind::Text_1_1(ref i) => i.is_exhausted(), + AnyEExpArgGroupIteratorKind::Binary_1_1(ref i) => i.is_exhausted(), + } + } +} + +impl<'top> IntoIterator for AnyEExpArgGroup<'top> { + type Item = IonResult>; + type IntoIter = AnyEExpArgGroupIterator<'top>; + + fn into_iter(self) -> Self::IntoIter { + match self.kind { + AnyEExpArgGroupKind::Text_1_1(group) => AnyEExpArgGroupIterator { + kind: AnyEExpArgGroupIteratorKind::Text_1_1(group.into_iter()), }, - Binary_1_1(e) => LazyRawAnyMacroArgsIterator { - encoding: LazyRawAnyMacroArgsIteratorKind::Binary_1_1(e.raw_arguments()), + AnyEExpArgGroupKind::Binary_1_1(group) => AnyEExpArgGroupIterator { + kind: AnyEExpArgGroupIteratorKind::Binary_1_1(group.into_iter()), }, } } } -pub enum LazyRawAnyMacroArgsIteratorKind<'top> { +#[derive(Copy, Clone, Debug)] +pub enum AnyEExpArgGroupIteratorKind<'top> { + Text_1_1(TextEExpArgGroupIterator<'top>), + Binary_1_1(BinaryEExpArgGroupIterator<'top>), +} + +impl<'top> Iterator for AnyEExpArgGroupIterator<'top> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + match self.kind { + AnyEExpArgGroupIteratorKind::Text_1_1(ref mut i) => { + Some(Ok(try_next!(i.next()).into())) + } + AnyEExpArgGroupIteratorKind::Binary_1_1(ref mut i) => { + Some(Ok(try_next!(i.next()).into())) + } + } + } +} + +impl<'top> EExpressionArgGroup<'top, AnyEncoding> for AnyEExpArgGroup<'top> { + type Iterator = AnyEExpArgGroupIterator<'top>; + + fn encoding(&self) -> ParameterEncoding { + match self.kind { + AnyEExpArgGroupKind::Text_1_1(g) => g.encoding(), + AnyEExpArgGroupKind::Binary_1_1(g) => g.encoding(), + } + } + + fn resolve(self, context: EncodingContextRef<'top>) -> ArgGroup<'top, AnyEncoding> { + ArgGroup::new(self, context) + } +} + +#[derive(Copy, Clone, Debug)] +pub enum LazyRawAnyEExpArgsIteratorKind<'top> { Text_1_1( - as RawEExpression< + as RawEExpression< 'top, TextEncoding_1_1, - >>::RawArgumentsIterator<'top>, + >>::RawArgumentsIterator, ), Binary_1_1( - as RawEExpression< + <&'top BinaryEExpression_1_1<'top> as RawEExpression< 'top, BinaryEncoding_1_1, - >>::RawArgumentsIterator<'top>, + >>::RawArgumentsIterator, ), } -pub struct LazyRawAnyMacroArgsIterator<'top> { - encoding: LazyRawAnyMacroArgsIteratorKind<'top>, + +#[derive(Copy, Clone, Debug)] +pub struct AnyEExpArgsIterator<'top> { + encoding: LazyRawAnyEExpArgsIteratorKind<'top>, } -impl<'top> Iterator for LazyRawAnyMacroArgsIterator<'top> { - type Item = IonResult>; +impl<'top> Iterator for AnyEExpArgsIterator<'top> { + type Item = IonResult>; fn next(&mut self) -> Option { match &mut self.encoding { - LazyRawAnyMacroArgsIteratorKind::Text_1_1(ref mut iter) => match iter.next() { - Some(Ok(RawValueExpr::ValueLiteral(value))) => { - Some(Ok(RawValueExpr::ValueLiteral(LazyRawAnyValue::from(value)))) - } - Some(Ok(RawValueExpr::EExp(invocation))) => { - Some(Ok(RawValueExpr::EExp(LazyRawAnyEExpression { - encoding: LazyRawAnyEExpressionKind::Text_1_1(invocation), - }))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - }, - LazyRawAnyMacroArgsIteratorKind::Binary_1_1(ref mut iter) => match iter.next() { - Some(Ok(RawValueExpr::ValueLiteral(value))) => { - Some(Ok(RawValueExpr::ValueLiteral(LazyRawAnyValue::from(value)))) - } - Some(Ok(RawValueExpr::EExp(invocation))) => { - Some(Ok(RawValueExpr::EExp(LazyRawAnyEExpression { - encoding: LazyRawAnyEExpressionKind::Binary_1_1(invocation), - }))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - }, + LazyRawAnyEExpArgsIteratorKind::Text_1_1(ref mut iter) => { + let arg = try_next!(iter.next()); + use EExpArgExpr::*; + let any_expr = match arg.expr() { + ValueLiteral(v) => ValueLiteral(LazyRawAnyValue::from(*v)), + EExp(e) => EExp(LazyRawAnyEExpression::from(*e)), + ArgGroup(g) => ArgGroup(AnyEExpArgGroup { + kind: AnyEExpArgGroupKind::Text_1_1(*g), + }), + }; + Some(Ok(EExpArg::new(arg.encoding(), any_expr))) + } + LazyRawAnyEExpArgsIteratorKind::Binary_1_1(ref mut iter) => { + let arg = try_next!(iter.next()); + use EExpArgExpr::*; + let any_expr = match arg.expr() { + ValueLiteral(v) => ValueLiteral(LazyRawAnyValue::from(*v)), + EExp(e) => EExp(LazyRawAnyEExpression::from(*e)), + ArgGroup(g) => ArgGroup(AnyEExpArgGroup { + kind: AnyEExpArgGroupKind::Binary_1_1(*g), + }), + }; + Some(Ok(EExpArg::new(arg.encoding(), any_expr))) + } } } } @@ -301,7 +422,11 @@ impl<'top> Iterator for LazyRawAnyMacroArgsIterator<'top> { /// A lazy raw reader that can decode both text and binary Ion. pub struct LazyRawAnyReader<'data> { - encoding: RawReaderKind<'data>, + // If the reader encounters an IVM that changes the encoding, the new encoding will be stored + // here until `next()` is called again, at which point the reader will be swapped out for one + // that can read the new encoding. + new_encoding: Option, + encoding_reader: RawReaderKind<'data>, } impl<'data> LazyRawAnyReader<'data> { @@ -316,6 +441,15 @@ impl<'data> LazyRawAnyReader<'data> { } } +impl<'data> From> for LazyRawAnyReader<'data> { + fn from(encoding: RawReaderKind<'data>) -> Self { + Self { + new_encoding: None, + encoding_reader: encoding, + } + } +} + pub enum RawReaderKind<'data> { Text_1_0(LazyRawTextReader_1_0<'data>), Binary_1_0(LazyRawBinaryReader_1_0<'data>), @@ -323,7 +457,39 @@ pub enum RawReaderKind<'data> { Binary_1_1(LazyRawBinaryReader_1_1<'data>), } -#[derive(Default, Debug, Copy, Clone)] +impl<'data> RawReaderKind<'data> { + fn resume_at_offset( + data: &'data [u8], + stream_offset: usize, + encoding_hint: IonEncoding, + ) -> RawReaderKind { + use IonEncoding::*; + match encoding_hint { + Text_1_0 => RawReaderKind::Text_1_0(LazyRawTextReader_1_0::resume_at_offset( + data, + stream_offset, + encoding_hint, + )), + Binary_1_0 => RawReaderKind::Binary_1_0(LazyRawBinaryReader_1_0::resume_at_offset( + data, + stream_offset, + encoding_hint, + )), + Text_1_1 => RawReaderKind::Text_1_1(LazyRawTextReader_1_1::resume_at_offset( + data, + stream_offset, + encoding_hint, + )), + Binary_1_1 => RawReaderKind::Binary_1_1(LazyRawBinaryReader_1_1::resume_at_offset( + data, + stream_offset, + encoding_hint, + )), + } + } +} + +#[derive(Default, Debug, Copy, Clone, PartialEq)] #[non_exhaustive] pub enum IonEncoding { // In the absence of a binary IVM, readers must assume Ion 1.0 text data until a @@ -356,79 +522,99 @@ impl IonEncoding { } } - pub fn version(&self) -> (u8, u8) { + pub fn version(&self) -> IonVersion { use IonEncoding::*; match self { - Text_1_0 | Binary_1_0 => (1, 0), - Text_1_1 | Binary_1_1 => (1, 1), + Text_1_0 | Binary_1_0 => IonVersion::v1_0, + Text_1_1 | Binary_1_1 => IonVersion::v1_1, + } + } +} + +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub enum IonVersion { + #[default] + v1_0, + v1_1, +} + +impl IonVersion { + pub fn major_minor(&self) -> (u8, u8) { + use IonVersion::*; + match self { + v1_0 => (1, 0), + v1_1 => (1, 1), } } } impl<'data> From> for LazyRawAnyReader<'data> { fn from(reader: LazyRawTextReader_1_0<'data>) -> Self { - LazyRawAnyReader { - encoding: RawReaderKind::Text_1_0(reader), - } + RawReaderKind::Text_1_0(reader).into() } } impl<'data> From> for LazyRawAnyReader<'data> { fn from(reader: LazyRawTextReader_1_1<'data>) -> Self { - LazyRawAnyReader { - encoding: RawReaderKind::Text_1_1(reader), - } + RawReaderKind::Text_1_1(reader).into() } } impl<'data> From> for LazyRawAnyReader<'data> { fn from(reader: LazyRawBinaryReader_1_0<'data>) -> Self { - LazyRawAnyReader { - encoding: RawReaderKind::Binary_1_0(reader), - } + RawReaderKind::Binary_1_0(reader).into() } } impl<'data> From> for LazyRawAnyReader<'data> { fn from(reader: LazyRawBinaryReader_1_1<'data>) -> Self { - LazyRawAnyReader { - encoding: RawReaderKind::Binary_1_1(reader), - } + RawReaderKind::Binary_1_1(reader).into() } } impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { fn new(data: &'data [u8]) -> Self { - let reader_type = Self::detect_encoding(data); - Self::resume_at_offset(data, 0, reader_type) + Self::resume_at_offset(data, 0, IonEncoding::default()) } - fn resume_at_offset( - data: &'data [u8], - offset: usize, - mut raw_reader_type: IonEncoding, - ) -> Self { + fn resume_at_offset(data: &'data [u8], offset: usize, mut encoding_hint: IonEncoding) -> Self { if offset == 0 { - // If we're at the beginning of the stream, the provided `raw_reader_type` may be a + // If we're at the beginning of the stream, the provided `encoding_hint` may be a // default. We need to inspect the bytes to see if we should override it. - raw_reader_type = Self::detect_encoding(data); + encoding_hint = Self::detect_encoding(data); } - match raw_reader_type { + match encoding_hint { IonEncoding::Text_1_0 => { - LazyRawTextReader_1_0::resume_at_offset(data, offset, ()).into() + LazyRawTextReader_1_0::resume_at_offset(data, offset, encoding_hint).into() } IonEncoding::Binary_1_0 => { - LazyRawBinaryReader_1_0::resume_at_offset(data, offset, ()).into() + LazyRawBinaryReader_1_0::resume_at_offset(data, offset, encoding_hint).into() } IonEncoding::Text_1_1 => { - LazyRawTextReader_1_0::resume_at_offset(data, offset, ()).into() + LazyRawTextReader_1_1::resume_at_offset(data, offset, encoding_hint).into() } IonEncoding::Binary_1_1 => { - LazyRawBinaryReader_1_1::resume_at_offset(data, offset, ()).into() + LazyRawBinaryReader_1_1::resume_at_offset(data, offset, encoding_hint).into() } } } + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + use RawReaderKind::*; + let (remaining_data, stream_offset, mut encoding) = match &self.encoding_reader { + Text_1_0(r) => r.stream_data(), + Binary_1_0(r) => r.stream_data(), + Text_1_1(r) => r.stream_data(), + Binary_1_1(r) => r.stream_data(), + }; + // If we hit an IVM that changed the encoding but we haven't changed our reader yet, + // we still want to report the new encoding. + if let Some(new_encoding) = self.new_encoding { + encoding = new_encoding; + } + (remaining_data, stream_offset, encoding) + } + fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, @@ -436,23 +622,41 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { where 'data: 'top, { + // If we previously ran into an IVM that changed the stream encoding, replace our reader + // with one that can read the new encoding. + if let Some(new_encoding) = self.new_encoding.take() { + let (remaining_data, stream_offset, _) = self.stream_data(); + let new_encoding_reader = + RawReaderKind::resume_at_offset(remaining_data, stream_offset, new_encoding); + self.encoding_reader = new_encoding_reader; + } + use RawReaderKind::*; - match &mut self.encoding { - Text_1_0(r) => Ok(r.next(context)?.into()), - Binary_1_0(r) => Ok(r.next()?.into()), - Text_1_1(r) => Ok(r.next(context)?.into()), - Binary_1_1(r) => Ok(r.next(context)?.into()), + let item: LazyRawStreamItem = match &mut self.encoding_reader { + Text_1_0(r) => r.next(context)?.into(), + Binary_1_0(r) => r.next()?.into(), + Text_1_1(r) => r.next(context)?.into(), + Binary_1_1(r) => r.next(context)?.into(), + }; + + // If this item is an IVM: + // * the encoding context will be reset, but this is handled by higher-level readers. + // * the encoding itself may change, and we need to handle that at this level. + if let RawStreamItem::VersionMarker(ivm) = item { + let ivm_old_encoding = ivm.stream_encoding_before_marker(); + let ivm_new_encoding = ivm.stream_encoding_after_marker()?; + if ivm_new_encoding != ivm_old_encoding { + // Save the new encoding; when `next()` is called again, we'll make a new reader. + self.new_encoding = Some(ivm_new_encoding); + } } - } - #[inline] - fn save_state(&self) -> ::ReaderSavedState { - self.encoding() + Ok(item) } fn position(&self) -> usize { use RawReaderKind::*; - match &self.encoding { + match &self.encoding_reader { Text_1_0(r) => r.position(), Binary_1_0(r) => r.position(), Text_1_1(r) => r.position(), @@ -462,7 +666,16 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { fn encoding(&self) -> IonEncoding { use RawReaderKind::*; - match &self.encoding { + // If we hit an IVM that changed the encoding but we haven't changed our reader yet, + // we still want to report the new encoding. This is a niche case -- it can only arise + // when the reader has hit an IVM (in which case `next()` mutably borrowed the reader + // and `reader.encoding()` cannot be called) and then dropped the IVM. At that point, + // the reader is available again and has moved beyond the IVM, so the new encoding is in + // effect even though we have not encountered our first item in the new encoding. + if let Some(new_encoding) = self.new_encoding { + return new_encoding; + } + match &self.encoding_reader { Text_1_0(_) => IonEncoding::Text_1_0, Binary_1_0(_) => IonEncoding::Binary_1_0, Text_1_1(_) => IonEncoding::Text_1_1, @@ -501,7 +714,7 @@ pub enum LazyRawValueKind<'top> { Text_1_0(LazyRawTextValue_1_0<'top>), Binary_1_0(LazyRawBinaryValue_1_0<'top>), Text_1_1(LazyRawTextValue_1_1<'top>), - Binary_1_1(LazyRawBinaryValue_1_1<'top>), + Binary_1_1(&'top LazyRawBinaryValue_1_1<'top>), } impl<'top> From> for LazyRawAnyValue<'top> { @@ -528,8 +741,8 @@ impl<'top> From> for LazyRawAnyValue<'top> { } } -impl<'top> From> for LazyRawAnyValue<'top> { - fn from(value: LazyRawBinaryValue_1_1<'top>) -> Self { +impl<'top> From<&'top LazyRawBinaryValue_1_1<'top>> for LazyRawAnyValue<'top> { + fn from(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { LazyRawAnyValue { encoding: LazyRawValueKind::Binary_1_1(value), } @@ -681,7 +894,7 @@ impl<'top> From> LazyRawStreamItem::::Value(value) => { LazyRawStreamItem::::Value(value.into()) } - LazyRawStreamItem::::EExpression(_) => { + LazyRawStreamItem::::EExp(_) => { unreachable!("Ion 1.0 does not support macro invocations") } LazyRawStreamItem::::EndOfStream(end) => { @@ -702,7 +915,7 @@ impl<'top> From> LazyRawStreamItem::::Value(value) => { LazyRawStreamItem::::Value(value.into()) } - LazyRawStreamItem::::EExpression(_) => { + LazyRawStreamItem::::EExp(_) => { unreachable!("Ion 1.0 does not support macro invocations") } LazyRawStreamItem::::EndOfStream(end) => { @@ -723,8 +936,8 @@ impl<'top> From> LazyRawStreamItem::::Value(value) => { LazyRawStreamItem::::Value(value.into()) } - LazyRawStreamItem::::EExpression(invocation) => { - LazyRawStreamItem::::EExpression(LazyRawAnyEExpression { + LazyRawStreamItem::::EExp(invocation) => { + LazyRawStreamItem::::EExp(LazyRawAnyEExpression { encoding: LazyRawAnyEExpressionKind::Text_1_1(invocation), }) } @@ -746,8 +959,8 @@ impl<'top> From> LazyRawStreamItem::::Value(value) => { LazyRawStreamItem::::Value(value.into()) } - LazyRawStreamItem::::EExpression(eexp) => { - LazyRawStreamItem::::EExpression(eexp.into()) + LazyRawStreamItem::::EExp(eexp) => { + LazyRawStreamItem::::EExp(eexp.into()) } LazyRawStreamItem::::EndOfStream(end) => { LazyRawStreamItem::::EndOfStream(end) @@ -1258,7 +1471,7 @@ impl<'top> HasRange for LazyRawAnyFieldName<'top> { } } -impl<'top> LazyRawFieldName<'top> for LazyRawAnyFieldName<'top> { +impl<'top> LazyRawFieldName<'top, AnyEncoding> for LazyRawAnyFieldName<'top> { fn read(&self) -> IonResult> { use LazyRawFieldNameKind::*; match self.encoding { @@ -1501,7 +1714,7 @@ mod tests { let context = encoding_context.get_ref(); let mut reader = LazyRawAnyReader::new(data); - assert_eq!(reader.next(context)?.expect_ivm()?.version(), (1, 0)); + assert_eq!(reader.next(context)?.expect_ivm()?.major_minor(), (1, 0)); let _strukt = reader .next(context)? .expect_value()? @@ -1569,4 +1782,160 @@ mod tests { Ok(()) } + + fn expect_version_change( + context_ref: EncodingContextRef, + reader: &mut LazyRawAnyReader, + encoding_before: IonEncoding, + encoding_after: IonEncoding, + ) -> IonResult<()> { + // The reader is using the expected encoding before we hit the IVM + assert_eq!(reader.encoding(), encoding_before); + // The next item is an IVM + let ivm = reader.next(context_ref)?.expect_ivm()?; + // The IVM correctly reports the expected before/after encodings + assert_eq!(ivm.stream_encoding_before_marker(), encoding_before); + assert_eq!(ivm.stream_encoding_after_marker()?, encoding_after); + // The reader is now using the new encoding + assert_eq!(reader.encoding(), encoding_after); + Ok(()) + } + + fn expect_int( + context_ref: EncodingContextRef, + reader: &mut LazyRawAnyReader, + expected_encoding: IonEncoding, + expected_int: i64, + ) -> IonResult<()> { + let value = reader.next(context_ref)?.expect_value()?; + let actual_int = value.read()?.expect_i64()?; + assert_eq!(actual_int, expected_int); + assert_eq!(reader.encoding(), expected_encoding); + Ok(()) + } + + #[test] + fn switch_text_versions() -> IonResult<()> { + const DATA: &str = r#" + 1 + $ion_1_0 + 2 + $ion_1_1 + 3 + $ion_1_1 + 4 + $ion_1_0 + 5 + "#; + + let mut reader = LazyRawAnyReader::new(DATA.as_bytes()); + let encoding_context = EncodingContext::empty(); + let context_ref = encoding_context.get_ref(); + + expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 1)?; + + // This IVM doesn't change the encoding. + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Text_1_0, + IonEncoding::Text_1_0, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 2)?; + + // This IVM changes the encoding from 1.0 text to 1.1 text + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Text_1_0, + IonEncoding::Text_1_1, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Text_1_1, 3)?; + + // This IVM doesn't change the encoding. + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Text_1_1, + IonEncoding::Text_1_1, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Text_1_1, 4)?; + + // This IVM changes the encoding from 1.1 text to 1.0 text + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Text_1_1, + IonEncoding::Text_1_0, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 5)?; + + Ok(()) + } + + #[test] + fn switch_binary_versions() -> IonResult<()> { + const DATA: &[u8] = &[ + 0xE0, 0x01, 0x00, 0xEA, // $ion_1_0 + 0x21, 0x02, // 2 + 0xE0, 0x01, 0x01, 0xEA, // $ion_1_1 + 0x61, 0x03, // 3 + 0xE0, 0x01, 0x01, 0xEA, // $ion_1_1 + 0x61, 0x04, // 4 + 0xE0, 0x01, 0x00, 0xEA, // $ion_1_0 + 0x21, 0x05, // 5 + ]; + + let mut reader = LazyRawAnyReader::new(DATA); + let encoding_context = EncodingContext::empty(); + let context_ref = encoding_context.get_ref(); + + // When the reader is constructed it peeks at the leading bytes to see if they're an IVM. + // In this case, they were a binary Ion v1.0 IVM, so the reader is already expecting to see + // binary 1.0 data. Reading the binary version marker tells the reader to switch encodings. + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Binary_1_0, + IonEncoding::Binary_1_0, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Binary_1_0, 2)?; + + // This IVM changes the encoding from 1.0 binary to 1.1 binary + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Binary_1_0, + IonEncoding::Binary_1_1, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Binary_1_1, 3)?; + + // This IVM doesn't change the encoding. + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Binary_1_1, + IonEncoding::Binary_1_1, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Binary_1_1, 4)?; + + // This IVM changes the encoding from 1.1 binary to 1.0 binary + expect_version_change( + context_ref, + &mut reader, + IonEncoding::Binary_1_1, + IonEncoding::Binary_1_0, + )?; + + expect_int(context_ref, &mut reader, IonEncoding::Binary_1_0, 5)?; + + Ok(()) + } } diff --git a/src/lazy/binary/encoded_value.rs b/src/lazy/binary/encoded_value.rs index f4608ffd..1e7410ba 100644 --- a/src/lazy/binary/encoded_value.rs +++ b/src/lazy/binary/encoded_value.rs @@ -72,11 +72,15 @@ pub(crate) struct EncodedValue { // We store the offset for the header byte because it is guaranteed to be present for all values. // Annotations appear earlier in the stream but are optional. - // The number of bytes used to encode the annotations wrapper (if present) preceding the Ion - // value. If `annotations` is empty, `annotations_header_length` will be zero. The annotations - // wrapper contains several fields: an opcode, a wrapper length, a sequence length, and the - // sequence itself. - pub annotations_header_length: u16, + // The number of bytes used to encode the header of the annotations wrapper preceding the Ion + // value. If the value has no annotations, `annotations_header_length` will be zero. + // + // In Ion 1.0, the annotations header contains several fields: an opcode, a wrapper length, and + // the length of the sequence itself. It does not include the actual sequence of annotations. + // + // In Ion 1.1, the annotations header contains an opcode and (in the case of opcode 0xE9) a + // FlexUInt length. + pub annotations_header_length: u8, // The number of bytes used to encode the series of symbol IDs inside the annotations wrapper. pub annotations_sequence_length: u16, // Whether the annotations sequence is encoded as `FlexSym`s or as symbol addresses. @@ -154,37 +158,49 @@ impl EncodedValue { self.annotations_header_length > 0 } - /// Returns the number of bytes used to encode this value's annotations, if any. - /// While annotations envelope the value that they decorate, this function does not include - /// the length of the value itself. - pub fn annotations_header_length(&self) -> Option { - if self.annotations_header_length == 0 { - return None; - } - Some(self.annotations_header_length as usize) + /// Returns the number of bytes used to encode this value's annotations header, if any. + /// + /// In Ion 1.0, the annotations header contains several fields: an opcode, a wrapper length, and + /// the length of the sequence itself. It does not include the actual sequence of annotations. + /// + /// In Ion 1.1, the annotations header contains an opcode and (in the case of opcode 0xE9) a + /// FlexUInt representing the sequence length. + pub fn annotations_header_length(&self) -> usize { + self.annotations_header_length as usize } - /// Returns the number of bytes used to encode the series of VarUInt annotation symbol IDs, if + /// Returns the number of bytes used to encode the series of annotation symbols, if /// any. /// /// See: - pub fn annotations_sequence_length(&self) -> Option { - if self.annotations_header_length == 0 { - return None; - } - Some(self.annotations_sequence_length as usize) + pub fn annotations_sequence_length(&self) -> usize { + self.annotations_sequence_length as usize } - pub fn annotations_sequence_range(&self) -> Option> { - let wrapper_offset = self.annotations_offset()?; + /// Returns the combined length of the annotations header and sequence. + pub fn annotations_total_length(&self) -> usize { + self.annotations_header_length() + self.annotations_sequence_length() + } + + /// Returns the offset range of the bytes in the stream that encoded the value's annotations + /// sequence. + pub fn annotations_sequence_range(&self) -> Range { + let wrapper_offset = self + .annotations_offset() + .unwrap_or_else(|| self.header_offset()); let wrapper_exclusive_end = wrapper_offset + self.annotations_header_length as usize; let sequence_length = self.annotations_sequence_length as usize; - let sequence_offset = wrapper_exclusive_end - sequence_length; - Some(sequence_offset..wrapper_exclusive_end) + let sequence_offset = wrapper_exclusive_end; + let sequence_exclusive_end = sequence_offset + sequence_length; + debug_assert!(sequence_exclusive_end == self.header_offset); + sequence_offset..sequence_exclusive_end } pub fn annotations_sequence_offset(&self) -> Option { - Some(self.annotations_sequence_range()?.start) + if self.annotations_header_length() == 0 { + return None; + } + Some(self.header_offset - self.annotations_sequence_length()) } /// Returns the offset of the beginning of the annotations wrapper, if present. @@ -192,15 +208,15 @@ impl EncodedValue { if self.annotations_header_length == 0 { return None; } - Some(self.header_offset - self.annotations_header_length as usize) + Some(self.header_offset - self.annotations_total_length()) } - /// Returns an offset Range that includes the bytes used to encode this value's annotations, - /// if any. While annotations envelope the value that they modify, this function does not - /// include the bytes of the encoded value itself. + /// Returns an offset Range that includes the bytes used to encode this value's annotations + /// (including both the header and sequence), if any. pub fn annotations_range(&self) -> Option> { if let Some(start) = self.annotations_offset() { - let end = start + self.annotations_header_length as usize; + // The annotations sequence always ends at the value's opcode. + let end = self.header_offset(); return Some(start..end); } None @@ -217,7 +233,7 @@ impl EncodedValue { /// complete encoding, including annotations. pub fn annotated_value_range(&self) -> Range { // [ annotations? | header (type descriptor) | header_length? | value ] - let start = self.header_offset - self.annotations_header_length as usize; + let start = self.header_offset - self.annotations_total_length(); let end = start + self.total_length; start..end } @@ -227,7 +243,7 @@ impl EncodedValue { pub fn unannotated_value_range(&self) -> Range { // [ annotations? | header (type descriptor) | header_length? | value ] let start = self.header_offset; - let end = start + self.total_length - self.annotations_header_length as usize; + let end = start + self.total_length - self.annotations_total_length(); start..end } @@ -253,7 +269,7 @@ mod tests { ion_type_code: IonTypeCode::String, length_code: 3, }, - annotations_header_length: 3, + annotations_header_length: 2, annotations_sequence_length: 1, annotations_encoding: AnnotationsEncoding::SymbolAddress, header_offset: 200, @@ -275,10 +291,10 @@ mod tests { assert_eq!(value.header_range(), 200..201); assert!(value.has_annotations()); assert_eq!(value.annotations_range(), Some(197..200)); - assert_eq!(value.annotations_header_length(), Some(3)); + assert_eq!(value.annotations_header_length(), 2); assert_eq!(value.annotations_sequence_offset(), Some(199)); - assert_eq!(value.annotations_sequence_length(), Some(1)); - assert_eq!(value.annotations_sequence_range(), Some(199..200)); + assert_eq!(value.annotations_sequence_length(), 1); + assert_eq!(value.annotations_sequence_range(), 199..200); assert_eq!(value.value_body_length(), 3); assert_eq!(value.value_body_offset(), 201); assert_eq!(value.value_body_range(), 201..204); diff --git a/src/lazy/binary/immutable_buffer.rs b/src/lazy/binary/immutable_buffer.rs index ad3e56c0..0af21a15 100644 --- a/src/lazy/binary/immutable_buffer.rs +++ b/src/lazy/binary/immutable_buffer.rs @@ -436,15 +436,17 @@ impl<'a> ImmutableBuffer<'a> { input_after_annotations_length.offset(), ); } - let final_input = input_after_annotations_length.consume(annotations_length.value()); - - // Here, `self` is the (immutable) buffer we started with. Comparing it with `input` - // gets us the before-and-after we need to calculate the size of the header. - let annotations_header_length = final_input.offset() - self.offset(); + // Here, `self` is the (immutable) buffer we started with. Comparing it with `input_after_annotations_length` + // gets us the before-and-after comparison we need to calculate the size of the header. + // "Header" here refers to the annotations opcode, wrapper length, and sequence length. It does + // not include the length of the sequence itself. + let annotations_header_length = input_after_annotations_length.offset() - self.offset(); let annotations_header_length = u8::try_from(annotations_header_length).map_err(|_e| { IonError::decoding_error("found an annotations header greater than 255 bytes long") })?; + let final_input = input_after_annotations_length.consume(annotations_length.value()); + let annotations_sequence_length = u8::try_from(annotations_length.value()).map_err(|_e| { IonError::decoding_error( @@ -746,9 +748,10 @@ impl<'a> ImmutableBuffer<'a> { ); } - lazy_value.encoded_value.annotations_header_length = wrapper.header_length as u16; + lazy_value.encoded_value.annotations_header_length = wrapper.header_length; lazy_value.encoded_value.annotations_sequence_length = wrapper.sequence_length as u16; - lazy_value.encoded_value.total_length += wrapper.header_length as usize; + lazy_value.encoded_value.total_length += + lazy_value.encoded_value.annotations_total_length(); // Modify the input to include the annotations lazy_value.input = input; diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index 5b9b5a5b..e663d3bc 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -2,7 +2,7 @@ use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0; -use crate::lazy::decoder::{Decoder, HasRange, LazyRawFieldExpr, LazyRawReader, RawVersionMarker}; +use crate::lazy::decoder::{HasRange, LazyRawFieldExpr, LazyRawReader}; use crate::lazy::encoding::BinaryEncoding_1_0; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::result::IonFailure; @@ -42,12 +42,6 @@ impl<'data> LazyRawBinaryReader_1_0<'data> { 'data: 'top, { let (marker, _buffer_after_ivm) = buffer.read_ivm()?; - let (major, minor) = marker.version(); - if (major, minor) != (1, 0) { - return IonResult::decoding_error(format!( - "unsupported version of Ion: v{major}.{minor}; only 1.0 is supported" - )); - } self.data.buffer = buffer; self.data.bytes_to_skip = 4; // IVM length Ok(LazyRawStreamItem::::VersionMarker( @@ -113,7 +107,8 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0 fn resume_at_offset( data: &'data [u8], offset: usize, - _config: ::ReaderSavedState, + // This argument is ignored by all raw readers except LazyRawAnyReader + _encoding_hint: IonEncoding, ) -> Self { LazyRawBinaryReader_1_0 { data: DataSource { @@ -123,6 +118,15 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0 } } + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + let stream_offset = self.position(); + ( + &self.data.buffer.bytes()[self.data.bytes_to_skip..], + stream_offset, + IonEncoding::Binary_1_0, + ) + } + fn next<'top>( &'top mut self, _context: EncodingContextRef<'top>, @@ -321,7 +325,7 @@ mod tests { } Value(value) => println!("{:?}", value.read()?), EndOfStream(_) => break, - EExpression(_) => unreachable!("No macros in Ion 1.0"), + EExp(_) => unreachable!("No macros in Ion 1.0"), } } Ok(()) diff --git a/src/lazy/binary/raw/struct.rs b/src/lazy/binary/raw/struct.rs index e7f8a31d..ab247811 100644 --- a/src/lazy/binary/raw/struct.rs +++ b/src/lazy/binary/raw/struct.rs @@ -134,7 +134,7 @@ impl<'top> HasRange for LazyRawBinaryFieldName_1_0<'top> { } } -impl<'top> LazyRawFieldName<'top> for LazyRawBinaryFieldName_1_0<'top> { +impl<'top> LazyRawFieldName<'top, BinaryEncoding_1_0> for LazyRawBinaryFieldName_1_0<'top> { fn read(&self) -> IonResult> { Ok(RawSymbolRef::SymbolId(self.field_id)) } diff --git a/src/lazy/binary/raw/v1_1/e_expression.rs b/src/lazy/binary/raw/v1_1/e_expression.rs index 55a29cc8..9c63f201 100644 --- a/src/lazy/binary/raw/v1_1/e_expression.rs +++ b/src/lazy/binary/raw/v1_1/e_expression.rs @@ -3,77 +3,443 @@ use std::fmt::{Debug, Formatter}; use std::ops::Range; -use crate::lazy::binary::raw::v1_1::immutable_buffer::ImmutableBuffer; +use crate::lazy::binary::raw::v1_1::immutable_buffer::{ + ArgGrouping, ArgGroupingBitmapIterator, ImmutableBuffer, +}; use crate::lazy::decoder::LazyRawValueExpr; -use crate::lazy::expanded::macro_evaluator::RawEExpression; +use crate::lazy::encoding::BinaryEncoding_1_1; +use crate::lazy::expanded::e_expression::ArgGroup; +use crate::lazy::expanded::macro_evaluator::{EExpArgGroupIterator, MacroExprKind}; +use crate::lazy::expanded::macro_evaluator::{EExpressionArgGroup, RawEExpression, ValueExpr}; +use crate::lazy::expanded::macro_table::MacroRef; +use crate::lazy::expanded::template::{MacroSignature, Parameter, ParameterEncoding}; +use crate::lazy::expanded::EncodingContextRef; +use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr}; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; -use crate::{v1_1, HasRange, HasSpan, IonResult, Span}; +use crate::{try_or_some_err, v1_1, Environment, HasRange, HasSpan, IonResult, Span}; #[derive(Copy, Clone)] -pub struct EncodedBinaryEExp { - // The number of bytes that were used to encode the e-expression's header (including its ID) - header_length: u16, +pub struct BinaryEExpHeader { + // The number of bytes that were used to encode the e-expression's opcode and address. + opcode_and_address_length: u8, + // The number of bytes that were used to encode the e-expression's arg grouping bitmap, if any. + bitmap_length: u8, } -impl EncodedBinaryEExp { - pub fn new(header_length: u16) -> Self { - Self { header_length } +impl BinaryEExpHeader { + pub fn new(opcode_length: u8, bitmap_length: u8) -> Self { + Self { + opcode_and_address_length: opcode_length, + bitmap_length, + } + } + pub fn address_and_opcode_length(&self) -> usize { + self.opcode_and_address_length as usize + } + pub fn bitmap_length(&self) -> usize { + self.bitmap_length as usize + } + pub fn header_length(&self) -> usize { + self.address_and_opcode_length() + self.bitmap_length() } } +/// An e-expression which has been parsed from a binary Ion 1.1 stream. #[derive(Copy, Clone)] -pub struct RawBinaryEExpression_1_1<'top> { - pub(crate) encoded_expr: EncodedBinaryEExp, +pub struct BinaryEExpression_1_1<'top> { + // The arguments to the e-expression are parsed either: + // + // 1. when the e-expression is first encountered, if it is not length-prefixed. + // OR + // 2. when the e-expression is being evaluated, to populate its evaluation environment. + // + // In case #1, we store the parsed arguments in the bump allocator so that we don't have to + // re-parse them at evaluation time. We _could_ store the `LazyRawValueExpr<'top, BinaryEncoding_1_1>` + // representations of the expressions, but then we would need to make a separate array of the + // resolved versions of those expressions (`ValueExpr<'top, BinaryEncoding_1_1>`) to populate + // the environment. As an optimization, we store the fully resolved version of the arguments + // so that populating the environment is very close to a no-op. In the uncommon case that we need + // to iterate over the raw argument expressions (usually for tooling), we can do so by + // "unpacking" their resolved representations. See `make_evaluation_environment` for details. + cache: Option<&'top [ValueExpr<'top, BinaryEncoding_1_1>]>, + macro_ref: MacroRef<'top>, + bitmap_bits: u64, + // The index of `input` at which the bitmap can be found. If there is no bitmap, this index + // will be the beginning of the encoded arguments. + bitmap_offset: u8, + // The index at which the arguments to the e-expression begin within `input`. This index is + // the first position after the opcode, address, length, and bitmap. + args_offset: u8, + pub(crate) input: ImmutableBuffer<'top>, - pub(crate) id: MacroIdRef<'top>, - pub(crate) arg_expr_cache: &'top [LazyRawValueExpr<'top, v1_1::Binary>], } -impl<'top> RawBinaryEExpression_1_1<'top> { +impl<'top> BinaryEExpression_1_1<'top> { pub fn new( - id: MacroIdRef<'top>, - encoded_expr: EncodedBinaryEExp, + macro_ref: MacroRef<'top>, + bitmap_bits: u64, input: ImmutableBuffer<'top>, - arg_expr_cache: &'top [LazyRawValueExpr<'top, v1_1::Binary>], + bitmap_offset: u8, + args_offset: u8, ) -> Self { Self { - encoded_expr, + bitmap_bits, input, - id, - arg_expr_cache, + macro_ref, + bitmap_offset, + args_offset, + cache: None, } } + + pub fn with_arg_expr_cache( + mut self, + cache: &'top [ValueExpr<'top, BinaryEncoding_1_1>], + ) -> Self { + self.cache = Some(cache); + self + } } -impl<'top> HasSpan<'top> for RawBinaryEExpression_1_1<'top> { +impl<'top> HasSpan<'top> for &'top BinaryEExpression_1_1<'top> { fn span(&self) -> Span<'top> { Span::with_offset(self.input.offset(), self.input.bytes()) } } -impl<'top> HasRange for RawBinaryEExpression_1_1<'top> { +impl<'top> HasRange for &'top BinaryEExpression_1_1<'top> { fn range(&self) -> Range { self.input.range() } } -impl<'top> Debug for RawBinaryEExpression_1_1<'top> { +impl<'top> Debug for &'top BinaryEExpression_1_1<'top> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "", self.id()) } } -impl<'top> RawEExpression<'top, v1_1::Binary> for RawBinaryEExpression_1_1<'top> { - type RawArgumentsIterator<'a> = RawBinarySequenceCacheIterator_1_1<'top> - where - Self: 'a; +impl<'top> RawEExpression<'top, v1_1::Binary> for &'top BinaryEExpression_1_1<'top> { + type RawArgumentsIterator = BinaryEExpArgsIterator_1_1<'top>; + type ArgGroup = BinaryEExpArgGroup<'top>; + + fn id(self) -> MacroIdRef<'top> { + MacroIdRef::LocalAddress(self.macro_ref.address()) + } + + fn raw_arguments(&self) -> Self::RawArgumentsIterator { + let signature = self.macro_ref.signature(); + let args_input = self.input.consume(self.args_offset as usize); + if let Some(cache) = self.cache { + return BinaryEExpArgsIterator_1_1::for_cache(signature, args_input.offset(), cache); + } + let bitmap_iterator = ArgGroupingBitmapIterator::new(signature.len(), self.bitmap_bits); + BinaryEExpArgsIterator_1_1::for_input(bitmap_iterator, args_input, signature) + } + + fn make_evaluation_environment( + &self, + context: EncodingContextRef<'top>, + ) -> IonResult> { + // If we've already parsed and resolved the e-expression's arguments, use our cache as the + // new environment. + if let Some(cache) = self.cache { + return Ok(Environment::new(cache)); + } + // Otherwise, we parse the arguments and add them to a new environment as we go. + // Note that (as currently designed) we cannot then populate the cache as we do not have + // a mutable reference to `self` and getting one would be a non-trivial change. However, + // in the vast majority of use cases, e-expressions are not evaluated more than once, which + // means that populating the cache would be of little value. + Environment::for_eexp(context, *self) + } +} + +#[derive(Debug, Copy, Clone)] +pub enum BinaryEExpArgsSource<'top> { + // If the e-expression arguments were parsed when it was first encountered, their resolved + // representations are stored in a bump-allocated array. This iterator kind will iterate over + // the array. + Cache(BinaryEExpArgsCacheIter<'top>), + // If the e-expression was length-prefixed, the cache will not be populated. This iterator kind + // will incrementally parse the contents of the buffer range identified by the length prefix. + Input(BinaryEExpArgsInputIter<'top>), +} + +#[derive(Debug, Copy, Clone)] +pub struct BinaryEExpArgsIterator_1_1<'top> { + source: BinaryEExpArgsSource<'top>, +} + +impl<'top> BinaryEExpArgsIterator_1_1<'top> { + pub fn for_input( + groupings_iter: ArgGroupingBitmapIterator, + remaining_args_buffer: ImmutableBuffer<'top>, + signature: &'top MacroSignature, + ) -> Self { + Self { + source: BinaryEExpArgsSource::Input(BinaryEExpArgsInputIter { + bitmap_iter: groupings_iter, + remaining_args_buffer, + param_index: 0, + signature, + }), + } + } + + pub fn for_cache( + signature: &'top MacroSignature, + initial_offset: usize, + cache: &'top [ValueExpr<'top, BinaryEncoding_1_1>], + ) -> Self { + Self { + source: BinaryEExpArgsSource::Cache(BinaryEExpArgsCacheIter { + cache_exprs: cache, + initial_offset, + expr_index: 0, + signature, + }), + } + } + + /// Reports the position of the iterator within the overall stream. Before `next()` has been + /// called for the first time, the position will be the first offset after the + /// opcode/address/length/bitmap. When the iterator is exhausted, the position will be + /// the first offset beyond the end of the e-expression. + pub fn offset(&self) -> usize { + match &self.source { + BinaryEExpArgsSource::Input(i) => i.remaining_args_buffer.offset(), + // If there weren't any args, then the iterator's position is where it started. + BinaryEExpArgsSource::Cache(c) if c.cache_exprs.is_empty() => c.initial_offset, + BinaryEExpArgsSource::Cache(c) => { + match c.cache_exprs.get(c.expr_index) { + Some(value_expr) => value_expr.range().unwrap().end, + // If the iterator is exhausted, then its offset is the end of the last arg expr. + None => c.cache_exprs[c.expr_index - 1].range().unwrap().end, + } + } + } + } +} + +impl<'top> Iterator for BinaryEExpArgsIterator_1_1<'top> { + type Item = IonResult>; + + #[inline(always)] + fn next(&mut self) -> Option { + match self.source { + BinaryEExpArgsSource::Input(ref mut input_iter) => input_iter.next(), + BinaryEExpArgsSource::Cache(ref mut cache_iter) => cache_iter.next(), + } + } + + fn size_hint(&self) -> (usize, Option) { + let signature = match self.source { + BinaryEExpArgsSource::Input(i) => i.signature, + BinaryEExpArgsSource::Cache(i) => i.signature, + }; + let num_args = signature.len(); + // Tells the macro evaluator how much space to allocate to hold these arguments + (num_args, Some(num_args)) + } +} + +/// An iterator that incrementally parses e-expression arguments from a provided buffer. +#[derive(Debug, Copy, Clone)] +pub struct BinaryEExpArgsInputIter<'top> { + bitmap_iter: ArgGroupingBitmapIterator, + remaining_args_buffer: ImmutableBuffer<'top>, + param_index: usize, + signature: &'top MacroSignature, +} + +impl<'top> Iterator for BinaryEExpArgsInputIter<'top> { + type Item = IonResult>; + + #[inline(always)] + fn next(&mut self) -> Option>> { + // We cannot read the arguments of a binary e-expression without first looking at the + // corresponding parameter's encoding and cardinality to know what bytes to expect. + // First, get the parameter from the signature. If `get` returns `None`, we've reached the + // end of the signature and can early-return. + let parameter = self.signature.parameters().get(self.param_index)?; + let arg_grouping = if parameter.is_variadic() { + // If it's a variadic parameter (that is: `?`, `*`, or `+`), pull two bits from the + // argument encoding bitmap to see how it was encoded. + try_or_some_err!(self.bitmap_iter.next().unwrap()) + } else { + // If it's a required parameter (`!`, or no modifier), there's no corresponding entry in the + // argument encoding bitmap. + ArgGrouping::ValueExprLiteral + }; + // TODO: Tagless encodings + let (arg_expr, remaining_input) = match arg_grouping { + // If the encoding is `empty`, there's nothing to do. Make an empty slice at the current + // offset and build an empty BinaryEExpArgGroup with it. + ArgGrouping::Empty => { + let input = self.remaining_args_buffer.slice(0, 0); + let expr = EExpArgExpr::ArgGroup(BinaryEExpArgGroup::new(parameter, input, 0)); + (EExpArg::new(parameter, expr), self.remaining_args_buffer) + } + // If it's a tagged value expression, parse it as usual. + ArgGrouping::ValueExprLiteral => { + let (expr, remaining) = try_or_some_err! { + self + .remaining_args_buffer + .expect_eexp_arg_expr("reading tagged e-expr arg") + }; + (EExpArg::new(parameter, expr), remaining) + } + // If it's an argument group... + ArgGrouping::ArgGroup => { + //...then it starts with a FlexUInt that indicates whether the group is length-prefixed + // or delimited. + let (group_header_flex_uint, _remaining_args_input) = + try_or_some_err!(self.remaining_args_buffer.read_flex_uint()); + let bytes_to_read = match group_header_flex_uint.value() { + 0 => todo!("delimited argument groups"), + n_bytes => n_bytes as usize, + }; + // If it's length-prefixed, we don't need to inspect its contents. We can build an + // ArgGroup using the unexamined bytes; we'll parse them later if they get evaluated. + let arg_group_length = group_header_flex_uint.size_in_bytes() + bytes_to_read; + let arg_group = BinaryEExpArgGroup::new( + parameter, + self.remaining_args_buffer.slice(0, arg_group_length), + group_header_flex_uint.size_in_bytes() as u8, + ); + ( + EExpArg::new(parameter, EExpArgExpr::ArgGroup(arg_group)), + self.remaining_args_buffer.consume(arg_group_length), + ) + } + }; + + self.param_index += 1; + self.remaining_args_buffer = remaining_input; + Some(Ok(arg_expr)) + } +} + +/// An iterator that visits already-resolved `ValueExpr`s stored in an array. +#[derive(Debug, Copy, Clone)] +pub struct BinaryEExpArgsCacheIter<'top> { + initial_offset: usize, + cache_exprs: &'top [ValueExpr<'top, BinaryEncoding_1_1>], + expr_index: usize, + signature: &'top MacroSignature, +} + +impl<'top> BinaryEExpArgsCacheIter<'top> { + pub fn next(&mut self) -> Option>> { + let parameter = self.signature.parameters().get(self.expr_index)?; + let cache_entry = self.cache_exprs.get(self.expr_index).unwrap(); + self.expr_index += 1; + let next_expr = match cache_entry { + ValueExpr::ValueLiteral(value) => { + // We know that every ValueExpr in the cache is the resolved version of a raw value + // literal, so we can safely `unwrap` here. + let value_literal = value.expect_value_literal().unwrap(); + EExpArg::new(parameter, EExpArgExpr::ValueLiteral(value_literal)) + } + ValueExpr::MacroInvocation(invocation) => { + use MacroExprKind::*; + let expr = match invocation.source() { + TemplateMacro(_) => { + unreachable!("e-expression cannot be a TDL macro invocation") + } + EExp(eexp) => EExpArgExpr::EExp(eexp.raw_invocation), + EExpArgGroup(group) => EExpArgExpr::ArgGroup(group.raw_arg_group()), + }; + EExpArg::new(parameter, expr) + } + }; + Some(Ok(next_expr)) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct BinaryEExpArgGroup<'top> { + parameter: &'top Parameter, + input: ImmutableBuffer<'top>, + header_size: u8, +} + +impl<'top> BinaryEExpArgGroup<'top> { + pub fn new(parameter: &'top Parameter, input: ImmutableBuffer<'top>, header_size: u8) -> Self { + Self { + parameter, + input, + header_size, + } + } +} + +impl<'top> HasRange for BinaryEExpArgGroup<'top> { + fn range(&self) -> Range { + self.input.range() + } +} + +impl<'top> HasSpan<'top> for BinaryEExpArgGroup<'top> { + fn span(&self) -> Span<'top> { + Span::with_offset(self.input.offset(), self.input.bytes()) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct BinaryEExpArgGroupIterator<'top> { + parameter: &'top Parameter, + remaining_args_buffer: ImmutableBuffer<'top>, +} + +impl<'top> EExpArgGroupIterator<'top, BinaryEncoding_1_1> for BinaryEExpArgGroupIterator<'top> { + fn is_exhausted(&self) -> bool { + self.remaining_args_buffer.is_empty() + } +} + +impl<'top> Iterator for BinaryEExpArgGroupIterator<'top> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + if self.remaining_args_buffer.is_empty() { + return None; + } + let (expr, remaining) = try_or_some_err! { + // TODO: Other encodings + self.remaining_args_buffer.expect_sequence_value_expr("eexp arg group subarg") + }; + self.remaining_args_buffer = remaining; + Some(Ok(expr)) + } +} + +impl<'top> IntoIterator for BinaryEExpArgGroup<'top> { + type Item = IonResult>; + type IntoIter = BinaryEExpArgGroupIterator<'top>; + + fn into_iter(self) -> Self::IntoIter { + BinaryEExpArgGroupIterator { + parameter: self.parameter, + remaining_args_buffer: self.input.consume(self.header_size as usize), + } + } +} + +impl<'top> EExpressionArgGroup<'top, BinaryEncoding_1_1> for BinaryEExpArgGroup<'top> { + type Iterator = BinaryEExpArgGroupIterator<'top>; + + fn encoding(&self) -> ParameterEncoding { + self.parameter.encoding() + } - fn id(&self) -> MacroIdRef<'top> { - self.id + fn resolve(self, context: EncodingContextRef<'top>) -> ArgGroup<'top, BinaryEncoding_1_1> { + ArgGroup::new(self, context) } - fn raw_arguments(&self) -> Self::RawArgumentsIterator<'top> { - RawBinarySequenceCacheIterator_1_1::new(self.arg_expr_cache) + fn iter(self) -> Self::Iterator { + self.into_iter() } } diff --git a/src/lazy/binary/raw/v1_1/immutable_buffer.rs b/src/lazy/binary/raw/v1_1/immutable_buffer.rs index 5cb60868..6a66e72b 100644 --- a/src/lazy/binary/raw/v1_1/immutable_buffer.rs +++ b/src/lazy/binary/raw/v1_1/immutable_buffer.rs @@ -1,11 +1,14 @@ use std::fmt::{Debug, Formatter}; +use std::mem::size_of; use std::ops::Range; use bumpalo::collections::Vec as BumpVec; use crate::binary::constants::v1_1::IVM; use crate::lazy::binary::encoded_value::EncodedValue; -use crate::lazy::binary::raw::v1_1::e_expression::{EncodedBinaryEExp, RawBinaryEExpression_1_1}; +use crate::lazy::binary::raw::v1_1::e_expression::{ + BinaryEExpArgsIterator_1_1, BinaryEExpression_1_1, +}; use crate::lazy::binary::raw::v1_1::value::{ LazyRawBinaryValue_1_1, LazyRawBinaryVersionMarker_1_1, }; @@ -16,11 +19,11 @@ use crate::lazy::encoder::binary::v1_1::fixed_uint::FixedUInt; use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt; use crate::lazy::encoder::binary::v1_1::flex_sym::FlexSym; use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt; -use crate::lazy::expanded::macro_table::MacroKind; +use crate::lazy::expanded::macro_table::MacroRef; use crate::lazy::expanded::EncodingContextRef; -use crate::lazy::text::raw::v1_1::reader::MacroIdRef; +use crate::lazy::text::raw::v1_1::arg_group::EExpArgExpr; use crate::result::IonFailure; -use crate::{v1_1, HasRange, IonError, IonResult}; +use crate::{v1_1, IonError, IonResult}; /// A buffer of unsigned bytes that can be cheaply copied and which defines methods for parsing /// the various encoding elements of a binary Ion stream. @@ -53,6 +56,17 @@ impl<'a> Debug for ImmutableBuffer<'a> { } } +impl<'a> PartialEq for ImmutableBuffer<'a> { + fn eq(&self, other: &Self) -> bool { + // A definition of equality that ignores the `context` field. + self.offset == other.offset && self.data == other.data + // An argument could be made that two buffers are not equal if they're holding references to + // different contexts, but this is a very low-level, feature-gated construct so it's probably + // fine if the implementation is arguably imperfect. + } +} + +/// When `Ok`, contains the value that was matched/parsed and the remainder of the input buffer. pub(crate) type ParseResult<'a, T> = IonResult<(T, ImmutableBuffer<'a>)>; impl<'a> ImmutableBuffer<'a> { @@ -62,6 +76,7 @@ impl<'a> ImmutableBuffer<'a> { Self::new_with_offset(context, data, 0) } + #[inline] pub fn new_with_offset( context: EncodingContextRef<'a>, data: &'a [u8], @@ -74,6 +89,10 @@ impl<'a> ImmutableBuffer<'a> { } } + pub fn context(&self) -> EncodingContextRef<'a> { + self.context + } + /// Returns a slice containing all of the buffer's bytes. pub fn bytes(&self) -> &'a [u8] { self.data @@ -141,14 +160,33 @@ impl<'a> ImmutableBuffer<'a> { } } - /// Reads the first byte in the buffer and returns it as an [Opcode]. + /// Reads the first byte in the buffer and returns it as an [Opcode]. If the buffer is empty, + /// returns an `IonError::Incomplete`. #[inline] - pub(crate) fn peek_opcode(&self) -> IonResult { + pub(crate) fn expect_opcode(&self) -> IonResult { if self.is_empty() { return IonResult::incomplete("an opcode", self.offset()); } + Ok(self.peek_opcode_unchecked()) + } + + /// Reads the first byte in the buffer and returns it as an [Opcode]. If the buffer is empty, + /// returns `None`. + #[inline] + pub(crate) fn peek_opcode(&self) -> Option { + if let Some(&byte) = self.data.first() { + Some(ION_1_1_OPCODES[byte as usize]) + } else { + None + } + } + + /// Reads the first byte in the buffer *without confirming one is available* and returns it + /// as an [Opcode]. + #[inline] + pub(crate) fn peek_opcode_unchecked(&self) -> Opcode { let next_byte = self.data[0]; - Ok(ION_1_1_OPCODES[next_byte as usize]) + ION_1_1_OPCODES[next_byte as usize] } /// Reads the first four bytes in the buffer as an Ion version marker. If it is successful, @@ -201,7 +239,7 @@ impl<'a> ImmutableBuffer<'a> { // expose the ability to write them. As such, this method has been marked `inline(never)` to // allow the hot path to be better optimized. pub fn read_nop_pad(self) -> ParseResult<'a, usize> { - let opcode = self.peek_opcode()?; + let opcode = self.expect_opcode()?; // We need to determine the size of the nop.. let (size, remaining) = if opcode.low_nibble() == 0xC { @@ -226,9 +264,6 @@ impl<'a> ImmutableBuffer<'a> { /// Calls [`Self::read_nop_pad`] in a loop until the buffer is empty or an opcode /// is encountered that is not a NOP. #[inline(never)] - // NOP padding is not widely used in Ion 1.0. This method is annotated with `inline(never)` - // to avoid the compiler bloating other methods on the hot path with its rarely used - // instructions. pub fn consume_nop_padding(self, mut opcode: Opcode) -> ParseResult<'a, ()> { let mut buffer = self; // Skip over any number of NOP regions @@ -238,7 +273,7 @@ impl<'a> ImmutableBuffer<'a> { if buffer.is_empty() { break; } - opcode = buffer.peek_opcode()? + opcode = buffer.expect_opcode()? } Ok(((), buffer)) } @@ -247,59 +282,138 @@ impl<'a> ImmutableBuffer<'a> { /// from the buffer to interpret as the value's length. If it is successful, returns an `Ok(_)` /// containing a [FlexUInt] representation of the value's length. If no additional bytes were /// read, the returned `FlexUInt`'s `size_in_bytes()` method will return `0`. + #[inline] pub fn read_value_length(self, header: Header) -> ParseResult<'a, FlexUInt> { - let length = match header.length_type() { + match header.length_type() { LengthType::InOpcode(n) => { // FlexUInt represents the length, but is not physically present, hence the 0 size. - FlexUInt::new(0, n as u64) - } - LengthType::FlexUIntFollows => { - let (flexuint, _) = self.read_flex_uint()?; - flexuint + Ok((FlexUInt::new(0, n as u64), self)) } - }; + LengthType::FlexUIntFollows => self.read_flex_uint(), + } + } - let remaining = self; + /// Reads a single expression (value literal or e-expression) as an argument to an e-expression. + #[inline] + pub(crate) fn expect_eexp_arg_expr( + self, + label: &'static str, + ) -> ParseResult<'a, EExpArgExpr<'a, v1_1::Binary>> { + let (raw_value_expr, remaining_input) = self.expect_sequence_value_expr(label)?; + let arg_expr = match raw_value_expr { + RawValueExpr::ValueLiteral(v) => EExpArgExpr::ValueLiteral(v), + RawValueExpr::EExp(e) => EExpArgExpr::EExp(e), + }; + Ok((arg_expr, remaining_input)) + } - // TODO: Validate length to ensure it is a reasonable value. + pub(crate) fn expect_sequence_value_expr( + self, + label: &'static str, + ) -> ParseResult<'a, LazyRawValueExpr<'a, v1_1::Binary>> { + match self.read_sequence_value_expr() { + Ok((Some(expr), remaining)) => Ok((expr, remaining)), + Ok((None, _)) => IonResult::incomplete(label, self.offset), + Err(e) => Err(e), + } + } - Ok((length, remaining)) + /// Returns `true` if the opcode was updated to one that follows the NOP. + /// Returns `false` if there was no more data following the NOP. + #[inline(never)] + pub(crate) fn opcode_after_nop(&mut self, opcode: &mut Opcode) -> IonResult { + let (_matched, input_after_nop) = self.consume_nop_padding(*opcode)?; + if let Some(new_opcode) = input_after_nop + .peek_next_byte() + .map(|b| ION_1_1_OPCODES[b as usize]) + { + *opcode = new_opcode; + *self = input_after_nop; + Ok(true) + } else { + Ok(false) + } } /// Reads a value without a field name from the buffer. This is applicable in lists, s-expressions, /// and at the top level. - pub(crate) fn peek_sequence_value_expr( + pub(crate) fn read_sequence_value_expr( self, - ) -> IonResult>> { - if self.is_empty() { - return Ok(None); + ) -> ParseResult<'a, Option>> { + let opcode = match self.peek_opcode() { + Some(opcode) => opcode, + None => return Ok((None, self)), + }; + + // Like RawValueExpr, but doesn't use references. + enum ParseValueExprResult<'top> { + Value(ParseResult<'top, LazyRawBinaryValue_1_1<'top>>), + EExp(ParseResult<'top, BinaryEExpression_1_1<'top>>), } - let mut input = self; - let mut type_descriptor = input.peek_opcode()?; - // If we find a NOP... - if type_descriptor.is_nop() { - // ...skip through NOPs until we found the next non-NOP byte. - (_, input) = self.consume_nop_padding(type_descriptor)?; - // If there is no next byte, we're out of values. - if input.is_empty() { - return Ok(None); + + use OpcodeType::*; + let result = match opcode.opcode_type { + EExpressionWithAddress => { + ParseValueExprResult::EExp(self.read_eexp_with_address_in_opcode(opcode)) + } + EExpressionAddressFollows => todo!("eexp address follows"), + EExpressionWithLengthPrefix => { + ParseValueExprResult::EExp(self.read_eexp_with_length_prefix(opcode)) } - // Otherwise, there's a value. - type_descriptor = input.peek_opcode()?; + AnnotationFlexSym => ParseValueExprResult::Value(self.read_annotated_value(opcode)), + AnnotationSymAddress => todo!("symbol address annotations"), + _ if opcode.ion_type().is_some() => { + ParseValueExprResult::Value(self.read_value_without_annotations(opcode)) + } + _ => return self.read_nop_then_sequence_value(), + }; + let allocator = self.context().allocator(); + match result { + ParseValueExprResult::Value(Ok((value, remaining))) => { + let value_ref = &*allocator.alloc_with(|| value); + Ok(( + Some(LazyRawValueExpr::<'a, v1_1::Binary>::ValueLiteral( + value_ref, + )), + remaining, + )) + } + ParseValueExprResult::EExp(Ok((eexp, remaining))) => { + let eexp_ref = &*allocator.alloc_with(|| eexp); + Ok(( + Some(LazyRawValueExpr::<'a, v1_1::Binary>::EExp(eexp_ref)), + remaining, + )) + } + ParseValueExprResult::Value(Err(e)) => Err(e), + ParseValueExprResult::EExp(Err(e)) => Err(e), + } + } + + #[inline(never)] + fn read_nop_then_sequence_value( + self, + ) -> ParseResult<'a, Option>> { + let mut opcode = self.expect_opcode()?; + if !opcode.is_nop() { + return IonResult::decoding_error("found a non-value, non-eexp, non-nop in a sequence"); } - if type_descriptor.is_e_expression() { - return Ok(Some(RawValueExpr::EExp( - self.read_e_expression(type_descriptor)?, - ))); + let mut input = self; + // This test updates input and opcode. + if !input.opcode_after_nop(&mut opcode)? { + return Ok((None, input)); + } + // TODO: Make an `OpcodeClass` enum that captures groups like this for fewer branches + if opcode.is_e_expression() || opcode.ion_type.is_some() || opcode.is_annotations_sequence() + { + return input.read_sequence_value_expr(); } - Ok(Some(RawValueExpr::ValueLiteral( - input.read_value(type_descriptor)?, - ))) + IonResult::decoding_error("found a non-value, non-eexp after a nop pad") } /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that /// the next byte (`type_descriptor`) is not a NOP. - pub fn read_value(self, opcode: Opcode) -> IonResult> { + pub fn read_value(self, opcode: Opcode) -> ParseResult<'a, LazyRawBinaryValue_1_1<'a>> { if opcode.is_annotations_sequence() { self.read_annotated_value(opcode) } else { @@ -309,17 +423,24 @@ impl<'a> ImmutableBuffer<'a> { /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that /// the next byte (`type_descriptor`) is neither a NOP nor an annotations wrapper. + #[inline(always)] fn read_value_without_annotations( self, type_descriptor: Opcode, - ) -> IonResult> { + ) -> ParseResult<'a, LazyRawBinaryValue_1_1<'a>> { let input = self; let header = type_descriptor .to_header() .ok_or_else(|| IonError::decoding_error("found a non-value in value position"))?; let header_offset = input.offset(); - let (length, _) = input.consume(1).read_value_length(header)?; + + let length = match header.length_type() { + LengthType::InOpcode(n) => FlexUInt::new(0, n as u64), + // This call to `read_value_length` is not always inlined, so we avoid the method call + // if possible. + _ => input.consume(1).read_value_length(header)?.0, + }; let length_length = length.size_in_bytes() as u8; let value_length = length.value() as usize; // ha let total_length = 1 // Header byte @@ -342,14 +463,14 @@ impl<'a> ImmutableBuffer<'a> { // If this value has a field ID or annotations, this will be replaced by the caller. input: self, }; - Ok(lazy_value) + Ok((lazy_value, self.consume(total_length))) } pub fn read_fixed_int(self, length: usize) -> ParseResult<'a, FixedInt> { let int_bytes = self .peek_n_bytes(length) .ok_or_else(|| IonError::incomplete("a FixedInt", self.offset()))?; - let fixed_int = FixedInt::read(int_bytes, length, 0)?; + let fixed_int = FixedInt::read(int_bytes, length, self.offset())?; Ok((fixed_int, self.consume(length))) } @@ -363,19 +484,20 @@ impl<'a> ImmutableBuffer<'a> { /// Reads an annotations wrapper and its associated value from the buffer. The caller must confirm /// that the next byte in the buffer (`type_descriptor`) begins an annotations wrapper. - fn read_annotated_value(self, opcode: Opcode) -> IonResult> { + fn read_annotated_value(self, opcode: Opcode) -> ParseResult<'a, LazyRawBinaryValue_1_1<'a>> { let (annotations_seq, input_after_annotations) = self.read_annotations_sequence(opcode)?; - let opcode = input_after_annotations.peek_opcode()?; - let mut value = input_after_annotations.read_value_without_annotations(opcode)?; + let opcode = input_after_annotations.expect_opcode()?; + let (mut value, input_after_value) = + input_after_annotations.read_value_without_annotations(opcode)?; let total_annotations_length = annotations_seq.header_length as usize + annotations_seq.sequence_length as usize; - value.encoded_value.annotations_header_length = total_annotations_length as u16; + value.encoded_value.annotations_header_length = annotations_seq.header_length; value.encoded_value.annotations_sequence_length = annotations_seq.sequence_length; value.encoded_value.annotations_encoding = annotations_seq.encoding; value.encoded_value.total_length += total_annotations_length; // Rewind the input to include the annotations sequence value.input = self; - Ok(value) + Ok((value, input_after_value)) } fn read_annotations_sequence(self, opcode: Opcode) -> ParseResult<'a, EncodedAnnotations> { @@ -455,67 +577,208 @@ impl<'a> ImmutableBuffer<'a> { todo!() } - fn read_e_expression(self, opcode: Opcode) -> IonResult> { + #[inline] + pub fn read_e_expression(self, opcode: Opcode) -> ParseResult<'a, BinaryEExpression_1_1<'a>> { use OpcodeType::*; - let (macro_id, buffer_after_id) = match opcode.opcode_type { - EExpressionWithAddress => ( - MacroIdRef::LocalAddress(opcode.byte as usize), - self.consume(1), - ), + match opcode.opcode_type { + EExpressionWithAddress => return self.read_eexp_with_address_in_opcode(opcode), EExpressionAddressFollows => todo!("e-expr with trailing address; {opcode:#0x?}",), + EExpressionWithLengthPrefix => return self.read_eexp_with_length_prefix(opcode), _ => unreachable!("read_e_expression called with invalid opcode"), }; + } - // TODO: When we support untagged parameter encodings, we need to use the signature's - // parameter encodings to drive this process. For now--while everything is tagged - // and cardinality is always required--we just loop `num_parameters` times. - let macro_def = self + fn read_eexp_with_address_in_opcode( + self, + opcode: Opcode, + ) -> ParseResult<'a, BinaryEExpression_1_1<'a>> { + let input_after_opcode = self.consume(1); + let macro_address = opcode.byte as usize; + + // Get a reference to the macro that lives at that address + let macro_ref = self .context - .macro_table - .macro_with_id(macro_id) - .ok_or_else(|| { - IonError::decoding_error(format!("invocation of unknown macro '{macro_id:?}'")) - })?; - use MacroKind::*; - let num_parameters = match macro_def.kind() { - Template(t) => t.signature().parameters().len(), - // Many system macros like `values`, `make_string`, etc take a variadic number of args. - _ => todo!("system macros require support for argument group encoding"), + .macro_table() + .macro_at_address(macro_address) + .ok_or_else( + #[inline(never)] + || { + IonError::decoding_error(format!( + "invocation of macro at unknown address '{macro_address:?}'" + )) + }, + )? + .reference(); + + let signature = macro_ref.signature(); + let bitmap_size_in_bytes = signature.bitmap_size_in_bytes(); + + let (bitmap_bits, input_after_bitmap) = if signature.num_variadic_params() == 0 { + (0, input_after_opcode) + } else { + input_after_opcode.read_eexp_bitmap(bitmap_size_in_bytes)? }; - let args_cache = self + let bitmap = ArgGroupingBitmap::new(signature.num_variadic_params(), bitmap_bits); + let mut args_iter = + BinaryEExpArgsIterator_1_1::for_input(bitmap.iter(), input_after_bitmap, signature); + let mut cache = + BumpVec::with_capacity_in(args_iter.size_hint().0, self.context.allocator()); + for arg in &mut args_iter { + let arg = arg?; + let value_expr = arg.resolve(self.context)?; + cache.push(value_expr); + } + + let eexp_total_length = args_iter.offset() - self.offset(); + let matched_eexp_bytes = self.slice(0, eexp_total_length); + let remaining_input = self.consume(matched_eexp_bytes.len()); + + let bitmap_offset = input_after_opcode.offset() - self.offset(); + let args_offset = input_after_bitmap.offset() - self.offset(); + Ok(( + { + BinaryEExpression_1_1::new( + MacroRef::new(macro_address, macro_ref), + bitmap_bits, + matched_eexp_bytes, + bitmap_offset as u8, + args_offset as u8, + ) + .with_arg_expr_cache(cache.into_bump_slice()) + }, + remaining_input, + )) + } + + fn read_eexp_with_length_prefix( + self, + _opcode: Opcode, + ) -> ParseResult<'a, BinaryEExpression_1_1<'a>> { + let input_after_opcode = self.consume(1); + let (macro_address_flex_uint, input_after_address) = input_after_opcode.read_flex_uint()?; + let (args_length_flex_uint, input_after_length) = input_after_address.read_flex_uint()?; + let header_length = input_after_length.offset() - self.offset(); + let macro_address = macro_address_flex_uint.value() as usize; + let args_length = args_length_flex_uint.value() as usize; + + let total_length = header_length + args_length; + let matched_bytes = self.slice(0, total_length); + let macro_ref = self .context - .allocator() - .alloc_with(|| BumpVec::with_capacity_in(num_parameters, self.context.allocator())); - // `args_buffer` will be partially consumed in each iteration of the loop below. - let mut args_buffer = buffer_after_id; - for _ in 0..num_parameters { - let value_expr = match args_buffer.peek_sequence_value_expr()? { - Some(expr) => expr, - None => { - return IonResult::incomplete( - "found an incomplete e-expression", - buffer_after_id.offset(), - ) + .macro_table() + .macro_at_address(macro_address) + .ok_or_else(|| { + IonError::decoding_error(format!( + "invocation of macro at unknown address '{macro_address:?}'" + )) + })? + .reference(); + // Offset from `self`, not offset from the beginning of the stream. + let bitmap_offset = (input_after_length.offset() - self.offset()) as u8; + let (bitmap_bits, _input_after_bitmap) = + input_after_length.read_eexp_bitmap(macro_ref.signature().bitmap_size_in_bytes())?; + let args_offset = bitmap_offset + macro_ref.signature().bitmap_size_in_bytes() as u8; + let remaining_input = self.consume(total_length); + return Ok(( + BinaryEExpression_1_1::new( + MacroRef::new(macro_address, macro_ref), + bitmap_bits, + matched_bytes, + bitmap_offset, + args_offset, + ), + remaining_input, + )); + } + + fn read_eexp_bitmap(self, bitmap_size_in_bytes: usize) -> ParseResult<'a, u64> { + let bitmap_bytes = self.peek_n_bytes(bitmap_size_in_bytes).ok_or_else(|| { + IonError::incomplete("parsing an e-exp arg grouping bitmap", self.offset) + })?; + if bitmap_size_in_bytes == 1 { + return Ok((bitmap_bytes[0] as u64, self.consume(1))); + } + let mut buffer = [0u8; size_of::()]; + let bitmap_bytes = self.peek_n_bytes(bitmap_size_in_bytes).ok_or_else(|| { + IonError::incomplete("parsing an e-exp arg grouping bitmap", self.offset) + })?; + buffer[..bitmap_size_in_bytes].copy_from_slice(bitmap_bytes); + let bitmap_u64 = u64::from_le_bytes(buffer); + Ok((bitmap_u64, self.consume(bitmap_size_in_bytes))) + } +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct ArgGroupingBitmap { + num_args: usize, + bits: u64, +} + +impl ArgGroupingBitmap { + const BITS_PER_VARIADIC_PARAM: usize = 2; + pub(crate) const MAX_VARIADIC_PARAMS: usize = + u64::BITS as usize / Self::BITS_PER_VARIADIC_PARAM; + pub(crate) fn new(num_args: usize, bits: u64) -> Self { + Self { num_args, bits } + } + #[inline] + pub fn iter(&self) -> ArgGroupingBitmapIterator { + ArgGroupingBitmapIterator { + remaining_args: self.num_args, + bits: self.bits, + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum ArgGrouping { + Empty, // 00 + ValueExprLiteral, // 01 + ArgGroup, // 10 +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct ArgGroupingBitmapIterator { + remaining_args: usize, + bits: u64, +} + +impl ArgGroupingBitmapIterator { + pub fn new(remaining_args: usize, bits: u64) -> Self { + Self { + remaining_args, + bits, + } + } +} + +impl Iterator for ArgGroupingBitmapIterator { + type Item = IonResult; + + fn next(&mut self) -> Option { + if self.remaining_args == 0 { + None + } else { + use ArgGrouping::*; + // Read the last two bits + let encoding = match self.bits & 0b11 { + 0b00 => Empty, + 0b01 => ValueExprLiteral, + 0b10 => ArgGroup, + _ => { + return Some(IonResult::decoding_error( + "found e-expression argument using reserved bitmap entry", + )) } }; - args_buffer = args_buffer.consume(value_expr.range().len()); - args_cache.push(value_expr); + // Discard the last two bits and decrement the number of remaining entries + self.bits >>= 2; + self.remaining_args -= 1; + Some(Ok(encoding)) } - let macro_id_encoded_length = buffer_after_id.offset() - self.offset(); - let args_length = args_buffer.offset() + args_buffer.len() - buffer_after_id.offset(); - let e_expression_buffer = self.slice(0, macro_id_encoded_length + args_length); - - let e_expression = RawBinaryEExpression_1_1::new( - macro_id, - EncodedBinaryEExp::new(macro_id_encoded_length as u16), - e_expression_buffer, - args_cache, - ); - Ok(e_expression) } } - #[derive(Clone, Copy, Debug, PartialEq)] pub enum AnnotationsEncoding { SymbolAddress, @@ -536,11 +799,46 @@ pub struct EncodedAnnotations { #[cfg(test)] mod tests { - use super::*; + use rstest::rstest; + + use crate::ion_data::IonEq; + use crate::lazy::any_encoding::IonVersion; + use crate::lazy::binary::raw::v1_1::e_expression::BinaryEExpArgsIterator_1_1; use crate::lazy::expanded::compiler::TemplateCompiler; - use crate::lazy::expanded::macro_evaluator::RawEExpression; + use crate::lazy::expanded::macro_evaluator::{EExpressionArgGroup, RawEExpression}; + use crate::lazy::expanded::macro_table::MacroTable; use crate::lazy::expanded::EncodingContext; - use crate::lazy::text::raw::v1_1::reader::MacroAddress; + use crate::lazy::text::raw::v1_1::reader::{MacroAddress, MacroIdRef}; + use crate::v1_0::RawValueRef; + use crate::{Element, ElementReader, Reader}; + + use super::*; + + #[rstest] + #[case::no_args(0, &[0b00u8], &[])] + #[case::one_empty_arg(1, &[0b00u8], &[ArgGrouping::Empty])] + #[case::one_literal_arg(1, &[0b01u8], &[ArgGrouping::ValueExprLiteral])] + #[case::one_group_arg(1, &[0b10u8], &[ArgGrouping::ArgGroup])] + #[case::two_empty_args(2, &[0b0000u8], &[ArgGrouping::Empty, ArgGrouping::Empty])] + #[case::one_literal_one_group_arg(2, &[0b1001u8], &[ArgGrouping::ValueExprLiteral, ArgGrouping::ArgGroup])] + fn read_bitmaps( + #[case] num_args: usize, + #[case] bitmap_bytes: &[u8], + #[case] expected_entries: &[ArgGrouping], + ) -> IonResult<()> { + let context = EncodingContext::for_ion_version(IonVersion::v1_1); + let buffer = ImmutableBuffer::new(context.get_ref(), bitmap_bytes); + let bitmap = + ArgGroupingBitmap::new(num_args, buffer.read_eexp_bitmap(bitmap_bytes.len())?.0); + + // Sanity test for inputs + assert_eq!(num_args, expected_entries.len()); + + for (actual, expected) in bitmap.iter().zip(expected_entries.iter()) { + assert_eq!(&actual?, expected); + } + Ok(()) + } fn input_test>(input: A) { let empty_context = EncodingContext::empty(); @@ -594,7 +892,7 @@ mod tests { fn eexp_test( macro_source: &str, encode_macro_fn: impl FnOnce(MacroAddress) -> Vec, - test_fn: impl FnOnce(RawBinaryEExpression_1_1) -> IonResult<()>, + test_fn: impl FnOnce(BinaryEExpArgsIterator_1_1) -> IonResult<()>, ) -> IonResult<()> { let mut context = EncodingContext::empty(); let template_macro = TemplateCompiler::compile_from_text(context.get_ref(), macro_source)?; @@ -602,11 +900,12 @@ mod tests { let opcode_byte = u8::try_from(macro_address).unwrap(); let binary_ion = encode_macro_fn(opcode_byte as usize); let buffer = ImmutableBuffer::new(context.get_ref(), &binary_ion); - let eexp = buffer.read_e_expression(Opcode::from_byte(opcode_byte))?; + let eexp = buffer.read_e_expression(Opcode::from_byte(opcode_byte))?.0; + let eexp_ref = &*context.allocator.alloc_with(|| eexp); assert_eq!(eexp.id(), MacroIdRef::LocalAddress(macro_address)); - println!("{:?}", eexp); - assert_eq!(eexp.id, MacroIdRef::LocalAddress(opcode_byte as usize)); - test_fn(eexp) + println!("{:?}", &eexp); + assert_eq!(eexp.id(), MacroIdRef::LocalAddress(opcode_byte as usize)); + test_fn(eexp_ref.raw_arguments()) } #[test] @@ -618,8 +917,7 @@ mod tests { eexp_test( macro_source, encode_eexp_fn, - |eexp: RawBinaryEExpression_1_1| { - let mut args = eexp.raw_arguments(); + |mut args: BinaryEExpArgsIterator_1_1| { assert!(args.next().is_none()); Ok(()) }, @@ -643,11 +941,11 @@ mod tests { 0x4D, 0x69, 0x63, 0x68, 0x65, 0x6C, 0x6C, 0x65, ]; - let args_test = |eexp: RawBinaryEExpression_1_1| { - let mut args = eexp.raw_arguments(); + let args_test = |mut args: BinaryEExpArgsIterator_1_1| { assert_eq!( args.next() .unwrap()? + .expr() .expect_value()? .read()? .expect_string()?, @@ -680,11 +978,11 @@ mod tests { 0x54, 0x75, 0x65, 0x73, 0x64, 0x61, 0x79, ]; - let args_test = |eexp: RawBinaryEExpression_1_1| { - let mut args = eexp.raw_arguments(); + let args_test = |mut args: BinaryEExpArgsIterator_1_1| { assert_eq!( args.next() .unwrap()? + .expr() .expect_value()? .read()? .expect_string()?, @@ -693,6 +991,7 @@ mod tests { assert_eq!( args.next() .unwrap()? + .expr() .expect_value()? .read()? .expect_string()?, @@ -703,4 +1002,227 @@ mod tests { eexp_test(macro_source, encode_eexp_fn, args_test) } + + #[test] + fn read_eexp_with_star_parameter_empty() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + #[rustfmt::skip] + let encode_eexp_fn = |address: MacroAddress| vec![ + // === Invoke macro ==== + address as u8, + // === Argument grouping bitmap: empty === + 0b00, + ]; + + let args_test = |mut args: BinaryEExpArgsIterator_1_1| { + let arg_group = args.next().unwrap()?.expr().expect_arg_group()?; + let mut group_args = arg_group.iter(); + assert!(group_args.next().is_none()); + Ok(()) + }; + + eexp_test(macro_source, encode_eexp_fn, args_test) + } + + #[test] + fn read_eexp_with_star_parameter_value_literal() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + #[rustfmt::skip] + let encode_eexp_fn = |address: MacroAddress| vec![ + // === Invoke macro ==== + address as u8, + // === Argument grouping bitmap: value literal === + 0b01, + // === Value Literal === + 0x61, 0x01 + ]; + + let args_test = |mut args: BinaryEExpArgsIterator_1_1| { + let arg1 = args.next().unwrap()?.expr().expect_value()?; + assert_eq!(arg1.read()?, RawValueRef::Int(1.into())); + Ok(()) + }; + + eexp_test(macro_source, encode_eexp_fn, args_test) + } + + #[test] + fn read_eexp_with_star_parameter_arg_group() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + #[rustfmt::skip] + let encode_eexp_fn = |address: MacroAddress| vec![ + // === Invoke macro ==== + address as u8, + // === Argument group header: arg group === + 0b10, + // === Arg group === + 0x0D, // FlexUInt: Byte length 6 + 0x61, 0x01, // Int 1 + 0x61, 0x02, // Int 2 + 0x61, 0x03, // Int 3 + ]; + + let args_test = |mut args: BinaryEExpArgsIterator_1_1| { + let arg_group = args.next().unwrap()?.expr().expect_arg_group()?; + let mut group_exprs = arg_group.iter(); + let group_arg1 = group_exprs.next().unwrap()?; + let group_arg2 = group_exprs.next().unwrap()?; + let group_arg3 = group_exprs.next().unwrap()?; + assert_eq!( + group_arg1.expect_value()?.read()?, + RawValueRef::Int(1.into()) + ); + assert_eq!( + group_arg2.expect_value()?.read()?, + RawValueRef::Int(2.into()) + ); + assert_eq!( + group_arg3.expect_value()?.read()?, + RawValueRef::Int(3.into()) + ); + assert!(group_exprs.next().is_none()); + Ok(()) + }; + + eexp_test(macro_source, encode_eexp_fn, args_test) + } + + #[test] + fn read_eexp_with_star_parameter_arg_group_nested_eexp() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + let expected_text = r#" + [ + "first", + 1, + ["first", "last"], + 3, + "last", + ] + "#; + + let expected = Element::read_all(expected_text)?; + + let macro_address = MacroTable::FIRST_USER_MACRO_ID as u8; + #[rustfmt::skip] + let data = vec![ + // === Invoke macro ==== + macro_address, + // === Argument group header: arg group === + 0b10, + // === Arg group === + 0x0D, // FlexUInt: Byte length 6 + 0x61, 0x01, // Int 1 + macro_address, // Nested invocation of same macro + 0b00, // Empty group + 0x61, 0x03, // Int 3 + ]; + + let mut reader = Reader::new(v1_1::Binary, data)?; + reader.register_template_src(macro_source)?; + let actual = reader.read_all_elements()?; + assert!( + actual.ion_eq(&expected), + "Actual sequence\n{actual:?}\nwas not IonEq to expected sequence\n{expected:?}" + ); + Ok(()) + } + + #[test] + fn read_length_prefixed_eexp_with_star_parameter_arg_group_nested_eexp() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + let expected_text = r#" + [ + "first", + 1, + ["first", "last"], + 3, + "last", + ] + "#; + + let expected = Element::read_all(expected_text)?; + + let macro_address = MacroTable::FIRST_USER_MACRO_ID as u8; + let flex_uint_macro_address = (macro_address * 2) + 1; + #[rustfmt::skip] + let data = vec![ + // === Invoke length prefixed macro === + 0xF5, + // === Macro address === + flex_uint_macro_address, + // === Length prefix === + 0x11, // FlexUInt 8 + // === Argument bitmap: arg group === + 0b10, + // === Arg group === + 0x0D, // FlexUInt: Byte length 6 + 0x61, 0x01, // Int 1 + macro_address, // Nested invocation of same macro (not length prefixed) + 0b00, // Bitmap: Empty group + 0x61, 0x03, // Int 3 + ]; + + let mut reader = Reader::new(v1_1::Binary, data)?; + reader.register_template_src(macro_source)?; + let actual = reader.read_all_elements()?; + assert!( + actual.ion_eq(&expected), + "Actual sequence\n{actual:?}\nwas not IonEq to expected sequence\n{expected:?}" + ); + Ok(()) + } + + #[test] + fn read_length_prefixed_eexp_with_star_parameter_empty() -> IonResult<()> { + let macro_source = r#" + (macro wrap_in_list (values*) ["first", values, "last"]) + "#; + + let expected_text = r#" + [ + "first", + "last", + ] + "#; + + let expected = Element::read_all(expected_text)?; + + let macro_address = MacroTable::FIRST_USER_MACRO_ID as u8; + let flex_uint_macro_address = (macro_address * 2) + 1; + #[rustfmt::skip] + let data = vec![ + // === Invoke length prefixed macro === + 0xF5, + // === Macro address === + flex_uint_macro_address, + // === Length prefix === + 0x03, // FlexUInt 1 + // === Argument bitmap === + 0b00, // empty group + ]; + + let mut reader = Reader::new(v1_1::Binary, data)?; + reader.register_template_src(macro_source)?; + let actual = reader.read_all_elements()?; + assert!( + actual.ion_eq(&expected), + "Actual sequence\n{actual:?}\nwas not IonEq to expected sequence\n{expected:?}" + ); + Ok(()) + } } diff --git a/src/lazy/binary/raw/v1_1/reader.rs b/src/lazy/binary/raw/v1_1/reader.rs index d372158e..4a0fc05e 100644 --- a/src/lazy/binary/raw/v1_1/reader.rs +++ b/src/lazy/binary/raw/v1_1/reader.rs @@ -1,14 +1,14 @@ #![allow(non_camel_case_types)] use crate::lazy::any_encoding::IonEncoding; -use crate::lazy::binary::raw::v1_1::immutable_buffer::ImmutableBuffer; -use crate::lazy::decoder::{Decoder, LazyRawReader, RawValueExpr, RawVersionMarker}; +use crate::lazy::binary::raw::v1_1::immutable_buffer::{ImmutableBuffer, ParseResult}; +use crate::lazy::binary::raw::v1_1::ION_1_1_OPCODES; +use crate::lazy::decoder::{LazyRawReader, RawValueExpr}; use crate::lazy::encoder::private::Sealed; use crate::lazy::encoding::BinaryEncoding_1_1; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; -use crate::result::IonFailure; -use crate::{Encoding, HasRange, IonResult}; +use crate::{Encoding, IonResult}; pub struct LazyRawBinaryReader_1_1<'data> { input: &'data [u8], @@ -19,7 +19,7 @@ pub struct LazyRawBinaryReader_1_1<'data> { } impl<'data> LazyRawBinaryReader_1_1<'data> { - fn new(input: &'data [u8]) -> Self { + pub fn new(input: &'data [u8]) -> Self { Self::new_with_offset(input, 0) } @@ -43,12 +43,6 @@ impl<'data> LazyRawBinaryReader_1_1<'data> { 'data: 'top, { let (marker, buffer_after_ivm) = buffer.read_ivm()?; - let (major, minor) = marker.version(); - if (major, minor) != (1, 1) { - return IonResult::decoding_error(format!( - "unsupported version of Ion: v{major}.{minor}; only 1.1 is supported by this reader", - )); - } self.local_offset = buffer_after_ivm.offset() - self.stream_offset; Ok(LazyRawStreamItem::::VersionMarker( marker, @@ -58,20 +52,21 @@ impl<'data> LazyRawBinaryReader_1_1<'data> { fn read_value_expr<'top>( &'top mut self, buffer: ImmutableBuffer<'top>, - ) -> IonResult> + ) -> ParseResult<'top, LazyRawStreamItem<'top, BinaryEncoding_1_1>> where 'data: 'top, { - let item = match buffer.peek_sequence_value_expr()? { + let (maybe_expr, remaining) = buffer.read_sequence_value_expr()?; + let item = match maybe_expr { Some(RawValueExpr::ValueLiteral(lazy_value)) => RawStreamItem::Value(lazy_value), - Some(RawValueExpr::EExp(eexpr)) => RawStreamItem::EExpression(eexpr), + Some(RawValueExpr::EExp(eexpr)) => RawStreamItem::EExp(eexpr), None => self.end_of_stream(buffer.offset()), }; - let item_range = item.range(); - self.local_offset = item_range.end - self.stream_offset; - Ok(item) + self.local_offset = remaining.offset() - self.stream_offset; + Ok((item, remaining)) } + #[inline(always)] pub fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, @@ -79,27 +74,20 @@ impl<'data> LazyRawBinaryReader_1_1<'data> { where 'data: 'top, { - let mut buffer = ImmutableBuffer::new_with_offset( - context, - self.input.get(self.local_offset..).unwrap(), - self.position(), - ); - - if buffer.is_empty() { + let data = &self.input[self.local_offset..]; + let Some(&first_byte) = data.first() else { + return Ok(self.end_of_stream(self.position())); + }; + let mut buffer = ImmutableBuffer::new_with_offset(context, data, self.position()); + let mut opcode = ION_1_1_OPCODES[first_byte as usize]; + if opcode.is_nop() && !buffer.opcode_after_nop(&mut opcode)? { return Ok(self.end_of_stream(buffer.offset())); } - - let type_descriptor = buffer.peek_opcode()?; - if type_descriptor.is_nop() { - (_, buffer) = buffer.consume_nop_padding(type_descriptor)?; - if buffer.is_empty() { - return Ok(self.end_of_stream(buffer.offset())); - } - } - if type_descriptor.is_ivm_start() { + if opcode.is_ivm_start() { return self.read_ivm(buffer); } - self.read_value_expr(buffer) + let (item, _remaining) = self.read_value_expr(buffer)?; + Ok(item) } } @@ -113,11 +101,20 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_1> for LazyRawBinaryReader_1_1 fn resume_at_offset( data: &'data [u8], offset: usize, - _saved_state: ::ReaderSavedState, + // This argument is ignored by all raw readers except LazyRawAnyReader + _encoding_hint: IonEncoding, ) -> Self { Self::new_with_offset(data, offset) } + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + ( + &self.input[self.local_offset..], + self.position(), + self.encoding(), + ) + } + fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, diff --git a/src/lazy/binary/raw/v1_1/sequence.rs b/src/lazy/binary/raw/v1_1/sequence.rs index fc58f9b9..37d1f104 100644 --- a/src/lazy/binary/raw/v1_1/sequence.rs +++ b/src/lazy/binary/raw/v1_1/sequence.rs @@ -1,13 +1,14 @@ #![allow(non_camel_case_types)] +use std::fmt::{Debug, Formatter}; + use crate::lazy::binary::raw::v1_1::annotations_iterator::RawBinaryAnnotationsIterator_1_1; use crate::lazy::binary::raw::v1_1::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::v1_1::value::LazyRawBinaryValue_1_1; use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{Decoder, LazyRawContainer, LazyRawSequence, LazyRawValueExpr}; use crate::lazy::encoding::BinaryEncoding_1_1; -use crate::{HasRange, IonResult, IonType}; -use std::fmt::{Debug, Formatter}; +use crate::{try_or_some_err, IonResult, IonType}; #[derive(Debug, Copy, Clone)] pub struct LazyRawBinaryList_1_1<'top> { @@ -20,7 +21,7 @@ pub struct LazyRawBinarySExp_1_1<'top> { } impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinaryList_1_1<'top> { - fn from_value(value: LazyRawBinaryValue_1_1<'top>) -> Self { + fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { LazyRawBinaryList_1_1 { sequence: LazyRawBinarySequence_1_1 { value }, } @@ -50,7 +51,7 @@ impl<'top> LazyRawSequence<'top, BinaryEncoding_1_1> for LazyRawBinaryList_1_1<' } impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinarySExp_1_1<'top> { - fn from_value(value: LazyRawBinaryValue_1_1<'top>) -> Self { + fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { LazyRawBinarySExp_1_1 { sequence: LazyRawBinarySequence_1_1 { value }, } @@ -81,7 +82,7 @@ impl<'top> LazyRawSequence<'top, BinaryEncoding_1_1> for LazyRawBinarySExp_1_1<' #[derive(Copy, Clone)] pub struct LazyRawBinarySequence_1_1<'top> { - pub(crate) value: LazyRawBinaryValue_1_1<'top>, + pub(crate) value: &'top LazyRawBinaryValue_1_1<'top>, } impl<'top> LazyRawBinarySequence_1_1<'top> { @@ -90,9 +91,7 @@ impl<'top> LazyRawBinarySequence_1_1<'top> { } pub fn iter(&self) -> RawBinarySequenceIterator_1_1<'top> { - // Get as much of the sequence's body as is available in the input buffer. - // Reading a child value may fail as `Incomplete` - let buffer_slice = self.value.available_body(); + let buffer_slice = self.value.value_body_buffer(); RawBinarySequenceIterator_1_1::new(buffer_slice) } } @@ -131,16 +130,12 @@ impl<'a> Debug for LazyRawBinarySequence_1_1<'a> { } pub struct RawBinarySequenceIterator_1_1<'top> { - source: ImmutableBuffer<'top>, - bytes_to_skip: usize, + input: ImmutableBuffer<'top>, } impl<'top> RawBinarySequenceIterator_1_1<'top> { pub(crate) fn new(input: ImmutableBuffer<'top>) -> RawBinarySequenceIterator_1_1<'top> { - RawBinarySequenceIterator_1_1 { - source: input, - bytes_to_skip: 0, - } + RawBinarySequenceIterator_1_1 { input } } } @@ -148,13 +143,11 @@ impl<'top> Iterator for RawBinarySequenceIterator_1_1<'top> { type Item = IonResult>; fn next(&mut self) -> Option { - self.source = self.source.consume(self.bytes_to_skip); - let item = match self.source.peek_sequence_value_expr() { - Ok(Some(expr)) => expr, - Ok(None) => return None, - Err(e) => return Some(Err(e)), - }; - self.bytes_to_skip = item.range().len(); - Some(Ok(item)) + let (maybe_item, remaining_input) = try_or_some_err!(self.input.read_sequence_value_expr()); + if let Some(item) = maybe_item { + self.input = remaining_input; + return Some(Ok(item)); + } + None } } diff --git a/src/lazy/binary/raw/v1_1/struct.rs b/src/lazy/binary/raw/v1_1/struct.rs index 207d3eb7..549b2fb2 100644 --- a/src/lazy/binary/raw/v1_1/struct.rs +++ b/src/lazy/binary/raw/v1_1/struct.rs @@ -46,7 +46,7 @@ impl<'top> HasRange for LazyRawBinaryFieldName_1_1<'top> { } } -impl<'top> LazyRawFieldName<'top> for LazyRawBinaryFieldName_1_1<'top> { +impl<'top> LazyRawFieldName<'top, BinaryEncoding_1_1> for LazyRawBinaryFieldName_1_1<'top> { fn read(&self) -> IonResult> { Ok(self.field_name) } @@ -54,7 +54,7 @@ impl<'top> LazyRawFieldName<'top> for LazyRawBinaryFieldName_1_1<'top> { #[derive(Copy, Clone)] pub struct LazyRawBinaryStruct_1_1<'top> { - pub(crate) value: LazyRawBinaryValue_1_1<'top>, + pub(crate) value: &'top LazyRawBinaryValue_1_1<'top>, } impl<'a, 'top> IntoIterator for &'a LazyRawBinaryStruct_1_1<'top> { @@ -85,9 +85,7 @@ impl<'top> LazyRawBinaryStruct_1_1<'top> { } pub fn iter(&self) -> RawBinaryStructIterator_1_1<'top> { - // Get as much of the struct's body as is available in the input buffer. - // Reading a child value may fail as `Incomplete` - let buffer_slice = self.value.available_body(); + let buffer_slice = self.value.value_body_buffer(); RawBinaryStructIterator_1_1::new( self.value.encoded_value.header.ion_type_code, buffer_slice, @@ -96,7 +94,7 @@ impl<'top> LazyRawBinaryStruct_1_1<'top> { } impl<'top> LazyContainerPrivate<'top, BinaryEncoding_1_1> for LazyRawBinaryStruct_1_1<'top> { - fn from_value(value: LazyRawBinaryValue_1_1<'top>) -> Self { + fn from_value(value: &'top LazyRawBinaryValue_1_1<'top>) -> Self { LazyRawBinaryStruct_1_1 { value } } } @@ -197,7 +195,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { fn peek_value( buffer: ImmutableBuffer<'top>, ) -> IonResult<(Option>, ImmutableBuffer<'top>)> { - let opcode = buffer.peek_opcode()?; + let opcode = buffer.expect_opcode()?; if opcode.is_nop() { let after_nops = buffer.consume_nop_padding(opcode)?.1; if after_nops.is_empty() { @@ -208,7 +206,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { } else { buffer .read_value(opcode) - .map(|v| (Some(v), v.input.consume(v.encoded_value.total_length))) + .map(|(v, remaining)| (Some(v), remaining)) } } @@ -243,7 +241,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { buffer = after; continue; // No value for this field, loop to try next field. } - (Some(value), after) => (value, after), + (Some(value), after) => (&*after.context().allocator().alloc_with(|| value), after), }; let bytes_to_skip = after_value.offset() - self.source.offset(); diff --git a/src/lazy/binary/raw/v1_1/type_code.rs b/src/lazy/binary/raw/v1_1/type_code.rs index ff91f24f..65ca7e09 100644 --- a/src/lazy/binary/raw/v1_1/type_code.rs +++ b/src/lazy/binary/raw/v1_1/type_code.rs @@ -41,9 +41,10 @@ pub enum OpcodeType { // 0xF1 delimited list start // 0xF2 delimited s-expression start // 0xF3 delimited struct start - LargeInteger, // 0xF6 - Integer preceded by FlexUInt length - Blob, // 0xFE - - Clob, // 0xFF - + EExpressionWithLengthPrefix, // 0xF5 + LargeInteger, // 0xF6 - Integer preceded by FlexUInt length + Blob, // 0xFE - + Clob, // 0xFF - // 0xF8 Long decimal TimestampLong, // 0xF8 - Long-form Timestamp // 0xF9 - Long string diff --git a/src/lazy/binary/raw/v1_1/type_descriptor.rs b/src/lazy/binary/raw/v1_1/type_descriptor.rs index f68c129e..2a9cde32 100644 --- a/src/lazy/binary/raw/v1_1/type_descriptor.rs +++ b/src/lazy/binary/raw/v1_1/type_descriptor.rs @@ -76,6 +76,7 @@ impl Opcode { (0xE, 0xA) => (NullNull, low_nibble, Some(IonType::Null)), (0xE, 0xB) => (TypedNull, low_nibble, Some(IonType::Null)), (0xE, 0xC..=0xD) => (Nop, low_nibble, None), + (0xF, 0x5) => (EExpressionWithLengthPrefix, low_nibble, None), (0xF, 0x6) => (LargeInteger, low_nibble, Some(IonType::Int)), (0xF, 0x7) => (Decimal, 0xFF, Some(IonType::Decimal)), (0xF, 0x8) => (TimestampLong, low_nibble, Some(IonType::Timestamp)), @@ -96,6 +97,10 @@ impl Opcode { } } + pub fn ion_type(&self) -> Option { + self.ion_type + } + pub fn is_null(&self) -> bool { self.opcode_type == OpcodeType::NullNull || self.opcode_type == OpcodeType::TypedNull } @@ -108,7 +113,7 @@ impl Opcode { use OpcodeType::*; matches!( self.opcode_type, - EExpressionWithAddress | EExpressionAddressFollows + EExpressionWithAddress | EExpressionAddressFollows | EExpressionWithLengthPrefix ) } diff --git a/src/lazy/binary/raw/v1_1/value.rs b/src/lazy/binary/raw/v1_1/value.rs index 66418e6c..383ebc35 100644 --- a/src/lazy/binary/raw/v1_1/value.rs +++ b/src/lazy/binary/raw/v1_1/value.rs @@ -3,8 +3,13 @@ use std::fmt::Debug; use std::ops::Range; +use crate::lazy::binary::raw::v1_1::r#struct::LazyRawBinaryStruct_1_1; +use crate::lazy::binary::raw::v1_1::sequence::{LazyRawBinaryList_1_1, LazyRawBinarySExp_1_1}; +use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::{HasRange, HasSpan, RawVersionMarker}; +use crate::lazy::expanded::EncodingContextRef; use crate::lazy::span::Span; +use crate::lazy::str_ref::StrRef; use crate::{ lazy::{ binary::{ @@ -25,7 +30,8 @@ use crate::{ }, result::IonFailure, types::{HasMinute, SymbolId, Timestamp, TimestampBuilder}, - IonError, IonResult, IonType, RawSymbolRef, + Decimal, Int, IonEncoding, IonError, IonResult, IonType, LazyExpandedList, LazyExpandedSExp, + LazyExpandedStruct, LazyList, LazySExp, LazyStruct, RawSymbolRef, SymbolRef, ValueRef, }; use num_traits::PrimInt; @@ -75,9 +81,13 @@ impl<'top> HasRange for LazyRawBinaryVersionMarker_1_1<'top> { } impl<'top> RawVersionMarker<'top> for LazyRawBinaryVersionMarker_1_1<'top> { - fn version(&self) -> (u8, u8) { + fn major_minor(&self) -> (u8, u8) { (self.major, self.minor) } + + fn stream_encoding_before_marker(&self) -> IonEncoding { + IonEncoding::Binary_1_1 + } } #[derive(Debug, Copy, Clone)] @@ -86,7 +96,7 @@ pub struct LazyRawBinaryValue_1_1<'top> { pub(crate) input: ImmutableBuffer<'top>, } -impl<'top> HasSpan<'top> for LazyRawBinaryValue_1_1<'top> { +impl<'top> HasSpan<'top> for &'top LazyRawBinaryValue_1_1<'top> { fn span(&self) -> Span<'top> { let range = self.range(); let local_range = (range.start - self.input.offset())..(range.end - self.input.offset()); @@ -95,19 +105,19 @@ impl<'top> HasSpan<'top> for LazyRawBinaryValue_1_1<'top> { } } -impl<'top> HasRange for LazyRawBinaryValue_1_1<'top> { +impl<'top> HasRange for &'top LazyRawBinaryValue_1_1<'top> { fn range(&self) -> Range { self.encoded_value.annotated_value_range() } } -impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for LazyRawBinaryValue_1_1<'top> { +impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for &'top LazyRawBinaryValue_1_1<'top> { fn ion_type(&self) -> IonType { - self.ion_type() + self.encoded_value.ion_type() } fn is_null(&self) -> bool { - self.is_null() + self.encoded_value.header().is_null() } fn has_annotations(&self) -> bool { @@ -115,11 +125,107 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for LazyRawBinaryValue_1_1<'to } fn annotations(&self) -> ::AnnotationsIterator<'top> { - self.annotations() + RawBinaryAnnotationsIterator_1_1::new( + self.annotations_sequence(), + self.encoded_value.annotations_encoding, + ) } fn read(&self) -> IonResult> { - self.read() + if self.is_null() { + let ion_type = if self.encoded_value.header.ion_type_code == OpcodeType::TypedNull { + let body = self.value_body(); + ION_1_1_TYPED_NULL_TYPES[body[0] as usize] + } else { + IonType::Null + }; + return Ok(RawValueRef::Null(ion_type)); + } + + match self.ion_type() { + IonType::Null => unreachable!("all null types handled above"), + IonType::Bool => Ok(RawValueRef::Bool(self.read_bool()?)), + IonType::Int => Ok(RawValueRef::Int(self.read_int()?)), + IonType::Float => Ok(RawValueRef::Float(self.read_float()?)), + IonType::Decimal => Ok(RawValueRef::Decimal(self.read_decimal()?)), + IonType::Timestamp => Ok(RawValueRef::Timestamp(self.read_timestamp()?)), + IonType::Symbol => Ok(RawValueRef::Symbol(self.read_symbol()?)), + IonType::String => Ok(RawValueRef::String(self.read_string()?)), + IonType::Clob => Ok(RawValueRef::Clob(self.read_clob()?)), + IonType::Blob => Ok(RawValueRef::Blob(self.read_blob()?)), + IonType::List => Ok(RawValueRef::List(self.read_list()?)), + IonType::SExp => Ok(RawValueRef::SExp(self.read_sexp()?)), + IonType::Struct => Ok(RawValueRef::Struct(self.read_struct()?)), + } + } + + /// This is a fast path for reading values that we know need to be resolved. + /// + /// When a `LazyValue` wrapping a raw binary value calls `read()`, it's clear that the `RawValueRef` will + /// need to be resolved into a `ValueRef` before it is returned to the application. If `LazyValue::read` + /// (indirectly) calls `RawLazyValue::read`, the raw level will dispatch on the Ion type of the current + /// value to find the correct decoder. It will then return the `RawValueRef` to the `LazyValue`, which + /// will dispatch on the Ion type again to resolve it into a `ValueRef`. These two identical dispatch steps + /// over 13 Ion types happen far enough away from each other in the code that they cannot be consolidated + /// into a single dispatch by the compiler. + /// + /// This method exists to perform the read and resolve steps in the same locale, allowing the compiler + /// to optimize it more effectively. + #[inline(always)] + fn read_resolved( + &self, + context: EncodingContextRef<'top>, + ) -> IonResult> { + if self.is_null() { + return Ok(ValueRef::Null(self.ion_type())); + } + // Anecdotally, string and integer values are very common in Ion streams. This `match` creates + // an inlineable fast path for them while other types go through the general case impl. + // NOTE: We can and should change the subset of types this optimizes for when we have data to + // better inform our decision. + return match self.ion_type() { + IonType::String => Ok(ValueRef::String(self.read_string()?)), + IonType::Int => Ok(ValueRef::Int(self.read_int()?)), + _ => read_resolved_general_case(self, context), + }; + + // The 'general case' function that we fall back to for nulls and less common types + fn read_resolved_general_case<'a>( + value: &'a LazyRawBinaryValue_1_1<'a>, + context: EncodingContextRef<'a>, + ) -> IonResult> { + if value.is_null() { + return Ok(ValueRef::Null(value.ion_type())); + } + + let value_ref = + match value.ion_type() { + IonType::Bool => ValueRef::Bool(value.read_bool()?), + IonType::Int => ValueRef::Int(value.read_int()?), + IonType::Float => ValueRef::Float(value.read_float()?), + IonType::Decimal => ValueRef::Decimal(value.read_decimal()?), + IonType::Timestamp => ValueRef::Timestamp(value.read_timestamp()?), + IonType::String => ValueRef::String(value.read_string()?), + IonType::Symbol => { + let raw_symbol: RawSymbolRef = value.read_symbol()?; + let symbol: SymbolRef = raw_symbol.resolve(context)?; + ValueRef::Symbol(symbol) + } + IonType::Blob => ValueRef::Blob(value.read_blob()?), + IonType::Clob => ValueRef::Clob(value.read_clob()?), + IonType::List => ValueRef::List(LazyList::from( + LazyExpandedList::from_literal(context, value.read_list()?), + )), + IonType::SExp => ValueRef::SExp(LazySExp::from( + LazyExpandedSExp::from_literal(context, value.read_sexp()?), + )), + IonType::Struct => ValueRef::Struct(LazyStruct::from( + LazyExpandedStruct::from_literal(context, value.read_struct()?), + )), + IonType::Null => unreachable!("already handled"), + }; + Ok(value_ref) + } } fn annotations_span(&self) -> Span<'top> { @@ -142,17 +248,17 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for LazyRawBinaryValue_1_1<'to impl<'top> LazyRawBinaryValue_1_1<'top> { /// Indicates the Ion data type of this value. Calling this method does not require additional /// parsing of the input stream. - pub fn ion_type(&self) -> IonType { - self.encoded_value.ion_type() + pub fn ion_type(&'top self) -> IonType { + <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::ion_type(&self) } - pub fn is_null(&self) -> bool { - self.encoded_value.header().is_null() + pub fn is_null(&'top self) -> bool { + <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::is_null(&self) } /// Returns `true` if this value has a non-empty annotations sequence; otherwise, returns `false`. - fn has_annotations(&self) -> bool { - self.encoded_value.has_annotations() + fn has_annotations(&'top self) -> bool { + <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::has_annotations(&self) } /// Returns an `ImmutableBuffer` that contains the bytes comprising this value's encoded @@ -160,81 +266,46 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { fn annotations_sequence(&self) -> ImmutableBuffer<'top> { let annotations_header_length = self.encoded_value.annotations_header_length as usize; let sequence_length = self.encoded_value.annotations_sequence_length as usize; - let sequence = self - .input - .slice(annotations_header_length - sequence_length, sequence_length); + let sequence = self.input.slice(annotations_header_length, sequence_length); sequence } /// Returns an iterator over this value's unresolved annotation symbols. - pub fn annotations(&self) -> RawBinaryAnnotationsIterator_1_1<'top> { - RawBinaryAnnotationsIterator_1_1::new( - self.annotations_sequence(), - self.encoded_value.annotations_encoding, - ) + pub fn annotations(&'top self) -> RawBinaryAnnotationsIterator_1_1<'top> { + <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::annotations(&self) } /// Reads this value's data, returning it as a [`RawValueRef`]. If this value is a container, /// calling this method will not read additional data; the `RawValueRef` will provide a /// [`LazyRawBinarySequence_1_1`](crate::lazy::binary::raw::v1_1::sequence::LazyRawBinarySequence_1_1) - /// or [`LazyStruct`](crate::lazy::struct::LazyStruct) that can be traversed to access the container's contents. - pub fn read(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { - if self.is_null() { - let ion_type = if self.encoded_value.header.ion_type_code == OpcodeType::TypedNull { - let body = self.value_body()?; - ION_1_1_TYPED_NULL_TYPES[body[0] as usize] - } else { - IonType::Null - }; - return Ok(RawValueRef::Null(ion_type)); - } - - match self.ion_type() { - IonType::Null => unreachable!("all null types handled above"), - IonType::Bool => self.read_bool(), - IonType::Int => self.read_int(), - IonType::Float => self.read_float(), - IonType::Decimal => self.read_decimal(), - IonType::Timestamp => self.read_timestamp(), - IonType::Symbol => self.read_symbol(), - IonType::String => self.read_string(), - IonType::Clob => self.read_clob(), - IonType::Blob => self.read_blob(), - IonType::List => self.read_list(), - IonType::SExp => self.read_sexp(), - IonType::Struct => self.read_struct(), - } + /// or [`LazyStruct`] that can be traversed to access the container's contents. + pub fn read(&'top self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + <&'top Self as LazyRawValue<'top, BinaryEncoding_1_1>>::read(&self) } /// Returns the encoded byte slice representing this value's data. - pub(crate) fn value_body(&self) -> IonResult<&'top [u8]> { + /// For this raw value to have been created, lexing had to indicate that the complete value + /// was available. Because of that invariant, this method will always succeed. + #[inline] + pub(crate) fn value_body(&self) -> &'top [u8] { let value_total_length = self.encoded_value.total_length(); - if self.input.len() < value_total_length { - return IonResult::incomplete( - "only part of the requested value is available in the buffer", - self.input.offset(), - ); - } let value_body_length = self.encoded_value.value_body_length(); let value_offset = value_total_length - value_body_length; - Ok(self.input.bytes_range(value_offset, value_body_length)) + self.input.bytes_range(value_offset, value_body_length) } - /// Returns an [`ImmutableBuffer`] containing whatever bytes of this value's body are currently - /// available. This method is used to construct lazy containers, which are not required to be - /// fully buffered before reading begins. - pub(crate) fn available_body(&self) -> ImmutableBuffer<'top> { + /// Returns an [`ImmutableBuffer`] representing this value's data. + /// For this raw value to have been created, lexing had to indicate that the complete value + /// was available. Because of that invariant, this method will always succeed. + pub(crate) fn value_body_buffer(&self) -> ImmutableBuffer<'top> { let value_total_length = self.encoded_value.total_length(); let value_body_length = self.encoded_value.value_body_length(); let value_offset = value_total_length - value_body_length; - - let bytes_needed = std::cmp::min(self.input.len() - value_offset, value_body_length); - let buffer_slice = self.input.slice(value_offset, bytes_needed); - buffer_slice + self.input.slice(value_offset, value_body_length) } /// Helper method called by [`Self::read`]. Reads the current value as a bool. - fn read_bool(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_bool(&'top self) -> IonResult { debug_assert!(self.encoded_value.ion_type() == IonType::Bool); let header = &self.encoded_value.header(); let representation = header.type_code(); @@ -243,46 +314,32 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { (OpcodeType::Boolean, 0xF) => false, _ => unreachable!("found a boolean value with an illegal length code."), }; - Ok(RawValueRef::Bool(value)) + Ok(value) } - /// Helper method called by [`Self::read`]. Reads the current value as an int. - fn read_int(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + #[inline(always)] + fn read_int(&self) -> IonResult { debug_assert!(self.encoded_value.ion_type() == IonType::Int); - - let header = &self.encoded_value.header(); - let representation = header.type_code(); - let value = match (representation, header.low_nibble as usize) { - (OpcodeType::Integer, 0x0) => 0.into(), - (OpcodeType::Integer, n) => { - // We have n bytes following that make up our integer. - self.available_body().read_fixed_int(n)?.0.into() - } - (OpcodeType::LargeInteger, 0x6) => { - // We have a FlexUInt size, then big int. - let value_bytes = self.value_body()?; - FixedInt::read(value_bytes, value_bytes.len(), 0)?.into() - } - _ => unreachable!("integer encoding with illegal length_code found"), - }; - Ok(RawValueRef::Int(value)) + debug_assert!(!self.is_null()); + let body_bytes = self.value_body(); + Ok(*FixedInt::read(body_bytes, body_bytes.len(), self.input.offset())?.value()) } /// Helper method called by [`Self::read`]. Reads the current value as a float. - fn read_float(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_float(&'top self) -> IonResult { debug_assert!(self.encoded_value.ion_type() == IonType::Float); let value = match self.encoded_value.value_body_length { 8 => { let mut buffer = [0; 8]; - let val_bytes = self.available_body().bytes_range(0, 8); + let val_bytes = self.value_body_buffer().bytes_range(0, 8); buffer[..8].copy_from_slice(val_bytes); f64::from_le_bytes(buffer) } 4 => { let mut buffer = [0; 4]; - let val_bytes = self.available_body().bytes_range(0, 4); + let val_bytes = self.value_body_buffer().bytes_range(0, 4); buffer[..4].copy_from_slice(val_bytes); f32::from_le_bytes(buffer).into() @@ -291,11 +348,11 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { 0 => 0.0f64, _ => unreachable!("found a float value with illegal byte size"), }; - Ok(RawValueRef::Float(value)) + Ok(value) } /// Helper method called by [`Self::read`]. Reads the current value as a decimal. - fn read_decimal(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_decimal(&'top self) -> IonResult { use crate::types::decimal::*; debug_assert!(self.encoded_value.ion_type() == IonType::Decimal); @@ -304,7 +361,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { } else { use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt; - let value_bytes = self.value_body()?; + let value_bytes = self.value_body(); let exponent = FlexInt::read(value_bytes, 0)?; let coefficient_size = self.encoded_value.value_body_length - exponent.size_in_bytes(); let coefficient = FixedInt::read( @@ -321,16 +378,16 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { } }; - Ok(RawValueRef::Decimal(decimal)) + Ok(decimal) } // Helper method called by [`Self::read_timestamp_short`]. Reads the time information from a // timestamp with Unknown or UTC offset. fn read_timestamp_short_no_offset_after_minute( - &self, + &'top self, value_bytes: &[u8], ts_builder: TimestampBuilder, - ) -> ValueParseResult<'top, BinaryEncoding_1_1> { + ) -> IonResult { const SECONDS_MASK_16BIT: u16 = 0x03_F0; const MILLISECONDS_MASK_16BIT: u16 = 0x0F_FC; const MICROSECONDS_MASK_32BIT: u32 = 0x3F_FF_FC_00; @@ -347,7 +404,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { ts_builder.build()? }; - return Ok(RawValueRef::Timestamp(timestamp)); + return Ok(timestamp); } // Read Second @@ -363,7 +420,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { ts_builder.build()? }; - return Ok(RawValueRef::Timestamp(timestamp)); + return Ok(timestamp); } // Millisecond Precision @@ -377,7 +434,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { ts_builder.build()? }; - return Ok(RawValueRef::Timestamp(timestamp)); + return Ok(timestamp); } // Microsecond Precision @@ -391,7 +448,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { ts_builder.build()? }; - return Ok(RawValueRef::Timestamp(timestamp)); + return Ok(timestamp); } // Nanosecond Precision @@ -404,7 +461,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { ts_builder.build()? }; - return Ok(RawValueRef::Timestamp(timestamp)); + return Ok(timestamp); } unreachable!("invalid length code for short-form timestamp"); @@ -413,10 +470,10 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Helper method callsed by [`Self::read_timestamp_short`]. Reads the time information from a // timestamp with a provided offset. fn read_timestamp_short_offset_after_minute( - &self, + &'top self, value_bytes: &[u8], ts_builder: TimestampBuilder, - ) -> ValueParseResult<'top, BinaryEncoding_1_1> { + ) -> IonResult { const OFFSET_MASK_16BIT: u16 = 0x03_F8; const MILLISECOND_MASK_16BIT: u16 = 0x03_FF; const MICROSECOND_MASK_32BIT: u32 = 0x0F_FF_00; @@ -433,7 +490,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Hour and Minutes at known offset if length_code == 8 { let ts_builder = ts_builder.with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } // Read seconds @@ -443,7 +500,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Seconds precision at known offset. if length_code == 9 { let ts_builder = ts_builder.with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } // Opcodes 7A, 7B, and 7C, differ in subsecond precision. @@ -455,7 +512,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { .with_milliseconds(millisecond.into()) .with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } else if length_code == 0xB { // Read microseconds let microsecond = u32::from_le_bytes(value_bytes[4..=7].try_into().unwrap()) @@ -464,14 +521,14 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { .with_microseconds(microsecond) .with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } else if length_code == 0xC { // Read nanoseconds let nanoseconds = u32::from_le_bytes(value_bytes[5..=8].try_into().unwrap()) & NANOSECOND_MASK_32BIT; let ts_builder = ts_builder.with_nanoseconds(nanoseconds).with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } unreachable!(); @@ -479,18 +536,18 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Helper method called by [`Self::read_timestamp`]. Reads the time information from a // timestamp encoded in short form. - fn read_timestamp_short(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_timestamp_short(&'top self) -> IonResult { const MONTH_MASK_16BIT: u16 = 0x07_80; let length_code = self.encoded_value.header.low_nibble(); - let value_bytes = self.value_body()?; + let value_bytes = self.value_body(); // Year is biased offset by 1970, and is held in the lower 7 bits of the first byte. let ts_builder = Timestamp::with_year((value_bytes[0] & 0x7F) as u32 + 1970); // Year Precision. if length_code == 0 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } // Read month.. @@ -501,7 +558,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Month Precision if length_code == 1 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } // Read day. @@ -510,7 +567,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Day Precision if length_code == 2 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } // Hour and Minute @@ -532,7 +589,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { // Helper method called by [`Self::read_timestamp`]. Reads the time information from a // timestamp encoded in long form. - fn read_timestamp_long(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_timestamp_long(&'top self) -> IonResult { use crate::lazy::encoder::binary::v1_1::fixed_uint::FixedUInt; use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt; use crate::types::decimal::{coefficient::Coefficient, *}; @@ -545,7 +602,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { const SECOND_MASK_16BIT: u16 = 0x0F_C0; const OFFSET_MASK_16BIT: u16 = 0x3F_FC; - let value_bytes = self.value_body()?; + let value_bytes = self.value_body(); let value_length = self.encoded_value.value_body_length; if value_length < 2 || value_length == 4 || value_length == 5 { @@ -555,7 +612,7 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { let year = u16::from_le_bytes(value_bytes[0..=1].try_into().unwrap()) & YEAR_MASK_16BIT; let ts_builder = Timestamp::with_year(year.into()); if value_length == 2 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } let month = u16::from_le_bytes(value_bytes[1..=2].try_into().unwrap()) @@ -563,12 +620,12 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { let day = value_bytes[2].extract_bitmask(DAY_MASK_8BIT); let ts_builder = ts_builder.with_month(month.into()); if value_length == 3 && day == 0 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } let ts_builder = ts_builder.with_day(day as u32); if value_length == 3 { - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } let hour = u16::from_le_bytes(value_bytes[2..=3].try_into().unwrap()) @@ -588,9 +645,9 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { if value_length == 6 { if let Some(offset) = offset { let ts_builder = ts_builder.with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } let second = u16::from_le_bytes(value_bytes[5..=6].try_into().unwrap()) @@ -600,9 +657,9 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { if value_length == 7 { if let Some(offset) = offset { let ts_builder = ts_builder.with_offset(offset); - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } - return Ok(RawValueRef::Timestamp(ts_builder.build()?)); + return ts_builder.build(); } let scale = FlexUInt::read(&value_bytes[7..], 0)?; @@ -617,14 +674,14 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { let ts_builder = ts_builder.with_fractional_seconds(frac_sec); if let Some(offset) = offset { let ts_builder = ts_builder.with_offset(offset); - Ok(RawValueRef::Timestamp(ts_builder.build()?)) + ts_builder.build() } else { - Ok(RawValueRef::Timestamp(ts_builder.build()?)) + ts_builder.build() } } /// Helper method called by [`Self::read`]. Reads the current value as a timestamp. - fn read_timestamp(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_timestamp(&'top self) -> IonResult { debug_assert!(self.encoded_value.ion_type() == IonType::Timestamp); match self.encoded_value.header.type_code() { @@ -634,12 +691,24 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { } } + #[inline] + fn read_string(&self) -> IonResult> { + debug_assert!(self.encoded_value.ion_type() == IonType::String); + debug_assert!(!self.is_null()); + let raw_bytes = self.value_body(); + let text = std::str::from_utf8(raw_bytes) + .map_err(|_| IonError::decoding_error("found string with invalid UTF-8 data"))?; + Ok(StrRef::from(text)) + } + /// Helper method called by [`Self::read_symbol`]. Reads the current value as a symbol ID. - fn read_symbol_id(&self) -> IonResult { + fn read_symbol_id(&'top self) -> IonResult { let biases: [usize; 3] = [0, 256, 65792]; let length_code = self.encoded_value.header.low_nibble; if (1..=3).contains(&length_code) { - let (id, _) = self.available_body().read_fixed_uint(length_code.into())?; + let (id, _) = self + .value_body_buffer() + .read_fixed_uint(length_code.into())?; let id = usize::try_from(id.value())?; Ok(id + biases[(length_code - 1) as usize]) } else { @@ -648,71 +717,58 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { } /// Helper method called by [`Self::read`]. Reads the current value as a symbol. - fn read_symbol(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_symbol(&'top self) -> IonResult> { debug_assert!(self.encoded_value.ion_type() == IonType::Symbol); let type_code = self.encoded_value.header.ion_type_code; if type_code == OpcodeType::InlineSymbol { - let raw_bytes = self.value_body()?; + let raw_bytes = self.value_body(); let text = std::str::from_utf8(raw_bytes) .map_err(|_| IonError::decoding_error("found symbol with invalid UTF-8 data"))?; - Ok(RawValueRef::Symbol(RawSymbolRef::from(text))) + Ok(RawSymbolRef::from(text)) } else if type_code == OpcodeType::SymbolAddress { let symbol_id = self.read_symbol_id()?; - Ok(RawValueRef::Symbol(RawSymbolRef::SymbolId(symbol_id))) + Ok(RawSymbolRef::SymbolId(symbol_id)) } else { unreachable!("invalid Opcode type found for symbol"); } } - /// Helper method called by [`Self::read`]. Reads the current value as a string. - fn read_string(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { - use crate::lazy::str_ref::StrRef; - - debug_assert!(self.encoded_value.ion_type() == IonType::String); - let raw_bytes = self.value_body()?; - let text = std::str::from_utf8(raw_bytes) - .map_err(|_| IonError::decoding_error("found string with invalid UTF-8 data"))?; - Ok(RawValueRef::String(StrRef::from(text))) - } - /// Helper method called by [`Self::read`]. Reads the current value as a blob. - fn read_blob(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_blob(&self) -> IonResult> { debug_assert!(self.encoded_value.ion_type() == IonType::Blob); - let raw_bytes = self.value_body()?; - Ok(RawValueRef::Blob(raw_bytes.into())) + let raw_bytes = self.value_body(); + Ok(raw_bytes.into()) } /// Helper method called by [`Self::read`]. Reads the current value as a clob. - fn read_clob(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_clob(&'top self) -> IonResult> { debug_assert!(self.encoded_value.ion_type() == IonType::Clob); - let raw_bytes = self.value_body()?; - Ok(RawValueRef::Clob(raw_bytes.into())) + let raw_bytes = self.value_body(); + Ok(raw_bytes.into()) } /// Helper method called by [`Self::read`]. Reads the current value as an S-expression. - fn read_sexp(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_sexp(&'top self) -> IonResult> { use crate::lazy::binary::raw::v1_1::sequence::LazyRawBinarySExp_1_1; use crate::lazy::decoder::private::LazyContainerPrivate; debug_assert!(self.encoded_value.ion_type() == IonType::SExp); - Ok(RawValueRef::SExp(LazyRawBinarySExp_1_1::from_value(*self))) + Ok(LazyRawBinarySExp_1_1::from_value(self)) } /// Helper method called by [`Self::read`]. Reads the current value as a list. - fn read_list(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_list(&'top self) -> IonResult> { use crate::lazy::binary::raw::v1_1::sequence::LazyRawBinaryList_1_1; use crate::lazy::decoder::private::LazyContainerPrivate; debug_assert!(self.encoded_value.ion_type() == IonType::List); - Ok(RawValueRef::List(LazyRawBinaryList_1_1::from_value(*self))) + Ok(LazyRawBinaryList_1_1::from_value(self)) } /// Helper method called by [`Self::read`]. Reads the current value as a struct. - fn read_struct(&self) -> ValueParseResult<'top, BinaryEncoding_1_1> { + fn read_struct(&'top self) -> IonResult> { use crate::lazy::binary::raw::v1_1::r#struct::LazyRawBinaryStruct_1_1; use crate::lazy::decoder::private::LazyContainerPrivate; - Ok(RawValueRef::Struct(LazyRawBinaryStruct_1_1::from_value( - *self, - ))) + Ok(LazyRawBinaryStruct_1_1::from_value(self)) } } diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index a4210c96..24b15ffa 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -17,7 +17,7 @@ use crate::lazy::span::Span; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; use crate::types::SymbolId; -use crate::{Decimal, Int, IonError, IonResult, IonType, RawSymbolRef, Timestamp}; +use crate::{Decimal, Int, IonEncoding, IonError, IonResult, IonType, RawSymbolRef, Timestamp}; use std::fmt::{Debug, Formatter}; use std::ops::Range; use std::{fmt, mem}; @@ -52,9 +52,13 @@ impl<'top> HasRange for LazyRawBinaryVersionMarker_1_0<'top> { } impl<'top> RawVersionMarker<'top> for LazyRawBinaryVersionMarker_1_0<'top> { - fn version(&self) -> (u8, u8) { + fn major_minor(&self) -> (u8, u8) { (self.major, self.minor) } + + fn stream_encoding_before_marker(&self) -> IonEncoding { + IonEncoding::Binary_1_0 + } } /// A value that has been identified in the input stream but whose data has not yet been read. @@ -312,12 +316,7 @@ impl<'top> LazyRawBinaryValue_1_0<'top> { let offset_and_length = self .encoded_value .annotations_sequence_offset() - .map(|offset| { - ( - offset, - self.encoded_value.annotations_sequence_length().unwrap(), - ) - }); + .map(|offset| (offset, self.encoded_value.annotations_sequence_length())); let (sequence_offset, sequence_length) = match offset_and_length { None => { // If there are no annotations, return an empty slice positioned on the type diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index 3658b332..be6c1830 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -1,16 +1,25 @@ use std::fmt::Debug; +use std::io::Write; use std::ops::Range; -use crate::lazy::any_encoding::IonEncoding; -use crate::lazy::encoding::{BinaryEncoding_1_0, RawValueLiteral, TextEncoding_1_0}; +use crate::lazy::any_encoding::{IonEncoding, IonVersion}; +use crate::lazy::encoder::text::v1_0::writer::LazyRawTextWriter_1_0; +use crate::lazy::encoder::text::v1_1::writer::LazyRawTextWriter_1_1; +use crate::lazy::encoder::write_as_ion::{WriteableEExp, WriteableRawValue}; +use crate::lazy::encoding::{ + BinaryEncoding, BinaryEncoding_1_0, RawValueLiteral, TextEncoding_1_0, +}; use crate::lazy::expanded::macro_evaluator::RawEExpression; -use crate::lazy::expanded::EncodingContextRef; +use crate::lazy::expanded::{EncodingContext, EncodingContextRef}; use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; use crate::read_config::ReadConfig; use crate::result::IonFailure; -use crate::{Catalog, IonResult, IonType, RawSymbolRef}; +use crate::{ + v1_0, v1_1, Catalog, Encoding, IonResult, IonType, LazyExpandedFieldName, LazyRawWriter, + RawSymbolRef, ValueRef, +}; pub trait HasSpan<'top>: HasRange { fn span(&self) -> Span<'top>; @@ -18,6 +27,15 @@ pub trait HasSpan<'top>: HasRange { pub trait HasRange { fn range(&self) -> Range; + + /// Returns the number of bytes this encoded item occupies. + /// + /// This method is equivalent to calling `.range().len()`, but types have the option to + /// override its implementation if the length can be found more quickly without computing the + /// range first. + fn byte_length(&self) -> usize { + self.range().len() + } } /// A family of types that collectively comprise the lazy reader API for an Ion serialization @@ -30,11 +48,6 @@ pub trait HasRange { pub trait Decoder: 'static + Sized + Debug + Clone + Copy { /// A lazy reader that yields [`Self::Value`]s representing the top level values in its input. type Reader<'data>: LazyRawReader<'data, Self>; - /// Additional data (beyond the offset) that the reader will need in order to resume reading - /// from a different point in the stream. - // At the moment this feature is only used by `LazyAnyRawReader`, which needs to remember what - // encoding the stream was using during earlier read operations. - type ReaderSavedState: Copy + Default; /// A value (at any depth) in the input. This can be further inspected to access either its /// scalar data or, if it is a container, to view it as [`Self::List`], [`Self::SExp`] or /// [`Self::Struct`]. @@ -46,7 +59,7 @@ pub trait Decoder: 'static + Sized + Debug + Clone + Copy { /// A struct whose fields may be accessed iteratively or by field name. type Struct<'top>: LazyRawStruct<'top, Self>; /// A symbol token representing the name of a field within a struct. - type FieldName<'top>: LazyRawFieldName<'top>; + type FieldName<'top>: LazyRawFieldName<'top, Self>; /// An iterator over the annotations on the input stream's values. type AnnotationsIterator<'top>: Iterator>>; /// An e-expression invoking a macro. (Ion 1.1+) @@ -60,13 +73,73 @@ pub trait Decoder: 'static + Sized + Debug + Clone + Copy { } pub trait RawVersionMarker<'top>: Debug + Copy + Clone + HasSpan<'top> { + /// Returns the major version of the Ion encoding to which the stream is switching. fn major(&self) -> u8 { - self.version().0 + self.major_minor().0 } + + /// Returns the minor version of the Ion encoding to which the stream is switching. fn minor(&self) -> u8 { - self.version().1 + self.major_minor().1 + } + + /// Returns a tuple representing the `(major, minor)` version pair for the Ion encoding + /// to which the stream is switching. + fn major_minor(&self) -> (u8, u8); + + /// If this marker is encoded in binary Ion, returns `true`. Otherwise, returns `false`. + /// + /// Ion streams can switch versions (for example: from v1.0 to v1.1 or vice-versa), but they + /// cannot change formats (for example: from binary to text or vice-versa). Therefore the value + /// returned by this method will be true for the stream prior to the IVM _and_ for the stream + /// that follows the IVM. + fn is_binary(&self) -> bool { + self.stream_encoding_before_marker().is_binary() + } + + /// If this marker is encoded in text Ion, returns `true`. Otherwise, returns `false`. + /// + /// Ion streams can switch versions (for example: from v1.0 to v1.1 or vice-versa), but they + /// cannot change formats (for example: from binary to text or vice-versa). Therefore, the value + /// returned by this method will be true for the stream prior to the IVM _and_ for the stream + /// that follows the IVM. + fn is_text(&self) -> bool { + self.stream_encoding_before_marker().is_text() + } + + /// The `IonVersion` that was used to encode this IVM. + fn stream_version_before_marker(&self) -> IonVersion { + self.stream_encoding_before_marker().version() + } + + /// If this marker's `(major, minor)` version pair represents a supported Ion version, + /// returns `Ok(ion_version)`. Otherwise, returns a decoding error. To access the marker's + /// version without confirming it is supported, see [`major_minor`](Self::major_minor). + fn stream_version_after_marker(&self) -> IonResult { + match self.major_minor() { + (1, 0) => Ok(IonVersion::v1_0), + (1, 1) => Ok(IonVersion::v1_1), + (major, minor) => { + IonResult::decoding_error(format!("Ion version {major}.{minor} is not supported")) + } + } + } + + fn stream_encoding_before_marker(&self) -> IonEncoding; + + /// If this marker's `(major, minor)` version pair represents a supported Ion version, + /// returns `Ok(ion_encoding)`. Otherwise, returns a decoding error. To access the marker's + /// encoding information without confirming it is supported, see [`major_minor`](Self::major_minor) and + /// [`is_binary`](Self::is_binary)/[`is_text`](Self::is_text). + fn stream_encoding_after_marker(&self) -> IonResult { + let encoding = match (self.is_binary(), self.stream_version_after_marker()?) { + (true, IonVersion::v1_0) => IonEncoding::Binary_1_0, + (false, IonVersion::v1_0) => IonEncoding::Text_1_0, + (true, IonVersion::v1_1) => IonEncoding::Binary_1_1, + (false, IonVersion::v1_1) => IonEncoding::Text_1_1, + }; + Ok(encoding) } - fn version(&self) -> (u8, u8); } /// An expression found in value position in either serialized Ion or a template. @@ -272,9 +345,10 @@ impl<'top, D: Decoder> HasRange for LazyRawFieldExpr<'top, D> { // internal code that is defined in terms of `LazyRawField` to call the private `into_value()` // function while also preventing users from seeing or depending on it. pub(crate) mod private { + use crate::lazy::expanded::macro_evaluator::{MacroExpr, RawEExpression}; use crate::lazy::expanded::r#struct::UnexpandedField; use crate::lazy::expanded::EncodingContextRef; - use crate::IonResult; + use crate::{try_next, try_or_some_err, IonResult, LazyExpandedValue, LazyRawFieldName}; use super::{Decoder, LazyRawFieldExpr, LazyRawStruct}; @@ -309,16 +383,24 @@ pub(crate) mod private { type Item = IonResult>; fn next(&mut self) -> Option { - let field: LazyRawFieldExpr<'top, D> = match self.raw_fields.next() { - Some(Ok(field)) => field, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; + let field: LazyRawFieldExpr<'top, D> = try_next!(self.raw_fields.next()); use LazyRawFieldExpr::*; let unexpanded_field = match field { - NameValue(name, value) => UnexpandedField::RawNameValue(self.context, name, value), - NameEExp(name, eexp) => UnexpandedField::RawNameEExp(self.context, name, eexp), - EExp(eexp) => UnexpandedField::RawEExp(self.context, eexp), + NameValue(name, value) => UnexpandedField::NameValue( + name.resolve(self.context), + LazyExpandedValue::from_literal(self.context, value), + ), + NameEExp(name, raw_eexp) => { + let eexp = try_or_some_err!(raw_eexp.resolve(self.context)); + UnexpandedField::NameMacro( + name.resolve(self.context), + MacroExpr::from_eexp(eexp), + ) + } + EExp(raw_eexp) => { + let eexp = try_or_some_err!(raw_eexp.resolve(self.context)); + UnexpandedField::Macro(MacroExpr::from_eexp(eexp)) + } }; Some(Ok(unexpanded_field)) } @@ -342,12 +424,23 @@ pub(crate) mod private { } pub trait LazyRawReader<'data, D: Decoder>: Sized { + /// Constructs a new raw reader using decoder `D` that will read from `data`. + /// `data` must be the beginning of the stream. To continue reading from the middle of a + /// stream, see [`resume_at_offset`](Self::resume_at_offset). fn new(data: &'data [u8]) -> Self { - Self::resume_at_offset(data, 0, D::ReaderSavedState::default()) + Self::resume_at_offset(data, 0, IonEncoding::default()) } - fn resume_at_offset(data: &'data [u8], offset: usize, saved_state: D::ReaderSavedState) - -> Self; + /// Constructs a new raw reader using decoder `D` that will read from `data`. + /// + /// Automatically detecting the stream's encoding is only possible when `offset` is zero. + /// If offset is not zero, the caller must supply an `encoding_hint` indicating the expected + /// encoding. Encoding-specific raw readers will ignore this hint--the stream's encoding must be + /// the one that they support--but the `LazyRawAnyReader` will use it. + fn resume_at_offset(data: &'data [u8], offset: usize, encoding_hint: IonEncoding) -> Self; + + /// Deconstructs this reader, returning a tuple of `(remaining_data, stream_offset, encoding)`. + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding); fn next<'top>( &'top mut self, @@ -356,18 +449,112 @@ pub trait LazyRawReader<'data, D: Decoder>: Sized { where 'data: 'top; - fn save_state(&self) -> D::ReaderSavedState { - D::ReaderSavedState::default() - } - /// The stream byte offset at which the reader will begin parsing the next item to return. /// This position is not necessarily the first byte of the next value; it may be (e.g.) a NOP, /// a comment, or whitespace that the reader will traverse as part of matching the next item. fn position(&self) -> usize; + /// The Ion encoding of the stream that the reader has been processing. + /// + /// Note that: + /// * Before any items have been read from the stream, the encoding defaults + /// to [`IonEncoding::Text_1_0`]. + /// * When an IVM is encountered, the Ion version reported afterward can be different but the + /// format (text vs binary) will remain the same. fn encoding(&self) -> IonEncoding; } +/// Allows writers to specify which Ion encodings they can losslessly transcribe from. +/// +/// TODO: At the moment, this implementation does not process encoding directives in the +/// input stream, which means it only works for very simple use cases. A better solution +/// would be to take a `&mut SystemReader<_>` that can maintain the encoding context while +/// also only paying attention to stream literals. +pub trait TranscribeRaw { + fn transcribe<'a, R: LazyRawReader<'a, E>>(&mut self, reader: &mut R) -> IonResult<()> + where + Self: 'a; +} + +impl TranscribeRaw for LazyRawTextWriter_1_1 { + fn transcribe<'a, R: LazyRawReader<'a, v1_1::Binary>>( + &mut self, + reader: &mut R, + ) -> IonResult<()> + where + Self: 'a, + { + transcribe_raw_binary_to_text(reader, self) + } +} + +impl TranscribeRaw for LazyRawTextWriter_1_1 { + fn transcribe<'a, R: LazyRawReader<'a, v1_0::Binary>>( + &mut self, + reader: &mut R, + ) -> IonResult<()> + where + Self: 'a, + { + transcribe_raw_binary_to_text(reader, self) + } +} + +impl TranscribeRaw for LazyRawTextWriter_1_0 { + fn transcribe<'a, R: LazyRawReader<'a, v1_0::Binary>>( + &mut self, + reader: &mut R, + ) -> IonResult<()> + where + Self: 'a, + { + transcribe_raw_binary_to_text(reader, self) + } +} + +fn transcribe_raw_binary_to_text< + 'a, + W: Write + 'a, + InputEncoding: BinaryEncoding<'a>, + Reader: LazyRawReader<'a, InputEncoding>, + Writer: LazyRawWriter, +>( + reader: &mut Reader, + writer: &mut Writer, +) -> IonResult<()> { + const FLUSH_EVERY_N: usize = 100; + let encoding_context = EncodingContext::for_ion_version(IonVersion::v1_1); + let context_ref = encoding_context.get_ref(); + let mut item_number: usize = 0; + loop { + let item = reader.next(context_ref)?; + use crate::RawStreamItem::*; + match item { + VersionMarker(_m) if item_number == 0 => { + // The writer automatically emits an IVM at the head of the output. + } + VersionMarker(_m) => { + // If the reader surfaces another IVM, write a matching one in the output. + writer.write_version_marker()? + } + Value(v) => { + writer.write(WriteableRawValue::new(v))?; + } + EExp(e) => { + writer.write(WriteableEExp::new(e))?; + } + EndOfStream(_) => { + writer.flush()?; + return Ok(()); + } + } + item_number += 1; + if item_number % FLUSH_EVERY_N == 0 { + writer.flush()?; + } + } +} + pub trait LazyRawContainer<'top, D: Decoder> { fn as_value(&self) -> D::Value<'top>; } @@ -380,6 +567,9 @@ pub trait LazyRawValue<'top, D: Decoder>: fn has_annotations(&self) -> bool; fn annotations(&self) -> D::AnnotationsIterator<'top>; fn read(&self) -> IonResult>; + fn read_resolved(&self, context: EncodingContextRef<'top>) -> IonResult> { + self.read()?.resolve(context) + } fn annotations_span(&self) -> Span<'top>; @@ -410,6 +600,12 @@ pub trait LazyRawStruct<'top, D: Decoder>: fn iter(&self) -> Self::Iterator; } -pub trait LazyRawFieldName<'top>: HasSpan<'top> + Copy + Debug + Clone { +pub trait LazyRawFieldName<'top, D: Decoder = Self>>: + HasSpan<'top> + Copy + Debug + Clone +{ fn read(&self) -> IonResult>; + + fn resolve(&self, context: EncodingContextRef<'top>) -> LazyExpandedFieldName<'top, D> { + LazyExpandedFieldName::RawName(context, *self) + } } diff --git a/src/lazy/encoder/binary/v1_0/writer.rs b/src/lazy/encoder/binary/v1_0/writer.rs index a0995229..fe40d098 100644 --- a/src/lazy/encoder/binary/v1_0/writer.rs +++ b/src/lazy/encoder/binary/v1_0/writer.rs @@ -13,7 +13,7 @@ use crate::lazy::encoder::LazyRawWriter; use crate::lazy::encoding::Encoding; use crate::unsafe_helpers::{mut_ref_to_ptr, ptr_to_mut_ref}; use crate::write_config::{WriteConfig, WriteConfigKind}; -use crate::IonResult; +use crate::{IonEncoding, IonResult}; /// A "raw"-level streaming binary Ion writer. This writer does not provide symbol table /// management; symbol-related operations (e.g. setting field IDs and annotations or writing symbol @@ -125,19 +125,28 @@ impl LazyRawWriter for LazyRawBinaryWriter_1_0 { } } + fn output(&self) -> &W { + &self.output + } + delegate! { to self { fn flush(&mut self) -> IonResult<()>; } } - fn output(&self) -> &W { - &self.output - } - fn output_mut(&mut self) -> &mut W { &mut self.output } + + fn write_version_marker(&mut self) -> IonResult<()> { + self.output.write_all(&[0xE0, 0x01, 0x00, 0xEA])?; + Ok(()) + } + + fn encoding(&self) -> IonEncoding { + IonEncoding::Binary_1_0 + } } impl MakeValueWriter for LazyRawBinaryWriter_1_0 { diff --git a/src/lazy/encoder/binary/v1_1/fixed_int.rs b/src/lazy/encoder/binary/v1_1/fixed_int.rs index dc3a0916..caa6862d 100644 --- a/src/lazy/encoder/binary/v1_1/fixed_int.rs +++ b/src/lazy/encoder/binary/v1_1/fixed_int.rs @@ -1,7 +1,5 @@ use std::io::Write; -use ice_code::ice as cold_path; - use crate::decimal::coefficient::Coefficient; use crate::result::IonFailure; use crate::{Int, IonResult}; @@ -18,8 +16,12 @@ pub(crate) const MAX_UINT_SIZE_IN_BYTES: usize = std::mem::size_of::(); impl FixedInt { fn new(size_in_bytes: usize, value: impl Into) -> Self { + Self::from_int(size_in_bytes, value.into()) + } + + pub(crate) const fn from_int(size_in_bytes: usize, value: Int) -> Self { Self { - value: value.into(), + value, size_in_bytes, } } @@ -35,13 +37,36 @@ impl FixedInt { if input.len() < size_in_bytes { return IonResult::incomplete("reading a FixedInt", offset); } + // By branching on particular values, we make the value of `size_in_bytes` in their + // corresponding arm `const`. This allows us to use `read_const` to optimize for those + // sizes. + let fixed_int = match size_in_bytes { + 0 => FixedInt::from_int(0, Int::ZERO), + 1 => Self::read_const::<1>(input.try_into().unwrap()), + 2 => Self::read_const::<2>(input.try_into().unwrap()), + n if n <= MAX_INT_SIZE_IN_BYTES => Self::read_general_case(input, n), + _ => { + return IonResult::decoding_error( + "found a FixedInt that was larger than the supported maximum", + ) + } + }; + Ok(fixed_int) + } - if size_in_bytes > MAX_INT_SIZE_IN_BYTES { - return cold_path! {{ - IonResult::decoding_error("found a FixedInt that was larger than the supported maximum") - }}; - } + /// When the size of the FixedInt is known, the generated assembly for parsing it is more + /// efficient. This `const` read method is useful for optimizing common cases. + #[inline] + pub(crate) fn read_const(input: [u8; N]) -> FixedInt { + let mut buffer = [0u8; MAX_INT_SIZE_IN_BYTES]; + *buffer.last_chunk_mut::().unwrap() = input; + let value = i128::from_le_bytes(buffer) + .checked_shr(128 - (N as u32 * 8)) + .unwrap_or(0i128); + FixedInt::new(N, value) + } + fn read_general_case(input: &[u8], size_in_bytes: usize) -> FixedInt { const BUFFER_SIZE: usize = MAX_INT_SIZE_IN_BYTES; let mut buffer = [0u8; BUFFER_SIZE]; // Copy the input into the buffer as the _most_ significant bits, read as i128, and then @@ -52,7 +77,7 @@ impl FixedInt { .checked_shr(128 - (size_in_bytes as u32 * 8)) .unwrap_or(0) .into(); - Ok(FixedInt::new(size_in_bytes, value)) + FixedInt::new(size_in_bytes, value) } #[inline] diff --git a/src/lazy/encoder/binary/v1_1/flex_uint.rs b/src/lazy/encoder/binary/v1_1/flex_uint.rs index 841a9664..fe0756e5 100644 --- a/src/lazy/encoder/binary/v1_1/flex_uint.rs +++ b/src/lazy/encoder/binary/v1_1/flex_uint.rs @@ -56,9 +56,14 @@ impl FlexUInt { // FlexUInt we find requires more than 8 bytes to represent, we'll fall back to the general // case. if input.len() < COMMON_CASE_INPUT_BYTES_NEEDED || input[0] == 0 { - // `read_flex_uint_slow` is marked #[cold] to discourage inlining it, which keeps - // this method small enough that the code for the common case can be inlined. - return Self::read_flex_primitive_as_uint(input, offset, "reading a FlexUInt", false); + // Calling `read_flex_primitive_as_uint_no_inline` keeps this method small enough that + // the code for the common case can be inlined. + return Self::read_flex_primitive_as_uint_no_inline( + input, + offset, + "reading a FlexUInt", + false, + ); } let flex_uint = Self::read_small_flex_uint(input); @@ -90,6 +95,16 @@ impl FlexUInt { FlexUInt::new(num_encoded_bytes, value) } + #[inline(never)] + pub(crate) fn read_flex_primitive_as_uint_no_inline( + input: &[u8], + offset: usize, + label: &'static str, + support_sign_extension: bool, + ) -> IonResult { + Self::read_flex_primitive_as_uint(input, offset, label, support_sign_extension) + } + /// Helper method that reads a flex-encoded primitive from the buffer, returning it as a `FlexUInt`. /// If an error occurs while reading, its description will include the supplied `label`. /// diff --git a/src/lazy/encoder/binary/v1_1/writer.rs b/src/lazy/encoder/binary/v1_1/writer.rs index bb5afc9f..c7c5f56c 100644 --- a/src/lazy/encoder/binary/v1_1/writer.rs +++ b/src/lazy/encoder/binary/v1_1/writer.rs @@ -13,7 +13,7 @@ use crate::lazy::encoder::LazyRawWriter; use crate::lazy::encoding::Encoding; use crate::unsafe_helpers::{mut_ref_to_ptr, ptr_to_mut_ref}; use crate::write_config::{WriteConfig, WriteConfigKind}; -use crate::IonResult; +use crate::{IonEncoding, IonResult}; /// A "raw"-level streaming binary Ion 1.1 writer. This writer does not provide encoding module /// management; symbol- and macro- related operations require the caller to perform their own @@ -147,6 +147,15 @@ impl LazyRawWriter for LazyRawBinaryWriter_1_1 { fn output_mut(&mut self) -> &mut W { &mut self.output } + + fn write_version_marker(&mut self) -> IonResult<()> { + self.output.write_all(&[0xE0, 0x01, 0x01, 0xEA])?; + Ok(()) + } + + fn encoding(&self) -> IonEncoding { + IonEncoding::Binary_1_1 + } } impl MakeValueWriter for LazyRawBinaryWriter_1_1 { diff --git a/src/lazy/encoder/mod.rs b/src/lazy/encoder/mod.rs index b908e82a..386d5302 100644 --- a/src/lazy/encoder/mod.rs +++ b/src/lazy/encoder/mod.rs @@ -7,7 +7,7 @@ use value_writer::SequenceWriter; use crate::lazy::encoding::Encoding; use crate::write_config::WriteConfig; -use crate::IonResult; +use crate::{IonEncoding, IonResult}; pub mod annotate; pub mod annotation_seq; @@ -57,6 +57,7 @@ pub trait LazyRawWriter: SequenceWriter { fn new(output: W) -> IonResult where Self: Sized; + fn build(config: WriteConfig, output: W) -> IonResult where Self: Sized; @@ -65,6 +66,10 @@ pub trait LazyRawWriter: SequenceWriter { fn output(&self) -> &W; fn output_mut(&mut self) -> &mut W; + + fn write_version_marker(&mut self) -> IonResult<()>; + + fn encoding(&self) -> IonEncoding; } #[cfg(test)] diff --git a/src/lazy/encoder/text/v1_0/value_writer.rs b/src/lazy/encoder/text/v1_0/value_writer.rs index 30c97fce..0b9a7c99 100644 --- a/src/lazy/encoder/text/v1_0/value_writer.rs +++ b/src/lazy/encoder/text/v1_0/value_writer.rs @@ -1,3 +1,8 @@ +use std::fmt::Formatter; +use std::io::Write; + +use delegate::delegate; + use crate::lazy::encoder::annotation_seq::{AnnotationSeq, AnnotationsVec}; use crate::lazy::encoder::private::Sealed; use crate::lazy::encoder::text::v1_0::writer::LazyRawTextWriter_1_0; @@ -14,9 +19,6 @@ use crate::text::text_formatter::{FmtValueFormatter, IoValueFormatter}; use crate::text::whitespace_config::WhitespaceConfig; use crate::types::{ContainerType, ParentType}; use crate::{Decimal, Int, IonResult, IonType, RawSymbolRef, Timestamp}; -use delegate::delegate; -use std::fmt::Formatter; -use std::io::Write; pub struct TextValueWriter_1_0<'value, W: Write + 'value> { pub(crate) writer: &'value mut LazyRawTextWriter_1_0, diff --git a/src/lazy/encoder/text/v1_0/writer.rs b/src/lazy/encoder/text/v1_0/writer.rs index 1b312404..90db6a80 100644 --- a/src/lazy/encoder/text/v1_0/writer.rs +++ b/src/lazy/encoder/text/v1_0/writer.rs @@ -13,7 +13,7 @@ use crate::text::whitespace_config::{ }; use crate::types::ParentType; use crate::write_config::WriteConfigKind; -use crate::{IonResult, TextFormat, WriteConfig}; +use crate::{IonEncoding, IonResult, TextFormat, WriteConfig}; /// A raw text Ion 1.0 writer. pub struct LazyRawTextWriter_1_0 { @@ -113,6 +113,16 @@ impl LazyRawWriter for LazyRawTextWriter_1_0 { fn output_mut(&mut self) -> &mut W { &mut self.output } + + fn write_version_marker(&mut self) -> IonResult<()> { + let space_between = self.whitespace_config.space_between_top_level_values; + write!(self.output, "$ion_1_0{space_between}")?; + Ok(()) + } + + fn encoding(&self) -> IonEncoding { + IonEncoding::Text_1_0 + } } #[cfg(test)] diff --git a/src/lazy/encoder/text/v1_1/writer.rs b/src/lazy/encoder/text/v1_1/writer.rs index 94d24ebd..6154d741 100644 --- a/src/lazy/encoder/text/v1_1/writer.rs +++ b/src/lazy/encoder/text/v1_1/writer.rs @@ -10,7 +10,7 @@ use crate::text::whitespace_config::{ COMPACT_WHITESPACE_CONFIG, LINES_WHITESPACE_CONFIG, PRETTY_WHITESPACE_CONFIG, }; use crate::write_config::WriteConfigKind; -use crate::{IonResult, TextFormat, WriteConfig}; +use crate::{IonEncoding, IonResult, TextFormat, WriteConfig}; // Text Ion 1.1 is a syntactic superset of Ion 1.0. The types comprising this writer implementation // delegates nearly all of their functionality to the 1.0 text writer. @@ -90,15 +90,30 @@ impl LazyRawWriter for LazyRawTextWriter_1_1 { fn output_mut(&mut self) -> &mut W { self.writer_1_0.output_mut() } + + fn write_version_marker(&mut self) -> IonResult<()> { + let space_between = self + .writer_1_0 + .whitespace_config + .space_between_top_level_values; + write!(self.writer_1_0.output, "$ion_1_1{space_between}")?; + Ok(()) + } + + fn encoding(&self) -> IonEncoding { + IonEncoding::Text_1_1 + } } #[cfg(test)] mod tests { + use crate::lazy::any_encoding::IonVersion; use crate::lazy::decoder::{LazyRawReader, LazyRawSequence, LazyRawValue}; use crate::lazy::encoder::text::v1_1::writer::LazyRawTextWriter_1_1; use crate::lazy::encoder::value_writer::{SequenceWriter, StructWriter, ValueWriter}; use crate::lazy::encoder::write_as_ion::WriteAsSExp; use crate::lazy::encoder::LazyRawWriter; + use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::macro_evaluator::RawEExpression; use crate::lazy::expanded::EncodingContext; use crate::lazy::text::raw::v1_1::reader::{LazyRawTextReader_1_1, MacroIdRef}; @@ -264,56 +279,46 @@ mod tests { println!("{encoded_text}"); let mut reader = LazyRawTextReader_1_1::new(encoded_text.as_bytes()); - let empty_context = EncodingContext::empty(); - let context = empty_context.get_ref(); + let mut context = EncodingContext::for_ion_version(IonVersion::v1_1); + let macro_foo = + TemplateCompiler::compile_from_text(context.get_ref(), "(macro foo (x*) null)")?; + context.macro_table.add_macro(macro_foo)?; + let context = context.get_ref(); let _marker = reader.next(context)?.expect_ivm()?; - let eexp = reader.next(context)?.expect_macro_invocation()?; + let eexp = reader.next(context)?.expect_eexp()?; assert_eq!(MacroIdRef::LocalName("foo"), eexp.id()); let mut args = eexp.raw_arguments(); - let int_arg = args.next().unwrap()?.expect_value()?.read()?.expect_int()?; - assert_eq!(int_arg, 1.into()); - let list_arg = args - .next() - .unwrap()? - .expect_value()? - .read()? - .expect_list()?; - let mut list_values = list_arg.iter(); - let value = list_values + let x = args.next().unwrap()?.expr().expect_arg_group()?; + let mut x_values = x.into_iter(); + let int_value = x_values .next() .unwrap()? - .expect_value()? + .expect_value() + .unwrap() .read()? .expect_i64()?; - assert_eq!(value, 2); - let value = list_values + assert_eq!(int_value, 1); + let list_value = x_values .next() .unwrap()? .expect_value()? .read()? - .expect_i64()?; - assert_eq!(value, 3); - let value = list_values - .next() - .unwrap()? - .expect_value()? - .read()? - .expect_i64()?; - assert_eq!(value, 4); - let string_arg = args + .expect_list()?; + assert_eq!(list_value.iter().count(), 3); + let string_value = x_values .next() .unwrap()? .expect_value()? .read()? .expect_string()?; - assert_eq!(string_arg.text(), "bar"); - let symbol_arg = args + assert_eq!(string_value, "bar"); + let symbol_value = x_values .next() .unwrap()? .expect_value()? .read()? .expect_symbol()?; - assert_eq!(symbol_arg, RawSymbolRef::Text("+++")); + assert_eq!(symbol_value, RawSymbolRef::Text("+++")); Ok(()) } diff --git a/src/lazy/encoder/write_as_ion.rs b/src/lazy/encoder/write_as_ion.rs index 93affe00..47b925a1 100644 --- a/src/lazy/encoder/write_as_ion.rs +++ b/src/lazy/encoder/write_as_ion.rs @@ -17,14 +17,18 @@ use std::io; use std::marker::PhantomData; -use crate::lazy::decoder::Decoder; +use crate::lazy::decoder::{Decoder, LazyRawValueExpr, RawValueExpr}; use crate::lazy::encoder::annotation_seq::AnnotationsVec; use crate::lazy::encoder::value_writer::{SequenceWriter, StructWriter, ValueWriter}; use crate::lazy::encoding::Encoding; +use crate::lazy::expanded::macro_evaluator::RawEExpression; +use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr}; use crate::lazy::value::LazyValue; use crate::lazy::value_ref::ValueRef; +use crate::v1_0::RawValueRef; use crate::{ - Blob, Clob, Decimal, Element, Int, IonResult, IonType, LazyList, LazySExp, LazyStruct, Null, + Blob, Clob, Decimal, Element, Int, IonResult, IonType, LazyList, LazyRawFieldExpr, + LazyRawFieldName, LazyRawSequence, LazyRawStruct, LazyRawValue, LazySExp, LazyStruct, Null, RawSymbolRef, Symbol, SymbolRef, Timestamp, Value, WriteConfig, }; @@ -286,6 +290,193 @@ impl<'a, D: Decoder> WriteAsIon for LazyValue<'a, D> { } } +impl<'a, D: Decoder> WriteAsIon for RawValueRef<'a, D> { + fn write_as_ion(&self, value_writer: V) -> IonResult<()> { + use RawValueRef::*; + match self { + Null(i) => value_writer.write_null(*i), + Bool(b) => value_writer.write_bool(*b), + Int(i) => value_writer.write_int(i), + Float(f) => value_writer.write_f64(*f), + Decimal(d) => value_writer.write_decimal(d), + Timestamp(t) => value_writer.write_timestamp(t), + Symbol(s) => value_writer.write_symbol(s), + String(s) => value_writer.write_string(s.text()), + Clob(c) => value_writer.write_clob(c.as_ref()), + Blob(b) => value_writer.write_blob(b.as_ref()), + List(l) => { + let mut list_writer = value_writer.list_writer()?; + for value_result in l.iter() { + list_writer.write(WriteableRawValueExpr::<'_, D>::new(value_result?))?; + } + list_writer.close() + } + SExp(s) => { + let mut sexp_writer = value_writer.sexp_writer()?; + for value_result in s.iter() { + sexp_writer.write(WriteableRawValueExpr::<'_, D>::new(value_result?))?; + } + sexp_writer.close() + } + Struct(s) => { + let mut struct_writer = value_writer.struct_writer()?; + for field_result in s.iter() { + let field: LazyRawFieldExpr = field_result?; + match field { + LazyRawFieldExpr::NameValue(name, value) => { + struct_writer.write(name.read()?, WriteableRawValue::new(value))?; + } + LazyRawFieldExpr::NameEExp(name, eexp) => { + struct_writer.write(name.read()?, WriteableEExp::new(eexp))?; + } + LazyRawFieldExpr::EExp(_eexp) => { + todo!("Writing e-expressions in field name position during transcription."); + } + } + } + struct_writer.close() + } + } + } +} + +/// Wrapper type for `LazyRawValue`s that implements `WriteAsIon`. +pub struct WriteableRawValue<'a, D: Decoder, RawValue: LazyRawValue<'a, D>> { + raw_value: RawValue, + spooky: PhantomData<&'a D>, +} + +impl<'a, D: Decoder, RawValue: LazyRawValue<'a, D>> WriteableRawValue<'a, D, RawValue> { + pub fn new(raw_value: RawValue) -> Self { + Self { + raw_value, + spooky: PhantomData, + } + } +} + +impl<'a, D: Decoder, RawValue: LazyRawValue<'a, D>> WriteAsIon + for WriteableRawValue<'a, D, RawValue> +{ + fn write_as_ion(&self, writer: V) -> IonResult<()> { + if self.raw_value.has_annotations() { + let mut annotations = AnnotationsVec::new(); + for annotation in self.raw_value.annotations() { + annotations.push(annotation?); + } + self.raw_value + .read()? + .write_as_ion(writer.with_annotations(annotations)?) + } else { + self.raw_value.read()?.write_as_ion(writer) + } + } +} + +/// Wrapper type for `RawEExpression`s that implements `WriteAsIon`. +pub struct WriteableEExp<'a, D: Decoder = RawEExp>, RawEExp: RawEExpression<'a, D> + 'a> { + raw_eexp: RawEExp, + spooky: PhantomData<&'a D>, +} + +impl<'a, D: Decoder = RawEExp>, RawEExp: RawEExpression<'a, D> + 'a> + WriteableEExp<'a, D, RawEExp> +{ + pub fn new(raw_eexp: RawEExp) -> Self { + Self { + raw_eexp, + spooky: PhantomData, + } + } +} + +impl<'a, D: Decoder = RawEExp>, RawEExp: RawEExpression<'a, D> + 'a> WriteAsIon + for WriteableEExp<'a, D, RawEExp> +{ + fn write_as_ion(&self, writer: V) -> IonResult<()> { + let id = self.raw_eexp.id(); + let mut eexp_writer = writer.eexp_writer(id)?; + for arg_result in self.raw_eexp.raw_arguments() { + let arg = arg_result?; + eexp_writer.write(WriteableEExpArg::<'_, D>::new(arg))?; + } + eexp_writer.close() + } +} + +/// Wrapper type for `EExpArg`s that implements `WriteAsIon`. +pub struct WriteableEExpArg<'a, D: Decoder> { + arg_expr: EExpArg<'a, D>, + spooky: PhantomData<&'a D>, +} + +impl<'a, D: Decoder> WriteableEExpArg<'a, D> { + pub fn new(arg_expr: EExpArg<'a, D>) -> Self { + Self { + arg_expr, + spooky: PhantomData, + } + } +} + +impl<'a, D: Decoder> WriteAsIon for WriteableEExpArg<'a, D> { + fn write_as_ion(&self, writer: V) -> IonResult<()> { + use EExpArgExpr::*; + match self.arg_expr.expr() { + // TODO: Untagged encodings + ValueLiteral(v) => WriteableRawValue::new(*v).write_as_ion(writer), + EExp(e) => WriteableEExp::new(*e).write_as_ion(writer), + ArgGroup(group) => WriteableEExpArgGroup::<'_, D>::new(*group).write_as_ion(writer), + } + } +} + +/// Wrapper type for `WriteableEExpArgGroup`s that implements `WriteAsIon`. +pub struct WriteableEExpArgGroup<'a, D: Decoder> { + arg_group: <::EExp<'a> as RawEExpression<'a, D>>::ArgGroup, + spooky: PhantomData<&'a D>, +} + +impl<'a, D: Decoder> WriteableEExpArgGroup<'a, D> { + pub fn new(arg_group: <::EExp<'a> as RawEExpression<'a, D>>::ArgGroup) -> Self { + Self { + arg_group, + spooky: PhantomData, + } + } +} + +impl<'a, D: Decoder> WriteAsIon for WriteableEExpArgGroup<'a, D> { + fn write_as_ion(&self, _writer: V) -> IonResult<()> { + todo!() + } +} + +/// Wrapper type for `LazyRawValueExpr`s that implements `WriteAsIon`. +pub struct WriteableRawValueExpr<'a, D: Decoder> { + raw_value_expr: LazyRawValueExpr<'a, D>, + spooky: PhantomData<&'a D>, +} + +impl<'a, D: Decoder> WriteableRawValueExpr<'a, D> { + pub fn new(raw_value_expr: LazyRawValueExpr<'a, D>) -> Self { + Self { + raw_value_expr, + spooky: PhantomData, + } + } +} + +impl<'a, D: Decoder> WriteAsIon for WriteableRawValueExpr<'a, D> { + fn write_as_ion(&self, writer: V) -> IonResult<()> { + use RawValueExpr::*; + match self.raw_value_expr { + ValueLiteral(v) => WriteableRawValue::new(v).write_as_ion(writer), + EExp(e) => WriteableEExp::new(e).write_as_ion(writer), + } + } +} + impl<'a, D: Decoder> WriteAsIon for ValueRef<'a, D> { fn write_as_ion(&self, value_writer: V) -> IonResult<()> { use ValueRef::*; diff --git a/src/lazy/encoder/writer.rs b/src/lazy/encoder/writer.rs index 2ca322fb..5b4bb3bc 100644 --- a/src/lazy/encoder/writer.rs +++ b/src/lazy/encoder/writer.rs @@ -23,14 +23,14 @@ use crate::{ SymbolTable, Timestamp, Value, }; -pub(crate) struct EncodingContext { +pub(crate) struct WriteContext { symbol_table: SymbolTable, num_pending_symbols: usize, symbol_creation_policy: SymbolCreationPolicy, supports_text_tokens: bool, } -impl EncodingContext { +impl WriteContext { pub fn new( symbol_table: SymbolTable, symbol_creation_policy: SymbolCreationPolicy, @@ -47,7 +47,7 @@ impl EncodingContext { /// An Ion writer that maintains a symbol table and creates new entries as needed. pub struct Writer { - encoding_context: EncodingContext, + write_context: WriteContext, data_writer: E::Writer>, directive_writer: E::Writer>, output: Output, @@ -67,14 +67,15 @@ impl Writer { // Erase the IVM that's created by default data_writer.output_mut().clear(); // TODO: LazyEncoder should define a method to construct a new symtab and/or macro table - let symbol_table = SymbolTable::new(); - let encoding_context = EncodingContext::new( + let ion_version = E::ion_version(); + let symbol_table = SymbolTable::new(ion_version); + let encoding_context = WriteContext::new( symbol_table, E::DEFAULT_SYMBOL_CREATION_POLICY, E::SUPPORTS_TEXT_TOKENS, ); let mut writer = Writer { - encoding_context, + write_context: encoding_context, data_writer, directive_writer, output, @@ -100,9 +101,9 @@ impl Writer { /// Writes bytes of previously encoded values to the output stream. pub fn flush(&mut self) -> IonResult<()> { - if self.encoding_context.num_pending_symbols > 0 { + if self.write_context.num_pending_symbols > 0 { self.write_lst_append()?; - self.encoding_context.num_pending_symbols = 0; + self.write_context.num_pending_symbols = 0; } self.directive_writer.flush()?; @@ -125,14 +126,12 @@ impl Writer { /// Helper method to encode an LST append containing pending symbols. fn write_lst_append(&mut self) -> IonResult<()> { let Self { - encoding_context, + write_context: encoding_context, directive_writer, .. } = self; - let num_existing_symbols = encoding_context.symbol_table.len(); let num_pending_symbols = encoding_context.num_pending_symbols; - let mut lst = directive_writer .value_writer() .with_annotations(system_symbol_ids::ION_SYMBOL_TABLE)? @@ -145,7 +144,7 @@ impl Writer { let pending_symbols = encoding_context .symbol_table - .symbols_tail(num_existing_symbols - num_pending_symbols) + .symbols_tail(num_pending_symbols) .iter() .map(Symbol::text); @@ -166,7 +165,7 @@ impl MakeValueWriter for Writer { ApplicationValueWriter { raw_value_writer, - encoding: &mut self.encoding_context, + encoding: &mut self.write_context, } } } @@ -181,12 +180,12 @@ impl SequenceWriter for Writer { } pub struct ApplicationValueWriter<'a, V: ValueWriter> { - encoding: &'a mut EncodingContext, + encoding: &'a mut WriteContext, raw_value_writer: V, } impl<'a, V: ValueWriter> ApplicationValueWriter<'a, V> { - pub(crate) fn new(encoding_context: &'a mut EncodingContext, raw_value_writer: V) -> Self { + pub(crate) fn new(encoding_context: &'a mut WriteContext, raw_value_writer: V) -> Self { Self { encoding: encoding_context, raw_value_writer, @@ -336,13 +335,13 @@ impl<'value, V: ValueWriter> ValueWriter for ApplicationValueWriter<'value, V> { } pub struct ApplicationStructWriter<'value, V: ValueWriter> { - encoding: &'value mut EncodingContext, + encoding: &'value mut WriteContext, raw_struct_writer: V::StructWriter, } impl<'value, V: ValueWriter> ApplicationStructWriter<'value, V> { pub(crate) fn new( - encoding_context: &'value mut EncodingContext, + encoding_context: &'value mut WriteContext, raw_struct_writer: V::StructWriter, ) -> Self { Self { @@ -409,13 +408,13 @@ impl<'value, V: ValueWriter> StructWriter for ApplicationStructWriter<'value, V> } pub struct ApplicationListWriter<'value, V: ValueWriter> { - encoding: &'value mut EncodingContext, + encoding: &'value mut WriteContext, raw_list_writer: V::ListWriter, } impl<'value, V: ValueWriter> ApplicationListWriter<'value, V> { pub(crate) fn new( - encoding_context: &'value mut EncodingContext, + encoding_context: &'value mut WriteContext, raw_list_writer: V::ListWriter, ) -> Self { Self { @@ -444,15 +443,12 @@ impl<'value, V: ValueWriter> SequenceWriter for ApplicationListWriter<'value, V> } pub struct ApplicationSExpWriter<'value, V: ValueWriter> { - encoding: &'value mut EncodingContext, + encoding: &'value mut WriteContext, raw_sexp_writer: V::SExpWriter, } impl<'value, V: ValueWriter> ApplicationSExpWriter<'value, V> { - pub(crate) fn new( - encoding: &'value mut EncodingContext, - raw_sexp_writer: V::SExpWriter, - ) -> Self { + pub(crate) fn new(encoding: &'value mut WriteContext, raw_sexp_writer: V::SExpWriter) -> Self { Self { encoding, raw_sexp_writer, @@ -478,15 +474,12 @@ impl<'value, V: ValueWriter> SequenceWriter for ApplicationSExpWriter<'value, V> } pub struct ApplicationEExpWriter<'value, V: ValueWriter> { - encoding: &'value mut EncodingContext, + encoding: &'value mut WriteContext, raw_eexp_writer: V::EExpWriter, } impl<'value, V: ValueWriter> ApplicationEExpWriter<'value, V> { - pub(crate) fn new( - encoding: &'value mut EncodingContext, - raw_eexp_writer: V::EExpWriter, - ) -> Self { + pub(crate) fn new(encoding: &'value mut WriteContext, raw_eexp_writer: V::EExpWriter) -> Self { Self { encoding, raw_eexp_writer, diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index ef0451f9..9744d62b 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; use std::io; -use crate::lazy::any_encoding::{IonEncoding, LazyRawAnyValue}; +use crate::lazy::any_encoding::{IonEncoding, IonVersion, LazyRawAnyValue}; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::r#struct::{LazyRawBinaryFieldName_1_0, LazyRawBinaryStruct_1_0}; use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0; @@ -27,14 +27,14 @@ use crate::lazy::text::raw::reader::LazyRawTextReader_1_0; use crate::lazy::text::raw::sequence::{LazyRawTextList_1_0, LazyRawTextSExp_1_0}; use crate::lazy::text::raw::v1_1::reader::{ LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextReader_1_1, LazyRawTextSExp_1_1, - LazyRawTextStruct_1_1, RawTextEExpression_1_1, + LazyRawTextStruct_1_1, TextEExpression_1_1, }; use crate::lazy::text::value::{ LazyRawTextValue, LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker_1_0, LazyRawTextVersionMarker_1_1, RawTextAnnotationsIterator, }; -use crate::lazy::binary::raw::v1_1::e_expression::RawBinaryEExpression_1_1; +use crate::lazy::binary::raw::v1_1::e_expression::BinaryEExpression_1_1; use crate::{IonResult, TextFormat, WriteConfig}; /// Marker trait for types that represent an Ion encoding. @@ -63,7 +63,21 @@ pub trait Encoding: Encoder + Decoder { } fn encoding(&self) -> IonEncoding; + fn instance() -> Self; fn name() -> &'static str; + + fn is_binary() -> bool { + Self::instance().encoding().is_binary() + } + + fn is_text() -> bool { + Self::instance().encoding().is_text() + } + + fn ion_version() -> IonVersion { + Self::instance().encoding().version() + } + fn default_write_config() -> WriteConfig; } @@ -126,6 +140,10 @@ impl Encoding for BinaryEncoding_1_0 { IonEncoding::Binary_1_0 } + fn instance() -> Self { + BinaryEncoding_1_0 + } + fn name() -> &'static str { "binary Ion v1.0" } @@ -140,6 +158,10 @@ impl Encoding for BinaryEncoding_1_1 { IonEncoding::Binary_1_1 } + fn instance() -> Self { + BinaryEncoding_1_1 + } + fn name() -> &'static str { "binary Ion v1.1" } @@ -154,6 +176,10 @@ impl Encoding for TextEncoding_1_0 { IonEncoding::Text_1_0 } + fn instance() -> Self { + TextEncoding_1_0 + } + fn name() -> &'static str { "text Ion v1.0" } @@ -168,6 +194,10 @@ impl Encoding for TextEncoding_1_1 { IonEncoding::Text_1_1 } + fn instance() -> Self { + TextEncoding_1_1 + } + fn name() -> &'static str { "text Ion v1.1" } @@ -198,7 +228,6 @@ impl EncodingWithMacroSupport for TextEncoding_1_1 {} impl Decoder for BinaryEncoding_1_0 { type Reader<'data> = LazyRawBinaryReader_1_0<'data>; - type ReaderSavedState = (); type Value<'top> = LazyRawBinaryValue_1_0<'top>; type SExp<'top> = LazyRawBinarySExp_1_0<'top>; type List<'top> = LazyRawBinaryList_1_0<'top>; @@ -212,7 +241,6 @@ impl Decoder for BinaryEncoding_1_0 { impl Decoder for TextEncoding_1_0 { type Reader<'data> = LazyRawTextReader_1_0<'data>; - type ReaderSavedState = (); type Value<'top> = LazyRawTextValue_1_0<'top>; type SExp<'top> = LazyRawTextSExp_1_0<'top>; type List<'top> = LazyRawTextList_1_0<'top>; @@ -226,28 +254,25 @@ impl Decoder for TextEncoding_1_0 { impl Decoder for TextEncoding_1_1 { type Reader<'data> = LazyRawTextReader_1_1<'data>; - type ReaderSavedState = (); type Value<'top> = LazyRawTextValue_1_1<'top>; type SExp<'top> = LazyRawTextSExp_1_1<'top>; type List<'top> = LazyRawTextList_1_1<'top>; type Struct<'top> = LazyRawTextStruct_1_1<'top>; type FieldName<'top> = LazyRawTextFieldName_1_1<'top>; type AnnotationsIterator<'top> = RawTextAnnotationsIterator<'top>; - type EExp<'top> = RawTextEExpression_1_1<'top>; + type EExp<'top> = TextEExpression_1_1<'top>; type VersionMarker<'top> = LazyRawTextVersionMarker_1_1<'top>; } impl Decoder for BinaryEncoding_1_1 { type Reader<'data> = LazyRawBinaryReader_1_1<'data>; - type ReaderSavedState = (); - type Value<'top> = LazyRawBinaryValue_1_1<'top>; + type Value<'top> = &'top LazyRawBinaryValue_1_1<'top>; type SExp<'top> = LazyRawBinarySExp_1_1<'top>; type List<'top> = LazyRawBinaryList_1_1<'top>; type Struct<'top> = LazyRawBinaryStruct_1_1<'top>; type FieldName<'top> = LazyRawBinaryFieldName_1_1<'top>; type AnnotationsIterator<'top> = RawBinaryAnnotationsIterator_1_1<'top>; - // TODO: implement macros in 1.1 - type EExp<'top> = RawBinaryEExpression_1_1<'top>; + type EExp<'top> = &'top BinaryEExpression_1_1<'top>; type VersionMarker<'top> = LazyRawBinaryVersionMarker_1_1<'top>; } @@ -265,7 +290,7 @@ pub trait RawValueLiteral {} impl<'top, E: TextEncoding<'top>> RawValueLiteral for LazyRawTextValue<'top, E> {} impl<'top> RawValueLiteral for LazyRawBinaryValue_1_0<'top> {} -impl<'top> RawValueLiteral for LazyRawBinaryValue_1_1<'top> {} +impl<'top> RawValueLiteral for &'top LazyRawBinaryValue_1_1<'top> {} impl<'top> RawValueLiteral for LazyRawAnyValue<'top> {} #[cfg(test)] diff --git a/src/lazy/expanded/compiler.rs b/src/lazy/expanded/compiler.rs index 14667204..192b3c20 100644 --- a/src/lazy/expanded/compiler.rs +++ b/src/lazy/expanded/compiler.rs @@ -1,13 +1,14 @@ //! Compiles template definition language (TDL) expressions into a form suitable for fast incremental //! evaluation. -use std::collections::HashMap; use std::ops::Range; +use rustc_hash::FxHashMap; + use crate::lazy::decoder::Decoder; use crate::lazy::expanded::template::{ - ExprRange, MacroSignature, Parameter, ParameterEncoding, TemplateBody, TemplateBodyElement, - TemplateBodyMacroInvocation, TemplateBodyValueExpr, TemplateMacro, TemplateStructIndex, - TemplateValue, + ExprRange, MacroSignature, Parameter, ParameterCardinality, ParameterEncoding, + RestSyntaxPolicy, TemplateBody, TemplateBodyElement, TemplateBodyExpr, TemplateMacro, + TemplateStructIndex, TemplateValue, }; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::r#struct::LazyStruct; @@ -18,7 +19,64 @@ use crate::result::IonFailure; use crate::symbol_ref::AsSymbolRef; use crate::{v1_1, IonError, IonResult, IonType, Reader, SymbolRef}; -/// Validates a given TDL expression and compiles it into a [`TemplateMacro`] that can be added +/// Information inferred about a template's expansion at compile time. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct ExpansionAnalysis { + pub(crate) could_produce_system_value: bool, + pub(crate) must_produce_exactly_one_value: bool, + // A memoized combination of the above flags. + pub(crate) can_be_lazily_evaluated_at_top_level: bool, + pub(crate) expansion_singleton: Option, +} + +impl ExpansionAnalysis { + pub fn could_produce_system_value(&self) -> bool { + self.could_produce_system_value + } + + pub fn must_produce_exactly_one_value(&self) -> bool { + self.must_produce_exactly_one_value + } + + pub fn can_be_lazily_evaluated_at_top_level(&self) -> bool { + self.can_be_lazily_evaluated_at_top_level + } + + pub fn expansion_singleton(&self) -> Option { + self.expansion_singleton + } +} + +/// When static analysis can detect that a template body will always expand to a single value, +/// information inferred about that value is stored in this type. When this template backs a +/// lazy value, having these fields available allows the lazy value to answer basic queries without +/// needing to fully evaluate the template. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct ExpansionSingleton { + pub(crate) is_null: bool, + pub(crate) ion_type: IonType, + pub(crate) num_annotations: u8, +} + +impl ExpansionSingleton { + pub fn is_null(&self) -> bool { + self.is_null + } + + pub fn ion_type(&self) -> IonType { + self.ion_type + } + + pub fn has_annotations(&self) -> bool { + self.num_annotations > 0 + } + + pub fn num_annotations(&self) -> usize { + self.num_annotations as usize + } +} + +/// Validates a given TDL expression and compiles it into a `TemplateMacro` that can be added /// to a [`MacroTable`](crate::lazy::expanded::macro_table::MacroTable). pub struct TemplateCompiler {} @@ -29,35 +87,39 @@ impl TemplateCompiler { /// ``` /// and compiles it into a [`TemplateMacro`]. /// - /// The [`TemplateMacro`] stores a sequence of [`TemplateBodyValueExpr`]s that need to be evaluated + /// The [`TemplateMacro`] stores a sequence of [`TemplateBodyExpr`]s that need to be evaluated /// in turn. Each step is either a value literal, a reference to one of the parameters (that is: /// a variable), or a macro invocation. /// - /// Expressions that contain other expressions (i.e. containers and macro invocations) each - /// store the range of subexpressions that they contain, allowing a reader to skip the entire - /// parent expression as desired. For example, in this macro: + /// A `TemplateBodyExpr` can be made up of more than one expression. Each `TemplateBodyExpr` + /// stores the number of expressions that it includes. For example, scalar value + /// literals are always a single expression and so will have `num_expressions=1`. However, + /// container value literals can have nested expressions, and macro invocations can take + /// expressions as arguments; in both these cases, `num_expressions` can be `1` or higher. + /// This arrangement--expressions storing their composite expression count--enables the reader + /// to skip the entire parent expression as desired. For example, in this macro: /// /// ```ion_1_1 /// (macro foo () /// // Template body expressions - /// [ // #0, contains expressions 1..=4 - /// 1, // #1 - /// (values // #2, contains expressions 3..=4 - /// 2 // #3 - /// 3 // #4 + /// [ // #0, num_expressions=5, range=0..5 + /// 1, // #1, num_expressions=1, range=1..2 + /// (values // #2, num_expressions=3, range=2..5 + /// 2 // #3, num_expressions=1, range=3..4 + /// 3 // #4, num_expressions=1, range=4..5 /// ) /// ] /// ) /// ``` /// - /// the step corresponding to `(values 2 3)` would store the range `3..=4`, indicating that - /// it contains template body expressions number `3` and `4`. A reader wishing to skip that call - /// to `values` could do so by moving ahead to expression number `5`. The outer - /// list (`[1, (values 2 3)]`) would store a `1..=4`, indicating that it contains the `1`, - /// the a macro invocation `values`, and the two arguments that belong to `values`. + /// the step corresponding to `(values 2 3)` would store the range `2..5`, indicating that + /// it includes not only template body expression #2, but #3 and #4 as well. A reader wishing to + /// skip that call to `values` could do so by moving ahead to expression number `5`. The outer + /// list (`[1, (values 2 3)]`) would store a `0..5`, indicating that it contains the `1`, + /// the macro invocation `values`, and the two arguments that belong to `values`. /// - /// The compiler recognizes the `(quote expr1 expr2 [...] exprN)` form, adding each subexpression - /// to the template without interpretation. `(quote ...)` does not appear in the compiled + /// The compiler recognizes the `(literal expr1 expr2 [...] exprN)` form, adding each subexpression + /// to the template without interpretation. `(literal ...)` does not appear in the compiled /// template as there is nothing more for it to do at expansion time. pub fn compile_from_text( context: EncodingContextRef, @@ -66,52 +128,197 @@ impl TemplateCompiler { // TODO: This is a rudimentary implementation that panics instead of performing thorough // validation. Where it does surface errors, the messages are too terse. let mut reader = Reader::new(v1_1::Text, expression.as_bytes())?; - let invocation = reader.expect_next()?.read()?.expect_sexp()?; - let mut values = invocation.iter(); + let macro_def_sexp = reader.expect_next()?.read()?.expect_sexp()?; - let macro_keyword = values.next().expect("macro ID")?.read()?.expect_symbol()?; - if macro_keyword != "macro" { - return IonResult::decoding_error( - "macro compilation expects a sexp starting with the keyword `macro`", - ); - } + Self::compile_from_sexp(context, macro_def_sexp) + } - // TODO: Enforce 'identifier' syntax subset of symbol - // TODO: Syntactic support address IDs like `(:14 ...)` - let template_name = match values.next().expect("template name")?.read()? { - ValueRef::Symbol(s) if s.text().is_none() => { - return IonResult::decoding_error("$0 is not a valid macro name") + /// Pulls the next value from the provided source and confirms that it is a symbol whose + /// text matches the `keyword` string. + fn expect_keyword<'a, Encoding: Decoder>( + keyword: &str, + source: &mut impl Iterator>>, + ) -> IonResult<()> { + let value = match source.next() { + None => { + return IonResult::decoding_error(format!( + "expected keyword '{keyword}', but found nothing" + )) } - ValueRef::Symbol(s) => Some(s.text().unwrap().to_owned()), - ValueRef::Null(IonType::Symbol | IonType::Null) => None, - other => { + Some(Err(e)) => { return IonResult::decoding_error(format!( - "expected identifier as macro name but found: {other:?}" + "expected keyword '{keyword}', but encountered an error: {e:?}" )) } + Some(Ok(value)) => value, }; + match value.read()? { + ValueRef::Symbol(s) if s.text() == Some(keyword) => Ok(()), + value_ref => IonResult::decoding_error(format!( + "expected keyword '{keyword}', but found {value_ref:?}" + )), + } + } + + /// Confirms that the provided `value` is a symbol with known text. If so, returns `Ok(text)`. + /// If not, returns a decoding error containing the specified label. + fn expect_symbol_text<'a, Encoding: Decoder>( + label: &str, + value: LazyValue<'a, Encoding>, + ) -> IonResult<&'a str> { + match value.read()? { + ValueRef::Symbol(s) => { + if let Some(text) = s.text() { + Ok(text) + } else { + IonResult::decoding_error(format!( + "expected {label}, but found a symbol with no text" + )) + } + } + value_ref => { + IonResult::decoding_error(format!("expected {label}, but found a(n) {value_ref:?}")) + } + } + } + + /// Tries to pull the next `LazyValue` from the provided iterator. If the iterator is empty, + /// returns a `IonError::Decoding` that includes the specified label. + fn expect_next<'a, Encoding: Decoder>( + label: &str, + source: &mut impl Iterator>>, + ) -> IonResult> { + match source.next() { + None => IonResult::decoding_error(format!("expected {label} but found nothing")), + Some(Err(e)) => IonResult::decoding_error(format!( + "expected {label} but encountered an error: {e:?}" + )), + Some(Ok(value)) => Ok(value), + } + } + + /// Tries to pull the next `LazyValue` from the provided iterator, confirming that it is + /// a symbol with text. + fn expect_name<'a, Encoding: Decoder>( + label: &str, + source: &mut impl Iterator>>, + ) -> IonResult<&'a str> { + let value = Self::expect_next(label, source)?; + Self::expect_symbol_text(label, value) + } - let params = values - .next() - .expect("parameters sexp")? - .read()? - .expect_sexp()?; + /// Tries to pull the next `LazyValue` from the provided iterator, confirming that it is + /// either a symbol with text, `null`, or `null.symbol`. + fn expect_nullable_name<'a, Encoding: Decoder>( + label: &str, + source: &mut impl Iterator>>, + ) -> IonResult> { + let value = Self::expect_next(label, source)?; + if value.is_null() && matches!(value.ion_type(), IonType::Null | IonType::Symbol) { + Ok(None) + } else { + Ok(Some(Self::expect_symbol_text(label, value)?)) + } + } + + /// Tries to pull the next `LazyValue` from the provided iterator, confirming that it is + /// an s-expression. + fn expect_sexp<'a, Encoding: Decoder>( + label: &str, + source: &mut impl Iterator>>, + ) -> IonResult> { + let value = Self::expect_next(label, source)?; + match value.read()? { + ValueRef::SExp(clause) => Ok(clause), + other => { + IonResult::decoding_error(format!("expected {label}, but found a(n) {other:?}")) + } + } + } + + pub fn compile_from_sexp<'a, Encoding: Decoder>( + context: EncodingContextRef<'a>, + macro_def_sexp: LazySExp<'a, Encoding>, + ) -> Result { + let mut values = macro_def_sexp.iter(); + + Self::expect_keyword("macro", &mut values)?; + + // TODO: Enforce 'identifier' syntax subset of symbol + // TODO: Syntactic support for address IDs like `(14 ...)` + let template_name = + Self::expect_nullable_name("a macro name", &mut values)?.map(|name| name.to_owned()); + + // The `params` clause of the macro definition is an s-expression enumerating the parameters + // that the macro accepts. For example: `(flex_uint::x, y*, z?)`. + let params_clause = Self::expect_sexp("an s-expression defining parameters", &mut values)?; let mut compiled_params = Vec::new(); - for param_result in ¶ms { + // `param_items` is a peekable iterator over the Ion values in `params_clause`. Because it + // is peekable, we can look ahead at each step to see if there are more values. This is + // important because: + // * when adding a parameter, we need to look ahead to see if the next token is a + // cardinality modifier. + // * special syntax rules apply to `*` and `+` parameters in tail position. + let mut param_items = params_clause.iter().peekable(); + + let mut is_final_parameter = false; + while let Some(item) = param_items.next().transpose()? { + is_final_parameter |= param_items.peek().is_none(); + let name = Self::expect_symbol_text("a parameter name", item)?.to_owned(); + + use ParameterCardinality::*; + let mut cardinality = ExactlyOne; + if let Some(next_item_result) = param_items.peek() { + let next_item = match next_item_result { + Ok(item_ref) => *item_ref, + // Because we are borrowing the peek()ed result, we must clone the error + Err(e) => return Err(e.clone()), + }; + let text = Self::expect_symbol_text("a cardinality modifier", next_item)?; + cardinality = match text { + "!" => ExactlyOne, + "?" => ZeroOrOne, + "*" => ZeroOrMore, + "+" => OneOrMore, + // The next item doesn't appear to be a cardinality specifier, it's probably a parameter. + // Finish processing this parameter, then move on to the next item. + _ => { + // We know there are more items in the signature, so this isn't the last parameter. + // Therefore, rest syntax is not allowed. + let compiled_param = Parameter::new( + name, + ParameterEncoding::Tagged, + cardinality, + RestSyntaxPolicy::NotAllowed, + ); + compiled_params.push(compiled_param); + continue; + } + }; + // If we reach this point, the item was a cardinality specifier and we're done + // processing it. We can discard the item and continue on to the next parameter. + let _cardinality_specifier = param_items.next().unwrap(); + is_final_parameter |= param_items.peek().is_none(); + } + + let rest_syntax_policy = if is_final_parameter && cardinality != ExactlyOne { + RestSyntaxPolicy::Allowed + } else { + RestSyntaxPolicy::NotAllowed + }; + let compiled_param = Parameter::new( - param_result? - .read()? - .expect_symbol()? - .text() - .unwrap() - .to_string(), + name, ParameterEncoding::Tagged, + cardinality, + rest_syntax_policy, ); compiled_params.push(compiled_param); } - let signature = MacroSignature::new(compiled_params); - let body = values.next().expect("template body")?; + let signature = MacroSignature::new(compiled_params)?; + let body = Self::expect_next("the template body", &mut values)?; + let expansion_analysis = Self::analyze_body_expr(body)?; let mut compiled_body = TemplateBody { expressions: Vec::new(), annotations_storage: Vec::new(), @@ -120,28 +327,90 @@ impl TemplateCompiler { context, &signature, &mut compiled_body, - /*is_quoted=*/ false, + /*is_literal=*/ false, body, )?; let template_macro = TemplateMacro { name: template_name, signature, body: compiled_body, + expansion_analysis, }; Ok(template_macro) } + /// The entry point for static analysis of a template body expression. + fn analyze_body_expr(body_expr: LazyValue) -> IonResult { + let could_produce_system_value = Self::body_expr_could_produce_system_values(body_expr); + let must_produce_exactly_one_value = + Self::body_expr_must_produce_exactly_one_value(body_expr); + let num_annotations = u8::try_from(body_expr.annotations().count()).map_err(|_| { + IonError::decoding_error("template body expression can only have up to 255 annotations") + })?; + let expansion_singleton = if must_produce_exactly_one_value { + Some(ExpansionSingleton { + ion_type: body_expr.ion_type(), + num_annotations, + is_null: body_expr.is_null(), + }) + } else { + None + }; + Ok(ExpansionAnalysis { + could_produce_system_value, + must_produce_exactly_one_value, + can_be_lazily_evaluated_at_top_level: must_produce_exactly_one_value + && !could_produce_system_value, + expansion_singleton, + }) + } + + /// Indicates whether the provided expression *could* produce a system value (e.g. a symbol table + /// or encoding directive) when expanded. + /// + /// If the expression is guaranteed to never produce a system value, returns `false`. + /// If the expression *could* produce one, returns `true`. + /// + /// For the time being, this is a simple, lightweight heuristic. + fn body_expr_could_produce_system_values(body_expr: LazyValue) -> bool { + use IonType::*; + match body_expr.ion_type() { + // If the expression is an s-expression, it could expand to anything. If desired, we could + // inspect the macro it invokes to see if it's a `literal`, `make_string`, `make_struct`, etc. + // For now, we simply say "Producing a system value is possible." + SExp => true, + // If the value is a struct, it would need to be annotated with `$ion_symbol_table` + // to produce a system value. + Struct => { + matches!(body_expr.annotations().next(), Some(Ok(s)) if s.text() == Some("$ion_symbol_table")) + } + _ => false, + } + } + + /// Indicates whether the provided expression is guaranteed to produce exactly one Ion value + /// when expanded. + /// + /// If the expression will always produce a single value, returns `true`. + /// If the expression could potentially produce an empty stream or a stream with multiple + /// values, returns `false`. + fn body_expr_must_produce_exactly_one_value(body_expr: LazyValue) -> bool { + body_expr.ion_type() != IonType::SExp + } + /// Recursively visits all of the expressions in `lazy_value` and adds their corresponding - /// [`TemplateBodyValueExpr`] sequences to the `TemplateBody`. + /// [`TemplateBodyExpr`] sequences to the `TemplateBody`. /// - /// If `is_quoted` is true, nested symbols and s-expressions will not be interpreted. + /// If `is_literal` is true, nested symbols and s-expressions will not be interpreted. fn compile_value<'top, D: Decoder>( context: EncodingContextRef<'top>, signature: &MacroSignature, definition: &mut TemplateBody, - is_quoted: bool, + is_literal: bool, lazy_value: LazyValue<'top, D>, ) -> IonResult<()> { + // Add the value's annotations to the annotations storage vec and take note of the + // vec range that belongs to this value. let annotations_range_start = definition.annotations_storage.len(); for annotation_result in lazy_value.annotations() { let annotation = annotation_result?; @@ -150,6 +419,9 @@ impl TemplateCompiler { let annotations_range_end = definition.annotations_storage.len(); let annotations_range = annotations_range_start..annotations_range_end; + // Make a `TemplateValue` that represent's the value's unannotated data. Scalar `TemplateValue`s + // are very similar to their scalar `Value` counterparts, but its container types are more + // barebones. let value = match lazy_value.read()? { ValueRef::Null(ion_type) => TemplateValue::Null(ion_type), ValueRef::Bool(b) => TemplateValue::Bool(b), @@ -158,7 +430,7 @@ impl TemplateCompiler { ValueRef::Decimal(d) => TemplateValue::Decimal(d), ValueRef::Timestamp(t) => TemplateValue::Timestamp(t), ValueRef::String(s) => TemplateValue::String(s.to_owned()), - ValueRef::Symbol(s) if is_quoted => TemplateValue::Symbol(s.to_owned()), + ValueRef::Symbol(s) if is_literal => TemplateValue::Symbol(s.to_owned()), ValueRef::Symbol(s) => { return Self::compile_variable_reference( context, @@ -170,12 +442,14 @@ impl TemplateCompiler { } ValueRef::Blob(b) => TemplateValue::Blob(b.to_owned()), ValueRef::Clob(c) => TemplateValue::Clob(c.to_owned()), + // For the container types, compile the value's nested values/fields and take note + // of the total number of expressions that belong to this container. ValueRef::SExp(s) => { return Self::compile_sexp( context, signature, definition, - is_quoted, + is_literal, annotations_range.clone(), s, ); @@ -185,7 +459,7 @@ impl TemplateCompiler { context, signature, definition, - is_quoted, + is_literal, annotations_range.clone(), l, ) @@ -195,14 +469,18 @@ impl TemplateCompiler { context, signature, definition, - is_quoted, + is_literal, annotations_range.clone(), s, ) } }; + // At this point, we're only looking at scalars. + let scalar_expr_index = definition.expressions().len(); definition.push_element( TemplateBodyElement::with_value(value).with_annotations(annotations_range), + // Scalars are always a single expression. + ExprRange::new(scalar_expr_index..scalar_expr_index + 1), ); Ok(()) } @@ -212,26 +490,26 @@ impl TemplateCompiler { context: EncodingContextRef<'top>, signature: &MacroSignature, definition: &mut TemplateBody, - is_quoted: bool, + is_literal: bool, annotations_range: Range, lazy_list: LazyList<'top, D>, ) -> IonResult<()> { let list_element_index = definition.expressions.len(); - // Assume the list contains zero expressions to start, we'll update this at the end - let list_element = TemplateBodyElement::with_value(TemplateValue::List(ExprRange::empty())); - definition.push_element(list_element); - let list_children_start = definition.expressions.len(); + let list_element = TemplateBodyElement::with_value(TemplateValue::List); + // Use an empty range for now. When we finish reading the list, we'll overwrite the empty + // range with the correct one. + definition.push_element(list_element, ExprRange::empty()); for value_result in &lazy_list { let value = value_result?; - Self::compile_value(context, signature, definition, is_quoted, value)?; + Self::compile_value(context, signature, definition, is_literal, value)?; } let list_children_end = definition.expressions.len(); // Update the list entry to reflect the number of child expressions it contains - let list_element = TemplateBodyElement::with_value(TemplateValue::List(ExprRange::new( - list_children_start..list_children_end, - ))) - .with_annotations(annotations_range); - definition.expressions[list_element_index] = TemplateBodyValueExpr::Element(list_element); + let list_element = TemplateBodyElement::with_value(TemplateValue::List) + .with_annotations(annotations_range); + let list_expr_range = ExprRange::new(list_element_index..list_children_end); + definition.expressions[list_element_index] = + TemplateBodyExpr::element(list_element, list_expr_range); Ok(()) } @@ -240,12 +518,12 @@ impl TemplateCompiler { context: EncodingContextRef<'top>, signature: &MacroSignature, definition: &mut TemplateBody, - is_quoted: bool, + is_literal: bool, annotations_range: Range, lazy_sexp: LazySExp<'top, D>, ) -> IonResult<()> { - if is_quoted { - // If `is_quoted` is true, this s-expression is nested somewhere inside a `(quote ...)` + if is_literal { + // If `is_literal` is true, this s-expression is nested somewhere inside a `(literal ...)` // macro invocation. The sexp and its child expressions can be added to the TemplateBody // without interpretation. Self::compile_quoted_sexp(context, signature, definition, annotations_range, lazy_sexp) @@ -255,8 +533,8 @@ impl TemplateCompiler { if !annotations_range.is_empty() { return IonResult::decoding_error("found annotations on a macro invocation"); } - // Peek at the first expression in the sexp. If it's the symbol `quoted`... - if Self::sexp_is_quote_macro(&lazy_sexp)? { + // Peek at the first expression in the sexp. If it's the symbol `literal`... + if Self::sexp_is_literal_macro(&lazy_sexp)? { // ...then we set `is_quoted` to true and compile all of its child expressions. Self::compile_quoted_elements(context, signature, definition, lazy_sexp) } else { @@ -288,7 +566,6 @@ impl TemplateCompiler { // Assume the macro contains zero argument expressions to start, we'll update // this at the end of the function. definition.push_macro_invocation(macro_address, ExprRange::empty()); - let arguments_start = definition.expressions.len(); for argument_result in expressions { let argument = argument_result?; Self::compile_value( @@ -298,12 +575,9 @@ impl TemplateCompiler { let arguments_end = definition.expressions.len(); // Update the macro step to reflect the macro's address and number of child expressions it // contains - let template_macro_invocation = TemplateBodyMacroInvocation::new( - macro_address, - ExprRange::new(arguments_start..arguments_end), - ); + let invocation_expr_range = ExprRange::new(macro_step_index..arguments_end); definition.expressions[macro_step_index] = - TemplateBodyValueExpr::MacroInvocation(template_macro_invocation); + TemplateBodyExpr::macro_invocation(macro_address, invocation_expr_range); Ok(()) } @@ -381,10 +655,9 @@ impl TemplateCompiler { lazy_sexp: LazySExp<'top, D>, ) -> IonResult<()> { let sexp_element_index = definition.expressions.len(); - // Assume the sexp contains zero expressions to start, we'll update this at the end - let sexp_element = TemplateBodyElement::with_value(TemplateValue::SExp(ExprRange::empty())); - definition.push_element(sexp_element); - let sexp_children_start = definition.expressions.len(); + let sexp_element = TemplateBodyElement::with_value(TemplateValue::SExp); + // Use an empty range for now; we'll overwrite it with the correct one later. + definition.push_element(sexp_element, ExprRange::empty()); for value_result in &lazy_sexp { let value = value_result?; Self::compile_value( @@ -392,26 +665,28 @@ impl TemplateCompiler { )?; } let sexp_children_end = definition.expressions.len(); - let sexp_element = TemplateBodyElement::with_value(TemplateValue::SExp(ExprRange::new( - sexp_children_start..sexp_children_end, - ))) - .with_annotations(annotations_range); + let sexp_element = TemplateBodyElement::with_value(TemplateValue::SExp) + .with_annotations(annotations_range); + let sexp_expr_range = ExprRange::new(sexp_element_index..sexp_children_end); // Update the sexp entry to reflect the number of child expressions it contains - definition.expressions[sexp_element_index] = TemplateBodyValueExpr::Element(sexp_element); + definition.expressions[sexp_element_index] = + TemplateBodyExpr::element(sexp_element, sexp_expr_range); Ok(()) } - /// Returns `Ok(true)` if the first child value in the `LazySexp` is the symbol `quote`. - /// This method should only be called in an unquoted context. - fn sexp_is_quote_macro(sexp: &LazySExp) -> IonResult { + /// Returns `Ok(true)` if the first child value in the `LazySexp` is the symbol `literal`. + /// This method should only be called in a non-literal context. + fn sexp_is_literal_macro(sexp: &LazySExp) -> IonResult { let first_expr = sexp.iter().next(); match first_expr { - // If the sexp is empty and we're not in a quoted context, that's an error. - None => IonResult::decoding_error("found an empty s-expression in an unquoted context"), + // If the sexp is empty and we're not in a literal context, that's an error. + None => { + IonResult::decoding_error("found an empty s-expression in a non-literal context") + } Some(Err(e)) => Err(e), Some(Ok(lazy_value)) => { let value = lazy_value.read()?; - Ok(value == ValueRef::Symbol("quote".as_symbol_ref())) + Ok(value == ValueRef::Symbol("literal".as_symbol_ref())) } } } @@ -421,30 +696,27 @@ impl TemplateCompiler { context: EncodingContextRef<'top>, signature: &MacroSignature, definition: &mut TemplateBody, - is_quoted: bool, + is_literal: bool, annotations_range: Range, lazy_struct: LazyStruct<'top, D>, ) -> IonResult<()> { let struct_element_index = definition.expressions.len(); - let struct_element = TemplateBodyElement::with_value( - // Assume the struct contains zero expressions to start, we'll update this entry with - // the actual range at the end of the method. - TemplateValue::Struct( - ExprRange::empty(), - // Creating a new HashMap does not allocate; we'll overwrite this value with an - // actual map of field names to indexes at the end of the method. - HashMap::new(), - ), - ); - definition.push_element(struct_element); + let struct_element = TemplateBodyElement::with_value(TemplateValue::Struct( + // Creating a new HashMap does not allocate; we'll overwrite this value with an + // actual map of field names to indexes at the end of the method. + FxHashMap::default(), + )); + // Use an empty range for now; we'll overwrite it with the correct range later. + definition.push_element(struct_element, ExprRange::empty()); - let mut fields: TemplateStructIndex = HashMap::new(); - let struct_start = definition.expressions.len(); + let mut fields: TemplateStructIndex = FxHashMap::default(); for field_result in &lazy_struct { let field = field_result?; let name = field.name()?.to_owned(); + let name_expr_index = definition.expressions().len(); let name_element = TemplateBodyElement::with_value(TemplateValue::Symbol(name.clone())); - definition.push_element(name_element); + let name_expr_range = ExprRange::new(name_expr_index..name_expr_index + 1); + definition.push_element(name_element, name_expr_range); // If this field name has defined text (which is everything besides `$0` and equivalents), // add that text to the fields map. Future queries for `$0` will require a linear scan, // but that's a niche use case. If there is call for it, this approach can be revised. @@ -458,17 +730,15 @@ impl TemplateCompiler { } } - Self::compile_value(context, signature, definition, is_quoted, field.value())?; + Self::compile_value(context, signature, definition, is_literal, field.value())?; } let struct_end = definition.expressions.len(); // Update the struct entry to reflect the range of expansion steps it contains. - let struct_element = TemplateBodyElement::with_value(TemplateValue::Struct( - ExprRange::new(struct_start..struct_end), - fields, - )) - .with_annotations(annotations_range); + let struct_element = TemplateBodyElement::with_value(TemplateValue::Struct(fields)) + .with_annotations(annotations_range); + let struct_expr_range = ExprRange::new(struct_element_index..struct_end); definition.expressions[struct_element_index] = - TemplateBodyValueExpr::Element(struct_element); + TemplateBodyExpr::element(struct_element, struct_expr_range); Ok(()) } @@ -506,12 +776,11 @@ impl TemplateCompiler { #[cfg(test)] mod tests { - use std::collections::HashMap; + use rustc_hash::FxHashMap; use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::template::{ - ExprRange, TemplateBodyMacroInvocation, TemplateBodyValueExpr, - TemplateBodyVariableReference, TemplateMacro, TemplateValue, + ExprRange, TemplateBodyExpr, TemplateMacro, TemplateValue, }; use crate::lazy::expanded::{EncodingContext, EncodingContextRef}; use crate::{Int, IntoAnnotations, IonResult, Symbol}; @@ -528,8 +797,8 @@ mod tests { .expressions() .get(index) .expect("no such expansion step") - .expect_element() - .unwrap_or_else(|_| panic!("expected value {expected:?}")); + .kind() + .require_element(); assert_eq!(actual.value(), &expected); Ok(()) } @@ -543,11 +812,11 @@ mod tests { expect_step( definition, index, - TemplateBodyValueExpr::MacroInvocation(TemplateBodyMacroInvocation::new( + TemplateBodyExpr::macro_invocation( expected_address, - // The arg range starts just after the macro invocation step and goes for `expected_num_arguments`. - ExprRange::new(index + 1..index + 1 + expected_num_arguments), - )), + // First arg position to last arg position (exclusive) + ExprRange::new(index..index + 1 + expected_num_arguments), + ), ) } @@ -559,23 +828,24 @@ mod tests { expect_step( definition, index, - TemplateBodyValueExpr::Variable(TemplateBodyVariableReference::new( + TemplateBodyExpr::variable( expected_signature_index as u16, - )), + ExprRange::new(index..index + 1), + ), ) } fn expect_step( definition: &TemplateMacro, index: usize, - expected: TemplateBodyValueExpr, + expected: TemplateBodyExpr, ) -> IonResult<()> { let step = definition .body() .expressions() .get(index) .expect("no such expansion step"); - assert_eq!(step, &expected); + assert_eq!(step, &expected, "(actual, expected)"); Ok(()) } @@ -589,8 +859,8 @@ mod tests { .expressions() .get(index) .expect("requested index does not exist") - .expect_element() - .unwrap(); + .kind() + .require_element(); let actual_annotations = definition .body .annotations_storage() @@ -626,7 +896,7 @@ mod tests { let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; assert_eq!(template.name(), "foo"); - assert_eq!(template.signature().parameters().len(), 0); + assert_eq!(template.signature().len(), 0); expect_value(&template, 0, TemplateValue::Int(42.into()))?; Ok(()) } @@ -640,8 +910,8 @@ mod tests { let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; assert_eq!(template.name(), "foo"); - assert_eq!(template.signature().parameters().len(), 0); - expect_value(&template, 0, TemplateValue::List(ExprRange::new(1..4)))?; + assert_eq!(template.signature().len(), 0); + expect_value(&template, 0, TemplateValue::List)?; expect_value(&template, 1, TemplateValue::Int(1.into()))?; expect_value(&template, 2, TemplateValue::Int(2.into()))?; expect_value(&template, 3, TemplateValue::Int(3.into()))?; @@ -657,7 +927,7 @@ mod tests { let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; assert_eq!(template.name(), "foo"); - assert_eq!(template.signature().parameters().len(), 0); + assert_eq!(template.signature().len(), 0); expect_macro( &template, 0, @@ -678,22 +948,18 @@ mod tests { let expression = "(macro foo (x y z) [100, [200, a::b::300], x, {y: [true, false, z]}])"; let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; - expect_value(&template, 0, TemplateValue::List(ExprRange::new(1..12)))?; + expect_value(&template, 0, TemplateValue::List)?; expect_value(&template, 1, TemplateValue::Int(Int::from(100)))?; - expect_value(&template, 2, TemplateValue::List(ExprRange::new(3..5)))?; + expect_value(&template, 2, TemplateValue::List)?; expect_value(&template, 3, TemplateValue::Int(Int::from(200)))?; expect_value(&template, 4, TemplateValue::Int(Int::from(300)))?; expect_annotations(&template, 4, ["a", "b"]); expect_variable(&template, 5, 0)?; - let mut struct_index = HashMap::new(); + let mut struct_index = FxHashMap::default(); struct_index.insert(Symbol::from("y"), vec![8]); - expect_value( - &template, - 6, - TemplateValue::Struct(ExprRange::new(7..12), struct_index), - )?; + expect_value(&template, 6, TemplateValue::Struct(struct_index))?; expect_value(&template, 7, TemplateValue::Symbol(Symbol::from("y")))?; - expect_value(&template, 8, TemplateValue::List(ExprRange::new(9..12)))?; + expect_value(&template, 8, TemplateValue::List)?; expect_value(&template, 9, TemplateValue::Bool(true))?; expect_value(&template, 10, TemplateValue::Bool(false))?; expect_variable(&template, 11, 2)?; @@ -709,13 +975,13 @@ mod tests { let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; assert_eq!(template.name(), "identity"); - assert_eq!(template.signature().parameters().len(), 1); + assert_eq!(template.signature().len(), 1); expect_variable(&template, 0, 0)?; Ok(()) } #[test] - fn quote() -> IonResult<()> { + fn literal() -> IonResult<()> { let resources = TestResources::new(); let context = resources.context(); @@ -725,14 +991,14 @@ mod tests { (values // This `values` is a macro call that has a single argument: the variable `x` (values x) - // This `quote` call causes the inner `(values x)` to be an uninterpreted s-expression. - (quote + // This `literal` call causes the inner `(values x)` to be an uninterpreted s-expression. + (literal (values x)))) "#; let template = TemplateCompiler::compile_from_text(context.get_ref(), expression)?; assert_eq!(template.name(), "foo"); - assert_eq!(template.signature().parameters().len(), 1); + assert_eq!(template.signature().len(), 1); // Outer `values` expect_macro( &template, @@ -748,9 +1014,9 @@ mod tests { 1, )?; expect_variable(&template, 2, 0)?; - // Second argument: `(quote (values x))` - // Notice that the `quote` is not part of the compiled output, only its arguments - expect_value(&template, 3, TemplateValue::SExp(ExprRange::new(4..6)))?; + // Second argument: `(literal (values x))` + // Notice that the `literal` is not part of the compiled output, only its arguments + expect_value(&template, 3, TemplateValue::SExp)?; expect_value(&template, 4, TemplateValue::Symbol("values".into()))?; expect_value(&template, 5, TemplateValue::Symbol("x".into()))?; diff --git a/src/lazy/expanded/e_expression.rs b/src/lazy/expanded/e_expression.rs index cde06907..ca7aa546 100644 --- a/src/lazy/expanded/e_expression.rs +++ b/src/lazy/expanded/e_expression.rs @@ -1,14 +1,91 @@ //! Types and traits representing an e-expression in an Ion stream. #![allow(non_camel_case_types)] -use crate::lazy::decoder::{Decoder, LazyRawValueExpr}; +use std::fmt::{Debug, Formatter}; +use std::ops::Range; + +use crate::lazy::decoder::{Decoder, RawValueExpr}; use crate::lazy::encoding::TextEncoding_1_1; -use crate::lazy::expanded::macro_evaluator::{MacroExpr, RawEExpression, ValueExpr}; -use crate::lazy::expanded::macro_table::MacroRef; +use crate::lazy::expanded::compiler::{ExpansionAnalysis, ExpansionSingleton}; +use crate::lazy::expanded::macro_evaluator::{ + EExpArgGroupIterator, EExpressionArgGroup, MacroExpansion, MacroExpansionKind, MacroExpr, + MacroExprArgsIterator, MakeStringExpansion, RawEExpression, TemplateExpansion, ValueExpr, + ValuesExpansion, +}; +use crate::lazy::expanded::macro_table::{MacroKind, MacroRef}; +use crate::lazy::expanded::template::TemplateMacroRef; use crate::lazy::expanded::{EncodingContextRef, LazyExpandedValue}; +use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr}; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; -use crate::IonResult; -use std::fmt::{Debug, Formatter}; +use crate::{try_next, Environment, HasRange, HasSpan, IonResult, Span}; + +/// An `ArgGroup` is a collection of expressions found in e-expression argument position. +/// They can only appear in positions that correspond with variadic parameters. +#[derive(Copy, Clone)] +pub struct ArgGroup<'top, D: Decoder> { + context: EncodingContextRef<'top>, + raw_arg_group: as RawEExpression<'top, D>>::ArgGroup, + invoked_macro: MacroRef<'top>, +} + +impl<'top, D: Decoder> ArgGroup<'top, D> { + pub fn new( + raw_arg_group: as RawEExpression<'top, D>>::ArgGroup, + context: EncodingContextRef<'top>, + ) -> Self { + // While an `ArgGroup` is a distinct syntactic entity with its own role to play in the grammar, + // once it has been read off the wire it behaves identically to a call to `(:values ...)`. + // Each expression in the group is expanded and the resulting stream is concatenated to that + // of the expression that proceeded it. + // TODO: Fully qualified, unambiguous ID + const VALUES_MACRO_ID: MacroIdRef<'static> = MacroIdRef::LocalAddress(1); + let invoked_macro = context + .macro_table() + .macro_with_id(VALUES_MACRO_ID) + .expect("`values` must be available"); + Self { + context, + raw_arg_group, + invoked_macro, + } + } + pub fn context(&self) -> EncodingContextRef<'top> { + self.context + } + pub fn raw_arg_group(&self) -> as RawEExpression<'top, D>>::ArgGroup { + self.raw_arg_group + } + pub fn invoked_macro(&self) -> MacroRef<'top> { + self.invoked_macro + } + pub fn expressions(&self) -> ArgGroupIterator<'top, D> { + ArgGroupIterator::new(self.context, self.raw_arg_group()) + } + pub fn expand(&self, environment: Environment<'top, D>) -> IonResult> { + let context = self.context(); + let arguments = MacroExprArgsIterator::from_arg_group(self.expressions()); + let expansion_kind = MacroExpansionKind::Values(ValuesExpansion::new(arguments)); + Ok(MacroExpansion::new(context, environment, expansion_kind)) + } +} + +impl<'top, D: Decoder> HasRange for ArgGroup<'top, D> { + fn range(&self) -> Range { + self.raw_arg_group.range() + } +} + +impl<'top, D: Decoder> HasSpan<'top> for ArgGroup<'top, D> { + fn span(&self) -> Span<'top> { + self.raw_arg_group.span() + } +} + +impl<'top, D: Decoder> Debug for ArgGroup<'top, D> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ArgGroup {:?}", self.raw_arg_group) + } +} /// An e-expression (in Ion format `D`) that has been resolved in the current encoding context. #[derive(Copy, Clone)] @@ -28,6 +105,58 @@ impl<'top, D: Decoder> EExpression<'top, D> { pub fn invoked_macro(&self) -> MacroRef<'top> { self.invoked_macro } + + pub(crate) fn new_evaluation_environment(&self) -> IonResult> { + self.raw_invocation + .make_evaluation_environment(self.context) + } + + pub(crate) fn expand(&self) -> IonResult> { + let invoked_macro = self.invoked_macro; + let arguments = MacroExprArgsIterator::from_eexp(self.arguments()); + + let mut environment = Environment::empty(); + // Initialize a `MacroExpansionKind` with the state necessary to evaluate the requested + // macro. + let expansion_kind = match invoked_macro.kind() { + MacroKind::Void => MacroExpansionKind::Void, + MacroKind::Values => MacroExpansionKind::Values(ValuesExpansion::new(arguments)), + MacroKind::MakeString => { + MacroExpansionKind::MakeString(MakeStringExpansion::new(arguments)) + } + MacroKind::Template(template_body) => { + let template_ref = TemplateMacroRef::new(invoked_macro, template_body); + environment = self.new_evaluation_environment()?; + MacroExpansionKind::Template(TemplateExpansion::new(template_ref)) + } + }; + Ok(MacroExpansion::new( + self.context(), + environment, + expansion_kind, + )) + } + + pub(crate) fn expand_to_single_value(&self) -> IonResult> { + let environment = self.new_evaluation_environment()?; + MacroExpansion::expand_singleton(MacroExpansion::initialize( + environment, + MacroExpr::from_eexp(*self), + )?) + } + + pub fn expansion_analysis(&self) -> ExpansionAnalysis { + self.invoked_macro.expansion_analysis() + } + + pub fn expansion_singleton(&self) -> Option { + self.expansion_analysis().expansion_singleton() + } + /// Caller must guarantee that this e-expression invokes a template and that the template + /// has a `ExpansionSingleton`. If these prerequisites are not met, this method will panic. + pub fn require_expansion_singleton(&self) -> ExpansionSingleton { + self.expansion_singleton().unwrap() + } } impl<'top, D: Decoder> Debug for EExpression<'top, D> { @@ -36,6 +165,18 @@ impl<'top, D: Decoder> Debug for EExpression<'top, D> { } } +impl<'top, D: Decoder> HasRange for EExpression<'top, D> { + fn range(&self) -> Range { + self.raw_invocation.range() + } +} + +impl<'top, D: Decoder> HasSpan<'top> for EExpression<'top, D> { + fn span(&self) -> Span<'top> { + self.raw_invocation.span() + } +} + impl<'top, D: Decoder> EExpression<'top, D> { pub fn new( context: EncodingContextRef<'top>, @@ -59,44 +200,116 @@ impl<'top, D: Decoder> EExpression<'top, D> { EExpressionArgsIterator { context: self.context, raw_args: self.raw_invocation.raw_arguments(), + num_args: self.invoked_macro.signature().len() as u32, + index: 0, } } } impl<'top, D: Decoder> From> for MacroExpr<'top, D> { fn from(value: EExpression<'top, D>) -> Self { - MacroExpr::EExp(value) + MacroExpr::from_eexp(value) } } +#[derive(Copy, Clone, Debug)] pub struct EExpressionArgsIterator<'top, D: Decoder> { context: EncodingContextRef<'top>, - raw_args: as RawEExpression<'top, D>>::RawArgumentsIterator<'top>, + raw_args: as RawEExpression<'top, D>>::RawArgumentsIterator, + // The number of argument expressions that the e-expr expects + num_args: u32, + // The index of the next argument to consider + index: u32, +} + +impl<'top, D: Decoder> EExpressionArgsIterator<'top, D> { + pub fn is_exhausted(&self) -> bool { + self.index == self.num_args + } } impl<'top, D: Decoder> Iterator for EExpressionArgsIterator<'top, D> { type Item = IonResult>; fn next(&mut self) -> Option { - let raw_arg: LazyRawValueExpr<'top, D> = match self.raw_args.next()? { - Ok(arg) => arg, - Err(e) => return Some(Err(e)), + let raw_arg: EExpArg<'top, D> = match self.raw_args.next()? { + Ok(arg) => { + debug_assert!(self.index < self.num_args); + arg + } + Err(e) => { + debug_assert!(self.index == self.num_args); + return Some(Err(e)); + } }; + self.index += 1; - let expr = match raw_arg { - LazyRawValueExpr::::ValueLiteral(value) => { - ValueExpr::ValueLiteral(LazyExpandedValue::from_literal(self.context, value)) + let expr = match raw_arg.expr() { + EExpArgExpr::::ValueLiteral(value) => { + ValueExpr::ValueLiteral(LazyExpandedValue::from_literal(self.context, *value)) } - LazyRawValueExpr::::EExp(raw_invocation) => { + EExpArgExpr::::EExp(raw_invocation) => { let invocation = match raw_invocation.resolve(self.context) { Ok(invocation) => invocation, Err(e) => return Some(Err(e)), }; ValueExpr::MacroInvocation(invocation.into()) } + EExpArgExpr::::ArgGroup(group) => { + let arg_group = group.resolve(self.context); + ValueExpr::MacroInvocation(MacroExpr::from_eexp_arg_group(arg_group)) + } }; Some(Ok(expr)) } + + fn size_hint(&self) -> (usize, Option) { + self.raw_args.size_hint() + } } pub type TextEExpression_1_1<'top> = EExpression<'top, TextEncoding_1_1>; + +#[derive(Copy, Clone, Debug)] +pub struct ArgGroupIterator<'top, D: Decoder> { + context: EncodingContextRef<'top>, + expressions: <<::EExp<'top> as RawEExpression<'top, D>>::ArgGroup as EExpressionArgGroup<'top, D>>::Iterator, +} + +impl<'top, D: Decoder> ArgGroupIterator<'top, D> { + pub fn new( + context: EncodingContextRef<'top>, + arg_group: <::EExp<'top> as RawEExpression<'top, D>>::ArgGroup, + ) -> Self { + Self { + context, + expressions: arg_group.iter(), + } + } + + pub fn is_exhausted(&self) -> bool { + self.expressions.is_exhausted() + } +} + +impl<'top, D: Decoder> Iterator for ArgGroupIterator<'top, D> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + let expr = try_next!(self.expressions.next()); + match expr { + RawValueExpr::ValueLiteral(v) => Some(Ok(ValueExpr::ValueLiteral( + LazyExpandedValue::from_literal(self.context, v), + ))), + RawValueExpr::EExp(e) => { + let resolved_eexp = match e.resolve(self.context) { + Ok(eexp) => eexp, + Err(e) => return Some(Err(e)), + }; + Some(Ok(ValueExpr::MacroInvocation(MacroExpr::from_eexp( + resolved_eexp, + )))) + } + } + } +} diff --git a/src/lazy/expanded/encoding_module.rs b/src/lazy/expanded/encoding_module.rs new file mode 100644 index 00000000..7ba1c62b --- /dev/null +++ b/src/lazy/expanded/encoding_module.rs @@ -0,0 +1,40 @@ +use crate::lazy::expanded::macro_table::MacroTable; +use crate::SymbolTable; + +#[derive(Debug, Clone)] +pub struct EncodingModule { + name: String, + macro_table: MacroTable, + symbol_table: SymbolTable, +} + +impl EncodingModule { + pub fn new(name: String, macro_table: MacroTable, symbol_table: SymbolTable) -> Self { + Self { + name, + macro_table, + symbol_table, + } + } + pub fn name(&self) -> &str { + &self.name + } + pub fn macro_table(&self) -> &MacroTable { + &self.macro_table + } + pub fn macro_table_mut(&mut self) -> &mut MacroTable { + &mut self.macro_table + } + pub fn symbol_table(&self) -> &SymbolTable { + &self.symbol_table + } + pub fn symbol_table_mut(&mut self) -> &mut SymbolTable { + &mut self.symbol_table + } + pub fn set_macro_table(&mut self, macro_table: MacroTable) { + self.macro_table = macro_table; + } + pub fn set_symbol_table(&mut self, symbol_table: SymbolTable) { + self.symbol_table = symbol_table; + } +} diff --git a/src/lazy/expanded/macro_evaluator.rs b/src/lazy/expanded/macro_evaluator.rs index df6cec59..1e095cfc 100644 --- a/src/lazy/expanded/macro_evaluator.rs +++ b/src/lazy/expanded/macro_evaluator.rs @@ -13,122 +13,262 @@ #![allow(non_camel_case_types)] use std::fmt::{Debug, Formatter}; +use std::ops::Range; use bumpalo::collections::{String as BumpString, Vec as BumpVec}; use crate::lazy::decoder::{Decoder, HasSpan, LazyRawValueExpr}; -use crate::lazy::expanded::e_expression::{EExpression, EExpressionArgsIterator}; -use crate::lazy::expanded::macro_table::{MacroKind, MacroRef}; +use crate::lazy::expanded::e_expression::{ + ArgGroup, ArgGroupIterator, EExpression, EExpressionArgsIterator, +}; +use crate::lazy::expanded::macro_table::MacroRef; use crate::lazy::expanded::sequence::Environment; use crate::lazy::expanded::template::{ - TemplateBodyValueExpr, TemplateBodyVariableReference, TemplateElement, TemplateMacroInvocation, - TemplateMacroInvocationArgsIterator, TemplateMacroRef, TemplateValue, + ParameterEncoding, TemplateBodyExprKind, TemplateBodyVariableReference, TemplateElement, + TemplateMacroInvocation, TemplateMacroInvocationArgsIterator, TemplateMacroRef, }; -use crate::lazy::expanded::EncodingContextRef; -use crate::lazy::expanded::{ExpandedValueRef, LazyExpandedValue}; +use crate::lazy::expanded::LazyExpandedValue; +use crate::lazy::expanded::{EncodingContextRef, TemplateVariableReference}; use crate::lazy::str_ref::StrRef; +use crate::lazy::text::raw::v1_1::arg_group::EExpArg; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; use crate::result::IonFailure; -use crate::{IonError, IonResult, RawSymbolRef}; +use crate::{ExpandedValueSource, HasRange, IonError, IonResult, ValueRef}; + +pub trait EExpArgGroupIterator<'top, D: Decoder>: + Copy + Clone + Debug + Iterator>> +{ + /// Returns `true` if the iterator is known to be out of arguments to return. + /// Implementations are permitted to return `false` if they are uncertain whether more arguments + /// are available. After the iterator returns `None` for the first time, this method must return + /// `true` when called. + fn is_exhausted(&self) -> bool; +} + +pub trait EExpressionArgGroup<'top, D: Decoder>: + HasSpan<'top> + + Debug + + Copy + + Clone + + IntoIterator>, IntoIter = Self::Iterator> +{ + type Iterator: EExpArgGroupIterator<'top, D>; + + fn encoding(&self) -> ParameterEncoding; + fn resolve(self, context: EncodingContextRef<'top>) -> ArgGroup<'top, D>; + + fn iter(self) -> Self::Iterator { + self.into_iter() + } +} /// The syntactic entity in format `D` that represents an e-expression. This expression has not /// yet been resolved in the current encoding context. pub trait RawEExpression<'top, D: Decoder = Self>>: HasSpan<'top> + Debug + Copy + Clone +where + Self: 'top, { /// An iterator that yields the macro invocation's arguments in order. - type RawArgumentsIterator<'a>: Iterator>> - where - Self: 'a; + type RawArgumentsIterator: Debug + Copy + Clone + Iterator>>; + + /// A type that represents an argument group--several expressions that form a stream + /// passed as a single argument. + type ArgGroup: EExpressionArgGroup<'top, D>; /// The macro name or address specified at the head of this macro invocation. - fn id(&self) -> MacroIdRef<'top>; + fn id(self) -> MacroIdRef<'top>; /// The arguments that follow the macro name or address in this macro invocation. - fn raw_arguments(&self) -> Self::RawArgumentsIterator<'top>; + fn raw_arguments(&self) -> Self::RawArgumentsIterator; /// Looks up the macro invoked by this E-expression in the given `EncodingContext`. /// If the lookup is successful, returns an `Ok` containing a resolved `EExpression` that holds /// a reference to the macro being invoked. /// If the ID cannot be found in the `EncodingContext`, returns `Err`. fn resolve(self, context: EncodingContextRef<'top>) -> IonResult> { - let invoked_macro = context - .macro_table() - .macro_with_id(self.id()) - .ok_or_else(|| { - IonError::decoding_error(format!("unrecognized macro ID {:?}", self.id())) - })?; + let invoked_macro = context.macro_table().macro_with_id(self.id()).ok_or_else( + #[inline(never)] + || IonError::decoding_error(format!("unrecognized macro ID {:?}", self.id())), + )?; Ok(EExpression::new(context, self, invoked_macro)) } + + /// Returns an array of resolved [`ValueExpr`] instances that can be evaluated and/or passed + /// as arguments to other macro invocations. + fn make_evaluation_environment( + &self, + context: EncodingContextRef<'top>, + ) -> IonResult> { + Environment::for_eexp(context, *self) + } } /// An invocation of a macro found in either the data stream or in the body of a template. /// This invocation has been resolved in the current encoding context, and holds a reference to /// the definition of the macro being invoked. #[derive(Copy, Clone, Debug)] -pub enum MacroExpr<'top, D: Decoder> { +pub struct MacroExpr<'top, D: Decoder> { + kind: MacroExprKind<'top, D>, + variable: Option>, +} + +impl<'top, D: Decoder> MacroExpr<'top, D> { + pub fn kind(&self) -> MacroExprKind<'top, D> { + self.kind + } +} + +impl<'top, D: Decoder> MacroExpr<'top, D> { + pub fn new(source: MacroExprKind<'top, D>) -> Self { + Self { + kind: source, + variable: None, + } + } + + pub fn via_variable(mut self, variable_ref: TemplateVariableReference<'top>) -> Self { + self.variable = Some(variable_ref); + self + } + + pub fn expand(&self, environment: Environment<'top, D>) -> IonResult> { + match self.kind { + MacroExprKind::TemplateMacro(t) => t.expand(environment), + MacroExprKind::EExp(e) => e.expand(), + MacroExprKind::EExpArgGroup(g) => g.expand(environment), + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum MacroExprKind<'top, D: Decoder> { /// A macro invocation found in the body of a template. TemplateMacro(TemplateMacroInvocation<'top>), /// A macro invocation found in the data stream. EExp(EExpression<'top, D>), + /// An e-expression argument group. (A `values` call with special encoding.) + EExpArgGroup(ArgGroup<'top, D>), } impl<'top, D: Decoder> MacroExpr<'top, D> { + pub fn from_template_macro(invocation: TemplateMacroInvocation<'top>) -> Self { + MacroExpr::new(MacroExprKind::TemplateMacro(invocation)) + } + + pub fn from_eexp(eexp: EExpression<'top, D>) -> Self { + MacroExpr::new(MacroExprKind::EExp(eexp)) + } + + pub fn from_eexp_arg_group(group: ArgGroup<'top, D>) -> Self { + MacroExpr::new(MacroExprKind::EExpArgGroup(group)) + } + + pub fn variable(&self) -> Option> { + self.variable + } + + pub fn source(&self) -> MacroExprKind<'top, D> { + self.kind + } + fn id(&self) -> MacroIdRef { - match &self { - MacroExpr::TemplateMacro(m) => m.id(), - MacroExpr::EExp(e) => e.id(), + use MacroExprKind::*; + match &self.kind { + TemplateMacro(m) => m.id(), + EExp(e) => e.id(), + EExpArgGroup(_) => MacroIdRef::LocalAddress(1), // `values` } } fn arguments(&self, environment: Environment<'top, D>) -> MacroExprArgsIterator<'top, D> { - let args_kind = match &self { - MacroExpr::TemplateMacro(m) => { - MacroExprArgsKind::<'top, D>::Macro(m.arguments(environment)) + use MacroExprKind::*; + let args_kind = match &self.kind { + TemplateMacro(m) => { + MacroExprArgsKind::<'top, D>::TemplateMacro(m.arguments(environment)) } - MacroExpr::EExp(e) => MacroExprArgsKind::<'top, D>::EExp(e.arguments()), + EExp(e) => MacroExprArgsKind::<'top, D>::EExp(e.arguments()), + EExpArgGroup(group) => MacroExprArgsKind::<'top, D>::ArgGroup(group.expressions()), }; MacroExprArgsIterator { source: args_kind } } - fn invoked_macro(&self) -> MacroRef<'top> { - match &self { - MacroExpr::TemplateMacro(m) => m.invoked_macro(), - MacroExpr::EExp(e) => e.invoked_macro(), + pub(crate) fn invoked_macro(&self) -> MacroRef<'top> { + use MacroExprKind::*; + match &self.kind { + TemplateMacro(m) => m.invoked_macro(), + EExp(e) => e.invoked_macro(), + EExpArgGroup(g) => g.invoked_macro(), } } - fn context(&self) -> EncodingContextRef<'top> { - match self { - MacroExpr::TemplateMacro(t) => t.context(), - MacroExpr::EExp(e) => e.context(), + pub(crate) fn context(&self) -> EncodingContextRef<'top> { + use MacroExprKind::*; + match self.kind { + TemplateMacro(t) => t.context(), + EExp(e) => e.context(), + EExpArgGroup(g) => g.context(), } } } +#[derive(Copy, Clone, Debug)] pub enum MacroExprArgsKind<'top, D: Decoder> { - Macro(TemplateMacroInvocationArgsIterator<'top, D>), + TemplateMacro(TemplateMacroInvocationArgsIterator<'top, D>), EExp(EExpressionArgsIterator<'top, D>), + ArgGroup(ArgGroupIterator<'top, D>), } +#[derive(Copy, Clone, Debug)] pub struct MacroExprArgsIterator<'top, D: Decoder> { source: MacroExprArgsKind<'top, D>, } +impl<'top, D: Decoder> MacroExprArgsIterator<'top, D> { + pub fn from_eexp(args: EExpressionArgsIterator<'top, D>) -> Self { + MacroExprArgsIterator { + source: MacroExprArgsKind::EExp(args), + } + } + + pub fn from_template_macro(args: TemplateMacroInvocationArgsIterator<'top, D>) -> Self { + MacroExprArgsIterator { + source: MacroExprArgsKind::TemplateMacro(args), + } + } + + pub fn from_arg_group(args: ArgGroupIterator<'top, D>) -> Self { + MacroExprArgsIterator { + source: MacroExprArgsKind::ArgGroup(args), + } + } + + fn is_exhausted(&self) -> bool { + match self.source { + MacroExprArgsKind::TemplateMacro(ref args) => args.is_exhausted(), + MacroExprArgsKind::EExp(ref args) => args.is_exhausted(), + MacroExprArgsKind::ArgGroup(ref args) => args.is_exhausted(), + } + } +} + impl<'top, D: Decoder> Iterator for MacroExprArgsIterator<'top, D> { type Item = IonResult>; + #[inline(always)] fn next(&mut self) -> Option { match &mut self.source { - MacroExprArgsKind::Macro(m) => m.next(), + MacroExprArgsKind::TemplateMacro(m) => m.next(), MacroExprArgsKind::EExp(e) => e.next(), + MacroExprArgsKind::ArgGroup(g) => g.next(), } } fn size_hint(&self) -> (usize, Option) { match &self.source { - MacroExprArgsKind::Macro(m) => m.size_hint(), + MacroExprArgsKind::TemplateMacro(m) => m.size_hint(), MacroExprArgsKind::EExp(e) => e.size_hint(), + MacroExprArgsKind::ArgGroup(g) => g.size_hint(), } } } @@ -152,14 +292,11 @@ impl<'top, D: Decoder> ArgExpr<'top, D> { /// environment. Returns an `ArgValueExpr` which is the value literal or macro invocation to /// which the variable referred. /// Otherwise, passes through the value literal or macro invocation. - pub(crate) fn resolve( - &self, - environment: Environment<'top, D>, - ) -> IonResult> { + pub(crate) fn resolve(&self, environment: Environment<'top, D>) -> ValueExpr<'top, D> { match self { - ArgExpr::ValueLiteral(value) => Ok(ValueExpr::ValueLiteral(*value)), - ArgExpr::Variable(variable) => environment.get_expected(variable.signature_index()), - ArgExpr::MacroInvocation(invocation) => Ok(ValueExpr::MacroInvocation(*invocation)), + ArgExpr::ValueLiteral(value) => ValueExpr::ValueLiteral(*value), + ArgExpr::Variable(variable) => environment.require_expr(variable.signature_index()), + ArgExpr::MacroInvocation(invocation) => ValueExpr::MacroInvocation(*invocation), } } } @@ -178,8 +315,35 @@ pub enum ValueExpr<'top, D: Decoder> { MacroInvocation(MacroExpr<'top, D>), } +impl<'top, D: Decoder> ValueExpr<'top, D> { + /// If this `ValueExpr` represents an entity encoded in te data stream, returns `Some(range)`. + /// If it represents a template value or a constructed value, returns `None`. + pub fn range(&self) -> Option> { + match self { + ValueExpr::ValueLiteral(value) => { + use ExpandedValueSource::*; + match value.source { + EExp(_) => todo!(), + ValueLiteral(literal) => Some(literal.range()), + Template(_, _) => None, + Constructed(_, _) => None, + } + } + ValueExpr::MacroInvocation(e) => { + use MacroExprKind::*; + match e.source() { + TemplateMacro(_) => None, + EExp(e) => Some(e.range()), + EExpArgGroup(g) => Some(g.range()), + } + } + } + } +} + /// Indicates which of the supported macros this represents and stores the state necessary to /// continue evaluating that macro. +#[derive(Copy, Clone, Debug)] pub enum MacroExpansionKind<'top, D: Decoder> { Void, Values(ValuesExpansion<'top, D>), @@ -189,14 +353,73 @@ pub enum MacroExpansionKind<'top, D: Decoder> { /// A macro in the process of being evaluated. Stores both the state of the evaluation and the /// syntactic element that represented the macro invocation. +#[derive(Copy, Clone)] pub struct MacroExpansion<'top, D: Decoder> { + context: EncodingContextRef<'top>, kind: MacroExpansionKind<'top, D>, - invocation: MacroExpr<'top, D>, + environment: Environment<'top, D>, + is_complete: bool, } impl<'top, D: Decoder> MacroExpansion<'top, D> { - pub(crate) fn new(kind: MacroExpansionKind<'top, D>, invocation: MacroExpr<'top, D>) -> Self { - Self { kind, invocation } + pub fn context(&self) -> EncodingContextRef<'top> { + self.context + } + + /// Expands the current macro with the expectation that it will produce exactly one value. + #[inline(always)] + pub(crate) fn expand_singleton(mut self) -> IonResult> { + // We don't need to construct an evaluator because this is guaranteed to produce exactly + // one value. + match self.next_step()? { + // If the expansion produces anything other than a final value, there's a bug. + MacroExpansionStep::FinalStep(Some(ValueExpr::ValueLiteral(value))) => Ok(value), + _ => unreachable!("e-expression-backed lazy values must yield a single value literal"), + } + } + + /// Construct a new `MacroExpansion` and populate its evaluation environment as needed. + pub(crate) fn initialize( + environment: Environment<'top, D>, + invocation_to_evaluate: MacroExpr<'top, D>, + ) -> IonResult> { + match invocation_to_evaluate.source() { + MacroExprKind::TemplateMacro(t) => t.expand(environment), + MacroExprKind::EExp(e) => e.expand(), + MacroExprKind::EExpArgGroup(g) => g.expand(environment), + } + } + + pub(crate) fn new( + context: EncodingContextRef<'top>, + environment: Environment<'top, D>, + kind: MacroExpansionKind<'top, D>, + ) -> Self { + Self { + environment, + kind, + context, + is_complete: false, + } + } + + /// Continues evaluating this macro until it: + /// * produces another value. + /// * encounters another macro or variable that needs to be expanded. + /// * is completed. + #[inline(always)] + pub fn next_step(&mut self) -> IonResult> { + use MacroExpansionKind::*; + let context = self.context; + let environment = self.environment; + // Delegate the call to `next()` based on the macro kind. + match &mut self.kind { + Template(template_expansion) => template_expansion.next(context, environment), + Values(values_expansion) => values_expansion.next(context, environment), + MakeString(make_string_expansion) => make_string_expansion.next(context, environment), + // `void` is trivial and requires no delegation + Void => Ok(MacroExpansionStep::FinalStep(None)), + } } } @@ -207,34 +430,232 @@ impl<'top, D: Decoder> Debug for MacroExpansion<'top, D> { MacroExpansionKind::Values(_) => "values", MacroExpansionKind::MakeString(_) => "make_string", MacroExpansionKind::Template(t) => { - return write!(f, "", t.template.name()) + return if let Some(name) = t.template.name() { + write!(f, "", name) + } else { + write!(f, "") + } } }; - write!(f, "") + write!(f, "") } } -impl<'top, D: Decoder> MacroExpansion<'top, D> { - /// Continues evaluating this macro until it: - /// * produces another value. - /// * encounters another macro or variable that needs to be expanded. - /// * is completed. - fn next(&mut self, environment: Environment<'top, D>) -> IonResult>> { - use MacroExpansionKind::*; - let context = self.invocation.context(); - // Delegate the call to `next()` based on the macro kind. - match &mut self.kind { - MakeString(make_string_expansion) => make_string_expansion.next(context, environment), - Values(values_expansion) => values_expansion.next(context, environment), - // `void` is trivial and requires no delegation - Void => Ok(None), - Template(template_expansion) => template_expansion.next(context, environment), +pub enum MacroExpansionStep<'top, D: Decoder> { + Step(ValueExpr<'top, D>), + FinalStep(Option>), +} + +impl<'top, D: Decoder> MacroExpansionStep<'top, D> { + pub fn value_expr(&self) -> Option> { + match self { + MacroExpansionStep::Step(expr) => Some(*expr), + MacroExpansionStep::FinalStep(maybe_expr) => *maybe_expr, + } + } + + pub fn is_final(&self) -> bool { + matches!(self, MacroExpansionStep::FinalStep(_)) + } +} + +/// The internal bookkeeping representation used by a [`MacroEvaluator`]. +#[derive(Debug)] +pub enum EvaluatorState<'top, D: Decoder> { + /// The evaluator is empty; it does not currently have any expansions in progress. + Empty, + /// The evaluator has a single expansion in progress. It does not own any bump-allocated + /// resources. + Stackless(MacroExpansion<'top, D>), + /// The evaluator has several expansions in progress. It holds a bump-allocated `MacroStack` + /// that it pushes to and pops from. + Stacked(StackedMacroEvaluator<'top, D>), +} + +impl<'top, D: Decoder> Default for EvaluatorState<'top, D> { + fn default() -> Self { + Self::Empty + } +} + +/// A general-purpose macro evaluator that waits to allocate resources until it is clear that they +/// are necessary. +#[derive(Debug, Default)] +pub struct MacroEvaluator<'top, D: Decoder> { + root_environment: Environment<'top, D>, + state: EvaluatorState<'top, D>, +} + +impl<'top, D: Decoder> MacroEvaluator<'top, D> { + pub fn is_empty(&self) -> bool { + use EvaluatorState::*; + match self.state { + Empty => true, + Stacked(ref evaluator) => evaluator.macro_stack_depth() == 0, + _ => false, + } + } + + #[inline] + #[allow(clippy::should_implement_trait)] + // ^-- Clippy complains this looks like Iterator::next(). + pub fn next(&mut self) -> IonResult>> { + // This inlineable method checks whether the evaluator is empty to avoid a more expensive + // method invocation when possible. + if self.is_empty() { + return Ok(None); + } + self.next_general_case() + } + + /// The core macro evaluation logic. + #[inline] + fn next_general_case(&mut self) -> IonResult>> { + use EvaluatorState::*; + // This happens in a loop in case the next item produced is another macro to evaluate. + // In most cases, we never return to the top of the loop. + loop { + let expansion = match self.state { + // If the evaluator is empty, there's nothing to do. + Empty => return Ok(None), + // If the evaluator is processing several expansions (not common), we delegate the + // evaluation and bookkeeping to the `StackedMacroEvaluator` type. + Stacked(ref mut stack_evaluator) => return stack_evaluator.next(), + // If the evaluator is stackless, there's only one expansion in progress. + Stackless(ref mut expansion) => expansion, + }; + + // At this point, we have a reference to the only expansion in progress. + // + // If the next thing it produces is another macro, we would push it onto the stack. + // However, this would cause the stack to grow to a depth of 2 and require us to + // bump-allocate a proper stack. Instead, we take note of the environment this expansion + // was using... + let environment = expansion.environment; + // ...get the next step in the expansion... + let step = expansion.next_step()?; + // ...and, if this was the last step in the expansion, pop it off the stack-of-one + // by setting the state back to `Empty`. + if step.is_final() { + self.state = Empty; + } + // Now the stack has a depth of zero or one. + match step.value_expr() { + // No more expressions means we're done. (It also means the stack is empty because + // it is not possible to return a non-final step that is `None`.) + None => return Ok(None), + // If it's a value, then regardless of stack depth we return that value. + Some(ValueExpr::ValueLiteral(value)) => return Ok(Some(value)), + // If it's another macro to evaluate, then we'll push it onto the stack and continue + // at the top of the loop looking for our next value-or-nothing. + Some(ValueExpr::MacroInvocation(invocation)) => { + // If the evaluator state is `Empty`, this sets it back to `Stackless`, which + // is an important optimization. We avoid bump-allocating in the lion's share + // of evaluations. + // If the state is `Stackless` (i.e. there's still an expansion in progress), + // this will upgrade the state to `Stacked` and allocate the necessary + // resources. + self.push(invocation.expand(environment)?) + // This "tail eval" optimization--eagerly popping completed expansions off the + // stack to keep it flat--avoids allocations in many evaluations, e.g.: + // (:void) + // (:values) + // (:values 1 2 3) + // (:values 1 2 3 /*POP*/ (:values 1 2 3)) + // (:values 1 2 3 /*POP*/ (:values 1 2 3 /*POP*/ (:values 1 2 3))) + // + // TODO: Use `invocation.invoked_macro().must_produce_exactly_one_value()` + // to see if we can avoid pushing the new invocation and instead + // eagerly evaluating it. + } + } + } + } + + pub fn new() -> Self { + Self { + root_environment: Environment::empty(), + state: EvaluatorState::Empty, + } + } + + pub fn new_with_environment(environment: Environment<'top, D>) -> Self { + Self { + root_environment: environment, + state: EvaluatorState::Empty, + } + } + + pub fn for_eexp(eexp: EExpression<'top, D>) -> IonResult { + let macro_expr = MacroExpr::from_eexp(eexp); + Self::for_macro_expr(Environment::empty(), macro_expr) + } + + pub fn for_macro_expr( + environment: Environment<'top, D>, + macro_expr: MacroExpr<'top, D>, + ) -> IonResult { + let expansion = MacroExpansion::initialize(environment, macro_expr)?; + Ok(Self::for_expansion(expansion)) + } + + fn for_expansion(expansion: MacroExpansion<'top, D>) -> Self { + Self { + root_environment: expansion.environment, + state: EvaluatorState::Stackless(expansion), } } + + pub fn environment(&self) -> Environment<'top, D> { + use EvaluatorState::*; + match self.state { + Empty => self.root_environment, + Stackless(ref expansion) => expansion.environment, + Stacked(ref stack) => stack.environment(), + } + } + + #[inline] + pub fn push(&mut self, new_expansion: MacroExpansion<'top, D>) { + if self.is_empty() { + // Going from zero expansions to one expansion is cheap. + self.state = EvaluatorState::Stackless(new_expansion); + } else { + // Going from 1 to 2 or more is more expensive and less common, + // so we don't inline this case. + self.push_general_case(new_expansion) + } + } + + #[inline(never)] + pub fn push_general_case(&mut self, new_expansion: MacroExpansion<'top, D>) { + match self.state { + // Going from zero expansions to one expansion + EvaluatorState::Empty => self.state = EvaluatorState::Stackless(new_expansion), + // Going from one expansion to two + EvaluatorState::Stackless(original_expansion) => { + let mut stacked_evaluator = StackedMacroEvaluator::new_with_environment( + new_expansion.context(), + self.root_environment, + ); + stacked_evaluator + .macro_stack + .extend_from_slice_copy(&[original_expansion, new_expansion]); + self.state = EvaluatorState::Stacked(stacked_evaluator) + } + // Going from 2+ up + EvaluatorState::Stacked(ref mut stacked_evaluator) => { + stacked_evaluator.macro_stack.push(new_expansion) + } + }; + } + + pub fn set_root_environment(&mut self, environment: Environment<'top, D>) { + self.root_environment = environment; + } } pub type MacroStack<'top, D> = BumpVec<'top, MacroExpansion<'top, D>>; -pub type EnvironmentStack<'top, D> = BumpVec<'top, Environment<'top, D>>; /// Evaluates macro invocations recursively, yielding a single expanded value at a time. /// @@ -245,115 +666,60 @@ pub type EnvironmentStack<'top, D> = BumpVec<'top, Environment<'top, D>>; /// x {eager, lazy} /// /// For incremental/lazy evaluation, push a macro invocation onto the stack using -/// [`MacroEvaluator::push`] and then use [`MacroEvaluator::next`] to evaluate the next value. +/// [`StackedMacroEvaluator::push`] and then use [`StackedMacroEvaluator::next`] to evaluate the next value. /// -/// For eager evaluation, use [`MacroEvaluator::evaluate`], which returns an iterator that will +/// For eager evaluation, use [`StackedMacroEvaluator::evaluate`], which returns an iterator that will /// yield the expanded values. -pub struct MacroEvaluator<'top, D: Decoder> { +#[derive(Debug)] +pub struct StackedMacroEvaluator<'top, D: Decoder> { // A stack with the most recent macro invocations at the top. This stack grows each time a macro // of any kind begins evaluation. macro_stack: MacroStack<'top, D>, - // A stack of _template_ macro invocation environments. This stack only grows when a template - // macro is invoked from any context. For example, given these template definitions: - // (macro foo (x) (values 1 2 x)) - // (macro bar (y) (foo y)) - // and this invocation: - // (:bar 3) - // A new environment [/*y=*/ 3] would be pushed for the invocation of `bar`, and another - // environment [/*x=y=*/ 3] would be pushed for the invocation of `foo` within `bar`. However, - // no environment would be created/pushed for the invocation of the `values` macro within `foo`. - // For any macro being evaluated, the current environment is always the one at the top of the - // environment stack. - env_stack: EnvironmentStack<'top, D>, + root_environment: Environment<'top, D>, } -impl<'top, D: Decoder> MacroEvaluator<'top, D> { - pub fn new(context: EncodingContextRef<'top>, environment: Environment<'top, D>) -> Self { - let macro_stack = BumpVec::new_in(context.allocator()); - let mut env_stack = BumpVec::new_in(context.allocator()); - env_stack.push(environment); +impl<'top, D: Decoder> StackedMacroEvaluator<'top, D> { + #[inline] + pub fn new(context: EncodingContextRef<'top>) -> Self { + const INITIAL_MACRO_STACK_CAPACITY: usize = 8; + let macro_stack = + BumpVec::with_capacity_in(INITIAL_MACRO_STACK_CAPACITY, context.allocator()); Self { macro_stack, - env_stack, + root_environment: Environment::empty(), } } + pub fn new_with_environment( + context: EncodingContextRef<'top>, + environment: Environment<'top, D>, + ) -> Self { + let mut evaluator = Self::new(context); + evaluator.root_environment = environment; + evaluator + } + /// Returns the number of macros that are currently being evaluated. pub fn macro_stack_depth(&self) -> usize { self.macro_stack.len() } - /// Returns the current environment (i.e. the one at the top of the macro stack.) + /// Returns the current environment (i.e. the one used by the top of the macro stack.) pub fn environment(&self) -> Environment<'top, D> { - // The stack is never completely empty; the 'root' evaluator is created with an empty - // environment at the base of the stack. - *self.env_stack.last().unwrap() - } - - /// Creates a new `Environment` for the given `invocation`. - /// - /// This helper function iterates over the argument expressions in the invocation. If an argument - /// expression is a value literal or macro invocation, it is added to the new environment as-is. - /// If an argument is a variable reference, it is substituted with the corresponding value literal - /// or macro invocation from the current environment and then added to the new environment. - fn make_new_evaluation_environment( - &mut self, - invocation: MacroExpr<'top, D>, - ) -> IonResult> { - // Get an allocator reference from the `env_stack` BumpVec. - let allocator = self.env_stack.bump(); - let args_iter = invocation.arguments(self.environment()); - // Use the iterator's size hint to determine an initial capacity to aim for. - let num_args_hint = args_iter.size_hint(); - let capacity_hint = num_args_hint.1.unwrap_or(num_args_hint.0); - let mut args = BumpVec::with_capacity_in(capacity_hint, allocator); - - for arg in args_iter { - args.push(arg?); - } - let environment = Environment::new(args); - Ok(environment) - } - - /// Initializes a [`MacroExpansion`] that contains the necessary state to incrementally evaluate - /// the provided macro invocation. - /// - /// Returns an error if the invocation is invalid due to missing or malformed arguments. - fn initialize_expansion( - &mut self, - invocation_to_evaluate: MacroExpr<'top, D>, - ) -> IonResult> { - // Initialize a `MacroExpansionKind` with the state necessary to evaluate the requested - // macro. - let expansion_kind = match invocation_to_evaluate.invoked_macro().kind() { - MacroKind::Void => MacroExpansionKind::Void, - MacroKind::Values => MacroExpansionKind::Values(ValuesExpansion { - arguments: invocation_to_evaluate.arguments(self.environment()), - initial_eval_stack_depth: self.macro_stack_depth(), - }), - MacroKind::MakeString => MacroExpansionKind::MakeString(MakeStringExpansion::new( - invocation_to_evaluate.arguments(self.environment()), - )), - MacroKind::Template(template) => { - let template_address = invocation_to_evaluate.invoked_macro().address(); - let template_ref = TemplateMacroRef::new(template_address, template); - let new_environment = - self.make_new_evaluation_environment(invocation_to_evaluate)?; - self.env_stack.push(new_environment); - MacroExpansionKind::Template(TemplateExpansion::new(template_ref)) - } - }; - Ok(MacroExpansion { - kind: expansion_kind, - invocation: invocation_to_evaluate, - }) + self.macro_stack + .last() + .map(|expansion| expansion.environment) + .unwrap_or(self.root_environment) } /// Given a syntactic element representing a macro invocation, attempt to resolve it with the /// current encoding context and push the resulting `MacroExpansion` onto the stack. pub fn push(&mut self, invocation: impl Into>) -> IonResult<()> { let macro_expr = invocation.into(); - let expansion = self.initialize_expansion(macro_expr)?; + let expansion = match MacroExpansion::initialize(self.environment(), macro_expr) { + Ok(expansion) => expansion, + Err(e) => return Err(e), + }; self.macro_stack.push(expansion); Ok(()) } @@ -389,60 +755,50 @@ impl<'top, D: Decoder> MacroEvaluator<'top, D> { &mut self, depth_to_exhaust: usize, ) -> IonResult>> { - debug_assert!( - self.macro_stack_depth() >= depth_to_exhaust, - "asked to exhaust a macro at an invalid depth" - ); - loop { - let environment = self.environment(); // Get the expansion at the top of the stack. let current_expansion = match self.macro_stack.last_mut() { - // NOTE: If the user specifies a `depth_to_exhaust` of 0, this is where the loop - // will end. Behaviorally, this is identical to a `depth_to_exhaust` of 1, - // which would return `Ok(None)` at the bottom of this method. It is always - // legal to call `next()` with a `depth_to_exhaust` of 0; however, it is - // illegal to call it with a `depth_to_exhaust` of 1 when the stack is empty. None => return Ok(None), Some(expansion) => expansion, }; // Ask that expansion to continue its evaluation by one step. + let step = match current_expansion.next_step() { + Ok(step) => step, + Err(e) => return Err(e), + }; + current_expansion.is_complete = step.is_final(); use ValueExpr::*; - match current_expansion.next(environment)? { - // If we get a value, return it to the caller. - Some(ValueLiteral(value)) => { - return Ok(Some(value)); - } - // If we get another macro, push it onto the stack and continue evaluation. + let maybe_output_value = match step.value_expr() { Some(MacroInvocation(invocation)) => { - // If we encounter another macro invocation, put it on top of the stack. self.push(invocation)?; continue; } - // If the current macro reports that its expansion is complete... - None => { - // Check to see if the completed value was a template. If so, discard its environment. - let completed_kind = &self.macro_stack.last().unwrap().kind; - if matches!(completed_kind, MacroExpansionKind::Template(_)) { - // NB: Here and below, we use `truncate()` instead of `pop()` so the value can - // be dropped in place without incurring a move. That move runs afoul of the - // aliasing requirements that `miri` looks for, though I'm unsure why. - // Once Polonius lands and we are able to remove the `unsafe` usages in - // the LazyExpandingReader, this will be unnecessary. - self.env_stack.truncate(self.env_stack.len() - 1); - } - self.macro_stack.truncate(self.macro_stack.len() - 1); + Some(ValueLiteral(value)) => Some(value), + None => None, + }; - // ...and see that was the macro the caller was interested in evaluating. - if self.macro_stack.len() < depth_to_exhaust { - // If so, there are no more values to yield, even though there may still - // be macros on the stack. - return Ok(None); - } - // Otherwise, the caller is interested in one of the previously invoked macros. - continue; - } + if current_expansion.is_complete { + self.pop_completed_macros(); + } + if self.macro_stack.len() < depth_to_exhaust { + return Ok(maybe_output_value); + } + if maybe_output_value.is_none() && !self.macro_stack.is_empty() { + continue; + } + return Ok(maybe_output_value); + } + } + + fn pop_completed_macros(&mut self) { + loop { + // Pop the top macro, which we know to be completed. + self.macro_stack.truncate(self.macro_stack.len() - 1); + // See if the new top macro is also complete and ready to be popped. + match self.macro_stack.last() { + Some(expansion) if expansion.is_complete => continue, + _ => break, } } } @@ -465,12 +821,12 @@ impl<'top, D: Decoder> MacroEvaluator<'top, D> { /// Yields the values produced by incrementally evaluating the macro that was at the top of the /// evaluator's stack when the iterator was created. pub struct EvaluatingIterator<'iter, 'top, D: Decoder> { - evaluator: &'iter mut MacroEvaluator<'top, D>, + evaluator: &'iter mut StackedMacroEvaluator<'top, D>, initial_stack_depth: usize, } impl<'iter, 'top, D: Decoder> EvaluatingIterator<'iter, 'top, D> { - pub fn new(evaluator: &'iter mut MacroEvaluator<'top, D>) -> Self { + pub fn new(evaluator: &'iter mut StackedMacroEvaluator<'top, D>) -> Self { let initial_stack_depth = evaluator.macro_stack_depth(); Self { evaluator, @@ -501,30 +857,38 @@ impl<'iter, 'top, D: Decoder> Iterator for EvaluatingIterator<'iter, 'top, D> { /// (:values 1) => 1 /// (:values 1 2 3) => 1 2 3 /// (:values 1 2 (:values 3 4)) => 1 2 3 4 +#[derive(Copy, Clone, Debug)] pub struct ValuesExpansion<'top, D: Decoder> { // Which argument the macro is in the process of expanding arguments: MacroExprArgsIterator<'top, D>, - // The stack depth where this `values` call lives. When the stack shrinks below this depth, - // evaluation is complete. - initial_eval_stack_depth: usize, } impl<'top, D: Decoder> ValuesExpansion<'top, D> { - pub fn new(arguments: MacroExprArgsIterator<'top, D>, initial_eval_stack_depth: usize) -> Self { - Self { - arguments, - initial_eval_stack_depth, - } + pub fn new(arguments: MacroExprArgsIterator<'top, D>) -> Self { + Self { arguments } } /// Yields the next [`ValueExpr`] in this macro's evaluation. + #[inline(always)] pub fn next( &mut self, _context: EncodingContextRef<'top>, _environment: Environment<'top, D>, - ) -> IonResult>> { + ) -> IonResult> { + let arg_result = self.arguments.next(); + let is_last_arg = self.arguments.is_exhausted(); // We visit the argument expressions in the invocation in order from left to right. - self.arguments.next().transpose() + match arg_result { + // This is known to be the last argument; there will be no other steps after it is returned. + Some(Ok(expr)) if is_last_arg => Ok(MacroExpansionStep::FinalStep(Some(expr))), + // This is not known to be the last argument--if it _is_ the last one, the iterator was not + // able to tell that it was. We'll treat it as though it is not. False positives result + // in a future call to `next` to get a `FinalStep(None)` and can cause the evaluator + // to allocate more resources than necessary. + Some(Ok(expr)) => Ok(MacroExpansionStep::Step(expr)), + None => Ok(MacroExpansionStep::FinalStep(None)), + Some(Err(e)) => Err(e), + } } } @@ -548,17 +912,14 @@ impl<'top, D: Decoder> ValuesExpansion<'top, D> { /// (:make_string (:values "first" "_") $4) => "first_name" /// (:make_string) => "" /// (:make_string "foo" 7) => Error +#[derive(Copy, Clone, Debug)] pub struct MakeStringExpansion<'top, D: Decoder> { arguments: MacroExprArgsIterator<'top, D>, - is_complete: bool, } impl<'top, D: Decoder> MakeStringExpansion<'top, D> { pub fn new(arguments: MacroExprArgsIterator<'top, D>) -> Self { - Self { - arguments, - is_complete: false, - } + Self { arguments } } /// Yields the next [`ValueExpr`] in this `make_string` macro's evaluation. @@ -566,34 +927,31 @@ impl<'top, D: Decoder> MakeStringExpansion<'top, D> { &mut self, context: EncodingContextRef<'top>, environment: Environment<'top, D>, - ) -> IonResult>> { - // `make_string` always produces a single value. Once that value has been returned, it needs - // to report `Complete` on the following call to `next()`. - if self.is_complete { - return Ok(None); - } - + ) -> IonResult> { // Create a bump-allocated buffer to hold our constructed string - let mut buffer = BumpString::new_in(context.allocator()); + const INITIAL_CAPACITY: usize = 32; + let mut buffer = BumpString::with_capacity_in(INITIAL_CAPACITY, context.allocator()); // We need to eagerly evaluate all of the arguments to `make_string` to produce its next // (and only) value. However, because `&mut self` (the expansion state) lives in a stack // inside the evaluator, we cannot get a simultaneous mutable reference to the evaluator // itself. Instead, we use the bump allocator the make a transient macro evaluator // whose resources can be trivially reclaimed when the expansion is done. - let mut evaluator = MacroEvaluator::new(context, environment); + let mut evaluator = MacroEvaluator::<'top, D>::new(); for arg_result in &mut self.arguments { let arg_expr = arg_result?; match arg_expr { - ValueExpr::ValueLiteral(value) => { - Self::append_expanded_raw_text_value(context, &mut buffer, value.read()?)? + ValueExpr::ValueLiteral(expanded_value) => { + let text = expanded_value.read_resolved()?.expect_text()?; + buffer.push_str(text); } ValueExpr::MacroInvocation(invocation) => { - for value_result in evaluator.evaluate(invocation)? { - let value = value_result?; - let expanded = value.read()?; - Self::append_expanded_raw_text_value(context, &mut buffer, expanded)? + let expansion = MacroExpansion::initialize(environment, invocation)?; + evaluator.push(expansion); + while let Some(value) = evaluator.next()? { + let text = value.read_resolved()?.expect_text()?; + buffer.push_str(text); } } } @@ -601,55 +959,25 @@ impl<'top, D: Decoder> MakeStringExpansion<'top, D> { // Convert our BumpString<'bump> into a &'bump str that we can wrap in an `ExpandedValueRef` let constructed_text = buffer.into_bump_str(); - let expanded_value_ref: &'top ExpandedValueRef<'top, D> = context + let value_ref: &'top ValueRef<'top, D> = context .allocator() - .alloc_with(|| ExpandedValueRef::String(StrRef::from(constructed_text))); + .alloc_with(|| ValueRef::String(StrRef::from(constructed_text))); static EMPTY_ANNOTATIONS: &[&str] = &[]; - self.is_complete = true; - Ok(Some(ValueExpr::ValueLiteral( - LazyExpandedValue::from_constructed(context, EMPTY_ANNOTATIONS, expanded_value_ref), + Ok(MacroExpansionStep::FinalStep(Some( + ValueExpr::ValueLiteral(LazyExpandedValue::from_constructed( + context, + EMPTY_ANNOTATIONS, + value_ref, + )), ))) } - - /// Appends a string fragment to the `BumpString` being constructed. - fn append_expanded_raw_text_value( - context: EncodingContextRef<'_>, - buffer: &mut BumpString, - value: ExpandedValueRef<'_, D>, - ) -> IonResult<()> { - match value { - ExpandedValueRef::String(text) => buffer.push_str(text.as_ref()), - ExpandedValueRef::Symbol(RawSymbolRef::Text(text)) => buffer.push_str(text.as_ref()), - ExpandedValueRef::Symbol(RawSymbolRef::SymbolId(sid)) => { - let symbol = context.symbol_table.symbol_for(sid).ok_or_else(|| { - IonError::decoding_error(format!( - "found unknown symbol ID {sid} in call to `make_string`" - )) - })?; - if let Some(text) = symbol.text() { - buffer.push_str(text); - } else { - return IonResult::decoding_error(format!( - "found a symbol ID {sid} with unknown text in call to `make_string`" - )); - } - } - other => { - return IonResult::decoding_error(format!( - "found a non-text parameter to `make_string`: {:?}", - other - )) - } - } - Ok(()) - } } // ===== Implementation of template macro expansion ===== /// The evaluation state of a template expansion. -#[derive(Clone, Debug)] +#[derive(Copy, Clone, Debug)] pub struct TemplateExpansion<'top> { // A reference to the template definition template: TemplateMacroRef<'top>, @@ -665,42 +993,44 @@ impl<'top> TemplateExpansion<'top> { } } - fn next<'data: 'top, D: Decoder>( + pub(crate) fn next<'data: 'top, D: Decoder>( &mut self, context: EncodingContextRef<'top>, environment: Environment<'top, D>, - ) -> IonResult>> { - let value_expr = match self.template.body().expressions().get(self.step_index) { - None => return Ok(None), + ) -> IonResult> { + let expressions = self.template.body().expressions(); + let value_expr = match expressions.get(self.step_index) { + None => return Ok(MacroExpansionStep::FinalStep(None)), Some(expr) => expr, }; - self.step_index += 1; - - let step = match value_expr { - TemplateBodyValueExpr::Element(e) => { - match e.value() { - TemplateValue::List(range) - | TemplateValue::SExp(range) - | TemplateValue::Struct(range, _) => self.step_index += range.len(), - _ => {} - } + + self.step_index += value_expr.num_expressions(); + let value_expr = match value_expr.kind() { + TemplateBodyExprKind::Element(e) => { ValueExpr::ValueLiteral(LazyExpandedValue::from_template( context, environment, - TemplateElement::new(self.template, e), + TemplateElement::new(self.template.macro_ref(), e, value_expr.expr_range()), )) } - TemplateBodyValueExpr::Variable(variable) => { - environment.get_expected(variable.signature_index())? + TemplateBodyExprKind::Variable(variable) => { + environment.require_expr(variable.signature_index()) } - TemplateBodyValueExpr::MacroInvocation(raw_invocation) => { - let invocation = raw_invocation.resolve(self.template, context); - self.step_index += invocation.arg_expressions().len(); + TemplateBodyExprKind::MacroInvocation(raw_invocation) => { + let invocation = raw_invocation.resolve( + context, + self.template.address(), + value_expr.expr_range(), + ); ValueExpr::MacroInvocation(invocation.into()) } }; - Ok(Some(step)) + if self.step_index >= expressions.len() { + Ok(MacroExpansionStep::FinalStep(Some(value_expr))) + } else { + Ok(MacroExpansionStep::Step(value_expr)) + } } } @@ -738,7 +1068,7 @@ mod tests { expected: &str, ) -> IonResult<()> { let mut reader = Reader::new(v1_1::Text, invocation.as_bytes())?; - let _macro_address = reader.register_template(template_definition)?; + let _macro_address = reader.register_template_src(template_definition)?; let actual = reader.read_all_elements()?; let mut expected_reader = Reader::new(v1_1::Text, expected.as_bytes())?; let expected = expected_reader.read_all_elements()?; @@ -761,6 +1091,222 @@ mod tests { ) } + mod cardinality { + + mod bang { + use crate::lazy::expanded::macro_evaluator::tests::eval_template_invocation; + + #[test] + #[should_panic] + fn required_does_not_accept_empty_rest() { + eval_template_invocation( + "(macro foo (x) (make_string x x))", + r#" + (:foo) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + + #[test] + #[should_panic] + fn required_does_not_accept_empty_arg_group() { + eval_template_invocation( + "(macro foo (x) (make_string x x))", + r#" + (:foo (:)) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + + #[test] + #[should_panic] + fn required_does_not_accept_populated_arg_group() { + eval_template_invocation( + "(macro foo (x) (make_string x x))", + r#" + (:foo (:)) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + } + + mod optional { + use crate::lazy::expanded::macro_evaluator::tests::eval_template_invocation; + use crate::IonResult; + + #[test] + fn optional_accepts_empty_or_expr() -> IonResult<()> { + eval_template_invocation( + "(macro foo (x?) (make_string x x))", + r#" + (:foo) // x is implicitly empty + (:foo (:)) // x is explicitly empty + (:foo (: )) // x is explicitly empty with extra whitespace + (:foo "a") // x is "a" + (:foo (:foo a)) // x is `(:foo a)` + "#, + r#" + "" + "" + "" + "aa" + "aaaa" + "#, + ) + } + + #[test] + #[should_panic] + fn optional_does_not_accept_populated_arg_groups() { + eval_template_invocation( + "(macro foo (x?) (make_string x x))", + r#" + (:foo (: "a")) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + } + + mod star { + use crate::lazy::expanded::macro_evaluator::tests::eval_template_invocation; + use crate::IonResult; + + #[test] + fn star_accepts_groups() -> IonResult<()> { + eval_template_invocation( + "(macro foo (x y*) (make_string x y))", + r#" + (:foo "hello" (: " there " "friend!" )) + "#, + r#" + "hello there friend!" + "#, + ) + } + + #[test] + fn trailing_star_accepts_rest() -> IonResult<()> { + eval_template_invocation( + "(macro foo (x y*) (make_string x y))", + r#" + // x y1 y2 + (:foo "hello" " there " "friend!") + "#, + r#" + "hello there friend!" + "#, + ) + } + + #[test] + fn star_accepts_value_literal() -> IonResult<()> { + eval_template_invocation( + "(macro foo (x y* z*) (make_string x y z))", + r#" + // x y z + (:foo "hello" " there " "friend!") + "#, + r#" + "hello there friend!" + "#, + ) + } + + #[test] + fn omit_trailing_star() -> IonResult<()> { + eval_template_invocation( + "(macro foo (x y*) (make_string x y))", + r#" + (:foo "hello") // pass one arg, `y` will be an empty stream + "#, + r#" + "hello" + "#, + ) + } + + #[test] + #[should_panic] + fn omit_only_last_trailing_star() { + eval_template_invocation( + "(macro foo (x y* z*) (make_string x y))", + r#" + (:foo "hello") // pass one arg, y and z cannot both be omitted + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + } + + mod plus { + use crate::lazy::expanded::macro_evaluator::tests::eval_template_invocation; + + #[test] + #[should_panic] + fn plus_does_not_accept_empty_arg_group() { + eval_template_invocation( + "(macro foo (x+) (make_string x x))", + r#" + (:foo (:)) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + + #[test] + #[should_panic] + fn plus_does_not_accept_empty_rest() { + eval_template_invocation( + "(macro foo (x+) (make_string x x))", + r#" + (:foo) + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + } + } + + #[test] + #[should_panic] + fn too_many_args() { + eval_template_invocation( + "(macro foo (x y) (make_string x y))", + r#" + (:foo "a" "b" "c") + "#, + r#" + // should raise an error + "#, + ) + .unwrap() + } + #[test] fn it_takes_all_kinds() -> IonResult<()> { eval_template_invocation( @@ -773,11 +1319,11 @@ mod tests { 1.0 2023T "1" - (quote '1') // TODO: Only treat identifiers as variables + (literal '1') // TODO: Only treat identifiers as variables {{MQ==}} {{"1"}} [1] - (quote (1)) // Prevent the sexp from being considered a macro invocation + (literal (1)) // Prevent the sexp from being considered a macro invocation {'1':1}))"#, r#" (:foo) @@ -938,7 +1484,7 @@ mod tests { 'threadId': thread_id, 'threadName': (make_string "scheduler-thread-" thread_name), 'loggerName': "com.example.organization.product.component.ClassName", - 'logLevel': (quote INFO), + 'logLevel': (literal INFO), 'format': "Request status: {} Client ID: {} Client Host: {} Client Region: {} Timestamp: {}", 'parameters': [ "SUCCESS", @@ -956,7 +1502,7 @@ mod tests { 418 "6" "1" - "18b4fa" + "abc-123" (:values "region 4" "2022-12-07T20:59:59.744000Z")) @@ -973,7 +1519,7 @@ mod tests { 'parameters': [ "SUCCESS", "example-client-1", - "aws-us-east-5f-18b4fa", + "aws-us-east-5f-abc-123", "region 4", "2022-12-07T20:59:59.744000Z", ] diff --git a/src/lazy/expanded/macro_table.rs b/src/lazy/expanded/macro_table.rs index 236c3276..390997e5 100644 --- a/src/lazy/expanded/macro_table.rs +++ b/src/lazy/expanded/macro_table.rs @@ -1,59 +1,152 @@ -use std::collections::HashMap; - -use crate::lazy::expanded::template::{TemplateMacro, TemplateMacroRef}; +use crate::lazy::expanded::compiler::{ExpansionAnalysis, ExpansionSingleton}; +use crate::lazy::expanded::template::{ + MacroSignature, Parameter, ParameterCardinality, ParameterEncoding, RestSyntaxPolicy, + TemplateBody, TemplateMacro, TemplateMacroRef, +}; use crate::lazy::text::raw::v1_1::reader::{MacroAddress, MacroIdRef}; use crate::result::IonFailure; -use crate::IonResult; +use crate::{IonResult, IonType}; +use delegate::delegate; +use std::borrow::Cow; +use std::collections::HashMap; + +#[derive(Debug, Clone, PartialEq)] +pub struct Macro { + name: Option, + signature: MacroSignature, + kind: MacroKind, + // Compile-time heuristics that allow the reader to evaluate e-expressions lazily or using fewer + // resources in many cases. + // + // For the time being, e-expressions that could produce multiple values cannot be lazily evaluated. + // This is because the reader gives out lazy value handles for each value in the stream. If it knows + // in advance that an expression will produce one value, it can give out a lazy value that is + // backed by that e-expression. + // + // At the top level, e-expressions that both: + // 1. Produce a single value + // and + // 2. Will not produce a system value + // can be lazily evaluated. + // + // At other levels of nesting, the single-value expansion is the only requirement for lazy + // evaluation. + expansion_analysis: ExpansionAnalysis, +} + +impl Macro { + pub fn named( + name: impl Into, + signature: MacroSignature, + kind: MacroKind, + expansion_analysis: ExpansionAnalysis, + ) -> Self { + Self::new(Some(name.into()), signature, kind, expansion_analysis) + } + + pub fn anonymous( + signature: MacroSignature, + kind: MacroKind, + expansion_analysis: ExpansionAnalysis, + ) -> Self { + Self::new(None, signature, kind, expansion_analysis) + } + + pub fn new( + name: Option, + signature: MacroSignature, + kind: MacroKind, + expansion_analysis: ExpansionAnalysis, + ) -> Self { + Self { + name, + signature, + kind, + expansion_analysis, + } + } + + pub fn name(&self) -> Option<&str> { + self.name.as_deref() + } + pub fn signature(&self) -> &MacroSignature { + &self.signature + } + pub fn kind(&self) -> &MacroKind { + &self.kind + } + + pub fn expansion_analysis(&self) -> ExpansionAnalysis { + self.expansion_analysis + } + + pub fn can_be_lazily_evaluated_at_top_level(&self) -> bool { + self.expansion_analysis() + .can_be_lazily_evaluated_at_top_level() + } + + pub fn must_produce_exactly_one_value(&self) -> bool { + self.expansion_analysis().must_produce_exactly_one_value() + } +} /// The kinds of macros supported by -/// [`MacroEvaluator`](crate::lazy::expanded::macro_evaluator::MacroEvaluator). +/// [`MacroEvaluator`](crate::MacroEvaluator) /// This list parallels -/// [`MacroExpansionKind`](crate::lazy::expanded::macro_evaluator::MacroExpansionKind), +/// [`MacroExpansionKind`](crate::MacroExpansionKind), /// but its variants do not hold any associated state. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum MacroKind { Void, Values, MakeString, - Template(TemplateMacro), + Template(TemplateBody), } -impl MacroKind { - fn name(&self) -> &str { - match self { - MacroKind::Void => "void", - MacroKind::Values => "values", - MacroKind::MakeString => "make_string", - MacroKind::Template(template) => template.name(), - } - } -} - -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub struct MacroRef<'top> { address: MacroAddress, - kind: &'top MacroKind, + reference: &'top Macro, } impl<'top> MacroRef<'top> { - pub fn new(address: MacroAddress, kind: &'top MacroKind) -> Self { - Self { address, kind } + pub fn new(address: MacroAddress, reference: &'top Macro) -> Self { + Self { address, reference } } + + pub fn require_template(self) -> TemplateMacroRef<'top> { + if let MacroKind::Template(body) = &self.kind() { + return TemplateMacroRef::new(self, body); + } + unreachable!( + "caller required a template macro but found {:?}", + self.kind() + ) + } + + pub fn id_text(&'top self) -> Cow<'top, str> { + self.name() + .map(Cow::from) + .unwrap_or_else(move || Cow::from(format!("", self.address()))) + } + pub fn address(&self) -> MacroAddress { self.address } - pub fn kind(&self) -> &'top MacroKind { - self.kind + + pub fn reference(&self) -> &'top Macro { + self.reference } - pub fn expect_template(self) -> IonResult> { - if let MacroKind::Template(template) = &self.kind { - return Ok(TemplateMacroRef::new(self.address, template)); + delegate! { + to self.reference { + pub fn name(&'top self) -> Option<&'top str>; + pub fn signature(self) -> &'top MacroSignature; + pub fn kind(&self) -> &'top MacroKind; + pub fn expansion_analysis(&self) -> ExpansionAnalysis; + pub fn can_be_lazily_evaluated_at_top_level(&self) -> bool; + pub fn must_produce_exactly_one_value(&self) -> bool; } - IonResult::decoding_error(format!( - "expected a template macro but found {:?}", - self.kind - )) } } @@ -61,7 +154,7 @@ impl<'top> MacroRef<'top> { /// its validity and allowing evaluation to begin. #[derive(Debug, Clone)] pub struct MacroTable { - macros_by_address: Vec, + macros_by_address: Vec, // Maps names to an address that can be used to query the Vec above. macros_by_name: HashMap, } @@ -73,11 +166,74 @@ impl Default for MacroTable { } impl MacroTable { + pub const SYSTEM_MACRO_KINDS: &'static [MacroKind] = + &[MacroKind::Void, MacroKind::Values, MacroKind::MakeString]; + pub const NUM_SYSTEM_MACROS: usize = Self::SYSTEM_MACRO_KINDS.len(); + // When a user defines new macros, this is the first ID that will be assigned. This value + // is expected to change as development continues. It is currently used in several unit tests. + pub const FIRST_USER_MACRO_ID: usize = 3; + pub fn new() -> Self { - let macros_by_id = vec![MacroKind::Void, MacroKind::Values, MacroKind::MakeString]; + let macros_by_id = vec![ + Macro::named( + "void", + MacroSignature::new(vec![]).unwrap(), + MacroKind::Void, + ExpansionAnalysis { + could_produce_system_value: false, + must_produce_exactly_one_value: false, + // This is false because lazy evaluation requires giving out a LazyValue as a + // handle to eventually read the expression. We cannot give out a `LazyValue` + // for e-expressions that will produce 0 or 2+ values. + can_be_lazily_evaluated_at_top_level: false, + expansion_singleton: None, + }, + ), + Macro::named( + "values", + MacroSignature::new(vec![Parameter::new( + "expr_group", + ParameterEncoding::Tagged, + ParameterCardinality::ZeroOrMore, + RestSyntaxPolicy::Allowed, + )]) + .unwrap(), + MacroKind::Values, + ExpansionAnalysis { + could_produce_system_value: true, + must_produce_exactly_one_value: false, + can_be_lazily_evaluated_at_top_level: false, + expansion_singleton: None, + }, + ), + Macro::named( + "make_string", + MacroSignature::new(vec![Parameter::new( + "expr_group", + ParameterEncoding::Tagged, + ParameterCardinality::ZeroOrMore, + RestSyntaxPolicy::Allowed, + )]) + .unwrap(), + MacroKind::MakeString, + ExpansionAnalysis { + could_produce_system_value: false, + must_produce_exactly_one_value: true, + can_be_lazily_evaluated_at_top_level: true, + expansion_singleton: Some(ExpansionSingleton { + is_null: false, + ion_type: IonType::String, + num_annotations: 0, + }), + }, + ), + ]; let mut macros_by_name = HashMap::default(); - for (id, kind) in macros_by_id.iter().enumerate() { - macros_by_name.insert(kind.name().to_string(), id); + for (id, mac) in macros_by_id.iter().enumerate() { + if let Some(name) = mac.name() { + macros_by_name.insert(name.to_owned(), id); + } + // Anonymous macros are not entered into the macros_by_name lookup table } Self { macros_by_address: macros_by_id, @@ -89,7 +245,12 @@ impl MacroTable { self.macros_by_address.len() } - pub fn macro_with_id(&'_ self, id: MacroIdRef<'_>) -> Option> { + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn macro_with_id<'a, 'b, I: Into>>(&'a self, id: I) -> Option> { + let id = id.into(); match id { MacroIdRef::LocalName(name) => self.macro_with_name(name), MacroIdRef::LocalAddress(address) => self.macro_at_address(address), @@ -97,28 +258,38 @@ impl MacroTable { } pub fn macro_at_address(&self, address: usize) -> Option> { - let kind = self.macros_by_address.get(address)?; - Some(MacroRef { address, kind }) + let reference = self.macros_by_address.get(address)?; + Some(MacroRef { address, reference }) } pub fn address_for_name(&self, name: &str) -> Option { self.macros_by_name.get(name).copied() } - pub fn macro_with_name(&self, name: &str) -> Option> { + pub fn macro_with_name<'a>(&'a self, name: &str) -> Option> { let address = *self.macros_by_name.get(name)?; - let kind = self.macros_by_address.get(address)?; - Some(MacroRef { address, kind }) + let reference = self.macros_by_address.get(address)?; + Some(MacroRef { address, reference }) } pub fn add_macro(&mut self, template: TemplateMacro) -> IonResult { - let name = template.name(); - if self.macros_by_name.contains_key(name) { - return IonResult::decoding_error(format!("macro named '{name}' already exists")); - } let id = self.macros_by_address.len(); - self.macros_by_name.insert(name.to_owned(), id); - self.macros_by_address.push(MacroKind::Template(template)); + // If the macro has a name, make sure that name is not already in use and then add it. + if let Some(name) = template.name.as_deref() { + if self.macros_by_name.contains_key(name) { + return IonResult::decoding_error(format!("macro named '{name}' already exists")); + } + self.macros_by_name.insert(name.to_owned(), id); + } + + let new_macro = Macro::new( + template.name, + template.signature, + MacroKind::Template(template.body), + template.expansion_analysis, + ); + + self.macros_by_address.push(new_macro); Ok(id) } } diff --git a/src/lazy/expanded/mod.rs b/src/lazy/expanded/mod.rs index ad98a22f..543f08e4 100644 --- a/src/lazy/expanded/mod.rs +++ b/src/lazy/expanded/mod.rs @@ -33,40 +33,44 @@ //! that are ignored by the reader do not incur the cost of symbol table resolution. use std::cell::{Cell, UnsafeCell}; +use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::iter::empty; -use std::ops::Deref; +use std::ops::{Deref, Range}; use bumpalo::Bump as BumpAllocator; use sequence::{LazyExpandedList, LazyExpandedSExp}; use crate::element::iterators::SymbolsIterator; -use crate::lazy::any_encoding::IonEncoding; +use crate::lazy::any_encoding::{IonEncoding, IonVersion}; use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::{Decoder, LazyRawValue}; use crate::lazy::encoding::RawValueLiteral; use crate::lazy::expanded::compiler::TemplateCompiler; -use crate::lazy::expanded::macro_evaluator::{MacroEvaluator, RawEExpression}; -use crate::lazy::expanded::macro_table::MacroTable; +use crate::lazy::expanded::e_expression::EExpression; +use crate::lazy::expanded::encoding_module::EncodingModule; +use crate::lazy::expanded::macro_evaluator::{ + MacroEvaluator, MacroExpansion, MacroExpr, RawEExpression, +}; +use crate::lazy::expanded::macro_table::{Macro, MacroTable}; use crate::lazy::expanded::r#struct::LazyExpandedStruct; use crate::lazy::expanded::sequence::Environment; -use crate::lazy::expanded::template::{ - TemplateElement, TemplateMacro, TemplateMacroRef, TemplateValue, -}; +use crate::lazy::expanded::template::{TemplateElement, TemplateMacro, TemplateValue}; use crate::lazy::r#struct::LazyStruct; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::sequence::{LazyList, LazySExp}; use crate::lazy::str_ref::StrRef; use crate::lazy::streaming_raw_reader::{IonInput, StreamingRawReader}; -use crate::lazy::system_reader::{PendingLst, SystemReader}; +use crate::lazy::system_reader::{PendingContextChanges, SystemReader}; use crate::lazy::system_stream_item::SystemStreamItem; use crate::lazy::text::raw::v1_1::reader::MacroAddress; use crate::lazy::value::LazyValue; use crate::raw_symbol_ref::AsRawSymbolRef; use crate::result::IonFailure; use crate::{ - AnyEncoding, Catalog, Decimal, Int, IonResult, IonType, RawSymbolRef, SymbolTable, Timestamp, + Catalog, Decimal, HasRange, HasSpan, Int, IonResult, IonType, RawSymbolRef, RawVersionMarker, + Span, SymbolTable, Timestamp, ValueRef, }; // All of these modules (and most of their types) are currently `pub` as the lazy reader is gated @@ -74,6 +78,7 @@ use crate::{ // stabilizes. pub mod compiler; pub mod e_expression; +pub mod encoding_module; pub mod macro_evaluator; pub mod macro_table; pub mod sequence; @@ -81,9 +86,6 @@ pub mod r#struct; pub mod template; /// A collection of resources that can be used to encode or decode Ion values. -/// The `'top` lifetime associated with the [`EncodingContextRef`] reflects the fact that it can only -/// be used as long as the reader is positioned on the same top level expression (i.e. the symbol and -/// macro tables are guaranteed not to change). // It should be possible to loosen this definition of `'top` to include several top level values // as long as the macro and symbol tables do not change between them, though this would require // carefully designing the API to emphasize that the sequence of values is either the set that @@ -92,6 +94,7 @@ pub mod template; // would need to be proved out first. #[derive(Debug)] pub struct EncodingContext { + pub(crate) modules: HashMap, pub(crate) macro_table: MacroTable, pub(crate) symbol_table: SymbolTable, pub(crate) allocator: BumpAllocator, @@ -104,19 +107,48 @@ impl EncodingContext { allocator: BumpAllocator, ) -> Self { Self { + modules: HashMap::new(), macro_table, symbol_table, allocator, } } + pub fn for_ion_version(version: IonVersion) -> Self { + Self::new( + MacroTable::new(), + SymbolTable::new(version), + BumpAllocator::new(), + ) + } + pub fn empty() -> Self { - Self::new(MacroTable::new(), SymbolTable::new(), BumpAllocator::new()) + Self::new( + MacroTable::new(), + SymbolTable::new(IonVersion::default()), + BumpAllocator::new(), + ) } pub fn get_ref(&self) -> EncodingContextRef { EncodingContextRef { context: self } } + + pub fn macro_table(&self) -> &MacroTable { + &self.macro_table + } + + pub fn macro_table_mut(&mut self) -> &mut MacroTable { + &mut self.macro_table + } + + pub fn symbol_table(&self) -> &SymbolTable { + &self.symbol_table + } + + pub fn allocator(&self) -> &BumpAllocator { + &self.allocator + } } #[derive(Debug, Copy, Clone)] @@ -230,7 +262,7 @@ pub struct ExpandingReader { // // Holds information found in symbol tables and encoding directives (TODO) that can be applied // to the encoding context the next time the reader is between top-level expressions. - pending_lst: UnsafeCell, + pending_context_changes: UnsafeCell, encoding_context: UnsafeCell, catalog: Box, } @@ -244,15 +276,19 @@ impl ExpandingReader { raw_reader: raw_reader.into(), evaluator_ptr: None.into(), encoding_context: EncodingContext::empty().into(), - pending_lst: PendingLst::new().into(), + pending_context_changes: PendingContextChanges::new().into(), catalog, } } // TODO: This method is temporary. It will be removed when the ability to read 1.1 encoding // directives from the input stream is available. Until then, template creation is manual. - pub fn register_template(&mut self, template_definition: &str) -> IonResult { + pub fn register_template_src(&mut self, template_definition: &str) -> IonResult { let template_macro: TemplateMacro = self.compile_template(template_definition)?; + self.register_template(template_macro) + } + + pub fn register_template(&mut self, template_macro: TemplateMacro) -> IonResult { self.add_macro(template_macro) } @@ -288,24 +324,26 @@ impl ExpandingReader { context.allocator.reset(); } - pub fn pending_lst(&self) -> &PendingLst { + pub fn pending_context_changes(&self) -> &PendingContextChanges { // If the user is able to call this method, the PendingLst is not being modified and it's // safe to immutably reference. - unsafe { &*self.pending_lst.get() } + unsafe { &*self.pending_context_changes.get() } } - pub fn pending_lst_mut(&mut self) -> &mut PendingLst { + pub fn pending_lst_mut(&mut self) -> &mut PendingContextChanges { // SAFETY: If the caller has a `&mut` reference to `self`, it is the only mutable reference // that can modify `self.pending_lst`. - unsafe { &mut *self.pending_lst.get() } + unsafe { &mut *self.pending_context_changes.get() } } + #[inline] fn ptr_to_mut_ref<'a, T>(ptr: *mut ()) -> &'a mut T { let typed_ptr: *mut T = ptr.cast(); unsafe { &mut *typed_ptr } } /// Dereferences a raw pointer storing the address of the active MacroEvaluator. + #[inline] fn ptr_to_evaluator<'top>(evaluator_ptr: *mut ()) -> &'top mut MacroEvaluator<'top, Encoding> { Self::ptr_to_mut_ref(evaluator_ptr) } @@ -321,49 +359,108 @@ impl ExpandingReader { Self::ref_as_ptr(evaluator) } - /// Updates the encoding context with the information stored in the `PendingLst`. - // TODO: This only works on Ion 1.0 symbol tables for now, hence the name `PendingLst` - fn apply_pending_lst(pending_lst: &mut PendingLst, symbol_table: &mut SymbolTable) { + /// Updates the encoding context with the information stored in the `PendingContextChanges`. + fn apply_pending_context_changes( + pending_changes: &mut PendingContextChanges, + symbol_table: &mut SymbolTable, + macro_table: &mut MacroTable, + ) { + if let Some(new_version) = pending_changes.switch_to_version.take() { + symbol_table.reset_to_version(new_version); + pending_changes.has_changes = false; + // If we're switching to a new version, the last stream item was a version marker + // and there are no other pending changes. The `take()` above clears the `switch_to_version`. + return; + } + + if let Some(mut module) = pending_changes.take_new_active_module() { + std::mem::swap(symbol_table, module.symbol_table_mut()); + std::mem::swap(macro_table, module.macro_table_mut()); + return; + } + // If the symbol table's `imports` field had a value of `$ion_symbol_table`, then we're // appending the symbols it defined to the end of our existing local symbol table. // Otherwise, we need to clear the existing table before appending the new symbols. - if !pending_lst.is_lst_append { + if !pending_changes.is_lst_append { // We're setting the symbols list, not appending to it. symbol_table.reset(); } // `drain()` empties the pending `imported_symbols` and `symbols` lists - for symbol in pending_lst.imported_symbols.drain(..) { + for symbol in pending_changes.imported_symbols.drain(..) { symbol_table.add_symbol(symbol); } - for symbol in pending_lst.symbols.drain(..) { + for symbol in pending_changes.symbols.drain(..) { symbol_table.add_symbol(symbol); } - pending_lst.is_lst_append = false; - pending_lst.has_changes = false; + pending_changes.is_lst_append = false; + pending_changes.has_changes = false; + } + + #[inline] + fn interpret_value<'top>( + &self, + value: LazyExpandedValue<'top, Encoding>, + ) -> IonResult> { + if value.has_annotations() && matches!(value.ion_type(), IonType::Struct | IonType::SExp) { + self.fully_interpret_value(value) + } else { + Ok(SystemStreamItem::Value(LazyValue::new(value))) + } } /// Inspects a `LazyExpandedValue` to determine whether it is a symbol table or an /// application-level value. Returns it as the appropriate variant of `SystemStreamItem`. - fn interpret_value<'top>( + fn fully_interpret_value<'top>( &self, value: LazyExpandedValue<'top, Encoding>, ) -> IonResult> { // If this value is a symbol table... if SystemReader::<_, Input>::is_symbol_table_struct(&value)? { // ...traverse it and record any new symbols in our `pending_lst`. - let pending_lst = unsafe { &mut *self.pending_lst.get() }; - SystemReader::<_, Input>::process_symbol_table(pending_lst, &*self.catalog, &value)?; - pending_lst.has_changes = true; + let pending_changes = unsafe { &mut *self.pending_context_changes.get() }; + SystemReader::<_, Input>::process_symbol_table( + pending_changes, + &*self.catalog, + &value, + )?; + pending_changes.has_changes = true; let lazy_struct = LazyStruct { expanded_struct: value.read()?.expect_struct().unwrap(), }; return Ok(SystemStreamItem::SymbolTable(lazy_struct)); + } else if self.detected_encoding().version() == IonVersion::v1_1 + && SystemReader::<_, Input>::is_encoding_directive_sexp(&value)? + { + let pending_changes = unsafe { &mut *self.pending_context_changes.get() }; + SystemReader::<_, Input>::process_encoding_directive(pending_changes, value)?; + pending_changes.has_changes = true; + let lazy_sexp = LazySExp { + expanded_sexp: value.read()?.expect_sexp().unwrap(), + }; + return Ok(SystemStreamItem::EncodingDirective(lazy_sexp)); } // Otherwise, it's an application value. let lazy_value = LazyValue::new(value); return Ok(SystemStreamItem::Value(lazy_value)); } + fn interpret_ivm<'top>( + &self, + marker: ::VersionMarker<'top>, + ) -> IonResult> { + let new_version = marker.stream_version_after_marker()?; + // If this is the first item in the stream or we're changing versions, we need to ensure + // the encoding context is set up for this version. + if marker.range().start == 0 || new_version != marker.stream_version_before_marker() { + // SAFETY: Version markers do not hold a reference to the symbol table. + let pending_changes = unsafe { &mut *self.pending_context_changes.get() }; + pending_changes.switch_to_version = Some(new_version); + pending_changes.has_changes = true; + } + Ok(SystemStreamItem::VersionMarker(marker)) + } + /// This method is invoked just before the reader begins reading the next top-level expression /// from the data stream. It is NOT invoked between multiple top level _values_ coming from a /// single expression. @@ -382,13 +479,16 @@ impl ExpandingReader { // If the pending LST has changes to apply, do so. // SAFETY: Nothing else holds a reference to the `PendingLst`'s contents, so we can use the // `UnsafeCell` to get a mutable reference to it. - let pending_lst: &mut PendingLst = unsafe { &mut *self.pending_lst.get() }; + let pending_lst: &mut PendingContextChanges = + unsafe { &mut *self.pending_context_changes.get() }; if pending_lst.has_changes { // SAFETY: Nothing else holds a reference to the `EncodingContext`'s contents, so we can use the // `UnsafeCell` to get a mutable reference to its symbol table. let symbol_table: &mut SymbolTable = &mut unsafe { &mut *self.encoding_context.get() }.symbol_table; - Self::apply_pending_lst(pending_lst, symbol_table); + let macro_table: &mut MacroTable = + &mut unsafe { &mut *self.encoding_context.get() }.macro_table; + Self::apply_pending_context_changes(pending_lst, symbol_table, macro_table); } } @@ -397,21 +497,23 @@ impl ExpandingReader { /// This method will consume and process as many system-level values as possible until it /// encounters an application-level value or the end of the stream. pub fn next_value(&mut self) -> IonResult>> { + use SystemStreamItem::*; loop { - match self.next_item()? { - SystemStreamItem::VersionMarker(_marker) => { - // TODO: Handle version changes 1.0 <-> 1.1 - } - SystemStreamItem::SymbolTable(_) => { - // The symbol table is processed by `next_item` before it is returned. There's - // nothing to be done here. - } - SystemStreamItem::Value(value) => return Ok(Some(value)), - SystemStreamItem::EndOfStream(_) => return Ok(None), - } + match self.next_item() { + Ok(Value(value)) => return Ok(Some(value)), + Ok(EndOfStream(_)) => return Ok(None), + Ok(_) => {} + Err(e) => return Err(e), + }; } } + pub fn detected_encoding(&self) -> IonEncoding { + // SAFETY: We have an immutable reference to `self`, so it's legal for us to have an immutable + // reference to one of its fields. + unsafe { &*self.raw_reader.get() }.encoding() + } + /// Returns the next [`SystemStreamItem`] either by continuing to evaluate a macro invocation /// in progress or by pulling another expression from the input stream. pub fn next_item(&self) -> IonResult> { @@ -422,8 +524,21 @@ impl ExpandingReader { // If there's already an active macro evaluator, that means the reader is still in the process // of expanding a macro invocation it previously encountered. See if it has a value to give us. - if let Some(stream_item) = self.next_from_evaluator()? { - return Ok(stream_item); + if let Some(ptr) = self.evaluator_ptr.get() { + // If there's already an evaluator, dereference the pointer. + let evaluator = Self::ptr_to_evaluator(ptr); + match evaluator.next() { + Ok(Some(value)) => { + if evaluator.is_empty() { + // If the evaluator is empty, unset the pointer so we know not to query it + // further. + self.evaluator_ptr.set(None); + } + return self.interpret_value(value); + } + Ok(None) => {} + Err(e) => return Err(e), + } } // Otherwise, we're now between top level expressions. Take this opportunity to apply any @@ -441,37 +556,55 @@ impl ExpandingReader { use crate::lazy::raw_stream_item::RawStreamItem::*; let raw_reader = unsafe { &mut *self.raw_reader.get() }; match raw_reader.next(context_ref)? { - VersionMarker(marker) => return Ok(SystemStreamItem::VersionMarker(marker)), + VersionMarker(marker) => { + return self.interpret_ivm(marker); + } // We got our value; return it. Value(raw_value) => { let value = LazyExpandedValue::from_literal(context_ref, raw_value); return self.interpret_value(value); } // It's another macro invocation, we'll start evaluating it. - EExpression(e_exp) => { + EExp(e_exp) => { let context = self.context(); - let resolved_e_exp = e_exp.resolve(context_ref)?; + let resolved_e_exp = match e_exp.resolve(context_ref) { + Ok(resolved) => resolved, + Err(e) => return Err(e), + }; + + // If this e-expression invokes a template with a non-system, singleton expansion, we can use the + // e-expression to back a LazyExpandedValue. It will only be evaluated if the user calls `read()`. + if let Some(value) = LazyExpandedValue::try_from_e_expression(resolved_e_exp) { + // Because the expansion is guaranteed not to be a system value, we do not need to interpret it. + return Ok(SystemStreamItem::Value(LazyValue::new(value))); + } + let new_evaluator = MacroEvaluator::for_eexp(resolved_e_exp)?; // Get the current evaluator or make a new one let evaluator = match self.evaluator_ptr.get() { - // If there's already an evaluator, dereference the pointer. - Some(ptr) => Self::ptr_to_evaluator(ptr), - // If there's not, make a new one. - None => context - .allocator - // E-expressions always have an empty environment - .alloc_with(move || { - MacroEvaluator::new(context_ref, Environment::empty()) - }), + // If there's already an evaluator in the bump, it's empty. Overwrite it with our new one. + Some(ptr) => { + let bump_evaluator_ref = Self::ptr_to_evaluator(ptr); + *bump_evaluator_ref = new_evaluator; + bump_evaluator_ref + } + // If there's not an evaluator in the bump, make a new one. + None => context.allocator.alloc_with(|| new_evaluator), }; - // Push the invocation onto the evaluation stack. - evaluator.push(resolved_e_exp)?; - self.evaluator_ptr - .set(Some(Self::evaluator_to_ptr(evaluator))); // Try to get a value by starting to evaluate the e-expression. - if let Some(value) = self.next_from_evaluator()? { + let next_value = match evaluator.next() { + Ok(value) => value, + Err(e) => return Err(e), + }; + if let Some(value) = next_value { + // If we get a value and the evaluator isn't empty yet, save its pointer + // so we can try to get more out of it when `next_at_or_above_depth` is called again. + if !evaluator.is_empty() { + self.evaluator_ptr + .set(Some(Self::evaluator_to_ptr(evaluator))); + } // If we get a value, return it. - return Ok(value); + return self.interpret_value(value); } else { // If the expression was equivalent to `(:void)`, return to the top of // the loop and get the next expression. @@ -484,40 +617,6 @@ impl ExpandingReader { }; } } - - /// If there is not an evaluation in process, returns `Ok(None)`. - /// If there is an evaluation in process but it does not yield another value, returns `Ok(None)`. - /// If there is an evaluation in process and it yields another value, returns `Ok(Some(value))`. - /// Otherwise, returns `Err`. - fn next_from_evaluator(&self) -> IonResult>> { - let evaluator_ptr = match self.evaluator_ptr.get() { - // There's not currently an evaluator. - None => return Ok(None), - // There's an evaluator in the process of expanding a macro. - Some(ptr) => ptr, - }; - let evaluator = Self::ptr_to_evaluator(evaluator_ptr); - - match evaluator.next() { - Ok(Some(value)) => { - // See if this value was a symbol table that needs interpretation. - self.interpret_value(value).map(Some) - } - Ok(None) => { - // While the evaluator had macros in its stack, they did not produce any more - // values. The stack is now empty. - Ok(None) - } - Err(e) => Err(e), - } - } -} - -impl ExpandingReader { - pub fn detected_encoding(&self) -> IonEncoding { - let raw_reader = unsafe { &*self.raw_reader.get() }; - raw_reader.encoding() - } } /// The source of data backing a [`LazyExpandedValue`]. @@ -525,6 +624,8 @@ impl ExpandingReader { pub enum ExpandedValueSource<'top, D: Decoder> { /// This value was a literal in the input stream. ValueLiteral(D::Value<'top>), + /// This value is backed by an e-expression invoking a macro known to produce a single value. + EExp(EExpression<'top, D>), /// This value was part of a template definition. Template(Environment<'top, D>, TemplateElement<'top>), /// This value was the computed result of a macro invocation like `(:make_string `...)`. @@ -533,14 +634,15 @@ pub enum ExpandedValueSource<'top, D: Decoder> { // it to `Never` and the compiler can eliminate this code path where applicable. // Constructed data stored in the bump allocator. Holding references instead of the data // itself allows this type (and those that contain it) to impl `Copy`. - &'top [&'top str], // Annotations (if any) - &'top ExpandedValueRef<'top, D>, // Value + &'top [&'top str], // Annotations (if any) + &'top ValueRef<'top, D>, // Value ), } impl<'top, Encoding: Decoder> Debug for ExpandedValueSource<'top, Encoding> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match &self { + ExpandedValueSource::EExp(eexp) => write!(f, "{eexp:?}"), ExpandedValueSource::ValueLiteral(v) => write!(f, "{v:?}"), ExpandedValueSource::Template(_, template_element) => { write!(f, "{:?}", template_element.value()) @@ -563,24 +665,24 @@ impl<'top, V: RawValueLiteral, Encoding: Decoder = V>> From /// A variable found in the body of a template macro. #[derive(Debug, Copy, Clone)] pub struct TemplateVariableReference<'top> { - template: TemplateMacroRef<'top>, + macro_ref: &'top Macro, signature_index: u16, } impl<'top> TemplateVariableReference<'top> { - pub fn new(template: TemplateMacroRef<'top>, signature_index: u16) -> Self { + pub fn new(macro_ref: &'top Macro, signature_index: u16) -> Self { Self { - template, + macro_ref, signature_index, } } - fn name(&self) -> &'top str { - self.template.signature.parameters()[self.signature_index()].name() + fn name(&self) -> &str { + self.macro_ref.signature().parameters()[self.signature_index()].name() } - fn host_template(&self) -> TemplateMacroRef<'top> { - self.template + fn host_macro(&self) -> &'top Macro { + self.macro_ref } fn signature_index(&self) -> usize { @@ -605,6 +707,23 @@ impl<'top, Encoding: Decoder> Debug for LazyExpandedValue<'top, Encoding> { } impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { + // If the provided e-expression can be resolved to a template macro that is eligible to back + // a lazy value without first being evaluated, returns `Some(lazy_expanded_value)`. + // To be eligible, the body of the template macro must be an Ion value literal that is not + // a system value. + pub(crate) fn try_from_e_expression(eexp: EExpression<'top, Encoding>) -> Option { + let analysis = eexp.expansion_analysis(); + if !analysis.can_be_lazily_evaluated_at_top_level() { + return None; + } + + Some(Self { + context: eexp.context, + source: ExpandedValueSource::EExp(eexp), + variable: None, + }) + } + pub(crate) fn from_literal( context: EncodingContextRef<'top>, value: Encoding::Value<'top>, @@ -631,7 +750,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { pub(crate) fn from_constructed( context: EncodingContextRef<'top>, annotations: &'top [&'top str], - value: &'top ExpandedValueRef<'top, Encoding>, + value: &'top ValueRef<'top, Encoding>, ) -> Self { Self { context, @@ -651,6 +770,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { ValueLiteral(value) => value.ion_type(), Template(_, element) => element.value().ion_type(), Constructed(_annotations, value) => value.ion_type(), + EExp(eexp) => eexp.require_expansion_singleton().ion_type(), } } @@ -660,8 +780,9 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { ValueLiteral(value) => value.is_null(), Template(_, element) => element.value().is_null(), Constructed(_, value) => { - matches!(value, ExpandedValueRef::Null(_)) + matches!(value, ValueRef::Null(_)) } + EExp(eexp) => eexp.require_expansion_singleton().is_null(), } } @@ -671,6 +792,7 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { ValueLiteral(value) => value.has_annotations(), Template(_, element) => !element.annotations().is_empty(), Constructed(annotations, _) => !annotations.is_empty(), + EExp(eexp) => eexp.require_expansion_singleton().has_annotations(), } } @@ -690,9 +812,21 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { empty(), ))) } + EExp(eexp) => { + let annotations_range = 0..eexp.require_expansion_singleton().num_annotations(); + let annotations = &eexp + .invoked_macro + .require_template() + .body() + .annotations_storage()[annotations_range]; + ExpandedAnnotationsIterator::new(ExpandedAnnotationsSource::Template( + SymbolsIterator::new(annotations), + )) + } } } + #[inline] pub fn read(&self) -> IonResult> { use ExpandedValueSource::*; match &self.source { @@ -702,10 +836,37 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { *environment, element, )), - Constructed(_annotations, value) => Ok((*value).clone()), + Constructed(_annotations, value) => Ok((**value).as_expanded()), + EExp(ref eexp) => eexp.expand_to_single_value()?.read(), } } + #[inline(always)] + pub fn read_resolved(&self) -> IonResult> { + use ExpandedValueSource::*; + match &self.source { + ValueLiteral(value) => value.read_resolved(self.context), + Template(environment, element) => { + Ok(ValueRef::from_template(self.context, *environment, element)) + } + Constructed(_annotations, value) => Ok(**value), + EExp(ref eexp) => self.read_resolved_singleton_eexp(eexp), + } + } + + #[inline(never)] + fn read_resolved_singleton_eexp( + &self, + eexp: &EExpression<'top, Encoding>, + ) -> IonResult> { + let new_expansion = MacroExpansion::initialize( + // The parent environment of an e-expression is always empty. + Environment::empty(), + MacroExpr::from_eexp(*eexp), + )?; + new_expansion.expand_singleton()?.read_resolved() + } + pub fn context(&self) -> EncodingContextRef<'top> { self.context } @@ -713,6 +874,27 @@ impl<'top, Encoding: Decoder> LazyExpandedValue<'top, Encoding> { pub fn source(&self) -> ExpandedValueSource<'top, Encoding> { self.source } + + pub fn expect_value_literal(&self) -> IonResult> { + if let ExpandedValueSource::ValueLiteral(literal) = self.source { + return Ok(literal); + } + IonResult::decoding_error("expected LazyExpandedValue to be a literal") + } + + pub fn range(&self) -> Option> { + if let ExpandedValueSource::ValueLiteral(value) = &self.source { + return Some(value.range()); + } + None + } + + pub fn span(&self) -> Option> { + if let ExpandedValueSource::ValueLiteral(value) = &self.source { + return Some(value.span()); + } + None + } } impl<'top, Encoding: Decoder> From> @@ -775,12 +957,7 @@ impl<'top, Encoding: Decoder> Iterator for ExpandedAnnotationsIterator<'top, Enc } } -// TODO: This type does not implement `Copy` because some of its variants can own heap resources. -// (Specifically: Int, Decimal, String, Symbol, Blob, Clob.) If we plumb the bump allocator all -// the way down to the raw readers, then the situations that require allocation can -// hold a 'top reference to a bump allocation instead of a static reference to a heap allocation. -// This will enable us to remove several calls to `clone()`, which can be much slower than copies. -#[derive(Clone)] +#[derive(Copy, Clone)] pub enum ExpandedValueRef<'top, Encoding: Decoder> { Null(IonType), Bool(bool), @@ -1001,26 +1178,20 @@ impl<'top, Encoding: Decoder> ExpandedValueRef<'top, Encoding> { Symbol(s) => ExpandedValueRef::Symbol(s.as_raw_symbol_token_ref()), Blob(b) => ExpandedValueRef::Blob(BytesRef::from(b.as_ref())), Clob(c) => ExpandedValueRef::Clob(BytesRef::from(c.as_ref())), - List(s) => ExpandedValueRef::List(LazyExpandedList::from_template( + List => ExpandedValueRef::List(LazyExpandedList::from_template( context, environment, - element.template(), - element.annotations_range(), - *s, + *element, )), - SExp(s) => ExpandedValueRef::SExp(LazyExpandedSExp::from_template( + SExp => ExpandedValueRef::SExp(LazyExpandedSExp::from_template( context, environment, - element.template(), - element.annotations_range(), - *s, + *element, )), - Struct(s, index) => ExpandedValueRef::Struct(LazyExpandedStruct::from_template( + Struct(index) => ExpandedValueRef::Struct(LazyExpandedStruct::from_template( context, environment, - element.template(), - element.annotations_range(), - *s, + element, index, )), } diff --git a/src/lazy/expanded/sequence.rs b/src/lazy/expanded/sequence.rs index 38b7f810..187562a9 100644 --- a/src/lazy/expanded/sequence.rs +++ b/src/lazy/expanded/sequence.rs @@ -1,16 +1,11 @@ -use bumpalo::collections::Vec as BumpVec; - use crate::element::iterators::SymbolsIterator; use crate::lazy::decoder::{Decoder, LazyRawSequence, LazyRawValueExpr, RawValueExpr}; use crate::lazy::expanded::macro_evaluator::{MacroEvaluator, RawEExpression, ValueExpr}; -use crate::lazy::expanded::template::{ - AnnotationsRange, ExprRange, TemplateMacroRef, TemplateSequenceIterator, -}; +use crate::lazy::expanded::template::{TemplateElement, TemplateSequenceIterator}; use crate::lazy::expanded::{ EncodingContextRef, ExpandedAnnotationsIterator, ExpandedAnnotationsSource, LazyExpandedValue, }; -use crate::result::IonFailure; -use crate::{IonError, IonResult, IonType}; +use crate::{try_or_some_err, IonResult, IonType}; /// A sequence of not-yet-evaluated expressions passed as arguments to a macro invocation. /// @@ -25,7 +20,7 @@ use crate::{IonError, IonResult, IonType}; /// ```ion_1_1 /// (:foo 1 2 (:values 3)) /// ``` -/// The `Environment` would contain the expressions `1`, `2` and `3`, corresponding to parameters +/// The `Environment` would contain the expressions `1`, `2` and `(:values 3)`, corresponding to parameters /// `x`, `y`, and `z` respectively. #[derive(Copy, Clone, Debug)] pub struct Environment<'top, D: Decoder> { @@ -33,35 +28,50 @@ pub struct Environment<'top, D: Decoder> { } impl<'top, D: Decoder> Environment<'top, D> { - pub(crate) fn new(args: BumpVec<'top, ValueExpr<'top, D>>) -> Self { - Environment { - expressions: args.into_bump_slice(), - } + pub(crate) fn new(args: &'top [ValueExpr<'top, D>]) -> Self { + Environment { expressions: args } } /// Returns the expression for the corresponding signature index -- the variable's offset within - /// the template's signature. If the requested index is out of bounds, returns `Err`. - pub fn get_expected(&self, signature_index: usize) -> IonResult> { - self.expressions() - .get(signature_index) - .copied() - // The TemplateCompiler should detect any invalid variable references prior to evaluation - .ok_or_else(|| { - IonError::decoding_error(format!( - "reference to variable with signature index {} not valid", - signature_index - )) - }) + /// the template's signature. + /// + /// Panics if the requested index is out of bounds, as the rules of the template compiler + /// should make that impossible. + #[inline] + pub fn require_expr(&self, signature_index: usize) -> ValueExpr<'top, D> { + if let Some(expr) = self.expressions().get(signature_index).copied() { + return expr; + } + unreachable!("found a macro signature index reference that was out of bounds") } /// Returns an empty environment without performing any allocations. This is used for evaluating /// e-expressions, which never have named parameters. - pub fn empty() -> Environment<'top, D> { + pub const fn empty() -> Environment<'top, D> { Environment { expressions: &[] } } pub fn expressions(&self) -> &'top [ValueExpr<'top, D>] { self.expressions } + pub fn for_eexp(context: EncodingContextRef<'top>, eexp: D::EExp<'top>) -> IonResult { + use bumpalo::collections::Vec as BumpVec; + let allocator = context.allocator(); + let raw_args = eexp.raw_arguments(); + let capacity_hint = raw_args.size_hint().0; + let mut env_exprs = BumpVec::with_capacity_in(capacity_hint, allocator); + // Populate the environment by parsing the arguments from input + for expr in raw_args { + env_exprs.push(expr?.resolve(context)?); + } + + Ok(Environment::new(env_exprs.into_bump_slice())) + } +} + +impl<'top, D: Decoder> Default for Environment<'top, D> { + fn default() -> Self { + Self::empty() + } } /// The data source for a [`LazyExpandedList`]. @@ -70,12 +80,7 @@ pub enum ExpandedListSource<'top, D: Decoder> { /// The list was a value literal in the input stream. ValueLiteral(D::List<'top>), /// The list was part of a template definition. - Template( - Environment<'top, D>, - TemplateMacroRef<'top>, - AnnotationsRange, - ExprRange, - ), + Template(Environment<'top, D>, TemplateElement<'top>), // TODO: Constructed } @@ -99,12 +104,9 @@ impl<'top, D: Decoder> LazyExpandedList<'top, D> { pub fn from_template( context: EncodingContextRef<'top>, environment: Environment<'top, D>, - template: TemplateMacroRef<'top>, - annotations_range: AnnotationsRange, - step_range: ExprRange, + element: TemplateElement<'top>, ) -> LazyExpandedList<'top, D> { - let source = - ExpandedListSource::Template(environment, template, annotations_range, step_range); + let source = ExpandedListSource::Template(environment, element); Self { source, context } } @@ -121,12 +123,8 @@ impl<'top, D: Decoder> LazyExpandedList<'top, D> { ExpandedListSource::ValueLiteral(value) => ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::ValueLiteral(value.annotations()), }, - ExpandedListSource::Template(_environment, template, annotations, _sequence) => { - let annotations = template - .body - .annotations_storage() - .get(annotations.ops_range()) - .unwrap(); + ExpandedListSource::Template(_environment, element) => { + let annotations = element.annotations(); ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::Template(SymbolsIterator::new(annotations)), } @@ -137,17 +135,16 @@ impl<'top, D: Decoder> LazyExpandedList<'top, D> { pub fn iter(&self) -> ExpandedListIterator<'top, D> { let source = match &self.source { ExpandedListSource::ValueLiteral(list) => { - let evaluator = MacroEvaluator::new(self.context, Environment::empty()); - ExpandedListIteratorSource::ValueLiteral(evaluator, list.iter()) + ExpandedListIteratorSource::ValueLiteral(MacroEvaluator::new(), list.iter()) } - ExpandedListSource::Template(environment, template, _annotations, steps) => { - let steps = template.body.expressions().get(steps.ops_range()).unwrap(); - let evaluator = MacroEvaluator::new(self.context, *environment); + ExpandedListSource::Template(environment, element) => { + let nested_expressions = element.nested_expressions(); + let evaluator = MacroEvaluator::new_with_environment(*environment); ExpandedListIteratorSource::Template(TemplateSequenceIterator::new( self.context, evaluator, - *template, - steps, + element.template(), + nested_expressions, )) } }; @@ -195,12 +192,7 @@ pub enum ExpandedSExpSource<'top, D: Decoder> { /// The SExp was a value literal in the input stream. ValueLiteral(D::SExp<'top>), /// The SExp was part of a template definition. - Template( - Environment<'top, D>, - TemplateMacroRef<'top>, - AnnotationsRange, - ExprRange, - ), + Template(Environment<'top, D>, TemplateElement<'top>), } /// An s-expression that may have come from either a value literal in the input stream or from @@ -225,12 +217,8 @@ impl<'top, D: Decoder> LazyExpandedSExp<'top, D> { ExpandedSExpSource::ValueLiteral(value) => ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::ValueLiteral(value.annotations()), }, - ExpandedSExpSource::Template(_environment, template, annotations, _sequence) => { - let annotations = template - .body - .annotations_storage() - .get(annotations.ops_range()) - .unwrap(); + ExpandedSExpSource::Template(_environment, element) => { + let annotations = element.annotations(); ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::Template(SymbolsIterator::new(annotations)), } @@ -241,17 +229,16 @@ impl<'top, D: Decoder> LazyExpandedSExp<'top, D> { pub fn iter(&self) -> ExpandedSExpIterator<'top, D> { let source = match &self.source { ExpandedSExpSource::ValueLiteral(sexp) => { - let evaluator = MacroEvaluator::new(self.context, Environment::empty()); - ExpandedSExpIteratorSource::ValueLiteral(evaluator, sexp.iter()) + ExpandedSExpIteratorSource::ValueLiteral(MacroEvaluator::new(), sexp.iter()) } - ExpandedSExpSource::Template(environment, template, _annotations, steps) => { - let steps = template.body.expressions().get(steps.ops_range()).unwrap(); - let evaluator = MacroEvaluator::new(self.context, *environment); + ExpandedSExpSource::Template(environment, element) => { + let nested_expressions = element.nested_expressions(); + let evaluator = MacroEvaluator::new_with_environment(*environment); ExpandedSExpIteratorSource::Template(TemplateSequenceIterator::new( self.context, evaluator, - *template, - steps, + element.template(), + nested_expressions, )) } }; @@ -272,11 +259,9 @@ impl<'top, D: Decoder> LazyExpandedSExp<'top, D> { pub fn from_template( context: EncodingContextRef<'top>, environment: Environment<'top, D>, - template: TemplateMacroRef<'top>, - annotations: AnnotationsRange, - expressions: ExprRange, + element: TemplateElement<'top>, ) -> LazyExpandedSExp<'top, D> { - let source = ExpandedSExpSource::Template(environment, template, annotations, expressions); + let source = ExpandedSExpSource::Template(environment, element); Self { source, context } } } @@ -321,7 +306,7 @@ fn expand_next_sequence_value<'top, D: Decoder>( ) -> Option>> { loop { // If the evaluator's stack is not empty, it's still expanding a macro. - if evaluator.macro_stack_depth() > 0 { + if !evaluator.is_empty() { let value = evaluator.next().transpose(); if value.is_some() { // The `Some` may contain a value or an error; either way, that's the next return value. @@ -341,10 +326,7 @@ fn expand_next_sequence_value<'top, D: Decoder>( Ok(resolved) => resolved, Err(e) => return Some(Err(e)), }; - let begin_expansion_result = evaluator.push(resolved_invocation); - if let Err(e) = begin_expansion_result { - return Some(Err(e)); - } + evaluator.push(try_or_some_err!(resolved_invocation.expand())); continue; } Some(Err(e)) => return Some(Err(e)), diff --git a/src/lazy/expanded/struct.rs b/src/lazy/expanded/struct.rs index ab09c810..b6454d9e 100644 --- a/src/lazy/expanded/struct.rs +++ b/src/lazy/expanded/struct.rs @@ -1,23 +1,21 @@ -use std::ops::ControlFlow; - use crate::element::iterators::SymbolsIterator; use crate::lazy::decoder::private::{LazyRawStructPrivate, RawStructUnexpandedFieldsIterator}; use crate::lazy::decoder::{Decoder, LazyRawFieldName, LazyRawStruct}; use crate::lazy::expanded::macro_evaluator::{ - MacroEvaluator, MacroExpr, RawEExpression, ValueExpr, + MacroEvaluator, MacroExpansion, MacroExpr, ValueExpr, }; use crate::lazy::expanded::sequence::Environment; use crate::lazy::expanded::template::{ - AnnotationsRange, ExprRange, TemplateBodyValueExpr, TemplateElement, TemplateMacroInvocation, - TemplateMacroRef, TemplateStructIndex, TemplateStructUnexpandedFieldsIterator, + TemplateBodyExprKind, TemplateElement, TemplateMacroRef, TemplateStructIndex, + TemplateStructUnexpandedFieldsIterator, }; use crate::lazy::expanded::{ EncodingContextRef, ExpandedAnnotationsIterator, ExpandedAnnotationsSource, ExpandedValueRef, - LazyExpandedValue, TemplateVariableReference, + LazyExpandedValue, }; use crate::result::IonFailure; use crate::symbol_ref::AsSymbolRef; -use crate::{IonError, IonResult, RawSymbolRef, SymbolRef}; +use crate::{try_or_some_err, IonError, IonResult, RawSymbolRef, SymbolRef}; /// A unified type embodying all possible field representations coming from both input data /// (i.e. raw structs of some encoding) and template bodies. @@ -27,18 +25,9 @@ use crate::{IonError, IonResult, RawSymbolRef, SymbolRef}; // and expands the field as part of its iteration process. #[derive(Debug, Clone, Copy)] pub enum UnexpandedField<'top, D: Decoder> { - RawNameValue(EncodingContextRef<'top>, D::FieldName<'top>, D::Value<'top>), - RawNameEExp(EncodingContextRef<'top>, D::FieldName<'top>, D::EExp<'top>), - RawEExp(EncodingContextRef<'top>, D::EExp<'top>), - TemplateNameValue(SymbolRef<'top>, TemplateElement<'top>), - TemplateNameMacro(SymbolRef<'top>, TemplateMacroInvocation<'top>), - TemplateNameVariable( - SymbolRef<'top>, - // The variable name and the expression to which it referred. - // The expression may be either a raw value or a template element, so it's represented - // as a `ValueExpr`, which can accommodate both. - (TemplateVariableReference<'top>, ValueExpr<'top, D>), - ), + NameValue(LazyExpandedFieldName<'top, D>, LazyExpandedValue<'top, D>), + NameMacro(LazyExpandedFieldName<'top, D>, MacroExpr<'top, D>), + Macro(MacroExpr<'top, D>), } #[derive(Debug, Clone, Copy)] @@ -126,14 +115,21 @@ pub enum ExpandedStructSource<'top, D: Decoder> { ValueLiteral(D::Struct<'top>), Template( Environment<'top, D>, - TemplateMacroRef<'top>, - AnnotationsRange, - ExprRange, + TemplateElement<'top>, &'top TemplateStructIndex, ), // TODO: Constructed } +impl<'top, D: Decoder> ExpandedStructSource<'top, D> { + fn environment(&self) -> Environment<'top, D> { + match self { + ExpandedStructSource::ValueLiteral(_) => Environment::empty(), + ExpandedStructSource::Template(environment, _, _) => *environment, + } + } +} + #[derive(Copy, Clone)] pub struct LazyExpandedStruct<'top, D: Decoder> { pub(crate) context: EncodingContextRef<'top>, @@ -162,13 +158,10 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { pub fn from_template( context: EncodingContextRef<'top>, environment: Environment<'top, D>, - template: TemplateMacroRef<'top>, - annotations: AnnotationsRange, - expressions: ExprRange, + element: &TemplateElement<'top>, index: &'top TemplateStructIndex, ) -> LazyExpandedStruct<'top, D> { - let source = - ExpandedStructSource::Template(environment, template, annotations, expressions, index); + let source = ExpandedStructSource::Template(environment, *element, index); Self { source, context } } @@ -177,18 +170,8 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { ExpandedStructSource::ValueLiteral(value) => ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::ValueLiteral(value.annotations()), }, - ExpandedStructSource::Template( - _environment, - template, - annotations, - _expressions, - _index, - ) => { - let annotations = template - .body - .annotations_storage() - .get(annotations.ops_range()) - .unwrap(); + ExpandedStructSource::Template(_environment, element, _index) => { + let annotations = element.annotations(); ExpandedAnnotationsIterator { source: ExpandedAnnotationsSource::Template(SymbolsIterator::new(annotations)), } @@ -197,28 +180,31 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { } pub fn iter(&self) -> ExpandedStructIterator<'top, D> { + let evaluator = self + .context + .allocator() + .alloc_with(|| MacroEvaluator::new()); let source = match &self.source { ExpandedStructSource::ValueLiteral(raw_struct) => { ExpandedStructIteratorSource::ValueLiteral( - MacroEvaluator::new(self.context, Environment::empty()), + evaluator, raw_struct.unexpanded_fields(self.context), ) } - ExpandedStructSource::Template( - environment, - template, - _annotations, - expressions, - _index, - ) => { - let evaluator = MacroEvaluator::new(self.context, *environment); + ExpandedStructSource::Template(environment, element, _index) => { + evaluator.set_root_environment(*environment); + let template = element.template(); ExpandedStructIteratorSource::Template( evaluator, TemplateStructUnexpandedFieldsIterator::new( self.context, *environment, - *template, - &template.body.expressions[expressions.ops_range()], + template, + template + .body() + .expressions + .get(element.expr_range().tail()) + .unwrap(), ), ) } @@ -230,10 +216,7 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { } fn environment(&self) -> Environment<'top, D> { - match &self.source { - ExpandedStructSource::ValueLiteral(_) => Environment::empty(), - ExpandedStructSource::Template(environment, _, _, _, _) => *environment, - } + self.source.environment() } pub fn bump_iter(&self) -> &'top mut ExpandedStructIterator<'top, D> { @@ -256,7 +239,7 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { } // If we're reading from a struct in a template, consult its field index to see if one or // more fields with the requested name exist. - ExpandedStructSource::Template(environment, template, _, _, index) => { + ExpandedStructSource::Template(environment, element, index) => { let Some(value_expr_addresses) = index.get(name) else { // If the field name is not in the index, it's not in the struct. return Ok(None); @@ -267,20 +250,28 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { // offer an efficient implementation of 'get last' because that could require // fully evaluating one or more macros to find the last value. let first_result_address = value_expr_addresses[0]; - let first_result_expr = - template.body.expressions.get(first_result_address).unwrap(); - match first_result_expr { + let first_result_expr = element + .template() + .body() + .expressions() + .get(first_result_address) + .unwrap(); + match first_result_expr.kind() { // If the expression is a value literal, wrap it in a LazyExpandedValue and return it. - TemplateBodyValueExpr::Element(element) => { + TemplateBodyExprKind::Element(body_element) => { let value = LazyExpandedValue::from_template( self.context, *environment, - TemplateElement::new(*template, element), + TemplateElement::new( + element.template().macro_ref(), + body_element, + first_result_expr.expr_range(), + ), ); Ok(Some(value)) } // If the expression is a variable, resolve it in the current environment. - TemplateBodyValueExpr::Variable(variable_ref) => { + TemplateBodyExprKind::Variable(variable_ref) => { let value_expr = environment .expressions() .get(variable_ref.signature_index()) @@ -291,16 +282,22 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { // If the variable maps to a macro invocation, evaluate it until we get // the first value back. ValueExpr::MacroInvocation(invocation) => { - let mut evaluator = MacroEvaluator::new(self.context, *environment); - evaluator.push(*invocation)?; + let mut evaluator = + MacroEvaluator::for_macro_expr(*environment, *invocation)?; evaluator.next() } } } - TemplateBodyValueExpr::MacroInvocation(body_invocation) => { - let invocation = body_invocation.resolve(*template, self.context); - let mut evaluator = MacroEvaluator::new(self.context, *environment); - evaluator.push(invocation)?; + TemplateBodyExprKind::MacroInvocation(body_invocation) => { + let invocation = body_invocation.resolve( + self.context, + element.template().address(), + first_result_expr.expr_range(), + ); + let mut evaluator = MacroEvaluator::new_with_environment(*environment); + let expansion = + MacroExpansion::initialize(*environment, invocation.into())?; + evaluator.push(expansion); evaluator.next() } } @@ -327,18 +324,39 @@ pub enum ExpandedStructIteratorSource<'top, D: Decoder> { ValueLiteral( // Giving the struct iterator its own evaluator means that we can abandon the iterator // at any time without impacting the evaluation state of its parent container. - MacroEvaluator<'top, D>, + &'top mut MacroEvaluator<'top, D>, RawStructUnexpandedFieldsIterator<'top, D>, ), // The struct we're iterating over is a value in a TDL template. It may contain macro // invocations that need to be evaluated. Template( - MacroEvaluator<'top, D>, + &'top mut MacroEvaluator<'top, D>, TemplateStructUnexpandedFieldsIterator<'top, D>, ), // TODO: Constructed } +impl<'top, D: Decoder> ExpandedStructIteratorSource<'top, D> { + fn next_field(&mut self) -> Option>> { + // Get the next unexpanded field from our source's iterator. + match self { + ExpandedStructIteratorSource::Template(_, template_iterator) => { + template_iterator.next() + } + ExpandedStructIteratorSource::ValueLiteral(_, raw_struct_iter) => { + raw_struct_iter.next() + } + } + } + + fn evaluator(&mut self) -> &mut MacroEvaluator<'top, D> { + match self { + ExpandedStructIteratorSource::Template(evaluator, _) => evaluator, + ExpandedStructIteratorSource::ValueLiteral(evaluator, _) => evaluator, + } + } +} + pub struct ExpandedStructIterator<'top, D: Decoder> { // Each variant of 'source' below holds its own encoding context reference source: ExpandedStructIteratorSource<'top, D>, @@ -369,38 +387,15 @@ enum ExpandedStructIteratorState<'top, D: Decoder> { // bar: 2, // This variant holds a pointer to that struct's iterator living in the // EncodingContext's bump allocator. - InliningAStruct( - LazyExpandedStruct<'top, D>, - &'top mut ExpandedStructIterator<'top, D>, - ), + InliningAStruct(&'top mut ExpandedStructIterator<'top, D>), } impl<'top, D: Decoder> Iterator for ExpandedStructIterator<'top, D> { type Item = IonResult>; + #[inline] fn next(&mut self) -> Option { - let Self { - ref mut source, - ref mut state, - } = *self; - match source { - ExpandedStructIteratorSource::Template(tdl_macro_evaluator, template_iterator) => { - Self::next_field_from( - template_iterator.context(), - state, - tdl_macro_evaluator, - template_iterator, - ) - } - ExpandedStructIteratorSource::ValueLiteral(e_exp_evaluator, raw_struct_iter) => { - Self::next_field_from( - raw_struct_iter.context(), - state, - e_exp_evaluator, - raw_struct_iter, - ) - } - } + self.next_field() } } @@ -414,43 +409,53 @@ impl<'top, D: Decoder> ExpandedStructIterator<'top, D> { /// Pulls the next expanded field from the raw source struct. The field returned may correspond /// to a `(name, value literal)` pair in the raw struct, or it may be the product of a macro /// evaluation. - fn next_field_from< - // The lifetime of this method invocation. - 'a, - // An iterator over the struct we're expanding. It may be the fields iterator from a - // LazyRawStruct, or it could be a `TemplateStructRawFieldsIterator`. - I: Iterator>>, - >( - context: EncodingContextRef<'top>, - state: &'a mut ExpandedStructIteratorState<'top, D>, - evaluator: &'a mut MacroEvaluator<'top, D>, - iter: &'a mut I, - ) -> Option>> { - // This method begins by pulling raw field expressions from the source iterator. - // If the expression is a (name, value literal) pair, we can wrap it in an LazyExpandedField - // and return it immediately. However, if it is a (name, macro) pair or (macro) expression, - // then an unknown amount of evaluation will need to happen before we can return our next - // field. + fn next_field(&mut self) -> Option>> { + // Temporarily destructure 'Self' to get simultaneous mutable references to its fields. + let Self { + ref mut source, + ref mut state, + } = *self; + loop { - use ControlFlow::{Break, Continue}; use ExpandedStructIteratorState::*; match state { // This is the initial state. We're reading a raw field expression from our source // iterator. ReadingFieldFromSource => { - // We'll see what kind of expression it is. - match Self::next_from_iterator(context, state, evaluator, iter) { - // The iterator found a (name, value literal) pair. - Break(maybe_result) => return maybe_result, - // The iterator found a (name, macro) pair or a macro; further evaluation - // is needed to yield a (name, value) pair. - Continue(_) => continue, - } + use UnexpandedField::*; + match try_or_some_err!(source.next_field()?) { + NameValue(name, value) => { + return Some(Ok(LazyExpandedField::new(name, value))) + } + NameMacro(name, invocation) => { + match Self::begin_expanding_field_macro( + state, + source.evaluator(), + name, + invocation, + ) { + Some(field_result) => return Some(field_result), + None => continue, + } + } + Macro(invocation) => { + // The next expression from the iterator was a macro. We expect it to expand to a + // single struct whose fields will be merged into the one we're iterating over. For example: + // {a: 1, (:make_struct b 2 c 3), d: 4} + // expands to: + // {a: 1, b: 2, c: 3, d: 4} + try_or_some_err!(Self::begin_inlining_struct_from_macro( + state, + source.evaluator(), + invocation, + )) + } + }; } // The iterator previously encountered a macro in field-name position. That macro // yielded a struct, and now we're merging that expanded struct's fields into our // own one at a time. - InliningAStruct(_struct, struct_iter) => { + InliningAStruct(struct_iter) => { if let Some(inlined_field) = struct_iter.next() { // We pulled another field from the struct we're inlining. return Some(inlined_field); @@ -464,14 +469,21 @@ impl<'top, D: Decoder> ExpandedStructIterator<'top, D> { // macro in field value position, emitting (name, value) pairs for each value // in the expansion, one at a time. ExpandingValueExpr(field_name) => { - match evaluator.next() { - Err(e) => return Some(Err(e)), - Ok(Some(next_value)) => { + // Get the next expression from our source's macro evaluator. + let evaluator = source.evaluator(); + match try_or_some_err!(evaluator.next()) { + Some(next_value) => { + let field_name = *field_name; + if evaluator.is_empty() { + // The evaluator is empty, so we should return to reading from + // source. + *state = ReadingFieldFromSource; + } // We got another value from the macro we're evaluating. Emit // it as another field using the same field_name. - return Some(Ok(LazyExpandedField::new(*field_name, next_value))); + return Some(Ok(LazyExpandedField::new(field_name, next_value))); } - Ok(None) => { + None => { // The macro in the value position is no longer emitting values. Switch // back to reading from the source. *state = ReadingFieldFromSource; @@ -484,116 +496,30 @@ impl<'top, D: Decoder> ExpandedStructIterator<'top, D> { /// Pulls a single unexpanded field expression from the source iterator and sets `state` according to /// the expression's kind. - fn next_from_iterator>>>( - context: EncodingContextRef<'top>, + fn begin_expanding_field_macro( state: &mut ExpandedStructIteratorState<'top, D>, evaluator: &mut MacroEvaluator<'top, D>, - iter: &mut I, - ) -> ControlFlow>>> { - // Because this helper function is always being invoked from within a loop, it uses - // the `ControlFlow` enum to signal whether its return value should cause the loop to - // terminate (`ControlFlow::Break`) or continue (`ControlFlow::Continue`). - use ControlFlow::*; - - // If the iterator is empty, we're done. - let unexpanded_field = match iter.next() { - Some(Ok(field_expr)) => field_expr, - Some(Err(error)) => { - return Break(Some(Err::, IonError>(error))) - } - None => return Break(None), - }; - - use UnexpandedField::*; - match unexpanded_field { - RawNameValue(context, name, value) => { - Break(Some(Ok(LazyExpandedField::from_raw_field( - context, - name, - LazyExpandedValue::from_literal(context, value), - )))) - } - TemplateNameValue(name, value) => Break(Some(Ok(LazyExpandedField::from_template( - value.template(), - name, - LazyExpandedValue::from_template(context, evaluator.environment(), value), - )))), - // (name, macro invocation) pair. For example: `foo: (:bar)` - RawNameEExp(context, raw_name, raw_eexp) => { - let eexp = match raw_eexp.resolve(context) { - Ok(eexp) => eexp, - Err(e) => return Break(Some(Err(e))), - }; - if let Err(e) = evaluator.push(eexp) { - return Break(Some(Err(e))); - } - let name = LazyExpandedFieldName::RawName(context, raw_name); - *state = ExpandedStructIteratorState::ExpandingValueExpr(name); - // We've pushed the macro invocation onto the evaluator's stack, but further evaluation - // is needed to get our next field. - Continue(()) - } - RawEExp(context, eexp) => { - let invocation = match eexp.resolve(context) { - Ok(invocation) => invocation, - Err(e) => return Break(Some(Err(e))), - }; - // The next expression from the iterator was a macro. We expect it to expand to a - // single struct whose fields will be merged into the one we're iterating over. For example: - // {a: 1, (:make_struct b 2 c 3), d: 4} - // expands to: - // {a: 1, b: 2, c: 3, d: 4} - match Self::begin_inlining_struct_from_macro(state, evaluator, invocation.into()) { - // If the macro expanded to a struct as expected, continue the evaluation - // until we get a field to return. - Ok(_) => Continue(()), - // If something went wrong, surface the error. - Err(e) => Break(Some(Err(e))), - } - } - TemplateNameMacro(name_symbol, invocation) => { - if let Err(e) = evaluator.push(invocation) { - return Break(Some(Err(e))); - } - let name = - LazyExpandedFieldName::TemplateName(invocation.host_template(), name_symbol); - *state = ExpandedStructIteratorState::ExpandingValueExpr(name); - // We've pushed the macro invocation onto the evaluator's stack, but further evaluation - // is needed to get our next field. - Continue(()) - } - TemplateNameVariable(name_symbol, (variable_ref, value_expr)) => { - use ValueExpr::*; - let name = LazyExpandedFieldName::TemplateName(variable_ref.template, name_symbol); - match value_expr { - ValueLiteral(value) => { - return Break(Some(Ok(LazyExpandedField::from_template( - variable_ref.template, - name_symbol, - value.via_variable(variable_ref), - )))) - } - MacroInvocation(MacroExpr::EExp(eexp)) => { - if let Err(e) = evaluator.push(eexp) { - return Break(Some(Err(e))); - } - *state = ExpandedStructIteratorState::ExpandingValueExpr(name); - // We've pushed the macro invocation onto the evaluator's stack, but further evaluation - // is needed to get our next field. - Continue(()) - } - MacroInvocation(MacroExpr::TemplateMacro(invocation)) => { - if let Err(e) = evaluator.push(invocation) { - return Break(Some(Err(e))); - } - *state = ExpandedStructIteratorState::ExpandingValueExpr(name); - // We've pushed the macro invocation onto the evaluator's stack, but further evaluation - // is needed to get our next field. - Continue(()) - } - } - } + field_name: LazyExpandedFieldName<'top, D>, + invocation: MacroExpr<'top, D>, + ) -> Option>> { + let environment = evaluator.environment(); + let expansion = try_or_some_err!(MacroExpansion::initialize(environment, invocation)); + // If the macro is guaranteed to expand to exactly one value, we can evaluate it + // in place. + if invocation + .invoked_macro() + .expansion_analysis() + .must_produce_exactly_one_value() + { + let value = try_or_some_err!(expansion.expand_singleton()); + return Some(Ok(LazyExpandedField::new(field_name, value))); } + // Otherwise, we'll add it to the evaluator's stack and return to the top of the loop. + evaluator.push(expansion); + *state = ExpandedStructIteratorState::ExpandingValueExpr(field_name); + // We've pushed the macro invocation onto the evaluator's stack, but further evaluation + // is needed to get our next field. + None } /// Pulls the next value from the evaluator, confirms that it's a struct, and then switches @@ -603,10 +529,11 @@ impl<'top, D: Decoder> ExpandedStructIterator<'top, D> { evaluator: &mut MacroEvaluator<'top, D>, invocation: MacroExpr<'top, D>, ) -> IonResult<()> { - let mut evaluation = evaluator.evaluate(invocation)?; - let expanded_value = match evaluation.next() { - Some(Ok(item)) => item, - Some(Err(e)) => return Err(e), + let environment = evaluator.environment(); + let expansion = MacroExpansion::initialize(environment, invocation)?; + evaluator.push(expansion); + let expanded_value = match evaluator.next()? { + Some(item) => item, None => return IonResult::decoding_error(format!("macros in field name position must produce a single struct; '{:?}' produced nothing", invocation)), }; let struct_ = match expanded_value.read()? { @@ -619,7 +546,7 @@ impl<'top, D: Decoder> ExpandedStructIterator<'top, D> { } }; let iter: &'top mut ExpandedStructIterator<'top, D> = struct_.bump_iter(); - *state = ExpandedStructIteratorState::InliningAStruct(struct_, iter); + *state = ExpandedStructIteratorState::InliningAStruct(iter); Ok(()) } } diff --git a/src/lazy/expanded/template.rs b/src/lazy/expanded/template.rs index 68d51373..50cbd598 100644 --- a/src/lazy/expanded/template.rs +++ b/src/lazy/expanded/template.rs @@ -1,69 +1,193 @@ -use std::collections::HashMap; use std::fmt; use std::fmt::{Debug, Formatter}; use std::ops::{Deref, Range}; +use bumpalo::collections::Vec as BumpVec; +use rustc_hash::FxHashMap; + +use crate::{Bytes, Decimal, Int, IonResult, IonType, LazyExpandedFieldName, Str, Symbol, SymbolRef, Timestamp, try_or_some_err, Value}; +use crate::lazy::binary::raw::v1_1::immutable_buffer::ArgGroupingBitmap; use crate::lazy::decoder::Decoder; -use crate::lazy::expanded::macro_evaluator::{MacroEvaluator, MacroExpr, ValueExpr}; -use crate::lazy::expanded::macro_table::MacroRef; -use crate::lazy::expanded::r#struct::UnexpandedField; -use crate::lazy::expanded::sequence::Environment; use crate::lazy::expanded::{ EncodingContextRef, ExpandedValueSource, LazyExpandedValue, TemplateVariableReference, }; +use crate::lazy::expanded::compiler::ExpansionAnalysis; +use crate::lazy::expanded::macro_evaluator::{MacroEvaluator, MacroExpansion, MacroExpansionKind, MacroExpr, MacroExprArgsIterator, MakeStringExpansion, TemplateExpansion, ValueExpr, ValuesExpansion}; +use crate::lazy::expanded::macro_table::{Macro, MacroKind, MacroRef}; +use crate::lazy::expanded::r#struct::UnexpandedField; +use crate::lazy::expanded::sequence::Environment; use crate::lazy::text::raw::v1_1::reader::{MacroAddress, MacroIdRef}; use crate::result::IonFailure; -use crate::{Bytes, Decimal, Int, IonResult, IonType, Str, Symbol, SymbolRef, Timestamp, Value}; /// A parameter in a user-defined macro's signature. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct Parameter { name: String, encoding: ParameterEncoding, - // TODO: Grouping + cardinality: ParameterCardinality, + rest_syntax_policy: RestSyntaxPolicy, } impl Parameter { - pub fn new(name: String, encoding: ParameterEncoding) -> Self { - Self { name, encoding } + pub fn new(name: impl Into, encoding: ParameterEncoding, cardinality: ParameterCardinality, rest_syntax_policy: RestSyntaxPolicy) -> Self { + Self { name: name.into(), encoding, cardinality, rest_syntax_policy } } pub fn name(&self) -> &str { self.name.as_str() } - pub fn encoding(&self) -> &ParameterEncoding { - &self.encoding + pub fn encoding(&self) -> ParameterEncoding { + self.encoding + } + pub fn cardinality(&self) -> ParameterCardinality { + self.cardinality + } + pub fn rest_syntax_policy(&self) -> RestSyntaxPolicy { + self.rest_syntax_policy + } + /// Returns true if this parameter is of any cardinality other than `ExactlyOne` (`!`). + pub fn is_variadic(&self) -> bool { + !matches!(self.cardinality, ParameterCardinality::ExactlyOne) } } /// The encoding used to serialize and deserialize the associated parameter. -#[derive(Debug, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum ParameterEncoding { /// A 'tagged' type is one whose binary encoding begins with an opcode (sometimes called a 'tag'.) Tagged, // TODO: tagless types, including fixed-width types and macros } +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum ParameterCardinality { + ExactlyOne, // ! + ZeroOrOne, // ? + ZeroOrMore, // * + OneOrMore, // + +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum RestSyntaxPolicy { + NotAllowed, + Allowed +} + /// The sequence of parameters for which callers must pass expressions when invoking the macro. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct MacroSignature { parameters: Vec, + num_variadic_params: usize, } impl MacroSignature { - fn with_parameter(mut self, name: impl Into, encoding: ParameterEncoding) -> Self { - self.parameters.push(Parameter { - name: name.into(), - encoding, - }); - self + fn with_parameter(mut self, name: impl Into, encoding: ParameterEncoding, cardinality: ParameterCardinality) -> IonResult { + // We're adding a new parameter, so the previous "final position" parameter is no longer in the final position. + // Disable rest syntax for that parameter. + if let Some(final_position_param) = self.parameters.last_mut() { + final_position_param.rest_syntax_policy = RestSyntaxPolicy::NotAllowed; + } + let rest_syntax_policy = if cardinality == ParameterCardinality::ExactlyOne { + RestSyntaxPolicy::NotAllowed + } else { + self.num_variadic_params += 1; + if self.num_variadic_params > ArgGroupingBitmap::MAX_VARIADIC_PARAMS { + return IonResult::decoding_error(format!( + "macro found with {} variadic parameters; the max supported is {}", + self.num_variadic_params, + ArgGroupingBitmap::MAX_VARIADIC_PARAMS, + )); + }; + RestSyntaxPolicy::Allowed + }; + let param = Parameter::new(name.into(), encoding, cardinality, rest_syntax_policy); + self.parameters.push(param); + Ok(self) } + /// Constructs a new instance of a signature with no arguments (the signature of a "constant" template). + fn constant() -> Self { + Self::new(Vec::new()).unwrap() + } + + pub fn len(&self) -> usize { + self.parameters().len() + } pub fn parameters(&self) -> &[Parameter] { - &self.parameters + self.parameters.as_slice() + } + pub fn new(parameters: Vec) -> IonResult { + let num_variadic_params = parameters.iter().filter(|p| p.cardinality != ParameterCardinality::ExactlyOne).count(); + if num_variadic_params > ArgGroupingBitmap::MAX_VARIADIC_PARAMS { + return IonResult::decoding_error(format!( + "macro found with {num_variadic_params} variadic parameters; the max supported is {}", + ArgGroupingBitmap::MAX_VARIADIC_PARAMS + )); + }; + Ok(Self { parameters, num_variadic_params }) + } + pub fn num_variadic_params(&self) -> usize { + self.num_variadic_params } - pub fn new(parameters: Vec) -> Self { - Self { parameters } + pub fn bitmap_size_in_bytes(&self) -> usize { + const BITS_PER_VARIADIC_PARAM: usize = 2; + const BITS_PER_BYTE: usize = 8; + ((self.num_variadic_params * BITS_PER_VARIADIC_PARAM) + 7) / 8 + } +} + +#[cfg(test)] +mod macro_signature_tests { + use crate::IonResult; + use crate::lazy::expanded::template::{MacroSignature, ParameterCardinality, ParameterEncoding}; + + #[test] + fn bitmap_sizes() -> IonResult<()> { + let signature = MacroSignature::constant(); + assert_eq!(signature.num_variadic_params(), 0); + assert_eq!(signature.bitmap_size_in_bytes(), 0); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ExactlyOne)?; + assert_eq!(signature.num_variadic_params(), 0); + assert_eq!(signature.bitmap_size_in_bytes(), 0); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)?; + assert_eq!(signature.num_variadic_params(), 1); + assert_eq!(signature.bitmap_size_in_bytes(), 1); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("bar", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)?; + assert_eq!(signature.num_variadic_params(), 2); + assert_eq!(signature.bitmap_size_in_bytes(), 1); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("bar", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrMore)? + .with_parameter("baz", ParameterEncoding::Tagged, ParameterCardinality::OneOrMore)?; + assert_eq!(signature.num_variadic_params(), 3); + assert_eq!(signature.bitmap_size_in_bytes(), 1); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("bar", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("baz", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("quux", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)?; + assert_eq!(signature.num_variadic_params(), 4); + assert_eq!(signature.bitmap_size_in_bytes(), 1); + + let signature = MacroSignature::new(Vec::new())? + .with_parameter("foo", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("bar", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("baz", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("quux", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)? + .with_parameter("quuz", ParameterEncoding::Tagged, ParameterCardinality::ZeroOrOne)?; + assert_eq!(signature.num_variadic_params(), 5); + assert_eq!(signature.bitmap_size_in_bytes(), 2); + + Ok(()) } } @@ -76,11 +200,12 @@ impl MacroSignature { /// ``` /// This simplifies the use of machine-authored macros, which are always invoked by their address /// in the macro table rather than by a human-friendly name. -#[derive(Clone)] +#[derive(Clone, PartialEq)] pub struct TemplateMacro { pub(crate) name: Option, pub(crate) signature: MacroSignature, pub(crate) body: TemplateBody, + pub(crate) expansion_analysis: ExpansionAnalysis, } impl Debug for TemplateMacro { @@ -97,7 +222,7 @@ impl Debug for TemplateMacro { let indentation = &mut String::from(" "); let mut index = 0usize; while let Some(expr) = self.body().expressions().get(index) { - index += TemplateBodyValueExpr::fmt_expr(f, indentation, self, expr)?; + index += TemplateBodyExprKind::fmt_expr(f, indentation, self, expr)?; } Ok(()) @@ -119,26 +244,28 @@ impl TemplateMacro { /// A reference to a template macro definition paired with the macro table address at which it was found. #[derive(Copy, Clone, Debug)] pub struct TemplateMacroRef<'top> { - // This field is only stored as a source of information for debugging. (For example, when showing - // a macro evaluator stack trace.) - address: MacroAddress, - template: &'top TemplateMacro, + macro_ref: MacroRef<'top>, + template_body: &'top TemplateBody, } impl<'top> TemplateMacroRef<'top> { - pub fn new(address: MacroAddress, template: &'top TemplateMacro) -> Self { - Self { address, template } + pub fn new(macro_ref: MacroRef<'top>, template_body: &'top TemplateBody) -> Self { + Self { macro_ref, template_body } } - pub fn address(&self) -> MacroAddress { - self.address + pub fn body(&self) -> &'top TemplateBody { + self.template_body + } + + pub fn macro_ref(&self) -> MacroRef<'top> { + self.macro_ref } } impl<'top> Deref for TemplateMacroRef<'top> { - type Target = &'top TemplateMacro; + type Target = MacroRef<'top>; fn deref(&self) -> &Self::Target { - &self.template + &self.macro_ref } } @@ -147,7 +274,7 @@ pub struct TemplateSequenceIterator<'top, D: Decoder> { context: EncodingContextRef<'top>, template: TemplateMacroRef<'top>, evaluator: MacroEvaluator<'top, D>, - value_expressions: &'top [TemplateBodyValueExpr], + value_expressions: &'top [TemplateBodyExpr], index: usize, } @@ -156,13 +283,13 @@ impl<'top, D: Decoder> TemplateSequenceIterator<'top, D> { context: EncodingContextRef<'top>, evaluator: MacroEvaluator<'top, D>, template: TemplateMacroRef<'top>, - value_expressions: &'top [TemplateBodyValueExpr], + nested_expressions: &'top [TemplateBodyExpr], ) -> Self { Self { context, template, evaluator, - value_expressions, + value_expressions: nested_expressions, index: 0, } } @@ -174,33 +301,29 @@ impl<'top, D: Decoder> Iterator for TemplateSequenceIterator<'top, D> { fn next(&mut self) -> Option { loop { // If the evaluator's stack is not empty, give it the opportunity to yield a value. - if self.evaluator.macro_stack_depth() > 0 { - match self.evaluator.next().transpose() { - Some(value) => return Some(value), - None => { - // The stack did not produce values and is empty, pull - // the next expression from `self.value_expressions` - } - } + if let Some(value) = try_or_some_err!(self.evaluator.next()) { + return Some(Ok(value)); } - // We didn't get a value from the evaluator, so pull the next expansion step. - let step = self.value_expressions.get(self.index)?; - self.index += 1; - return match step { - TemplateBodyValueExpr::Element(element) => { + // The stack did not produce values and is empty, pull the next expression from `self.value_expressions` + // and start expanding it. + let current_expr = self.value_expressions.get(self.index)?; + let environment = self.evaluator.environment(); + self.index += current_expr.num_expressions(); + break match current_expr.kind() { + TemplateBodyExprKind::Element(element) => { let value = LazyExpandedValue { context: self.context, source: ExpandedValueSource::Template( - self.evaluator.environment(), - TemplateElement::new(self.template, element), + environment, + TemplateElement::new(self.template.macro_ref, element, current_expr.expr_range()), ), variable: None, }; Some(Ok(value)) } - TemplateBodyValueExpr::MacroInvocation(body_invocation) => { - // ...it's a TDL macro invocation. Push it onto the evaluator's stack and return - // to the top of the loop. + TemplateBodyExprKind::MacroInvocation(body_invocation) => { + // ...it's a TDL macro invocation. Resolve the invocation to get a reference to the + // macro being invoked. let invoked_macro = self .context .macro_table() @@ -208,33 +331,33 @@ impl<'top, D: Decoder> Iterator for TemplateSequenceIterator<'top, D> { .unwrap(); let invocation = TemplateMacroInvocation::new( self.context, - self.template, + self.template.address(), invoked_macro, - self.template - .body - .expressions() - .get(body_invocation.arg_expr_range().ops_range()) - .unwrap(), + ExprRange::new(current_expr.expr_range().tail()) ); - self.index += invocation.arg_expressions.len(); - match self.evaluator.push(invocation) { - Ok(_) => continue, - Err(e) => Some(Err(e)), + // If the macro is guaranteed to expand to exactly one value, we can evaluate it + // in place. + let new_expansion = try_or_some_err!(MacroExpansion::initialize(environment, invocation.into())); + if invoked_macro.expansion_analysis().must_produce_exactly_one_value() { + Some(new_expansion.expand_singleton()) + } else { + // Otherwise, add it to the evaluator's stack and return to the top of the loop. + self.evaluator.push(new_expansion); + continue; } } - TemplateBodyValueExpr::Variable(variable_ref) => { - let arg_expr = self - .evaluator - .environment() - .expressions() - .get(variable_ref.signature_index()) - .unwrap(); + TemplateBodyExprKind::Variable(variable_ref) => { + let arg_expr = self.evaluator.environment().require_expr(variable_ref.signature_index()); match arg_expr { - ValueExpr::ValueLiteral(value) => Some(Ok(*value)), + ValueExpr::ValueLiteral(value) => Some(Ok(value)), ValueExpr::MacroInvocation(invocation) => { - match self.evaluator.push(*invocation) { - Ok(_) => continue, - Err(e) => Some(Err(e)), + let new_expansion = try_or_some_err!(MacroExpansion::initialize(environment, invocation)); + if invocation.invoked_macro().expansion_analysis().must_produce_exactly_one_value() { + Some(new_expansion.expand_singleton()) + } else { + // Otherwise, add it to the evaluator's stack and return to the top of the loop. + self.evaluator.push(new_expansion); + continue; } } } @@ -251,7 +374,7 @@ pub struct TemplateStructUnexpandedFieldsIterator<'top, D: Decoder> { context: EncodingContextRef<'top>, environment: Environment<'top, D>, template: TemplateMacroRef<'top>, - expressions: &'top [TemplateBodyValueExpr], + nested_expressions: &'top [TemplateBodyExpr], index: usize, } @@ -266,13 +389,13 @@ impl<'top, D: Decoder> TemplateStructUnexpandedFieldsIterator<'top, D> { context: EncodingContextRef<'top>, environment: Environment<'top, D>, template: TemplateMacroRef<'top>, - expressions: &'top [TemplateBodyValueExpr], + nested_expressions: &'top [TemplateBodyExpr], ) -> Self { Self { context, environment, template, - expressions, + nested_expressions, index: 0, } } @@ -284,10 +407,10 @@ impl<'top, D: Decoder> Iterator for TemplateStructUnexpandedFieldsIterator<'top, fn next(&mut self) -> Option { let name_expr_address = self.index; let name_element = self - .expressions + .nested_expressions .get(name_expr_address)? - .expect_element() - .expect("field name must be a literal"); + .kind() + .require_element(); let name: SymbolRef = match &name_element.value { TemplateValue::Symbol(s) => s.into(), TemplateValue::String(s) => s.text().into(), @@ -295,26 +418,17 @@ impl<'top, D: Decoder> Iterator for TemplateStructUnexpandedFieldsIterator<'top, }; let value_expr_address = name_expr_address + 1; let value_expr = self - .expressions + .nested_expressions .get(value_expr_address) .expect("template struct had field name with no value"); - let unexpanded_field = match value_expr { - TemplateBodyValueExpr::Element(element) => { - match element.value() { - TemplateValue::List(range) - | TemplateValue::SExp(range) - | TemplateValue::Struct(range, _) => self.index += range.len(), - _ => { - // Otherwise, the value is a scalar and is exactly one expression. We already - // accounted for the first expression, so there's nothing else to do here. - } - }; - UnexpandedField::TemplateNameValue( - name, - TemplateElement::new(self.template, element), + let unexpanded_field = match value_expr.kind() { + TemplateBodyExprKind::Element(element) => { + UnexpandedField::NameValue( + LazyExpandedFieldName::TemplateName(self.template, name), + LazyExpandedValue::from_template(self.context, self.environment, TemplateElement::new(self.template.macro_ref(), element, value_expr.expr_range())), ) } - TemplateBodyValueExpr::MacroInvocation(body_invocation) => { + TemplateBodyExprKind::MacroInvocation(body_invocation) => { let invoked_macro = self .context .macro_table() @@ -322,37 +436,40 @@ impl<'top, D: Decoder> Iterator for TemplateStructUnexpandedFieldsIterator<'top, .unwrap(); let invocation = TemplateMacroInvocation::new( self.context, - self.template, + self.template.address(), invoked_macro, - self.template - .body - .expressions() - .get(body_invocation.arg_expr_range().ops_range()) - .unwrap(), + ExprRange::new(value_expr.expr_range().tail()) ); - self.index += invocation.arg_expressions.len(); - UnexpandedField::TemplateNameMacro(name, invocation) + UnexpandedField::NameMacro( + LazyExpandedFieldName::TemplateName(self.template, name), + MacroExpr::from_template_macro(invocation) + ) } - TemplateBodyValueExpr::Variable(variable) => { + TemplateBodyExprKind::Variable(variable) => { let arg_expr = self .environment - .get_expected(variable.signature_index()) - .expect("reference to non-existent parameter"); - let variable_ref = variable.resolve(self.template); - UnexpandedField::TemplateNameVariable(name, (variable_ref, arg_expr)) + .require_expr(variable.signature_index()); + let variable_ref = variable.resolve(self.template.macro_ref.reference()); + let field_name = LazyExpandedFieldName::TemplateName(self.template, name); + let field = match arg_expr { + ValueExpr::ValueLiteral(value) => UnexpandedField::NameValue(field_name, value.via_variable(variable_ref)), + ValueExpr::MacroInvocation(invocation) => UnexpandedField::NameMacro(field_name, invocation) + }; + field } }; - self.index += 2; + self.index += /* name expr count -> */ 1 + value_expr.num_expressions(); Some(Ok(unexpanded_field)) } } /// Stores a sequence of expansion steps that need to be evaluated in turn. /// -/// See [`TemplateBodyValueExpr`] for details. +/// See [`TemplateBodyExprKind`] for details. #[derive(Debug, Clone, PartialEq)] pub struct TemplateBody { - pub(crate) expressions: Vec, + // A vector of expressions that will be visited in turn during expansion. + pub(crate) expressions: Vec, // All of the elements stored in the Vec above share the Vec below for storing their annotations. // This allows us to avoid allocating a `Vec` for every value in the template, saving // a small amount of time and memory during compilation. Each values hold an index range @@ -361,35 +478,77 @@ pub struct TemplateBody { } impl TemplateBody { - pub fn expressions(&self) -> &[TemplateBodyValueExpr] { - &self.expressions + pub fn expressions(&self) -> &[TemplateBodyExpr] { + self.expressions.as_slice() } pub fn annotations_storage(&self) -> &[Symbol] { &self.annotations_storage } - pub fn push_element(&mut self, element: TemplateBodyElement) { + pub fn push_element(&mut self, element: TemplateBodyElement, expr_range: ExprRange) { self.expressions - .push(TemplateBodyValueExpr::Element(element)) + .push(TemplateBodyExpr::element(element, expr_range)) } pub fn push_variable(&mut self, signature_index: u16) { - self.expressions.push(TemplateBodyValueExpr::Variable( - TemplateBodyVariableReference::new(signature_index), - )) + let index = self.expressions.len(); + self.expressions.push(TemplateBodyExpr::variable(signature_index, ExprRange::new(index..index+1))) } pub fn push_macro_invocation(&mut self, invoked_macro_address: usize, expr_range: ExprRange) { self.expressions - .push(TemplateBodyValueExpr::MacroInvocation( - TemplateBodyMacroInvocation::new(invoked_macro_address, expr_range), - )) + .push(TemplateBodyExpr::macro_invocation(invoked_macro_address, expr_range)) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct TemplateBodyExpr { + kind: TemplateBodyExprKind, + expr_range: ExprRange, +} + +impl TemplateBodyExpr { + pub fn new(kind: TemplateBodyExprKind, expr_range: ExprRange) -> Self { + Self { kind, expr_range } + } + + pub fn element(element: TemplateBodyElement, expr_range: ExprRange) -> Self { + Self { + kind: TemplateBodyExprKind::Element(element), + expr_range + } + } + + pub fn variable(signature_index: u16, expr_range: ExprRange) -> Self { + Self { + kind: TemplateBodyExprKind::Variable(TemplateBodyVariableReference::new(signature_index)), + expr_range + } + } + + pub fn macro_invocation(invoked_macro_address: MacroAddress, expr_range: ExprRange) -> Self { + Self { + kind: TemplateBodyExprKind::MacroInvocation(TemplateBodyMacroInvocation::new(invoked_macro_address)), + expr_range + } + } + + pub fn kind(&self) -> &TemplateBodyExprKind { + &self.kind + } + + pub fn num_expressions(&self) -> usize { + self.expr_range.len() + } + + pub fn expr_range(&self) -> ExprRange { + self.expr_range } } /// An expression appearing in value position in a template body. #[derive(Debug, Clone, PartialEq)] -pub enum TemplateBodyValueExpr { +pub enum TemplateBodyExprKind { /// A potentially annotated value literal. Element(TemplateBodyElement), /// A reference to a variable that needs to be expanded. @@ -398,25 +557,17 @@ pub enum TemplateBodyValueExpr { MacroInvocation(TemplateBodyMacroInvocation), } -impl TemplateBodyValueExpr { - /// Returns `Ok(&element)` if this expression is an annotated value. Otherwise, returns - /// `Err(IonError)`. - pub fn expect_element(&self) -> IonResult<&TemplateBodyElement> { - match self { - TemplateBodyValueExpr::Element(e) => Ok(e), - TemplateBodyValueExpr::Variable(variable_reference) => { - let index = variable_reference.signature_index(); - IonResult::decoding_error(format!( - "expected an element, found reference variable with signature index '{index}'" - )) - } - TemplateBodyValueExpr::MacroInvocation(invocation) => { - let address = invocation.macro_address(); - IonResult::decoding_error(format!( - "expected an element, found macro at address {address}" - )) - } +impl TemplateBodyExprKind { + /// Confirms that this value expression is a value literal and panics if it is not. + /// + /// When this method is called, it is because the rules of the template compiler have + /// dictated that an element in this position be a value literal. + #[inline] + pub fn require_element(&self) -> &TemplateBodyElement { + if let TemplateBodyExprKind::Element(e) = self { + return e; } + unreachable!("The compiled template contained a non-element in element position"); } /// This helper method is invoked by the `Debug` implementation of `TemplateMacro`, which provides @@ -428,17 +579,17 @@ impl TemplateBodyValueExpr { f: &mut Formatter<'_>, indentation: &mut String, host_template: &TemplateMacro, - expr: &TemplateBodyValueExpr, + expr: &TemplateBodyExpr, ) -> Result { - match &expr { - TemplateBodyValueExpr::Element(e) => { - Self::fmt_element(f, indentation, host_template, e) + match &expr.kind() { + TemplateBodyExprKind::Element(e) => { + Self::fmt_element(f, indentation, host_template, e, expr.expr_range()) } - TemplateBodyValueExpr::Variable(v) => { + TemplateBodyExprKind::Variable(v) => { Self::fmt_variable(f, indentation, host_template, v) } - TemplateBodyValueExpr::MacroInvocation(m) => { - Self::fmt_invocation(f, indentation, host_template, m) + TemplateBodyExprKind::MacroInvocation(m) => { + Self::fmt_invocation(f, indentation, host_template, m, expr.expr_range()) } } } @@ -451,6 +602,7 @@ impl TemplateBodyValueExpr { indentation: &mut String, host_template: &TemplateMacro, element: &TemplateBodyElement, + expr_range: ExprRange, ) -> Result { let annotations_range = element.annotations_range.ops_range(); let annotations = host_template @@ -464,17 +616,17 @@ impl TemplateBodyValueExpr { } use TemplateValue::*; match element.value() { - List(l) => { + List => { writeln!(f, "list")?; - return Self::fmt_sequence_body(f, indentation, host_template, *l); + return Self::fmt_sequence_body(f, indentation, host_template, expr_range); } - SExp(s) => { + SExp => { writeln!(f, "sexp")?; - return Self::fmt_sequence_body(f, indentation, host_template, *s); + return Self::fmt_sequence_body(f, indentation, host_template, expr_range); } - Struct(s, _) => { + Struct(_) => { writeln!(f, "struct")?; - return Self::fmt_struct(f, indentation, host_template, *s); + return Self::fmt_struct(f, indentation, host_template, expr_range); } Null(n) => writeln!(f, "{}", Value::Null(*n)), Bool(b) => writeln!(f, "{b}"), @@ -526,7 +678,7 @@ impl TemplateBodyValueExpr { indentation.push_str(" "); let mut expr_index: usize = 0; while expr_index < expressions.len() { - let TemplateBodyValueExpr::Element(name_element) = &expressions[expr_index] else { + let TemplateBodyExprKind::Element(name_element) = &expressions[expr_index].kind() else { unreachable!( "non-element field name in template struct: {:?}", &expressions[expr_index] @@ -558,6 +710,7 @@ impl TemplateBodyValueExpr { indentation: &mut String, host_template: &TemplateMacro, invocation: &TemplateBodyMacroInvocation, + expr_range: ExprRange, ) -> Result { writeln!( f, @@ -567,7 +720,7 @@ impl TemplateBodyValueExpr { let args = host_template .body .expressions - .get(invocation.arg_expr_range.ops_range()) + .get(expr_range.tail()) .unwrap(); indentation.push_str(" "); @@ -609,49 +762,36 @@ impl TemplateBodyValueExpr { #[derive(Debug, Copy, Clone, PartialEq)] pub struct TemplateBodyMacroInvocation { invoked_macro_address: MacroAddress, - arg_expr_range: ExprRange, } impl TemplateBodyMacroInvocation { - pub fn new(invoked_macro_address: MacroAddress, arg_expr_range: ExprRange) -> Self { + pub fn new(invoked_macro_address: MacroAddress) -> Self { Self { invoked_macro_address, - arg_expr_range, } } pub fn macro_address(&self) -> MacroAddress { self.invoked_macro_address } - pub fn arg_expr_range(&self) -> ExprRange { - self.arg_expr_range - } /// Finds the definition of the macro being invoked in the provided `context`'s macro table. /// /// It is a logic error for this method to be called with an [`EncodingContextRef`] that does not /// contain the necessary information; doing so will cause this method to panic. - pub(crate) fn resolve<'top>( + pub(crate) fn resolve( self, - host_template: TemplateMacroRef<'top>, - context: EncodingContextRef<'top>, - ) -> TemplateMacroInvocation<'top> { + context: EncodingContextRef, + host_template_address: MacroAddress, + expr_range: ExprRange + ) -> TemplateMacroInvocation { let invoked_macro = context .macro_table() .macro_at_address(self.invoked_macro_address) .unwrap(); - let arg_expressions = host_template - .body - .expressions() - .get(self.arg_expr_range.ops_range()) - .unwrap(); + let arg_expr_range = ExprRange::new(expr_range.tail()); - TemplateMacroInvocation { - context, - host_template, - invoked_macro, - arg_expressions, - } + TemplateMacroInvocation::new(context, host_template_address, invoked_macro, arg_expr_range) } } @@ -660,14 +800,13 @@ impl TemplateBodyMacroInvocation { #[derive(Copy, Clone)] pub struct TemplateMacroInvocation<'top> { context: EncodingContextRef<'top>, - // The definition of the template in which this macro invocation appears. This is useful as - // debugging information / viewing in stack traces. - host_template: TemplateMacroRef<'top>, + // We store the address of the host template (8 bytes) rather than a full TemplateMacroRef (24) + host_template_address: MacroAddress, // The macro being invoked invoked_macro: MacroRef<'top>, // The range of value expressions in the host template's body that are arguments to the // macro being invoked - arg_expressions: &'top [TemplateBodyValueExpr], + arg_expressions_range: ExprRange, } impl<'top> Debug for TemplateMacroInvocation<'top> { @@ -683,15 +822,15 @@ impl<'top> Debug for TemplateMacroInvocation<'top> { impl<'top> TemplateMacroInvocation<'top> { pub fn new( context: EncodingContextRef<'top>, - host_template: TemplateMacroRef<'top>, + host_template_address: MacroAddress, invoked_macro: MacroRef<'top>, - arg_expressions: &'top [TemplateBodyValueExpr], + arg_expressions_range: ExprRange, ) -> Self { Self { context, - host_template, + host_template_address, invoked_macro, - arg_expressions, + arg_expressions_range, } } @@ -702,13 +841,32 @@ impl<'top> TemplateMacroInvocation<'top> { &self, environment: Environment<'top, D>, ) -> TemplateMacroInvocationArgsIterator<'top, D> { - TemplateMacroInvocationArgsIterator::new(environment, *self) + TemplateMacroInvocationArgsIterator::new(self.context, environment, self.arg_expressions(), self.host_macro_ref()) + } + pub fn host_template_address(&self) -> MacroAddress { + self.host_template_address + } + + /// Helper method to access the definition of the host template. Useful for debugging, + /// but not required for macro expansion. + pub fn host_macro_ref(&self) -> MacroRef<'top> { + self.context().macro_table().macro_at_address(self.host_template_address).unwrap() } + + /// Helper method to access the definition of the host template. Useful for debugging, + /// but not required for macro expansion. pub fn host_template(&self) -> TemplateMacroRef<'top> { - self.host_template + // We only store the macro address (8 bytes) instead of the full `TemplateMacroRef` (24 bytes) + // for size savings. Because the address was copied from a resolved `TemplateMacroRef` in the + // constructor and the encoding context is frozen for the duration of `'top`, we can safely + // assume that the address maps to a template macro in the current encoding context. This + // allows us to call `unwrap()` freely. + let macro_ref = self.host_macro_ref(); + macro_ref.require_template() } - pub fn arg_expressions(&self) -> &'top [TemplateBodyValueExpr] { - self.arg_expressions + + pub fn arg_expressions(&self) -> &'top [TemplateBodyExpr] { + self.host_template().body().expressions().get(self.arg_expressions_range.ops_range()).unwrap() } pub fn invoked_macro(&self) -> MacroRef<'top> { self.invoked_macro @@ -716,77 +874,111 @@ impl<'top> TemplateMacroInvocation<'top> { pub fn context(&self) -> EncodingContextRef<'top> { self.context } + + pub fn new_evaluation_environment(&self, parent_environment: Environment<'top, D>) -> IonResult> { + let arguments = self.arguments(parent_environment); + let allocator = self.context().allocator(); + // Use the iterator's size hint to determine an initial capacity to aim for. + let num_args_hint = arguments.size_hint(); + let capacity_hint = num_args_hint.1.unwrap_or(num_args_hint.0); + let mut env_exprs = BumpVec::with_capacity_in(capacity_hint, allocator); + for arg in arguments { + env_exprs.push(arg?); + } + Ok(Environment::new(env_exprs.into_bump_slice())) + } + + pub fn expand(&self, mut environment: Environment<'top, D>) -> IonResult> { + // Initialize a `MacroExpansionKind` with the state necessary to evaluate the requested + // macro. + let macro_ref: MacroRef<'top> = self.invoked_macro(); + let arguments = MacroExprArgsIterator::from_template_macro(self.arguments(environment)); + let expansion_kind = match macro_ref.kind() { + MacroKind::Void => MacroExpansionKind::Void, + MacroKind::Values => MacroExpansionKind::Values(ValuesExpansion::new(arguments)), + MacroKind::MakeString => { + MacroExpansionKind::MakeString(MakeStringExpansion::new(arguments)) + } + MacroKind::Template(template_body) => { + let template_ref = TemplateMacroRef::new(macro_ref, template_body); + environment = self.new_evaluation_environment(environment)?; + MacroExpansionKind::Template(TemplateExpansion::new(template_ref)) + } + }; + Ok(MacroExpansion::new(self.context(), environment, expansion_kind)) + } } impl<'top, D: Decoder> From> for MacroExpr<'top, D> { fn from(value: TemplateMacroInvocation<'top>) -> Self { - MacroExpr::TemplateMacro(value) + MacroExpr::from_template_macro(value) } } /// Steps over the argument expressions passed to a macro invocation found in a template body. +#[derive(Copy, Clone, Debug)] pub struct TemplateMacroInvocationArgsIterator<'top, D: Decoder> { + context: EncodingContextRef<'top>, environment: Environment<'top, D>, - invocation: TemplateMacroInvocation<'top>, + host_template: MacroRef<'top>, + // The range of value expressions in the host template's body that are arguments to the + // macro being invoked + arg_expressions: &'top [TemplateBodyExpr], arg_index: usize, } impl<'top, D: Decoder> TemplateMacroInvocationArgsIterator<'top, D> { - pub fn new( - environment: Environment<'top, D>, - invocation: TemplateMacroInvocation<'top>, - ) -> Self { + pub fn new(context: EncodingContextRef<'top>, environment: Environment<'top, D>, arg_expressions: &'top [TemplateBodyExpr], host_template: MacroRef<'top>) -> Self { Self { environment, - invocation, - arg_index: 0, + context, + arg_expressions, + host_template, + arg_index: 0 } } + + pub fn is_exhausted(&self) -> bool { + let current = self.arg_index; + let max = self.arg_expressions.len(); + current == max + } } impl<'top, D: Decoder> Iterator for TemplateMacroInvocationArgsIterator<'top, D> { type Item = IonResult>; + #[inline(always)] fn next(&mut self) -> Option { - let arg = self.invocation.arg_expressions().get(self.arg_index)?; - self.arg_index += 1; - let arg_expr = match arg { - TemplateBodyValueExpr::Element(e) => { - // If it's a container, skip over its contents when this iterator resumes - match e.value() { - TemplateValue::List(range) - | TemplateValue::SExp(range) - | TemplateValue::Struct(range, _) => { - self.arg_index += range.len(); - } - _ => { - // If it's a scalar, it has already been accounted for. - } - }; + let arg = self.arg_expressions.get(self.arg_index)?; + let arg_expr = match arg.kind() { + TemplateBodyExprKind::Element(e) => { ValueExpr::ValueLiteral(LazyExpandedValue::from_template( - self.invocation.context, + self.context, self.environment, - TemplateElement::new(self.invocation.host_template(), e), + TemplateElement::new(self.host_template, e, arg.expr_range()), )) } - TemplateBodyValueExpr::Variable(variable_ref) => match self - .environment - .get_expected(variable_ref.signature_index()) - { - Ok(expr) => expr, - Err(e) => return Some(Err(e)), + TemplateBodyExprKind::Variable(variable_ref) => { + self + .environment + .require_expr(variable_ref.signature_index()) }, - TemplateBodyValueExpr::MacroInvocation(body_invocation) => { + TemplateBodyExprKind::MacroInvocation(body_invocation) => { let invocation = body_invocation - .resolve(self.invocation.host_template(), self.invocation.context); - // Skip over all of the expressions that belong to this invocation. - self.arg_index += invocation.arg_expressions.len(); + .resolve(self.context, self.host_template.address(), arg.expr_range()); ValueExpr::MacroInvocation(invocation.into()) } }; + self.arg_index += arg.num_expressions(); Some(Ok(arg_expr)) } + + fn size_hint(&self) -> (usize, Option) { + let num_args = self.arg_expressions.len(); + (num_args, Some(num_args)) + } } /// A reference to a variable in a template body. @@ -813,12 +1005,12 @@ impl TemplateBodyVariableReference { /// about the template definition to be retrieved later. pub(crate) fn resolve<'top>( &self, - template: TemplateMacroRef<'top>, + host_macro: &'top Macro, ) -> TemplateVariableReference<'top> { - TemplateVariableReference { - template, - signature_index: self.signature_index, - } + TemplateVariableReference::new( + host_macro, + self.signature_index, + ) } } @@ -830,31 +1022,37 @@ impl TemplateBodyVariableReference { pub struct TemplateElement<'top> { // This type holds a reference to the host template macro, which contains some shared resources // like a `Vec` of annotation definitions. - template: TemplateMacroRef<'top>, + template: MacroRef<'top>, element: &'top TemplateBodyElement, + expr_range: ExprRange, } impl<'top> TemplateElement<'top> { - pub fn new(template: TemplateMacroRef<'top>, element: &'top TemplateBodyElement) -> Self { - Self { template, element } + pub fn new(template: MacroRef<'top>, element: &'top TemplateBodyElement, expr_range: ExprRange) -> Self { + Self { template, element, expr_range } } pub fn annotations(&self) -> &'top [Symbol] { self.template + .require_template() .body() .annotations_storage() .get(self.element.annotations_range().ops_range()) .unwrap() } - pub fn annotations_range(&self) -> AnnotationsRange { self.element.annotations_range } - pub fn value(&self) -> &'top TemplateValue { &self.element.value } pub fn template(&self) -> TemplateMacroRef<'top> { - self.template + self.template.require_template() + } + pub fn expr_range(&self) -> ExprRange { + self.expr_range + } + pub fn nested_expressions(&self) -> &'top [TemplateBodyExpr] { + self.template().body().expressions().get(self.expr_range.tail()).unwrap() } } @@ -911,10 +1109,10 @@ pub enum TemplateValue { Clob(Bytes), Blob(Bytes), // The range of ensuing `TemplateBodyValueExpr`s that belong to this container. - List(ExprRange), - SExp(ExprRange), + List, + SExp, // A 'closed' struct quasi-literal. All field names are known at compile time. - Struct(ExprRange, TemplateStructIndex), + Struct(TemplateStructIndex), // TODO: Implementation of a `make_struct` macro requires an 'open' struct whose fields will // often not be known at compile time. } @@ -922,7 +1120,7 @@ pub enum TemplateValue { /// A mapping of struct field names to one or more template body addresses that have that /// field name. This type is used to allow field lookups within a template struct to happen in /// constant rather than linear time. -pub type TemplateStructIndex = HashMap>; +pub type TemplateStructIndex = FxHashMap>; impl TemplateValue { pub fn is_null(&self) -> bool { @@ -944,9 +1142,9 @@ impl TemplateValue { String(_) => IonType::String, Clob(_) => IonType::Clob, Blob(_) => IonType::Blob, - List(_) => IonType::List, - SExp(_) => IonType::SExp, - Struct(_, _) => IonType::Struct, + List => IonType::List, + SExp => IonType::SExp, + Struct(_) => IonType::Struct, } } } @@ -1004,6 +1202,10 @@ impl SmallRange { self.start as usize..self.end as usize } + pub fn tail(&self) -> Range { + self.start as usize + 1 .. self.end as usize + } + pub fn len(&self) -> usize { (self.end - self.start) as usize } diff --git a/src/lazy/never.rs b/src/lazy/never.rs index 80e11dec..68c86805 100644 --- a/src/lazy/never.rs +++ b/src/lazy/never.rs @@ -1,4 +1,5 @@ use std::fmt::Debug; +use std::marker::PhantomData; use std::ops::Range; use crate::lazy::decoder::{Decoder, HasRange, HasSpan, LazyRawValueExpr}; @@ -8,8 +9,14 @@ use crate::lazy::encoder::value_writer::{ delegate_value_writer_to_self, AnnotatableWriter, ValueWriter, }; use crate::lazy::encoder::value_writer::{EExpWriter, SequenceWriter, StructWriter}; -use crate::lazy::expanded::macro_evaluator::{MacroExpr, RawEExpression}; +use crate::lazy::expanded::e_expression::ArgGroup; +use crate::lazy::expanded::macro_evaluator::{ + EExpArgGroupIterator, EExpressionArgGroup, RawEExpression, +}; +use crate::lazy::expanded::template::ParameterEncoding; +use crate::lazy::expanded::EncodingContextRef; use crate::lazy::span::Span; +use crate::lazy::text::raw::v1_1::arg_group::EExpArg; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; use crate::raw_symbol_ref::AsRawSymbolRef; use crate::{Decimal, Int, IonResult, IonType, Timestamp}; @@ -32,27 +39,6 @@ impl HasRange for Never { } } -// Ion 1.0 uses `Never` as a placeholder type for MacroInvocation. -// The compiler should optimize these methods away. -impl<'top, D: Decoder = Self>> RawEExpression<'top, D> for Never { - // These use Box to avoid defining yet another placeholder type. - type RawArgumentsIterator<'a> = Box>>>; - - fn id(&self) -> MacroIdRef<'top> { - unreachable!("macro in Ion 1.0 (method: id)") - } - - fn raw_arguments(&self) -> Self::RawArgumentsIterator<'_> { - unreachable!("macro in Ion 1.0 (method: arguments)") - } -} - -impl<'top, D: Decoder> From for MacroExpr<'top, D> { - fn from(_value: Never) -> Self { - unreachable!("macro in Ion 1.0 (method: into)") - } -} - impl SequenceWriter for Never { type Resources = (); @@ -74,7 +60,8 @@ impl StructWriter for Never { } impl MakeValueWriter for Never { - type ValueWriter<'a> = Never where Self: 'a; + type ValueWriter<'a> = Never + where Self: 'a; fn make_value_writer(&mut self) -> Self::ValueWriter<'_> { unreachable!("MakeValueWriter::value_writer in Never") @@ -84,7 +71,8 @@ impl MakeValueWriter for Never { impl EExpWriter for Never {} impl AnnotatableWriter for Never { - type AnnotatedValueWriter<'a> = Never where Self: 'a; + type AnnotatedValueWriter<'a> = Never + where Self: 'a; fn with_annotations<'a>( self, @@ -105,3 +93,94 @@ impl ValueWriter for Never { delegate_value_writer_to_self!(); } + +impl<'top, D: Decoder = Self>> RawEExpression<'top, D> for Never { + type RawArgumentsIterator = NeverEExpArgIterator<'top, D>; // Placeholder + + type ArgGroup = NeverArgGroup<'top, D>; + + fn id(self) -> MacroIdRef<'top> { + unreachable!("::id") + } + + fn raw_arguments(&self) -> Self::RawArgumentsIterator { + unreachable!("::raw_arguments") + } +} + +#[derive(Copy, Clone, Debug)] +pub struct NeverEExpArgIterator<'top, D: Decoder> { + spooky: PhantomData<&'top D>, + never: Never, +} + +impl<'top, D: Decoder> Iterator for NeverEExpArgIterator<'top, D> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + unreachable!("::next"); + } +} + +#[derive(Copy, Clone, Debug)] +pub struct NeverArgGroup<'top, D: Decoder> { + spooky: PhantomData<&'top D>, + never: Never, +} + +impl<'top, D: Decoder> IntoIterator for NeverArgGroup<'top, D> { + type Item = IonResult>; + type IntoIter = NeverArgGroupIterator<'top, D>; + + fn into_iter(self) -> Self::IntoIter { + unreachable!("::into_iter") + } +} + +#[derive(Copy, Clone, Debug)] +pub struct NeverArgGroupIterator<'top, D: Decoder> { + spooky: PhantomData<&'top D>, + never: Never, +} + +impl<'top, D: Decoder> Iterator for NeverArgGroupIterator<'top, D> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + unreachable!("::next") + } +} + +impl<'top, D: Decoder> EExpArgGroupIterator<'top, D> for NeverArgGroupIterator<'top, D> { + fn is_exhausted(&self) -> bool { + unreachable!("::is_exhausted") + } +} + +impl<'top, D: Decoder> HasRange for NeverArgGroup<'top, D> { + fn range(&self) -> Range { + unreachable!("::range") + } +} + +impl<'top, D: Decoder> HasSpan<'top> for NeverArgGroup<'top, D> { + fn span(&self) -> Span<'top> { + unreachable!("::span") + } +} + +impl<'top, D: Decoder> EExpressionArgGroup<'top, D> for NeverArgGroup<'top, D> { + type Iterator = NeverArgGroupIterator<'top, D>; + + fn encoding(&self) -> ParameterEncoding { + unreachable!("::encoding") + } + + fn resolve(self, _context: EncodingContextRef<'top>) -> ArgGroup<'top, D> { + unreachable!("::resolve") + } + + fn iter(self) -> Self::Iterator { + unreachable!("::iter") + } +} diff --git a/src/lazy/raw_stream_item.rs b/src/lazy/raw_stream_item.rs index 98ec6ecf..a55d6a56 100644 --- a/src/lazy/raw_stream_item.rs +++ b/src/lazy/raw_stream_item.rs @@ -16,7 +16,7 @@ pub enum RawStreamItem { /// for [`LazyRawBinaryValue`](crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0). Value(V), /// An Ion 1.1+ macro invocation. Ion 1.0 readers will never return a macro invocation. - EExpression(E), + EExp(E), /// The end of the stream EndOfStream(EndPosition), } @@ -32,7 +32,7 @@ impl<'top> LazyRawStreamItem<'top, AnyEncoding> { match self { LazyRawStreamItem::::VersionMarker(m) => m.encoding(), LazyRawStreamItem::::Value(v) => v.encoding(), - LazyRawStreamItem::::EExpression(e) => e.encoding(), + LazyRawStreamItem::::EExp(e) => e.encoding(), LazyRawStreamItem::::EndOfStream(eos) => eos.encoding(), } } @@ -46,7 +46,7 @@ impl HasRange match self { VersionMarker(marker) => marker.range(), Value(value) => value.range(), - EExpression(eexp) => eexp.range(), + EExp(eexp) => eexp.range(), EndOfStream(eos) => eos.range(), } } @@ -60,7 +60,7 @@ impl<'top, M: Debug + HasSpan<'top>, V: Debug + HasSpan<'top>, E: Debug + HasSpa match self { VersionMarker(marker) => marker.span(), Value(value) => value.span(), - EExpression(eexp) => eexp.span(), + EExp(eexp) => eexp.span(), EndOfStream(eos) => eos.span(), } } @@ -104,15 +104,15 @@ impl RawStreamItem { } pub fn as_macro_invocation(&self) -> Option<&E> { - if let Self::EExpression(m) = self { + if let Self::EExp(m) = self { Some(m) } else { None } } - pub fn expect_macro_invocation(self) -> IonResult { - if let Self::EExpression(m) = self { + pub fn expect_eexp(self) -> IonResult { + if let Self::EExp(m) = self { Ok(m) } else { IonResult::decoding_error(format!("expected a macro invocation, found {:?}", self)) diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 277082e4..a82d7d71 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -1,9 +1,14 @@ +use std::fmt::{Debug, Formatter}; + use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::Decoder; +use crate::lazy::expanded::EncodingContextRef; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; -use crate::{Decimal, Int, IonResult, IonType, RawSymbolRef, Timestamp}; -use std::fmt::{Debug, Formatter}; +use crate::{ + Decimal, Int, IonResult, IonType, LazyExpandedList, LazyExpandedSExp, LazyExpandedStruct, + LazyList, LazySExp, LazyStruct, RawSymbolRef, Timestamp, ValueRef, +}; /// As RawValueRef represents a reference to an unresolved value read from the data stream. /// If the value is a symbol, it only contains the information found in the data stream (a symbol ID @@ -70,6 +75,31 @@ impl<'top, D: Decoder> Debug for RawValueRef<'top, D> { } impl<'top, D: Decoder> RawValueRef<'top, D> { + pub fn resolve(self, context: EncodingContextRef<'top>) -> IonResult> { + let value_ref = match self { + RawValueRef::Null(ion_type) => ValueRef::Null(ion_type), + RawValueRef::Bool(b) => ValueRef::Bool(b), + RawValueRef::Int(i) => ValueRef::Int(i), + RawValueRef::Float(f) => ValueRef::Float(f), + RawValueRef::Decimal(d) => ValueRef::Decimal(d), + RawValueRef::Timestamp(t) => ValueRef::Timestamp(t), + RawValueRef::String(s) => ValueRef::String(s), + RawValueRef::Symbol(s) => ValueRef::Symbol(s.resolve(context)?), + RawValueRef::Blob(b) => ValueRef::Blob(b), + RawValueRef::Clob(c) => ValueRef::Clob(c), + RawValueRef::SExp(s) => { + ValueRef::SExp(LazySExp::from(LazyExpandedSExp::from_literal(context, s))) + } + RawValueRef::List(l) => { + ValueRef::List(LazyList::from(LazyExpandedList::from_literal(context, l))) + } + RawValueRef::Struct(s) => ValueRef::Struct(LazyStruct::from( + LazyExpandedStruct::from_literal(context, s), + )), + }; + Ok(value_ref) + } + pub fn expect_null(self) -> IonResult { if let RawValueRef::Null(ion_type) = self { Ok(ion_type) diff --git a/src/lazy/reader.rs b/src/lazy/reader.rs index 2978bc97..e9eee12a 100644 --- a/src/lazy/reader.rs +++ b/src/lazy/reader.rs @@ -5,7 +5,6 @@ use crate::element::Element; use crate::lazy::decoder::Decoder; use crate::lazy::streaming_raw_reader::IonInput; use crate::lazy::system_reader::SystemReader; -use crate::lazy::text::raw::v1_1::reader::MacroAddress; use crate::lazy::value::LazyValue; use crate::read_config::ReadConfig; use crate::result::IonFailure; @@ -126,15 +125,21 @@ impl Reader { } } +use crate::lazy::{ + expanded::template::TemplateMacro, + text::raw::v1_1::reader::MacroAddress, +}; + impl Reader { - // Temporary method for defining/testing templates. This method does not confirm that the - // reader's encoding supports macros--that check will happen when encoding directives are - // supported. // TODO: Remove this when the reader can understand 1.1 encoding directives. - pub fn register_template(&mut self, template_definition: &str) -> IonResult { + pub fn register_template_src(&mut self, template_definition: &str) -> IonResult { self.system_reader .expanding_reader - .register_template(template_definition) + .register_template_src(template_definition) + } + + pub fn register_template(&mut self, template_macro: TemplateMacro) -> IonResult { + self.system_reader.expanding_reader.register_template(template_macro) } } @@ -183,6 +188,7 @@ mod tests { use crate::lazy::value_ref::ValueRef; use crate::write_config::WriteConfig; use crate::{ion_list, ion_sexp, ion_struct, v1_0, AnyEncoding, Int, IonResult, IonType}; + use crate::lazy::text::raw::v1_1::reader::MacroAddress; use super::*; @@ -296,7 +302,7 @@ mod tests { // Construct a reader for the encoded data. let mut reader = Reader::new(AnyEncoding, binary_ion.as_slice())?; // Register the template definition, getting the same ID we used earlier. - let actual_address = reader.register_template(macro_source)?; + let actual_address = reader.register_template_src(macro_source)?; assert_eq!( macro_address, actual_address, "Assigned macro address did not match expected address." diff --git a/src/lazy/sequence.rs b/src/lazy/sequence.rs index 3943309c..630a502c 100644 --- a/src/lazy/sequence.rs +++ b/src/lazy/sequence.rs @@ -7,7 +7,10 @@ use crate::lazy::expanded::sequence::{ ExpandedListIterator, ExpandedSExpIterator, LazyExpandedList, LazyExpandedSExp, }; use crate::lazy::value::{AnnotationsIterator, LazyValue}; -use crate::{Annotations, Element, IntoAnnotatedElement, Sequence, Value}; +use crate::{ + try_next, Annotations, Element, ExpandedListSource, ExpandedSExpSource, IntoAnnotatedElement, + LazyExpandedValue, LazyRawContainer, Sequence, Value, +}; use crate::{IonError, IonResult}; /// A list in a binary Ion stream whose header has been parsed but whose body @@ -59,6 +62,10 @@ pub struct LazyList<'top, D: Decoder> { pub type LazyBinarySequence<'top, 'data> = LazyList<'top, BinaryEncoding_1_0>; impl<'top, D: Decoder> LazyList<'top, D> { + pub(crate) fn new(expanded_list: LazyExpandedList<'top, D>) -> Self { + Self { expanded_list } + } + /// Returns an iterator over the values in this sequence. See: [`LazyValue`]. pub fn iter(&self) -> ListIterator<'top, D> { ListIterator { @@ -76,6 +83,18 @@ impl<'top, D: Decoder> LazyList<'top, D> { self.expanded_list } + pub fn as_value(&self) -> LazyValue<'top, D> { + let expanded_value = match self.expanded_list.source { + ExpandedListSource::ValueLiteral(v) => { + LazyExpandedValue::from_literal(self.expanded_list.context, v.as_value()) + } + ExpandedListSource::Template(env, element) => { + LazyExpandedValue::from_template(self.expanded_list.context, env, element) + } + }; + LazyValue::new(expanded_value) + } + /// Returns an iterator over the annotations on this value. If this value has no annotations, /// the resulting iterator will be empty. /// @@ -165,12 +184,7 @@ impl<'top, D: Decoder> Iterator for ListIterator<'top, D> { type Item = IonResult>; fn next(&mut self) -> Option { - let expanded_value = match self.expanded_list_iter.next() { - Some(Ok(expanded_value)) => expanded_value, - Some(Err(e)) => return Some(Err(e)), - None => return None, - }; - + let expanded_value = try_next!(self.expanded_list_iter.next()); let lazy_value = LazyValue { expanded_value }; Some(Ok(lazy_value)) } @@ -208,6 +222,10 @@ impl<'top, D: Decoder> Debug for LazySExp<'top, D> { } impl<'top, D: Decoder> LazySExp<'top, D> { + pub(crate) fn new(expanded_sexp: LazyExpandedSExp<'top, D>) -> Self { + Self { expanded_sexp } + } + #[cfg(feature = "experimental-tooling-apis")] pub fn expanded(&self) -> LazyExpandedSExp<'top, D> { self.expanded_sexp @@ -218,6 +236,18 @@ impl<'top, D: Decoder> LazySExp<'top, D> { self.expanded_sexp } + pub fn as_value(&self) -> LazyValue<'top, D> { + let expanded_value = match self.expanded_sexp.source { + ExpandedSExpSource::ValueLiteral(v) => { + LazyExpandedValue::from_literal(self.expanded_sexp.context, v.as_value()) + } + ExpandedSExpSource::Template(env, element) => { + LazyExpandedValue::from_template(self.expanded_sexp.context, env, element) + } + }; + LazyValue::new(expanded_value) + } + /// Returns an iterator over the values in this sequence. See: [`LazyValue`]. pub fn iter(&self) -> SExpIterator<'top, D> { SExpIterator { diff --git a/src/lazy/str_ref.rs b/src/lazy/str_ref.rs index a7427348..4c687613 100644 --- a/src/lazy/str_ref.rs +++ b/src/lazy/str_ref.rs @@ -20,7 +20,7 @@ impl<'data> StrRef<'data> { Str::from(self) } - pub fn text(&self) -> &str { + pub fn text(&self) -> &'data str { self.text } } diff --git a/src/lazy/streaming_raw_reader.rs b/src/lazy/streaming_raw_reader.rs index e9a4a41a..e59a4bf3 100644 --- a/src/lazy/streaming_raw_reader.rs +++ b/src/lazy/streaming_raw_reader.rs @@ -2,26 +2,25 @@ use std::cell::UnsafeCell; use std::fs::File; use std::io; use std::io::{BufReader, Read, StdinLock}; +use std::marker::PhantomData; use crate::lazy::any_encoding::IonEncoding; use crate::lazy::decoder::{Decoder, LazyRawReader}; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::LazyRawStreamItem; -use crate::{AnyEncoding, IonError, IonResult, LazyRawValue}; +use crate::{IonError, IonResult, LazyRawValue}; /// Wraps an implementation of [`IonDataSource`] and reads one top level value at a time from the input. pub struct StreamingRawReader { - // The Ion encoding that this reader recognizes. - encoding: Encoding, + // The type of decoder we're using. This type determines which `LazyRawReader` implementation + // is constructed for each slice of the input buffer. + decoder: PhantomData, + // The Ion encoding that this reader has been processing. // The StreamingRawReader works by reading the next value from the bytes currently available // in the buffer using a (non-streaming) raw reader. If the buffer is exhausted, it will read - // more data into the buffer and create a new raw reader. If any state needs to be preserved - // when moving from the old raw reader to the new one, that data's type will be set as the - // `Encoding`'s `ReaderSavedState`. - // At present, the only encoding that uses this is `AnyEncoding`, which needs to pass a record - // of the stream's detected encoding from raw reader to raw reader. For all other encodings, - // this is a zero-sized type and its associated operations are no-ops. - saved_state: Encoding::ReaderSavedState, + // more data into the buffer and create a new raw reader. If the raw reader uses `AnyEncoding`, + // the detected Ion encoding will be carried over from raw reader instance to raw reader instance. + detected_encoding: IonEncoding, // The absolute position of the reader within the overall stream. This is the index of the first // byte that has not yet been read. stream_position: usize, @@ -47,11 +46,12 @@ pub struct StreamingRawReader { const DEFAULT_IO_BUFFER_SIZE: usize = 4 * 1024; impl StreamingRawReader { - pub fn new(encoding: Encoding, input: Input) -> StreamingRawReader { + pub fn new(_encoding: Encoding, input: Input) -> StreamingRawReader { StreamingRawReader { - encoding, + decoder: PhantomData, + // This will be overwritten when reading begins + detected_encoding: IonEncoding::default(), input: input.into_data_source().into(), - saved_state: Default::default(), stream_position: 0, } } @@ -97,20 +97,19 @@ impl StreamingRawReader { >>::resume_at_offset( available_bytes, self.stream_position, - self.saved_state, + self.encoding(), )); let slice_reader = unsafe { &mut *unsafe_cell_reader.get() }; let starting_position = slice_reader.position(); + let old_encoding = slice_reader.encoding(); let result = slice_reader.next(context); // We're done modifying `slice_reader`, but we need to read some of its fields. These // fields are _not_ the data to which `result` holds a reference. We have to circumvent // the borrow checker's limitation (described in a comment on the StreamingRawReader type) // by getting a second (read-only) reference to the reader. let slice_reader_ref = unsafe { &*unsafe_cell_reader.get() }; - let encoding = slice_reader_ref.encoding(); + let new_encoding = slice_reader_ref.encoding(); let end_position = slice_reader_ref.position(); - // For the RawAnyReader, remember what encoding we detected for next time. - self.saved_state = slice_reader_ref.save_state(); let bytes_read = end_position - starting_position; let input = unsafe { &mut *self.input.get() }; @@ -149,7 +148,7 @@ impl StreamingRawReader { // // To avoid this, we perform a final check for text readers who have emptied their // buffer: we do not consider the item complete unless the input source is exhausted. - if encoding.is_text() + if old_encoding.is_text() && bytes_read == available_bytes.len() && !input_source_exhausted { @@ -158,7 +157,7 @@ impl StreamingRawReader { // Text containers and e-expressions have closing delimiters that allow us // to tell that they're complete. Value(v) if v.ion_type().is_container() => {} - EExpression(_eexp) => {} + EExp(_eexp) => {} // IVMs (which look like symbols), scalar values, and the end of the // stream are all cases where the reader looking at a fixed slice of the // buffer may reach the wrong conclusion. @@ -180,16 +179,16 @@ impl StreamingRawReader { // Update the streaming reader's position to reflect the number of bytes we // just read. self.stream_position = end_position; + // If the item read was an IVM, this will be a new value. + self.detected_encoding = new_encoding; } return result; } } -} -impl StreamingRawReader { pub fn encoding(&self) -> IonEncoding { - self.saved_state + self.detected_encoding } } @@ -574,7 +573,7 @@ mod tests { let context = empty_context.get_ref(); let mut reader = StreamingRawReader::new(v1_0::Text, IonStream::new(input)); - assert_eq!(reader.next(context)?.expect_ivm()?.version(), (1, 0)); + assert_eq!(reader.next(context)?.expect_ivm()?.major_minor(), (1, 0)); assert_eq!( reader .next(context)? diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index a6affb02..21673ffc 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -9,7 +9,6 @@ use crate::lazy::encoding::BinaryEncoding_1_0; use crate::lazy::expanded::r#struct::{ ExpandedStructIterator, ExpandedStructSource, LazyExpandedField, LazyExpandedStruct, }; -use crate::lazy::expanded::template::TemplateElement; use crate::lazy::expanded::LazyExpandedValue; use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::lazy::value_ref::ValueRef; @@ -73,6 +72,10 @@ impl<'top, D: Decoder> Debug for LazyStruct<'top, D> { } impl<'top, D: Decoder> LazyStruct<'top, D> { + pub(crate) fn new(expanded_struct: LazyExpandedStruct<'top, D>) -> Self { + Self { expanded_struct } + } + /// Returns an iterator over this struct's fields. See [`LazyField`]. pub fn iter(&self) -> StructIterator<'top, D> { StructIterator { @@ -95,13 +98,7 @@ impl<'top, D: Decoder> LazyStruct<'top, D> { ExpandedStructSource::ValueLiteral(v) => { LazyExpandedValue::from_literal(self.expanded_struct.context, v.as_value()) } - ExpandedStructSource::Template(env, template_ref, _, fields_range, _) => { - let element = TemplateElement::new( - template_ref, - template_ref.body().expressions()[fields_range.start() - 1] - .expect_element() - .unwrap(), - ); + ExpandedStructSource::Template(env, element, _) => { LazyExpandedValue::from_template(self.expanded_struct.context, env, element) } }; @@ -312,7 +309,11 @@ impl<'top, D: Decoder> Iterator for StructIterator<'top, D> { type Item = IonResult>; fn next(&mut self) -> Option { - StructIterator::next_field(self).transpose() + match StructIterator::next_field(self) { + Ok(Some(field)) => Some(Ok(field)), + Ok(None) => None, + Err(e) => Some(Err(e)), + } } } diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index 93e17002..39286654 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -1,16 +1,22 @@ #![allow(non_camel_case_types)] -use crate::lazy::any_encoding::IonEncoding; +use crate::lazy::any_encoding::{IonEncoding, IonVersion}; use crate::lazy::decoder::Decoder; -use crate::lazy::expanded::{ExpandedValueRef, ExpandingReader, LazyExpandedValue}; +use crate::lazy::expanded::compiler::TemplateCompiler; +use crate::lazy::expanded::encoding_module::EncodingModule; +use crate::lazy::expanded::macro_table::MacroTable; +use crate::lazy::expanded::template::TemplateMacro; +use crate::lazy::expanded::{ExpandingReader, LazyExpandedValue}; +use crate::lazy::sequence::SExpIterator; use crate::lazy::streaming_raw_reader::{IonInput, StreamingRawReader}; use crate::lazy::system_stream_item::SystemStreamItem; +use crate::lazy::text::raw::v1_1::reader::MacroAddress; use crate::lazy::value::LazyValue; use crate::read_config::ReadConfig; use crate::result::IonFailure; use crate::{ - AnyEncoding, Catalog, Int, IonError, IonResult, IonType, LazyExpandedField, RawSymbolRef, - Symbol, SymbolTable, + AnyEncoding, Catalog, Int, IonError, IonResult, IonType, LazyField, LazySExp, LazyStruct, + RawSymbolRef, Symbol, SymbolTable, ValueRef, }; use std::ops::Deref; use std::sync::Arc; @@ -29,11 +35,11 @@ const SYMBOLS: RawSymbolRef = RawSymbolRef::SymbolId(7); /// /// Each time [`SystemReader::next_item`] is called, the reader will advance to the next top-level /// value in the input stream. Once positioned on a top-level value, users may visit nested values by -/// calling [`LazyValue::read`] and working with the resulting [`crate::lazy::value_ref::ValueRef`], +/// calling [`LazyValue::read`] and working with the resulting [`ValueRef`], /// which may contain either a scalar value or a lazy container that may itself be traversed. /// /// The values that the reader yields ([`LazyValue`], -/// [`LazyList`](crate::LazyList), [`LazySExp`](crate::LazySExp) and [`LazyStruct`](crate::LazyStruct)), are immutable references to the data stream, +/// [`LazyList`](crate::LazyList), [`LazySExp`] and [`LazyStruct`]), are immutable references to the data stream, /// and remain valid until [`SystemReader::next_item`] is called again to advance the reader to /// the next top level value. This means that these references can be stored, read, and re-read as /// long as the reader remains on the same top-level value. @@ -81,20 +87,26 @@ pub struct SystemReader { // If the reader encounters a symbol table in the stream, it will store all of the symbols that // the table defines in this structure so that they may be applied when the reader next advances. #[derive(Default)] -pub struct PendingLst { +pub struct PendingContextChanges { + pub(crate) switch_to_version: Option, pub(crate) has_changes: bool, pub(crate) is_lst_append: bool, - pub(crate) symbols: Vec, pub(crate) imported_symbols: Vec, + pub(crate) symbols: Vec, + // A new encoding modules defined in the current encoding directive. + // TODO: Support for defining several modules + pub(crate) new_active_module: Option, } -impl PendingLst { +impl PendingContextChanges { pub fn new() -> Self { Self { + switch_to_version: None, has_changes: false, is_lst_append: false, symbols: Vec::new(), imported_symbols: Vec::new(), + new_active_module: None, } } pub fn local_symbols(&self) -> &[Symbol] { @@ -103,6 +115,17 @@ impl PendingLst { pub fn imported_symbols(&self) -> &[Symbol] { &self.imported_symbols } + pub fn has_changes(&self) -> bool { + self.has_changes + } + pub fn new_active_module(&self) -> Option<&EncodingModule> { + self.new_active_module.as_ref() + } + /// If there's a new module defined, returns `Some(new_module)` and sets `self.new_module` + /// to `None`. If there is no new module defined, returns `None`. + pub(crate) fn take_new_active_module(&mut self) -> Option { + self.new_active_module.take() + } } impl SystemReader { @@ -116,26 +139,60 @@ impl SystemReader { SystemReader { expanding_reader } } - // Returns `true` if the provided [`LazyRawValue`] is a struct whose first annotation is - // `$ion_symbol_table`. + pub fn register_template_src(&mut self, template_definition: &str) -> IonResult { + self.expanding_reader + .register_template_src(template_definition) + } + + pub fn register_template(&mut self, template_macro: TemplateMacro) -> IonResult { + self.expanding_reader.register_template(template_macro) + } + + /// Returns `true` if the provided `LazyRawValue` is a struct whose first annotation is + /// `$ion_symbol_table`. Caller is responsible for confirming the struct appeared at the top + /// level. pub(crate) fn is_symbol_table_struct( - lazy_value: &'_ LazyExpandedValue<'_, Encoding>, + expanded_value: &'_ LazyExpandedValue<'_, Encoding>, ) -> IonResult { - if lazy_value.ion_type() != IonType::Struct { + if expanded_value.ion_type() != IonType::Struct || !expanded_value.has_annotations() { return Ok(false); } + let lazy_value = LazyValue::new(*expanded_value); if let Some(symbol_ref) = lazy_value.annotations().next() { - return Ok(symbol_ref?.matches_sid_or_text(3, "$ion_symbol_table")); + return Ok(symbol_ref? == "$ion_symbol_table"); }; Ok(false) } + /// Returns `true` if the provided `LazyRawValue` is an s-expression whose first annotation + /// is `$ion_encoding`. Caller is responsible for confirming the sexp appeared at the top + /// level AND that this stream is encoded using Ion 1.1. + pub(crate) fn is_encoding_directive_sexp( + lazy_value: &'_ LazyExpandedValue<'_, Encoding>, + ) -> IonResult { + if lazy_value.ion_type() != IonType::SExp { + return Ok(false); + } + if !lazy_value.has_annotations() { + return Ok(false); + } + // At this point, we've confirmed it's an annotated s-expression. We need to see if its + // first annotation has the text `$ion_encoding`, which may involve a lookup in the + // encoding context. We'll promote this LazyExpandedValue to a LazyValue to enable that. + let lazy_value = LazyValue::new(*lazy_value); + let first_annotation = lazy_value + .annotations() + .next() + .expect("already confirmed that there are annotations")?; + Ok(first_annotation.text() == Some("$ion_encoding")) + } + pub fn symbol_table(&self) -> &SymbolTable { self.expanding_reader.context().symbol_table() } - pub fn pending_symtab_changes(&self) -> &PendingLst { - self.expanding_reader.pending_lst() + pub fn pending_context_changes(&self) -> &PendingContextChanges { + self.expanding_reader.pending_context_changes() } /// Returns the next top-level stream item (IVM, Symbol Table, Value, or Nothing) as a @@ -158,10 +215,205 @@ impl SystemReader { }) } + pub(crate) fn process_encoding_directive( + pending_changes: &mut PendingContextChanges, + directive: LazyExpandedValue<'_, Encoding>, + ) -> IonResult<()> { + // We've already confirmed this is an annotated sexp + let directive = directive.read()?.expect_sexp()?; + for step in directive.iter() { + Self::process_encoding_directive_operation(pending_changes, step?)?; + } + Ok(()) + } + + pub(crate) fn process_encoding_directive_operation( + pending_changes: &mut PendingContextChanges, + value: LazyExpandedValue, + ) -> IonResult<()> { + let operation_sexp = LazyValue::new(value).read()?.expect_sexp().map_err(|_| { + IonError::decoding_error(format!( + "found an encoding directive step that was not an s-expression: {value:?}" + )) + })?; + + let mut values = operation_sexp.iter(); + let first_value = + Self::expect_next_sexp_value("encoding directive operation name", &mut values)?; + let step_name_text = + Self::expect_symbol_text("encoding directive operation name", first_value)?; + + match step_name_text { + "module" => todo!("defining a new named module"), + "symbol_table" => { + let symbol_table = Self::process_symbol_table_definition(operation_sexp)?; + let new_encoding_module = match pending_changes.take_new_active_module() { + None => EncodingModule::new( + "$ion_encoding".to_owned(), + MacroTable::new(), + symbol_table, + ), + Some(mut module) => { + module.set_symbol_table(symbol_table); + module + } + }; + pending_changes.new_active_module = Some(new_encoding_module); + } + "macro_table" => { + let macro_table = Self::process_macro_table_definition(operation_sexp)?; + let new_encoding_module = match pending_changes.take_new_active_module() { + None => EncodingModule::new( + "$ion_encoding".to_owned(), + macro_table, + SymbolTable::new(IonVersion::v1_1), + ), + Some(mut module) => { + module.set_macro_table(macro_table); + module + } + }; + pending_changes.new_active_module = Some(new_encoding_module); + } + _ => { + return IonResult::decoding_error(format!( + "unsupported encoding directive step '{step_name_text}'" + )) + } + } + Ok(()) + } + + fn process_module_definition( + _pending_changes: &mut PendingContextChanges, + module: LazySExp, + ) -> IonResult<()> { + let mut args = module.iter(); + // We've already looked at and validated the `name` to get to this point. We can skip it. + let _operation = args.next(); // 'module' + let module_name = Self::expect_next_sexp_value("a module name", &mut args)?; + let module_name_text = Self::expect_symbol_text("a module name", module_name)?; + let symbol_table_value = + Self::expect_next_sexp_value("a `symbol_table` operation", &mut args)?; + let symbol_table_operation = + Self::expect_sexp("a `symbol_table` operation", symbol_table_value)?; + let macro_table_value = + Self::expect_next_sexp_value("a `macro_table` operation", &mut args)?; + let macro_table_operation = + Self::expect_sexp("a `macro_table` operation", macro_table_value)?; + + let symbol_table = Self::process_symbol_table_definition(symbol_table_operation)?; + let macro_table = Self::process_macro_table_definition(macro_table_operation)?; + + // TODO: Register the new module in `pending_changes`. + let _encoding_module = + EncodingModule::new(module_name_text.to_owned(), macro_table, symbol_table); + + Ok(()) + } + + fn process_symbol_table_definition(operation: LazySExp) -> IonResult { + let mut args = operation.iter(); + let operation_name_value = + Self::expect_next_sexp_value("a `symbol_table` operation name", &mut args)?; + let operation_name = + Self::expect_symbol_text("the operation name `symbol_table`", operation_name_value)?; + if operation_name != "symbol_table" { + return IonResult::decoding_error(format!( + "expected a symbol table definition operation, but found: {operation:?}" + )); + } + let mut symbol_table = SymbolTable::new(IonVersion::v1_1); + for arg in args { + let symbol_list = arg?.read()?.expect_list()?; + for value in symbol_list { + match value?.read()? { + ValueRef::String(s) => symbol_table.add_symbol_for_text(s.text()), + ValueRef::Symbol(s) => symbol_table.add_symbol(s.to_owned()), + other => { + return IonResult::decoding_error(format!( + "found a non-text value in symbols list: {other:?}" + )) + } + }; + } + } + Ok(symbol_table) + } + + fn process_macro_table_definition(operation: LazySExp) -> IonResult { + let mut args = operation.iter(); + let operation_name_value = + Self::expect_next_sexp_value("a `macro_table` operation name", &mut args)?; + let operation_name = + Self::expect_symbol_text("the operation name `macro_table`", operation_name_value)?; + if operation_name != "macro_table" { + return IonResult::decoding_error(format!( + "expected a macro table definition operation, but found: {operation:?}" + )); + } + let mut macro_table = MacroTable::new(); + for arg in args { + let arg = arg?; + let context = operation.expanded_sexp.context; + let macro_def_sexp = arg.read()?.expect_sexp().map_err(|_| { + IonError::decoding_error(format!( + "macro_table had a non-sexp parameter: {}", + arg.ion_type() + )) + })?; + let new_macro = TemplateCompiler::compile_from_sexp(context, macro_def_sexp)?; + macro_table.add_macro(new_macro)?; + } + Ok(macro_table) + } + + fn expect_next_sexp_value<'a>( + label: &str, + iter: &mut SExpIterator<'a, Encoding>, + ) -> IonResult> { + iter.next().transpose()?.ok_or_else(|| { + IonError::decoding_error(format!( + "expected {label} but found no more values in the s-expression" + )) + }) + } + + fn expect_sexp<'a>( + label: &str, + value: LazyValue<'a, Encoding>, + ) -> IonResult> { + value.read()?.expect_sexp().map_err(|_| { + IonError::decoding_error(format!( + "expected an s-expression representing {label} but found a {}", + value.ion_type() + )) + }) + } + + fn expect_symbol_text<'a>( + label: &str, + lazy_value: LazyValue<'a, Encoding>, + ) -> IonResult<&'a str> { + lazy_value + .read()? + .expect_symbol() + .map_err(|_| { + IonError::decoding_error(format!( + "found {label} with non-symbol type: {}", + lazy_value.ion_type() + )) + })? + .text() + .ok_or_else(|| { + IonError::decoding_error(format!("found {label} that had undefined text ($0)")) + }) + } + // Traverses a symbol table, processing the `symbols` and `imports` fields as needed to // populate the `PendingLst`. pub(crate) fn process_symbol_table( - pending_lst: &mut PendingLst, + pending_lst: &mut PendingContextChanges, catalog: &dyn Catalog, symbol_table: &LazyExpandedValue<'_, Encoding>, ) -> IonResult<()> { @@ -170,28 +422,41 @@ impl SystemReader { // We're interested in the `imports` field and the `symbols` field. Both are optional; // however, it is illegal to specify either field more than once. - let mut imports_field: Option> = None; - let mut symbols_field: Option> = None; + let mut imports_field: Option> = None; + let mut symbols_field: Option> = None; + + let symbol_table = LazyStruct { + expanded_struct: symbol_table, + }; // Iterate through the fields of the symbol table struct, taking note of `imports` and `symbols` // if we encounter them. for field_result in symbol_table.iter() { let field = field_result?; - if field.name().read_raw()?.matches_sid_or_text(6, "imports") { - if imports_field.is_some() { - return IonResult::decoding_error( - "found symbol table with multiple 'imports' fields", - ); + let Some(name) = field.name()?.text() else { + // If the field is $0, we don't care about it. + continue; + }; + match name { + "imports" => { + if imports_field.is_some() { + return IonResult::decoding_error( + "found symbol table with multiple 'imports' fields", + ); + } + imports_field = Some(field); } - imports_field = Some(field); - } else if field.name().read_raw()?.matches_sid_or_text(7, "symbols") { - if symbols_field.is_some() { - return IonResult::decoding_error( - "found symbol table with multiple 'symbols' fields", - ); + "symbols" => { + if symbols_field.is_some() { + return IonResult::decoding_error( + "found symbol table with multiple 'symbols' fields", + ); + } + symbols_field = Some(field); } - symbols_field = Some(field); - } + // Other fields are ignored + _ => {} + }; } if let Some(imports_field) = imports_field { @@ -207,13 +472,12 @@ impl SystemReader { } fn clear_pending_lst_if_needed( - pending_lst: &mut PendingLst, - imports_value: LazyExpandedValue<'_, Encoding>, + pending_lst: &mut PendingContextChanges, + imports_value: LazyValue<'_, Encoding>, ) -> IonResult<()> { match imports_value.read()? { // If this is an LST append, there's nothing to do. - ExpandedValueRef::Symbol(raw_symbol) - if raw_symbol.matches_sid_or_text(3, "$ion_symbol_table") => {} + ValueRef::Symbol(symbol) if symbol == "$ion_symbol_table" => {} // If this is NOT an LST append, it will eventually cause the SymbolTable to reset. // However, at this point in the processing the PendingLst may have symbols that have // not yet made it to the SymbolTable. This can happen when a single top-level e-expression @@ -240,12 +504,12 @@ impl SystemReader { // Store any strings defined in the `symbols` field in the `PendingLst` for future application. fn process_symbols( - pending_lst: &mut PendingLst, - symbols: LazyExpandedValue<'_, Encoding>, + pending_lst: &mut PendingContextChanges, + symbols: LazyValue<'_, Encoding>, ) -> IonResult<()> { - if let ExpandedValueRef::List(list) = symbols.read()? { + if let ValueRef::List(list) = symbols.read()? { for symbol_text_result in list.iter() { - if let ExpandedValueRef::String(str_ref) = symbol_text_result?.read()? { + if let ValueRef::String(str_ref) = symbol_text_result?.read()? { pending_lst .symbols .push(Symbol::shared(Arc::from(str_ref.deref()))) @@ -263,31 +527,29 @@ impl SystemReader { // Check for `imports: $ion_symbol_table`. fn process_imports( - pending_lst: &mut PendingLst, + pending_lst: &mut PendingContextChanges, catalog: &dyn Catalog, - imports: LazyExpandedValue<'_, Encoding>, + imports: LazyValue<'_, Encoding>, ) -> IonResult<()> { match imports.read()? { - ExpandedValueRef::Symbol(symbol_ref) => { - if symbol_ref.matches_sid_or_text(3, "$ion_symbol_table") { - pending_lst.is_lst_append = true; - } - // Any other symbol is ignored + // Any symbol other than `$ion_symbol_table` is ignored. + ValueRef::Symbol(symbol_ref) if symbol_ref == "$ion_symbol_table" => { + pending_lst.is_lst_append = true; } - ExpandedValueRef::List(list) => { + ValueRef::List(list) => { for value in list.iter() { - let ExpandedValueRef::Struct(import) = value?.read()? else { + let ValueRef::Struct(import) = value?.read()? else { // If there's a value in the imports list that isn't a struct, it's malformed. // Ignore that value. continue; }; let name = match import.get("name")? { // If `name` is missing, a non-string, or the empty string, ignore this import. - Some(ExpandedValueRef::String(s)) if !s.is_empty() => s, + Some(ValueRef::String(s)) if !s.is_empty() => s, _ => continue, }; let version: usize = match import.get("version")? { - Some(ExpandedValueRef::Int(i)) if i > Int::ZERO => usize::try_from(i) + Some(ValueRef::Int(i)) if i > Int::ZERO => usize::try_from(i) .map_err(|_| IonError::decoding_error(format!("found a symbol table import (name='{name}') with a version number too high to support: {i}")), ), @@ -304,12 +566,13 @@ impl SystemReader { }; let max_id = match import.get("max_id")? { - Some(ExpandedValueRef::Int(i)) if i >= Int::ZERO => usize::try_from(i) - .map_err(|_| { + Some(ValueRef::Int(i)) if i >= Int::ZERO => { + usize::try_from(i).map_err(|_| { IonError::decoding_error( "found a `max_id` beyond the range of usize", ) - })?, + })? + } // If the max_id is unspecified, negative, or an invalid data type, we'll import all of the symbols from the requested table. _ => shared_table.symbols().len(), }; @@ -350,7 +613,9 @@ mod tests { use crate::lazy::binary::test_utilities::to_binary_ion; use crate::lazy::decoder::RawVersionMarker; use crate::lazy::system_stream_item::SystemStreamItem; - use crate::{v1_0::Binary, AnyEncoding, Catalog, IonResult, SymbolRef}; + use crate::{ + v1_0, AnyEncoding, Catalog, IonResult, SequenceWriter, SymbolRef, ValueWriter, Writer, + }; use super::*; @@ -368,13 +633,16 @@ mod tests { hello "#, )?; - let mut system_reader = SystemReader::new(Binary, ion_data); + let mut system_reader = SystemReader::new(v1_0::Binary, ion_data); loop { match system_reader.next_item()? { SystemStreamItem::VersionMarker(marker) => { println!("ivm => v{}.{}", marker.major(), marker.minor()) } SystemStreamItem::SymbolTable(ref s) => println!("symtab => {:?}", s), + SystemStreamItem::EncodingDirective(ref s) => { + println!("encoding directive => {:?}", s) + } SystemStreamItem::Value(ref v) => println!("value => {:?}", v.read()?), SystemStreamItem::EndOfStream(_) => break, } @@ -393,7 +661,7 @@ mod tests { ) "#, )?; - let mut system_reader = SystemReader::new(Binary, ion_data); + let mut system_reader = SystemReader::new(v1_0::Binary, ion_data); loop { match system_reader.next_item()? { SystemStreamItem::Value(value) => { @@ -420,7 +688,7 @@ mod tests { } "#, )?; - let mut system_reader = SystemReader::new(Binary, ion_data); + let mut system_reader = SystemReader::new(v1_0::Binary, ion_data); loop { match system_reader.next_item()? { SystemStreamItem::Value(value) => { @@ -438,6 +706,8 @@ mod tests { // === Shared Symbol Tables === + use crate::lazy::encoder::binary::v1_1::writer::LazyRawBinaryWriter_1_1; + use crate::lazy::encoder::value_writer::AnnotatableWriter; use crate::{MapCatalog, SharedSymbolTable}; fn system_reader_for(ion: I) -> SystemReader { @@ -521,7 +791,7 @@ mod tests { // The reader has analyzed the symtab struct and identified what symbols will be added when // it advances beyond it. assert_eq!( - reader.pending_symtab_changes().local_symbols()[0].text(), + reader.pending_context_changes().local_symbols()[0].text(), Some("potato salad") ); @@ -531,11 +801,11 @@ mod tests { assert_eq!(reader.symbol_table().text_for(10), Some("potato salad")); // We can peak at the symbols that will be added by the second LST before they are applied. assert_eq!( - reader.pending_symtab_changes().imported_symbols(), + reader.pending_context_changes().imported_symbols(), &[Symbol::from("foo"), Symbol::from("bar")] ); assert_eq!( - reader.pending_symtab_changes().local_symbols(), + reader.pending_context_changes().local_symbols(), &[Symbol::from("local_symbol")] ); // Now we advance to the application data, confirming that the symbol IDs align with the @@ -579,9 +849,9 @@ mod tests { .as_slice(), map_catalog, ); - assert_eq!(reader.next_item()?.expect_ivm()?.version(), (1, 0)); + assert_eq!(reader.next_item()?.expect_ivm()?.major_minor(), (1, 0)); let _symtab = reader.next_item()?.expect_symbol_table()?; - let pending_imported_symbols = reader.pending_symtab_changes().imported_symbols(); + let pending_imported_symbols = reader.pending_context_changes().imported_symbols(); // This symbol table imports the symbols 'name' and 'foo'. assert_eq!(pending_imported_symbols[0].text(), Some("name")); assert_eq!(pending_imported_symbols[1].text(), Some("foo")); @@ -693,4 +963,141 @@ mod tests { assert_eq!(reader.expect_next_value()?.read()?.expect_symbol()?, "quuz"); Ok(()) } + + #[test] + fn detect_encoding_directive_text() -> IonResult<()> { + let text = r#" + $ion_1_1 + $ion_encoding::((symbol_table ["foo", "bar", "baz"])) + "#; + + let mut reader = SystemReader::new(AnyEncoding, text); + assert_eq!(reader.next_item()?.expect_ivm()?.major_minor(), (1, 1)); + reader.next_item()?.expect_encoding_directive()?; + Ok(()) + } + + #[test] + fn detect_encoding_directive_binary() -> IonResult<()> { + let mut writer = LazyRawBinaryWriter_1_1::new(Vec::new())?; + let mut directive = writer + .value_writer() + .with_annotations("$ion_encoding")? + .sexp_writer()?; + let mut symbol_table = directive.sexp_writer()?; + symbol_table.write_symbol("symbol_table")?; + symbol_table.write_list(["foo", "bar", "baz"])?; + symbol_table.close()?; + directive.close()?; + let binary_ion = writer.close()?; + + let mut reader = SystemReader::new(AnyEncoding, binary_ion); + assert_eq!(reader.next_item()?.expect_ivm()?.major_minor(), (1, 1)); + reader.next_item()?.expect_encoding_directive()?; + Ok(()) + } + + #[test] + fn ignore_encoding_directive_text_1_0() -> IonResult<()> { + let text = r#" + $ion_1_0 + // In Ion 1.0, this is just an annotated s-expression. + $ion_encoding::((symbol_table ["foo", "bar", "baz"])) + "#; + + let mut reader = SystemReader::new(AnyEncoding, text); + assert_eq!(reader.next_item()?.expect_ivm()?.major_minor(), (1, 0)); + let sexp = reader.next_item()?.expect_value()?.read()?.expect_sexp()?; + assert!(sexp.annotations().are(["$ion_encoding"])?); + Ok(()) + } + + #[test] + fn ignore_encoding_directive_binary_1_0() -> IonResult<()> { + let mut writer = Writer::new(v1_0::Binary, Vec::new())?; + let mut directive = writer + .value_writer() + .with_annotations("$ion_encoding")? + .sexp_writer()?; + let mut symbol_table = directive.sexp_writer()?; + symbol_table.write_symbol("symbol_table")?; + symbol_table.write_list(["foo", "bar", "baz"])?; + symbol_table.close()?; + directive.close()?; + let bytes = writer.close()?; + + let mut reader = SystemReader::new(AnyEncoding, bytes); + assert_eq!(reader.next_item()?.expect_ivm()?.major_minor(), (1, 0)); + let _ = reader.next_item()?.expect_symbol_table()?; + let sexp = reader.next_item()?.expect_value()?.read()?.expect_sexp()?; + assert!(sexp.annotations().are(["$ion_encoding"])?); + Ok(()) + } + + #[test] + fn read_encoding_directive_new_active_module() -> IonResult<()> { + let ion = r#" + $ion_1_1 + $ion_encoding::( + (symbol_table ["foo", "bar", "baz"]) + (macro_table + (macro seventeen () 17) + (macro twelve () 12))) + (:seventeen) + (:twelve) + "#; + let mut reader = SystemReader::new(AnyEncoding, ion); + // Before reading any data, the reader defaults to expecting the Text v1.0 encoding, + // the only encoding that doesn't have to start with an IVM. + assert_eq!(reader.detected_encoding(), IonEncoding::Text_1_0); + + // The first thing the reader encounters is an IVM. Verify that all of its accessors report + // the expected values. + let ivm = reader.next_item()?.expect_ivm()?; + assert_eq!(ivm.major_minor(), (1, 1)); + assert_eq!(ivm.stream_encoding_before_marker(), IonEncoding::Text_1_0); + assert_eq!(ivm.stream_encoding_after_marker()?, IonEncoding::Text_1_1); + assert!(ivm.is_text()); + assert!(!ivm.is_binary()); + + // After encountering the IVM, the reader will have changed its detected encoding to Text v1.1. + assert_eq!(reader.detected_encoding(), IonEncoding::Text_1_1); + + // The next stream item is an encoding directive that defines some symbols and some macros. + let _directive = reader.next_item()?.expect_encoding_directive()?; + + // === Make sure it has the expected symbol definitions === + let pending_changes = reader + .pending_context_changes() + .new_active_module() + .expect("this directive defines a new active module"); + let new_symbol_table = pending_changes.symbol_table(); + assert_eq!( + new_symbol_table.symbols_tail(3), + &[ + Symbol::from("foo"), + Symbol::from("bar"), + Symbol::from("baz") + ] + ); + + // === Make sure it has the expected macro definitions ==== + let new_macro_table = pending_changes.macro_table(); + // There are currently 3 supported system macros: void, values, and make_string. + // This directive defines two more. + assert_eq!(new_macro_table.len(), 2 + MacroTable::NUM_SYSTEM_MACROS); + assert_eq!( + new_macro_table.macro_with_id(3), + new_macro_table.macro_with_name("seventeen") + ); + assert_eq!( + new_macro_table.macro_with_id(4), + new_macro_table.macro_with_name("twelve") + ); + + // Expand the e-expressions to make sure the macro definitions work as expected. + assert_eq!(reader.expect_next_value()?.read()?.expect_i64()?, 17); + assert_eq!(reader.expect_next_value()?.read()?.expect_i64()?, 12); + Ok(()) + } } diff --git a/src/lazy/system_stream_item.rs b/src/lazy/system_stream_item.rs index 95bf9507..c1fb4587 100644 --- a/src/lazy/system_stream_item.rs +++ b/src/lazy/system_stream_item.rs @@ -1,12 +1,11 @@ use std::fmt::{Debug, Formatter}; use crate::lazy::decoder::{Decoder, RawVersionMarker}; -use crate::lazy::expanded::ExpandedValueSource; use crate::lazy::r#struct::LazyStruct; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::value::LazyValue; use crate::result::IonFailure; -use crate::{IonError, IonResult}; +use crate::{IonError, IonResult, LazySExp}; /// System stream elements that a SystemReader may encounter. #[non_exhaustive] @@ -14,8 +13,10 @@ pub enum SystemStreamItem<'top, D: Decoder> { /// An Ion Version Marker (IVM) indicating the Ion major and minor version that were used to /// encode the values that follow. VersionMarker(D::VersionMarker<'top>), - /// An Ion symbol table encoded as a struct annotated with `$ion_symbol_table`. + /// An Ion 1.0-style symbol table encoded as a struct annotated with `$ion_symbol_table`. SymbolTable(LazyStruct<'top, D>), + /// An Ion 1.1 encoding directive; an s-expression annotated with `$ion_encoding`. + EncodingDirective(LazySExp<'top, D>), /// An application-level Ion value Value(LazyValue<'top, D>), /// The end of the stream @@ -29,16 +30,20 @@ impl<'top, D: Decoder> Debug for SystemStreamItem<'top, D> { write!(f, "version marker v{}.{}", marker.major(), marker.minor()) } SystemStreamItem::SymbolTable(_) => write!(f, "a symbol table"), + SystemStreamItem::EncodingDirective(_) => write!(f, "an encoding directive"), SystemStreamItem::Value(value) => write!(f, "{}", value.ion_type()), SystemStreamItem::EndOfStream(_) => write!(f, ""), } } } +// Clippy complains that `as_` methods should return a reference. In this case, all of the types +// are `Copy`, so returning a copy isn't a problem. +#[allow(clippy::wrong_self_convention)] impl<'top, D: Decoder> SystemStreamItem<'top, D> { /// If this item is an Ion version marker (IVM), returns `Some(version_marker)` indicating the /// version. Otherwise, returns `None`. - pub fn version_marker(&self) -> Option> { + pub fn as_version_marker(&self) -> Option> { if let Self::VersionMarker(marker) = self { Some(*marker) } else { @@ -46,16 +51,16 @@ impl<'top, D: Decoder> SystemStreamItem<'top, D> { } } - /// Like [`Self::version_marker`], but returns a [`crate::IonError::Decoding`] if this item + /// Like [`Self::as_version_marker`], but returns a [`crate::IonError::Decoding`] if this item /// is not an IVM. pub fn expect_ivm(self) -> IonResult> { - self.version_marker() + self.as_version_marker() .ok_or_else(|| IonError::decoding_error(format!("expected IVM, found {:?}", self))) } /// If this item is a application-level value, returns `Some(&LazyValue)`. Otherwise, /// returns `None`. - pub fn value(&self) -> Option> { + pub fn as_value(&self) -> Option> { if let Self::Value(value) = self { Some(*value) } else { @@ -63,7 +68,7 @@ impl<'top, D: Decoder> SystemStreamItem<'top, D> { } } - /// Like [`Self::value`], but returns a [`IonError::Decoding`] if this item is not + /// Like [`Self::as_value`], but returns a [`IonError::Decoding`] if this item is not /// an application-level value. pub fn expect_value(self) -> IonResult> { if let Self::Value(value) = self { @@ -92,25 +97,35 @@ impl<'top, D: Decoder> SystemStreamItem<'top, D> { } } + /// If this item is a symbol table, returns `Some(lazy_struct)`. Otherwise, returns `None`. + pub fn as_encoding_directive(self) -> Option> { + if let Self::EncodingDirective(sexp) = self { + Some(sexp) + } else { + None + } + } + + /// Like [`Self::as_symbol_table`], but returns a [`IonError::Decoding`] if this item is not + /// a symbol table. + pub fn expect_encoding_directive(self) -> IonResult> { + if let Self::EncodingDirective(sexp) = self { + Ok(sexp) + } else { + IonResult::decoding_error(format!("expected encoding directive, found {:?}", self)) + } + } + pub fn raw_stream_item(&self) -> Option> { - let item = match self { - SystemStreamItem::VersionMarker(marker) => RawStreamItem::VersionMarker(*marker), - SystemStreamItem::SymbolTable(symtab) => { - use ExpandedValueSource::*; - match symtab.as_value().expanded().source { - ValueLiteral(literal) => RawStreamItem::Value(literal), - Template(..) | Constructed(..) => return None, - } - } - SystemStreamItem::Value(value) => { - use ExpandedValueSource::*; - match value.expanded().source { - ValueLiteral(literal) => RawStreamItem::Value(literal), - Template(..) | Constructed(..) => return None, - } + let value = match self { + SystemStreamItem::VersionMarker(marker) => { + return Some(RawStreamItem::VersionMarker(*marker)) } - SystemStreamItem::EndOfStream(end) => RawStreamItem::EndOfStream(*end), + SystemStreamItem::SymbolTable(symtab) => symtab.as_value(), + SystemStreamItem::EncodingDirective(directive) => directive.as_value(), + SystemStreamItem::Value(value) => *value, + SystemStreamItem::EndOfStream(end) => return Some(RawStreamItem::EndOfStream(*end)), }; - Some(item) + value.raw().map(RawStreamItem::Value) } } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 01a46153..ada9c4b7 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -30,20 +30,25 @@ use crate::lazy::text::matched::{ MatchedFloat, MatchedInt, MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, }; -use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; +use crate::lazy::text::parse_result::{fatal_parse_error, InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; use crate::lazy::text::raw::r#struct::{LazyRawTextFieldName_1_0, RawTextStructIterator_1_0}; use crate::lazy::text::raw::sequence::{RawTextListIterator_1_0, RawTextSExpIterator_1_0}; +use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr, TextEExpArgGroup}; use crate::lazy::text::raw::v1_1::reader::{ - EncodedTextMacroInvocation, LazyRawTextFieldName_1_1, MacroIdRef, RawTextEExpression_1_1, - RawTextListIterator_1_1, RawTextSExpIterator_1_1, RawTextStructIterator_1_1, - TextListSpanFinder_1_1, TextSExpSpanFinder_1_1, TextStructSpanFinder_1_1, + LazyRawTextFieldName_1_1, MacroIdRef, RawTextListIterator_1_1, RawTextSExpIterator_1_1, + RawTextStructIterator_1_1, TextEExpression_1_1, TextListSpanFinder_1_1, TextSExpSpanFinder_1_1, + TextStructSpanFinder_1_1, }; use crate::lazy::text::value::{ LazyRawTextValue, LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker, }; use crate::result::DecodingError; -use crate::{Encoding, IonError, IonResult, IonType, TimestampPrecision}; +use crate::{Encoding, HasRange, IonError, IonResult, IonType, RawSymbolRef, TimestampPrecision}; + +use crate::lazy::expanded::macro_table::Macro; +use crate::lazy::expanded::template::{Parameter, RestSyntaxPolicy}; +use bumpalo::collections::Vec as BumpVec; impl<'a> Debug for TextBufferView<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -136,6 +141,10 @@ impl<'top> TextBufferView<'top> { } } + pub fn context(&self) -> EncodingContextRef<'top> { + self.context + } + pub fn local_lifespan<'a>(self) -> TextBufferView<'a> where 'top: 'a, @@ -368,7 +377,7 @@ impl<'top> TextBufferView<'top> { ) -> IonParseResult<'top, Option>> { whitespace_and_then(alt(( Self::match_e_expression.map(|matched| Some(RawValueExpr::EExp(matched))), - value(None, tag(")")), + value(None, peek(tag(")"))), pair( opt(Self::match_annotations), // We need the s-expression parser to recognize the input `--3` as the operator `--` and the @@ -488,7 +497,7 @@ impl<'top> TextBufferView<'top> { /// range of input bytes where the field name is found, and the value. pub fn match_struct_field_name_and_e_expression_1_1( self, - ) -> IonParseResult<'top, (MatchedFieldName<'top>, RawTextEExpression_1_1<'top>)> { + ) -> IonParseResult<'top, (MatchedFieldName<'top>, TextEExpression_1_1<'top>)> { terminated( separated_pair( whitespace_and_then(Self::match_struct_field_name), @@ -594,7 +603,7 @@ impl<'top> TextBufferView<'top> { // We check for IVMs first because the rules for a symbol identifier will match them. alt(( Self::match_ivm::.map(RawStreamItem::VersionMarker), - Self::match_e_expression.map(RawStreamItem::EExpression), + Self::match_e_expression.map(RawStreamItem::EExp), Self::match_annotated_value_1_1 .map(LazyRawTextValue_1_1::from) .map(RawStreamItem::Value), @@ -1006,57 +1015,293 @@ impl<'top> TextBufferView<'top> { Ok((remaining, (matched, fields))) } - /// Matches an e-expression invoking a macro. - /// - /// If the input does not contain the entire e-expression, returns `IonError::Incomplete(_)`. - pub fn match_e_expression(self) -> IonParseResult<'top, RawTextEExpression_1_1<'top>> { - let (exp_body, _) = tag("(:")(self)?; - // TODO: Support macro ID kinds besides unqualified names - let (exp_body_after_id, (macro_id_bytes, _matched_symbol)) = - consumed(Self::match_identifier)(exp_body)?; - // Because the macro_id used identifier syntax, its bytes must be ASCII. We can safely unwrap. - let macro_name = macro_id_bytes.as_text().unwrap(); - let macro_id = MacroIdRef::LocalName(macro_name); - - // The rest of the e-expression uses s-expression syntax. Scan ahead to find the end of this - // expression. - let sexp_iter = RawTextSExpIterator_1_1::new(exp_body_after_id); + pub fn match_e_expression_arg_group( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { + alt(( + Self::parser_with_arg(Self::match_explicit_arg_group, parameter), + Self::parser_with_arg(Self::match_rest, parameter), + ))(self) + } + + /// Higher-order helper that takes a closure and an argument to pass and constructs a new + /// parser that calls the closure with the provided argument. + pub fn parser_with_arg( + mut parser: impl FnMut(Self, &'top A) -> IonParseResult<'top, O>, + arg_to_pass: &'top A, + ) -> impl Parser> { + move |input: TextBufferView<'top>| parser(input, arg_to_pass) + } + + pub fn match_explicit_arg_group( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { + let (group_body, group_head) = alt(( + // A trivially empty arg group: `(:)` + terminated(tag("(:"), peek(tag(")"))), + // An arg group that is not trivially empty, though it may only contain whitespace: + // (: ) + // (: 1 2 3) + recognize(pair(tag("(:"), Self::match_whitespace)), + ))(self)?; + + // The rest of the group uses s-expression syntax. Scan ahead to find the end of this + // group. + let sexp_iter = RawTextSExpIterator_1_1::new(group_body); // The sexp iterator holds the body of the expression. When finding the input span it occupies, - // we tell the iterator how many bytes comprised the head of the expression: two bytes - // for `(:` plus the length of the macro ID. - let initial_bytes_skipped = 2 + macro_id_bytes.len(); + // we tell the iterator how many bytes comprised the head of the expression: `(:` followed + // by whitespace. + let initial_bytes_skipped = group_head.len(); let (span, child_expr_cache) = match TextSExpSpanFinder_1_1::new(self.context.allocator(), sexp_iter) .find_span(initial_bytes_skipped) { Ok((span, child_expr_cache)) => (span, child_expr_cache), - // If the complete container isn't available, return an incomplete. + // If the complete group isn't available, return an incomplete. Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), // If invalid syntax was encountered, return a failure to prevent nom from trying // other parser kinds. Err(e) => { return { let error = InvalidInputError::new(self) - .with_label(format!( - "matching an e-expression invoking macro {}", - macro_name - )) + .with_label("matching an e-expression argument group") .with_description(format!("{}", e)); Err(nom::Err::Failure(IonParseError::Invalid(error))) } } }; - // For the matched span, we use `self` again to include the opening `(:` + // For the matched span, we use `self` again to include the opening `(:` and whitespace. let matched = self.slice(0, span.len()); let remaining = self.slice_to_end(span.len()); - let macro_invocation = RawTextEExpression_1_1::new( - macro_id, - EncodedTextMacroInvocation::new(macro_id_bytes.len() as u16), - matched, - child_expr_cache, + let arg_group = TextEExpArgGroup::new(parameter, matched, child_expr_cache); + + Ok((remaining, arg_group)) + } + + /// Matches an e-expression invoking a macro. + /// + /// If the input does not contain the entire e-expression, returns `IonError::Incomplete(_)`. + pub fn match_e_expression(self) -> IonParseResult<'top, TextEExpression_1_1<'top>> { + let (eexp_body, _opening_tag) = tag("(:")(self)?; + // TODO: Support macro ID kinds besides unqualified names + let (exp_body_after_id, (macro_id_bytes, matched_symbol)) = + consumed(Self::match_identifier)(eexp_body)?; + + let id = match matched_symbol + .read(self.context.allocator(), macro_id_bytes) + .expect("matched identifier but failed to read its bytes") + { + RawSymbolRef::SymbolId(_) => unreachable!("matched a text identifier, returned a SID"), + RawSymbolRef::Text(text) => MacroIdRef::LocalName(text), + }; + + let mut remaining = exp_body_after_id; + let mut arg_expr_cache = BumpVec::new_in(self.context.allocator()); + + let macro_ref: &'top Macro = self + .context() + .macro_table() + .macro_with_id(id) + .ok_or_else(|| { + nom::Err::Failure(IonParseError::Invalid( + InvalidInputError::new(self) + .with_description(format!("could not find macro with id {:?}", id)), + )) + })? + .reference(); + let signature_params: &'top [Parameter] = macro_ref.signature().parameters(); + for (index, param) in signature_params.iter().enumerate() { + let (input_after_match, maybe_arg) = remaining.match_argument_for(param)?; + remaining = input_after_match; + match maybe_arg { + Some(arg) => arg_expr_cache.push(arg), + None => { + for param in &signature_params[index..] { + if param.rest_syntax_policy() == RestSyntaxPolicy::NotAllowed { + return fatal_parse_error( + self, + format!( + "e-expression did not include an argument for param '{}'", + param.name() + ), + ); + } + } + break; + } + } + } + let (remaining, _end_of_eexp) = match whitespace_and_then(tag(")")).parse(remaining) { + Ok(result) => result, + Err(_e) => { + return fatal_parse_error( + remaining, + format!( + "signature has {} parameter(s), e-expression had an extra argument", + signature_params.len() + ), + ) + } + }; + + let matched_input = self.slice(0, remaining.offset() - self.offset()); + + let parameters = macro_ref.signature().parameters(); + if arg_expr_cache.len() < parameters.len() { + // If expressions were not provided for all arguments, it was due to rest syntax. + // Non-required expressions in trailing position can be omitted. + // If we reach this point, the rest syntax check in the argument parsing logic above + // has already verified that using rest syntax was legal. We can add empty argument + // groups for each missing expression. + const EMPTY_ARG_TEXT: &str = "(: /* no expression specified */ )"; + let last_explicit_arg_end = arg_expr_cache + .last() + .map(|arg| arg.expr().range().end) + .unwrap_or(remaining.offset); + for parameter in ¶meters[arg_expr_cache.len()..] { + let buffer = TextBufferView::new_with_offset( + self.context, + EMPTY_ARG_TEXT.as_bytes(), + last_explicit_arg_end, + ); + arg_expr_cache.push(EExpArg::new( + parameter, + EExpArgExpr::ArgGroup(TextEExpArgGroup::new(parameter, buffer, &[])), + )); + } + } + debug_assert!( + arg_expr_cache.len() == parameters.len(), + "every parameter must have an argument, explicit or implicit" ); - Ok((remaining, macro_invocation)) + Ok(( + remaining, + TextEExpression_1_1::new(id, matched_input, arg_expr_cache.into_bump_slice()), + )) + } + + pub fn match_argument_for( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, Option>> { + use crate::lazy::expanded::template::ParameterCardinality::*; + match parameter.cardinality() { + ExactlyOne => { + let (remaining, arg) = self.match_exactly_one(parameter)?; + Ok((remaining, Some(arg))) + } + ZeroOrOne => self.match_zero_or_one(parameter), + ZeroOrMore => self.match_zero_or_more(parameter), + OneOrMore => self.match_one_or_more(parameter), + } + } + + pub fn match_exactly_one( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, EExpArg<'top, TextEncoding_1_1>> { + let (remaining, maybe_expr) = whitespace_and_then( + Self::match_sexp_value_1_1.map(|expr| expr.map(EExpArgExpr::::from)), + ) + .parse(self)?; + match maybe_expr { + Some(expr) => Ok((remaining, EExpArg::new(parameter, expr))), + None => fatal_parse_error( + self, + format!( + "expected argument for required parameter '{}'", + parameter.name() + ), + ), + } + } + + pub fn match_empty_arg_group(self) -> IonMatchResult<'top> { + recognize(pair(tag("(:"), whitespace_and_then(tag(")"))))(self) + } + + pub fn match_zero_or_one( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, Option>> { + whitespace_and_then(alt(( + Self::match_empty_arg_group.map(|_| None), + // TODO: Match a non-empty arg group and turn it into a failure with a helpful error message + Self::match_sexp_value_1_1.map(|maybe_expr| { + maybe_expr.map(|expr| { + EExpArg::new(parameter, EExpArgExpr::::from(expr)) + }) + }), + ))) + .parse(self) + } + + pub fn match_zero_or_more( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, Option>> { + let (remaining, maybe_expr) = preceded( + Self::match_optional_comments_and_whitespace, + alt(( + Self::parser_with_arg(Self::match_e_expression_arg_group, parameter) + .map(|group| Some(EExpArg::new(parameter, EExpArgExpr::ArgGroup(group)))), + Self::match_sexp_value_1_1.map(|expr| { + expr.map(EExpArgExpr::from) + .map(|expr| EExpArg::new(parameter, expr)) + }), + value(None, peek(tag(")"))), + )), + )(self)?; + Ok((remaining, maybe_expr)) + } + + pub fn match_one_or_more( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, Option>> { + if self.match_empty_arg_group().is_ok() { + return Err(nom::Err::Failure(IonParseError::Invalid( + InvalidInputError::new(self).with_description(format!( + "parameter '{}' is one-or-more (`+`) and cannot accept an empty stream", + parameter.name() + )), + ))); + } + + self.match_zero_or_more(parameter) + } + + pub fn match_rest( + self, + parameter: &'top Parameter, + ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { + if parameter.rest_syntax_policy() == RestSyntaxPolicy::NotAllowed { + return Err(nom::Err::Error(IonParseError::Invalid( + InvalidInputError::new(self) + .with_description("parameter does not support rest syntax"), + ))); + } + let mut remaining = self; + let mut cache = BumpVec::new_in(self.context().allocator()); + loop { + let (remaining_after_expr, maybe_expr) = alt(( + value(None, whitespace_and_then(peek(tag(")")))), + Self::match_sexp_value_1_1, + )) + .parse(remaining)?; + if let Some(expr) = maybe_expr { + remaining = remaining_after_expr; + cache.push(expr); + } else { + return Ok(( + remaining, + TextEExpArgGroup::new(parameter, self, cache.into_bump_slice()), + )); + } + } } /// Matches and returns a boolean value. @@ -1540,7 +1785,7 @@ impl<'top> TextBufferView<'top> { } /// Matches an identifier (`foo`). - fn match_identifier(self) -> IonParseResult<'top, MatchedSymbol> { + pub(crate) fn match_identifier(self) -> IonParseResult<'top, MatchedSymbol> { let (remaining, identifier_text) = recognize(terminated( pair( Self::identifier_initial_character, @@ -2167,7 +2412,7 @@ impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { /// Takes a given parser and returns a new one that accepts any amount of leading whitespace before /// calling the original parser. -fn whitespace_and_then<'data, P, O>( +pub fn whitespace_and_then<'data, P, O>( parser: P, ) -> impl Parser, O, IonParseError<'data>> where @@ -2232,6 +2477,9 @@ where #[cfg(test)] mod tests { + use crate::lazy::any_encoding::IonVersion; + use crate::lazy::expanded::compiler::TemplateCompiler; + use crate::lazy::expanded::template::{ParameterCardinality, ParameterEncoding}; use crate::lazy::expanded::EncodingContext; use rstest::rstest; @@ -2240,7 +2488,7 @@ mod tests { /// Stores an input string that can be tested against a given parser. struct MatchTest { input: String, - context: EncodingContextRef<'static>, + context: EncodingContext, } impl MatchTest { @@ -2249,17 +2497,22 @@ mod tests { fn new(input: &str) -> Self { MatchTest { input: input.to_string(), - // This uses `leak` to get an `EncodingContextRef` with a `static` lifetime - // for the sake of unit test simplicity. - context: EncodingContextRef::new(Box::leak(Box::new(EncodingContext::empty()))), + context: EncodingContext::for_ion_version(IonVersion::v1_1), } } + fn register_macro(&mut self, text: &str) -> &mut Self { + let new_macro = + TemplateCompiler::compile_from_text(self.context.get_ref(), text).unwrap(); + self.context.macro_table.add_macro(new_macro).unwrap(); + self + } + fn try_match<'data, P, O>(&'data self, parser: P) -> IonParseResult<'data, usize> where P: Parser, O, IonParseError<'data>>, { - let buffer = TextBufferView::new(self.context, self.input.as_bytes()); + let buffer = TextBufferView::new(self.context.get_ref(), self.input.as_bytes()); match_length(parser).parse(buffer) } @@ -2353,6 +2606,21 @@ mod tests { }; } + macro_rules! matcher_tests_with_macro { + ($parser:ident $macro_src:literal $($expect:ident: [$($input:literal),+$(,)?]),+$(,)?) => { + mod $parser { + use super::*; + $( + #[test] + fn $expect() { + $(MatchTest::new($input.trim()).register_macro($macro_src).$expect(match_length(TextBufferView::$parser));) + + + } + )+ + } + }; + } + #[test] fn test_match_stop_char() { MatchTest::new(" ").expect_match(match_length(TextBufferView::match_stop_character)); @@ -2634,8 +2902,9 @@ mod tests { expect_incomplete: ["(", "(1 2 (3 4 5)"] } - matcher_tests! { + matcher_tests_with_macro! { match_sexp_1_1 + "(macro foo (x*) null)" expect_match: [ "()", "(1)", @@ -2667,8 +2936,9 @@ mod tests { ] } - matcher_tests! { + matcher_tests_with_macro! { match_list_1_1 + "(macro foo (x*) null)" expect_match: [ "[]", "[1]", "[1, 2]", "[[]]", "[([])]", "[1, (:foo 2 3)]" ], @@ -2680,8 +2950,9 @@ mod tests { ] } - matcher_tests! { + matcher_tests_with_macro! { match_e_expression + "(macro foo (x*) null)" expect_match: [ "(:foo)", "(:foo 1)", @@ -2699,25 +2970,49 @@ mod tests { ] } + #[rstest] + #[case::empty("(:)")] + #[case::empty_with_extra_spacing("(: )")] + #[case::single_value("(: 1)")] + #[case::multiple_values("(: 1 2 3)")] + #[case::eexp("(: foo 1 2 3)")] + #[case::eexp_with_sexp("(: (foo 1 2 3))")] + #[case::eexp_with_mixed_values("(: 1 2 3 {quux: [1, 2, 3]} 4 bar::5 baz::6)")] + fn match_eexp_arg_group(#[case] input: &str) { + let parameter = Parameter::new( + "x", + ParameterEncoding::Tagged, + ParameterCardinality::ZeroOrMore, + RestSyntaxPolicy::NotAllowed, + ); + MatchTest::new(input) + .register_macro("(macro foo (x*) null)") + .expect_match(match_length(TextBufferView::parser_with_arg( + TextBufferView::match_explicit_arg_group, + ¶meter, + ))) + } + #[rstest] #[case::simple_e_exp("(:foo)")] - #[case::e_exp_in_e_exp("(:foo (:bar 1))")] + #[case::e_exp_in_e_exp("(:foo (bar 1))")] #[case::e_exp_in_list("[a, b, (:foo 1)]")] #[case::e_exp_in_sexp("(a (:foo 1) c)")] - // #[case::e_exp_in_struct_field("{a:(:foo)}")] - // #[case::e_exp_in_struct_field_with_comma("{a:(:foo),}")] + #[case::e_exp_in_struct_field("{a:(:foo)}")] + #[case::e_exp_in_struct_field_with_comma("{a:(:foo),}")] #[case::e_exp_in_struct_field_with_comma_and_second_field("{a:(:foo), b:2}")] - // #[case::e_exp_in_struct_field_with_space_before("{ a:(:foo)}")] - // #[case::e_exp_in_struct_field_with_space_after("{a:(:foo) }")] + #[case::e_exp_in_struct_field_with_space_before("{ a:(:foo)}")] + #[case::e_exp_in_struct_field_with_space_after("{a:(:foo) }")] #[case::e_exp_in_list_in_struct_field("{ a: [(:foo)] }")] #[case::e_exp_in_sexp_in_struct_field("{ a: ((:foo)) }")] #[case::e_exp_in_sexp_in_list("[a, b, ((:foo 1))]")] #[case::e_exp_in_sexp_in_sexp("(a ((:foo 1)) c)")] #[case::e_exp_in_list_in_list("[a, b, [(:foo 1)]]")] #[case::e_exp_in_list_in_sexp("(a [(:foo 1)] c)")] - // TODO: Uncomment the above cases when fixing https://github.com/amazon-ion/ion-rust/issues/653 fn test_match_macro_invocation_in_context(#[case] input: &str) { - MatchTest::new(input).expect_match(match_length(TextBufferView::match_top_level_item_1_1)); + MatchTest::new(input) + .register_macro("(macro foo (x*) null)") + .expect_match(match_length(TextBufferView::match_top_level_item_1_1)); } matcher_tests! { diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 75f868b6..a7430b88 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -1,13 +1,12 @@ #![allow(non_camel_case_types)] use crate::lazy::any_encoding::IonEncoding; -use crate::lazy::decoder::{Decoder, LazyRawReader, RawVersionMarker}; +use crate::lazy::decoder::LazyRawReader; use crate::lazy::encoding::TextEncoding_1_0; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::AddContext; -use crate::result::IonFailure; use crate::{Encoding, IonResult}; /// A text Ion 1.0 reader that yields [`LazyRawStreamItem`]s representing the top level values found @@ -76,18 +75,6 @@ impl<'data> LazyRawTextReader_1_0<'data> { buffer_after_item, )?; - if let RawStreamItem::VersionMarker(version_marker) = matched_item { - // TODO: It is not the raw reader's responsibility to report this error. It should - // surface the IVM to the caller, who can then either create a different reader - // for the reported version OR raise an error. - // See: https://github.com/amazon-ion/ion-rust/issues/644 - let (major, minor) = version_marker.version(); - if (major, minor) != (1, 0) { - return IonResult::decoding_error(format!( - "Ion version {major}.{minor} is not supported" - )); - } - } // Since we successfully matched the next value, we'll update the buffer // so a future call to `next()` will resume parsing the remaining input. self.local_offset = buffer_after_trailing_ws.offset() - self.stream_offset; @@ -99,11 +86,20 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_0> for LazyRawTextReader_1_0<'da fn resume_at_offset( data: &'data [u8], offset: usize, - _config: ::ReaderSavedState, + // This argument is ignored by all raw readers except LazyRawAnyReader + _encoding_hint: IonEncoding, ) -> Self { LazyRawTextReader_1_0::new_with_offset(data, offset) } + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + ( + &self.input[self.local_offset..], + self.position(), + self.encoding(), + ) + } + fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, @@ -129,7 +125,7 @@ mod tests { use crate::lazy::expanded::EncodingContext; use crate::lazy::raw_value_ref::RawValueRef; use crate::raw_symbol_ref::AsRawSymbolRef; - use crate::{Decimal, IonType, RawSymbolRef, Timestamp}; + use crate::{Decimal, IonType, RawSymbolRef, RawVersionMarker, Timestamp}; use super::*; @@ -304,7 +300,7 @@ mod tests { context: encoding_context.get_ref(), }; - assert_eq!(reader.next()?.expect_ivm()?.version(), (1, 0)); + assert_eq!(reader.next()?.expect_ivm()?.major_minor(), (1, 0)); // null reader.expect_next(RawValueRef::Null(IonType::Null)); diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs index 31ed46c0..263feb92 100644 --- a/src/lazy/text/raw/struct.rs +++ b/src/lazy/text/raw/struct.rs @@ -106,7 +106,7 @@ impl<'top> HasRange for LazyRawTextFieldName_1_0<'top> { } } -impl<'top> LazyRawFieldName<'top> for LazyRawTextFieldName_1_0<'top> { +impl<'top> LazyRawFieldName<'top, TextEncoding_1_0> for LazyRawTextFieldName_1_0<'top> { fn read(&self) -> IonResult> { self.matched.read() } diff --git a/src/lazy/text/raw/v1_1/arg_group.rs b/src/lazy/text/raw/v1_1/arg_group.rs new file mode 100644 index 00000000..e2f1b05d --- /dev/null +++ b/src/lazy/text/raw/v1_1/arg_group.rs @@ -0,0 +1,184 @@ +use std::ops::Range; + +use crate::lazy::decoder::{LazyRawValueExpr, RawValueExpr}; +use crate::lazy::encoding::TextEncoding_1_1; +use crate::lazy::expanded::e_expression::ArgGroup; +use crate::lazy::expanded::macro_evaluator::{ + EExpArgGroupIterator, EExpressionArgGroup, MacroExpr, RawEExpression, ValueExpr, +}; +use crate::lazy::expanded::template::{Parameter, ParameterEncoding}; +use crate::lazy::expanded::EncodingContextRef; +use crate::lazy::text::buffer::TextBufferView; +use crate::result::IonFailure; +use crate::{Decoder, HasRange, HasSpan, IonResult, LazyExpandedValue, Span}; + +#[derive(Copy, Clone, Debug)] +pub struct EExpArg<'top, D: Decoder> { + parameter: &'top Parameter, + expr: EExpArgExpr<'top, D>, +} + +impl<'top, D: Decoder> EExpArg<'top, D> { + pub fn new(parameter: &'top Parameter, expr: EExpArgExpr<'top, D>) -> Self { + Self { parameter, expr } + } + + pub fn encoding(&self) -> &'top Parameter { + self.parameter + } + + pub fn expr(&self) -> &EExpArgExpr<'top, D> { + &self.expr + } + + pub fn resolve(&self, context: EncodingContextRef<'top>) -> IonResult> { + let value_expr = match self.expr { + EExpArgExpr::ValueLiteral(value) => { + ValueExpr::ValueLiteral(LazyExpandedValue::from_literal(context, value)) + } + EExpArgExpr::EExp(eexp) => { + ValueExpr::MacroInvocation(MacroExpr::from_eexp(eexp.resolve(context)?)) + } + EExpArgExpr::ArgGroup(group) => { + ValueExpr::MacroInvocation(MacroExpr::from_eexp_arg_group(group.resolve(context))) + } + }; + Ok(value_expr) + } +} + +#[derive(Copy, Clone, Debug)] +pub enum EExpArgExpr<'top, D: Decoder> { + ValueLiteral(::Value<'top>), + EExp(::EExp<'top>), + ArgGroup(<::EExp<'top> as RawEExpression<'top, D>>::ArgGroup), +} + +impl<'top, D: Decoder> EExpArgExpr<'top, D> { + pub fn expect_value(&self) -> IonResult<::Value<'top>> { + let EExpArgExpr::ValueLiteral(value) = self else { + return IonResult::decoding_error(format!("expected a value literal, found {self:?}")); + }; + Ok(*value) + } + + pub fn expect_eexp(&self) -> IonResult<::EExp<'top>> { + let EExpArgExpr::EExp(eexp) = self else { + return IonResult::decoding_error(format!("expected an e-expression, found {self:?}")); + }; + Ok(*eexp) + } + + pub fn expect_arg_group( + &self, + ) -> IonResult<<::EExp<'top> as RawEExpression<'top, D>>::ArgGroup> { + let EExpArgExpr::ArgGroup(group) = self else { + return IonResult::decoding_error(format!("expected an arg group, found {self:?}")); + }; + Ok(*group) + } +} + +impl<'top, D: Decoder> From> for EExpArgExpr<'top, D> { + fn from(value: LazyRawValueExpr<'top, D>) -> Self { + match value { + RawValueExpr::ValueLiteral(v) => EExpArgExpr::ValueLiteral(v), + RawValueExpr::EExp(e) => EExpArgExpr::EExp(e), + } + } +} + +impl<'top, D: Decoder> HasRange for EExpArgExpr<'top, D> { + fn range(&self) -> Range { + match self { + EExpArgExpr::ValueLiteral(v) => v.range(), + EExpArgExpr::EExp(e) => e.range(), + EExpArgExpr::ArgGroup(a) => a.range(), + } + } +} + +#[derive(Copy, Clone, Debug)] +pub struct TextEExpArgGroup<'top> { + input: TextBufferView<'top>, + parameter: &'top Parameter, + // Notice that the expressions inside an arg group cannot themselves be arg groups, + // only value literals or e-expressions. + expr_cache: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], +} + +impl<'top> TextEExpArgGroup<'top> { + pub fn new( + parameter: &'top Parameter, + input: TextBufferView<'top>, + child_expr_cache: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], + ) -> Self { + Self { + input, + parameter, + expr_cache: child_expr_cache, + } + } +} + +impl<'top> HasRange for TextEExpArgGroup<'top> { + fn range(&self) -> Range { + self.input.range() + } +} + +impl<'top> HasSpan<'top> for TextEExpArgGroup<'top> { + fn span(&self) -> Span<'top> { + Span::with_offset(self.input.offset(), self.input.bytes()) + } +} + +#[derive(Copy, Clone, Debug)] +pub struct TextEExpArgGroupIterator<'top> { + child_expr_cache: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], + index: usize, +} + +impl<'top> EExpArgGroupIterator<'top, TextEncoding_1_1> for TextEExpArgGroupIterator<'top> { + fn is_exhausted(&self) -> bool { + self.index == self.child_expr_cache.len() + } +} + +impl<'top> Iterator for TextEExpArgGroupIterator<'top> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + let child_expr = self.child_expr_cache.get(self.index)?; + self.index += 1; + Some(Ok(*child_expr)) + } +} + +impl<'top> IntoIterator for TextEExpArgGroup<'top> { + type Item = IonResult>; + type IntoIter = TextEExpArgGroupIterator<'top>; + + fn into_iter(self) -> Self::IntoIter { + TextEExpArgGroupIterator { + child_expr_cache: self.expr_cache, + index: 0, + } + } +} + +impl<'top> EExpressionArgGroup<'top, TextEncoding_1_1> for TextEExpArgGroup<'top> { + type Iterator = TextEExpArgGroupIterator<'top>; + + fn encoding(&self) -> ParameterEncoding { + self.parameter.encoding() + } + + fn resolve(self, context: EncodingContextRef<'top>) -> ArgGroup<'top, TextEncoding_1_1> { + ArgGroup::new(self, context) + } + + fn iter(self) -> Self::Iterator { + self.into_iter() + } +} diff --git a/src/lazy/text/raw/v1_1/mod.rs b/src/lazy/text/raw/v1_1/mod.rs index 1077754f..dfd9f19e 100644 --- a/src/lazy/text/raw/v1_1/mod.rs +++ b/src/lazy/text/raw/v1_1/mod.rs @@ -1 +1,2 @@ +pub mod arg_group; pub mod reader; diff --git a/src/lazy/text/raw/v1_1/reader.rs b/src/lazy/text/raw/v1_1/reader.rs index 164e3ccf..0158ba15 100644 --- a/src/lazy/text/raw/v1_1/reader.rs +++ b/src/lazy/text/raw/v1_1/reader.rs @@ -12,7 +12,6 @@ use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{ Decoder, HasRange, HasSpan, LazyRawContainer, LazyRawFieldExpr, LazyRawFieldName, LazyRawReader, LazyRawSequence, LazyRawStruct, LazyRawValue, LazyRawValueExpr, - RawVersionMarker, }; use crate::lazy::encoding::TextEncoding_1_1; use crate::lazy::expanded::macro_evaluator::RawEExpression; @@ -22,9 +21,9 @@ use crate::lazy::span::Span; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::matched::{MatchedFieldName, MatchedValue}; use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; +use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, TextEExpArgGroup}; use crate::lazy::text::value::{LazyRawTextValue_1_1, RawTextAnnotationsIterator}; -use crate::result::IonFailure; -use crate::{Encoding, IonResult, IonType, RawSymbolRef}; +use crate::{v1_1, Encoding, IonResult, IonType, RawSymbolRef}; pub struct LazyRawTextReader_1_1<'data> { input: &'data [u8], @@ -38,7 +37,8 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'da fn resume_at_offset( data: &'data [u8], offset: usize, - _config: ::ReaderSavedState, + // This argument is ignored by all raw readers except LazyRawAnyReader + _encoding_hint: IonEncoding, ) -> Self { LazyRawTextReader_1_1 { input: data, @@ -50,6 +50,14 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'da } } + fn stream_data(&self) -> (&'data [u8], usize, IonEncoding) { + ( + &self.input[self.local_offset..], + self.position(), + self.encoding(), + ) + } + fn next<'top>( &'top mut self, context: EncodingContextRef<'top>, @@ -85,18 +93,6 @@ impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'da buffer_after_item, )?; - if let RawStreamItem::VersionMarker(marker) = matched_item { - // TODO: It is not the raw reader's responsibility to report this error. It should - // surface the IVM to the caller, who can then either create a different reader - // for the reported version OR raise an error. - // See: https://github.com/amazon-ion/ion-rust/issues/644 - let (major, minor) = marker.version(); - if (major, minor) != (1, 1) { - return IonResult::decoding_error(format!( - "Ion version {major}.{minor} is not supported" - )); - } - } // Since we successfully matched the next value, we'll update the buffer // so a future call to `next()` will resume parsing the remaining input. self.local_offset = buffer_after_trailing_ws.offset() - self.stream_offset; @@ -147,56 +143,54 @@ impl<'data> From<&'data str> for MacroIdRef<'data> { } #[derive(Copy, Clone)] -pub struct RawTextEExpression_1_1<'top> { - pub(crate) encoded_expr: EncodedTextMacroInvocation, +pub struct TextEExpression_1_1<'top> { pub(crate) input: TextBufferView<'top>, pub(crate) id: MacroIdRef<'top>, - pub(crate) arg_expr_cache: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], + pub(crate) arg_cache: &'top [EExpArg<'top, TextEncoding_1_1>], } -impl<'top> HasSpan<'top> for RawTextEExpression_1_1<'top> { +impl<'top> HasSpan<'top> for TextEExpression_1_1<'top> { fn span(&self) -> Span<'top> { Span::with_offset(self.input.offset(), self.input.bytes()) } } -impl<'top> HasRange for RawTextEExpression_1_1<'top> { +impl<'top> HasRange for TextEExpression_1_1<'top> { fn range(&self) -> Range { self.input.range() } } -impl<'top> RawEExpression<'top, TextEncoding_1_1> for RawTextEExpression_1_1<'top> { - type RawArgumentsIterator<'a> = RawTextSequenceCacheIterator_1_1<'top> where Self: 'a; +impl<'top> RawEExpression<'top, TextEncoding_1_1> for TextEExpression_1_1<'top> { + type RawArgumentsIterator = TextEExpArgsIterator_1_1<'top>; + type ArgGroup = TextEExpArgGroup<'top>; - fn id(&self) -> MacroIdRef<'top> { + fn id(self) -> MacroIdRef<'top> { self.id } - fn raw_arguments(&self) -> Self::RawArgumentsIterator<'_> { - RawTextSequenceCacheIterator_1_1::new(self.arg_expr_cache) + fn raw_arguments(&self) -> Self::RawArgumentsIterator { + TextEExpArgsIterator_1_1::new(self.arg_cache) } } -impl<'data> Debug for RawTextEExpression_1_1<'data> { +impl<'data> Debug for TextEExpression_1_1<'data> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { // This is a text macro and the parser accepted it, so it's valid UTF-8. We can `unwrap()`. write!(f, "", self.input.as_text().unwrap()) } } -impl<'top> RawTextEExpression_1_1<'top> { +impl<'top> TextEExpression_1_1<'top> { pub(crate) fn new( id: MacroIdRef<'top>, - encoded_expr: EncodedTextMacroInvocation, input: TextBufferView<'top>, - child_expr_cache: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], + arg_cache: &'top [EExpArg<'top, TextEncoding_1_1>], ) -> Self { Self { - encoded_expr, input, id, - arg_expr_cache: child_expr_cache, + arg_cache, } } } @@ -361,6 +355,37 @@ impl<'top> Iterator for RawTextSequenceCacheIterator_1_1<'top> { } } +#[derive(Debug, Copy, Clone)] +pub struct TextEExpArgsIterator_1_1<'top> { + arg_exprs: &'top [EExpArg<'top, v1_1::Text>], + index: usize, +} + +impl<'top> TextEExpArgsIterator_1_1<'top> { + pub fn new(arg_exprs: &'top [EExpArg<'top, v1_1::Text>]) -> Self { + Self { + arg_exprs, + index: 0, + } + } +} + +impl<'top> Iterator for TextEExpArgsIterator_1_1<'top> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + let next_expr = self.arg_exprs.get(self.index)?; + self.index += 1; + Some(Ok(*next_expr)) + } + + fn size_hint(&self) -> (usize, Option) { + let num_args = self.arg_exprs.len(); + // Tells the macro evaluator how much space to allocate to hold these arguments + (num_args, Some(num_args)) + } +} + /// Wraps a [`RawTextSExpIterator_1_1`] (which parses the body of a sexp) and caches the child /// expressions the iterator yields along the way. Finally, returns a `Range` representing /// the span of input bytes that the sexp occupies. @@ -498,7 +523,7 @@ impl<'top> HasRange for LazyRawTextFieldName_1_1<'top> { } } -impl<'top> LazyRawFieldName<'top> for LazyRawTextFieldName_1_1<'top> { +impl<'top> LazyRawFieldName<'top, TextEncoding_1_1> for LazyRawTextFieldName_1_1<'top> { fn read(&self) -> IonResult> { self.matched.read() } @@ -741,8 +766,11 @@ impl<'top> TextStructSpanFinder_1_1<'top> { #[cfg(test)] mod tests { + use crate::lazy::any_encoding::IonVersion; + use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::EncodingContext; use crate::lazy::raw_value_ref::RawValueRef; + use crate::RawVersionMarker; use super::*; @@ -776,12 +804,15 @@ mod tests { false "#; - let empty_context = EncodingContext::empty(); - let context = empty_context.get_ref(); + let mut context = EncodingContext::for_ion_version(IonVersion::v1_1); + let macro_quux = + TemplateCompiler::compile_from_text(context.get_ref(), "(macro quux (x) null)")?; + context.macro_table.add_macro(macro_quux)?; let reader = &mut LazyRawTextReader_1_1::new(data.as_bytes()); + let context = context.get_ref(); // $ion_1_1 - assert_eq!(reader.next(context)?.expect_ivm()?.version(), (1, 1)); + assert_eq!(reader.next(context)?.expect_ivm()?.major_minor(), (1, 1)); // "foo" expect_next(context, reader, RawValueRef::String("foo".into())); // bar @@ -803,7 +834,7 @@ mod tests { ); assert!(children.next().is_none()); // (:quux quuz) - let macro_invocation = reader.next(context)?.expect_macro_invocation()?; + let macro_invocation = reader.next(context)?.expect_eexp()?; assert_eq!(macro_invocation.id, MacroIdRef::LocalName("quux")); expect_next(context, reader, RawValueRef::Int(77.into())); expect_next(context, reader, RawValueRef::Bool(false)); diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index a400b613..c003c0c6 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -12,7 +12,7 @@ use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::encoded_value::EncodedTextValue; -use crate::{IonResult, IonType, RawSymbolRef}; +use crate::{IonEncoding, IonResult, IonType, RawSymbolRef}; /// A value that has been identified in the text input stream but whose data has not yet been read. /// @@ -123,9 +123,13 @@ impl<'top, E: TextEncoding<'top>> HasRange for LazyRawTextVersionMarker<'top, E> } impl<'top, E: TextEncoding<'top>> RawVersionMarker<'top> for LazyRawTextVersionMarker<'top, E> { - fn version(&self) -> (u8, u8) { + fn major_minor(&self) -> (u8, u8) { (self.major, self.minor) } + + fn stream_encoding_before_marker(&self) -> IonEncoding { + IonEncoding::Text_1_0 + } } pub type LazyRawTextValue_1_0<'top> = LazyRawTextValue<'top, TextEncoding_1_0>; diff --git a/src/lazy/value.rs b/src/lazy/value.rs index ec38c90a..1b44e10f 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -1,8 +1,6 @@ use crate::lazy::decoder::Decoder; use crate::lazy::encoding::BinaryEncoding_1_0; -use crate::lazy::expanded::{ExpandedAnnotationsIterator, ExpandedValueRef, LazyExpandedValue}; -use crate::lazy::r#struct::LazyStruct; -use crate::lazy::sequence::{LazyList, LazySExp}; +use crate::lazy::expanded::{ExpandedAnnotationsIterator, LazyExpandedValue}; use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; use crate::symbol_ref::AsSymbolRef; @@ -260,50 +258,7 @@ impl<'top, D: Decoder> LazyValue<'top, D> { ///# fn main() -> IonResult<()> { Ok(()) } /// ``` pub fn read(&self) -> IonResult> { - use ExpandedValueRef::*; - - let value_ref = match self.expanded_value.read()? { - Null(ion_type) => ValueRef::Null(ion_type), - Bool(b) => ValueRef::Bool(b), - Int(i) => ValueRef::Int(i), - Float(f) => ValueRef::Float(f), - Decimal(d) => ValueRef::Decimal(d), - Timestamp(t) => ValueRef::Timestamp(t), - String(s) => ValueRef::String(s), - Symbol(s) => { - let symbol = match s { - RawSymbolRef::SymbolId(sid) => self - .expanded_value - .context - .symbol_table() - .symbol_for(sid) - .ok_or_else(|| { - IonError::decoding_error(format!( - "found a symbol ID (${}) that was not in the symbol table", - sid - )) - })? - .into(), - RawSymbolRef::Text(text) => text.into(), - }; - ValueRef::Symbol(symbol) - } - Blob(b) => ValueRef::Blob(b), - Clob(c) => ValueRef::Clob(c), - SExp(s) => { - let lazy_sexp = LazySExp { expanded_sexp: s }; - ValueRef::SExp(lazy_sexp) - } - List(l) => { - let lazy_sequence = LazyList { expanded_list: l }; - ValueRef::List(lazy_sequence) - } - Struct(s) => { - let lazy_struct = LazyStruct { expanded_struct: s }; - ValueRef::Struct(lazy_struct) - } - }; - Ok(value_ref) + self.expanded_value.read_resolved() } } diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index 815d5465..ecde5ee8 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -1,11 +1,16 @@ use crate::element::Value; use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::Decoder; +use crate::lazy::expanded::template::TemplateElement; +use crate::lazy::expanded::EncodingContextRef; use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::{LazyList, LazySExp}; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; -use crate::{Decimal, Element, Int, IonError, IonResult, IonType, SymbolRef, Timestamp}; +use crate::{ + Decimal, Element, Environment, ExpandedValueRef, Int, IonError, IonResult, IonType, + LazyExpandedList, LazyExpandedSExp, LazyExpandedStruct, SymbolRef, Timestamp, +}; use std::fmt::{Debug, Formatter}; /// A [ValueRef] represents a value that has been read from the input stream. Scalar variants contain @@ -182,7 +187,7 @@ impl<'top, D: Decoder> ValueRef<'top, D> { } } - pub fn expect_text(&self) -> IonResult<&'_ str> { + pub fn expect_text(self) -> IonResult<&'top str> { use ValueRef::*; match self { String(string) => Ok(string.text()), @@ -258,6 +263,64 @@ impl<'top, D: Decoder> ValueRef<'top, D> { ValueRef::Struct(_) => IonType::Struct, } } + + pub(crate) fn from_template( + context: EncodingContextRef<'top>, + environment: Environment<'top, D>, + element: &TemplateElement<'top>, + ) -> Self { + use crate::lazy::expanded::template::TemplateValue::*; + match element.value() { + Null(ion_type) => ValueRef::Null(*ion_type), + Bool(b) => ValueRef::Bool(*b), + Int(i) => ValueRef::Int(*i), + Float(f) => ValueRef::Float(*f), + Decimal(d) => ValueRef::Decimal(*d), + Timestamp(t) => ValueRef::Timestamp(*t), + String(s) => ValueRef::String(StrRef::from(s.text())), + Symbol(s) => ValueRef::Symbol(SymbolRef::from(s)), + Blob(b) => ValueRef::Blob(BytesRef::from(b.as_ref())), + Clob(c) => ValueRef::Clob(BytesRef::from(c.as_ref())), + List => ValueRef::List(LazyList::new(LazyExpandedList::from_template( + context, + environment, + *element, + ))), + SExp => ValueRef::SExp(LazySExp::new(LazyExpandedSExp::from_template( + context, + environment, + *element, + ))), + Struct(index) => ValueRef::Struct(LazyStruct::new(LazyExpandedStruct::from_template( + context, + environment, + element, + index, + ))), + } + } + + /// Downgrades the `ValueRef` to an `ExpandedValueRef` for use in contexts that expect the + /// lower-level representation. + /// TODO: Consolidate `ExpandedValue` and `LazyValue`. + pub(crate) fn as_expanded(&self) -> ExpandedValueRef<'top, D> { + use ValueRef::*; + match self { + Null(ion_type) => ExpandedValueRef::Null(*ion_type), + Bool(b) => ExpandedValueRef::Bool(*b), + Int(i) => ExpandedValueRef::Int(*i), + Float(f) => ExpandedValueRef::Float(*f), + Decimal(d) => ExpandedValueRef::Decimal(*d), + Timestamp(t) => ExpandedValueRef::Timestamp(*t), + String(s) => ExpandedValueRef::String(*s), + Symbol(s) => ExpandedValueRef::Symbol((*s).into()), + Blob(b) => ExpandedValueRef::Blob(*b), + Clob(c) => ExpandedValueRef::Clob(*c), + SExp(s) => ExpandedValueRef::SExp(s.expanded_sexp), + List(l) => ExpandedValueRef::List(l.expanded_list), + Struct(s) => ExpandedValueRef::Struct(s.expanded_struct), + } + } } #[cfg(test)] diff --git a/src/lib.rs b/src/lib.rs index 0ecd9128..fb7ab133 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -192,8 +192,6 @@ mod write_config; pub use crate::lazy::any_encoding::AnyEncoding; pub use crate::lazy::decoder::{HasRange, HasSpan}; pub use crate::lazy::span::Span; -pub use crate::write_config::WriteConfig; - macro_rules! v1_x_reader_writer { ($visibility:vis) => { #[allow(unused_imports)] @@ -214,10 +212,29 @@ macro_rules! v1_x_reader_writer { lazy::sequence::{LazyList, LazySExp}, lazy::encoder::value_writer::{ValueWriter, StructWriter, SequenceWriter, EExpWriter}, lazy::any_encoding::IonEncoding, + lazy::expanded::compiler::TemplateCompiler, + lazy::expanded::template::TemplateMacro, + lazy::expanded::template::TemplateBodyExpr, + lazy::expanded::template::TemplateBodyExprKind, + lazy::expanded::macro_table::Macro, + lazy::expanded::macro_evaluator::MacroEvaluator, + lazy::expanded::macro_evaluator::MacroExpansionKind, + lazy::expanded::macro_table::MacroKind, + lazy::expanded::macro_table::MacroTable, + lazy::expanded::EncodingContext, + lazy::any_encoding::IonVersion, + lazy::binary::raw::reader::LazyRawBinaryReader_1_0, + lazy::binary::raw::v1_1::reader::LazyRawBinaryReader_1_1, + lazy::expanded::macro_evaluator::RawEExpression, + lazy::expanded::macro_evaluator::ValueExpr, + lazy::expanded::macro_evaluator::MacroExpr, + lazy::expanded::macro_evaluator::MacroExprKind, }; }; } +pub use crate::write_config::WriteConfig; + macro_rules! v1_0_reader_writer { ($visibility:vis) => { #[allow(unused_imports)] @@ -376,3 +393,36 @@ pub enum Format { Binary, // TODO: Json(TextKind) } + +/// Early returns `Some(Err(_))` if the provided expression returns an `Err(_)`. +/// +/// Acts as an ersatz `?` operator in methods that return `Option>`. +macro_rules! try_or_some_err { + ($expr:expr) => { + match $expr { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + }; +} + +pub(crate) use try_or_some_err; + +/// Tries to get the next value from an expression of type `Option>`, early returning if +/// the expression is `None` or `Some(Err(_))`. This is useful in the context of iterator +/// implementations that produce an `Option>` and so cannot easily use the `?` operator. +/// +/// If the expression evaluates to `None`, early returns `None`. +/// If the expression evaluates to `Some(Err(e))`, early returns `Some(Err(e))`. +/// If the expression evaluates to `Some(Ok(value))`, evaluates to `value`. +macro_rules! try_next { + ($expr:expr) => { + match $expr { + Some(Ok(v)) => v, + None => return None, + Some(Err(e)) => return Some(Err(e)), + } + }; +} + +pub(crate) use try_next; diff --git a/src/raw_symbol_ref.rs b/src/raw_symbol_ref.rs index 46df5f85..89829d43 100644 --- a/src/raw_symbol_ref.rs +++ b/src/raw_symbol_ref.rs @@ -1,4 +1,6 @@ -use crate::{Symbol, SymbolId, SymbolRef}; +use crate::lazy::expanded::EncodingContextRef; +use crate::result::IonFailure; +use crate::{IonError, IonResult, Symbol, SymbolId, SymbolRef}; /// Like RawSymbolToken, but the Text variant holds a borrowed reference instead of a String. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -17,6 +19,26 @@ impl<'a> RawSymbolRef<'a> { RawSymbolRef::Text(text) => symbol_text == *text, } } + + pub fn resolve(self, context: EncodingContextRef<'a>) -> IonResult> { + let symbol = match self { + RawSymbolRef::SymbolId(sid) => context + .symbol_table() + .symbol_for(sid) + .ok_or_else( + #[inline(never)] + || { + IonError::decoding_error(format!( + "found a symbol ID (${}) that was not in the symbol table", + sid + )) + }, + )? + .into(), + RawSymbolRef::Text(text) => text.into(), + }; + Ok(symbol) + } } /// Implemented by types that can be viewed as a [RawSymbolRef] without allocations. diff --git a/src/symbol_table.rs b/src/symbol_table.rs index 53d7147a..3671c4c4 100644 --- a/src/symbol_table.rs +++ b/src/symbol_table.rs @@ -2,38 +2,53 @@ use std::collections::HashMap; use std::sync::Arc; use crate::constants::v1_0; +use crate::lazy::any_encoding::IonVersion; use crate::{Symbol, SymbolId}; /// Stores mappings from Symbol IDs to text and vice-versa. // SymbolTable instances always have at least system symbols; they are never empty. #[allow(clippy::len_without_is_empty)] -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SymbolTable { + ion_version: IonVersion, symbols_by_id: Vec, ids_by_text: HashMap, } impl Default for SymbolTable { fn default() -> Self { - Self::new() + Self::new(IonVersion::v1_0) } } impl SymbolTable { /// Constructs a new symbol table pre-populated with the system symbols defined in the spec. - pub(crate) fn new() -> SymbolTable { + pub(crate) fn new(ion_version: IonVersion) -> SymbolTable { + // Enough to hold the 1.0 system table and several user symbols. + const INITIAL_SYMBOLS_CAPACITY: usize = 32; let mut symbol_table = SymbolTable { - symbols_by_id: Vec::with_capacity(v1_0::SYSTEM_SYMBOLS.len()), + ion_version, + symbols_by_id: Vec::with_capacity(INITIAL_SYMBOLS_CAPACITY), ids_by_text: HashMap::new(), }; symbol_table.initialize(); symbol_table } - // Interns the v1.0 system symbols + /// Adds system symbols to the table. pub(crate) fn initialize(&mut self) { self.add_placeholder(); // $0 - v1_0::SYSTEM_SYMBOLS[1..] + + // TODO: If it's Ion 1.1, there are no other symbols in the symbol table. Implementing this + // requires having first implemented reading and writing system symbols in their own + // address space. For now, Ion 1.1's default symbol table matches Ion 1.0's. + // let remaining_system_symbols = match self.ion_version { + // IonVersion::v1_0 => &v1_0::SYSTEM_SYMBOLS[1..], + // IonVersion::v1_1 => &[], + // }; + let remaining_system_symbols = &v1_0::SYSTEM_SYMBOLS[1..]; + + remaining_system_symbols .iter() .copied() .map(Option::unwrap) @@ -48,6 +63,11 @@ impl SymbolTable { self.initialize(); } + pub(crate) fn reset_to_version(&mut self, new_version: IonVersion) { + self.ion_version = new_version; + self.reset(); + } + /// adds `text` to the symbol table and returns the newly assigned [SymbolId]. pub(crate) fn add_symbol_for_text>(&mut self, text: A) -> SymbolId { let arc: Arc = Arc::from(text.as_ref()); @@ -122,15 +142,11 @@ impl SymbolTable { &self.symbols_by_id } - /// Returns a slice of references to the symbol text stored in the table starting at the given - /// symbol ID. If a symbol table append occurs during reading, this function can be used to - /// easily view the new symbols that has been added to the table. - /// - /// The symbol table can contain symbols with unknown text; see the documentation for - /// [Symbol] for more information. - // TODO: Is this necessary vs just taking a slice of the `symbols()` method above? - pub(crate) fn symbols_tail(&self, start: usize) -> &[Symbol] { - &self.symbols_by_id[start..] + /// Returns a slice of the last `n` symbols in the symbol table. The caller must confirm that + /// `last_n` is less than the size of the symbol table. + pub(crate) fn symbols_tail(&self, last_n: usize) -> &[Symbol] { + let num_symbols = self.symbols_by_id.len(); + &self.symbols_by_id[num_symbols - last_n..] } /// Returns the number of symbols defined in the table. diff --git a/src/types/symbol.rs b/src/types/symbol.rs index 76247752..ccbfdc29 100644 --- a/src/types/symbol.rs +++ b/src/types/symbol.rs @@ -13,6 +13,9 @@ pub(crate) enum SymbolText { // This Symbol refers to a string in the symbol table Shared(Arc), // This Symbol owns its own text + // TODO: Turn this into a Box. + // Symbols are read-only, so there's no chance we'll add data to the `String`. Using + // a `Box` shrinks this value from 24 bytes to 8 bytes. Owned(String), // This Symbol is equivalent to SID zero (`$0`) Unknown, diff --git a/tests/element_display.rs b/tests/element_display.rs index a1e7977a..a2aca08a 100644 --- a/tests/element_display.rs +++ b/tests/element_display.rs @@ -1,5 +1,4 @@ -#![cfg(feature = "experimental-reader")] -#![cfg(feature = "experimental-writer")] +#![cfg(feature = "experimental-reader-writer")] use crate::ion_tests::contains_path; use ion_rs::IonData; use ion_rs::{Element, IonResult, Sequence}; diff --git a/tests/ion_data_consistency.rs b/tests/ion_data_consistency.rs index aa8286c3..39aebff8 100644 --- a/tests/ion_data_consistency.rs +++ b/tests/ion_data_consistency.rs @@ -5,7 +5,7 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::fs::read; -use std::path::MAIN_SEPARATOR as PATH_SEPARATOR; +use std::path::MAIN_SEPARATOR_STR as PATH_SEPARATOR; use test_generator::test_resources; /// Determines if the given file name is in the paths list. This deals with platform @@ -15,7 +15,7 @@ fn contains_path(paths: &[&str], file_name: &str) -> bool { paths .iter() // TODO construct the paths in a not so hacky way - .map(|p| p.replace('/', &PATH_SEPARATOR.to_string())) + .map(|p| p.replace('/', PATH_SEPARATOR)) .any(|p| p == file_name) } diff --git a/tests/lazy_element_ion_tests.rs b/tests/ion_tests/lazy_element_ion_tests.rs similarity index 57% rename from tests/lazy_element_ion_tests.rs rename to tests/ion_tests/lazy_element_ion_tests.rs index cfb9a26d..81edd3f6 100644 --- a/tests/lazy_element_ion_tests.rs +++ b/tests/ion_tests/lazy_element_ion_tests.rs @@ -1,24 +1,19 @@ -#![cfg(feature = "experimental-lazy-reader")] -#![cfg(feature = "experimental-writer")] -mod ion_tests; +#![cfg(feature = "experimental-reader-writer")] +use crate::good_round_trip; use crate::ion_tests::{ bad, equivs, non_equivs, ElementApi, SkipList, ELEMENT_EQUIVS_SKIP_LIST, ELEMENT_GLOBAL_SKIP_LIST, ELEMENT_ROUND_TRIP_SKIP_LIST, }; -use ion_rs::lazy::reader::Reader; -use ion_rs::IonResult; +use ion_rs::Reader; +use ion_rs::{AnyEncoding, IonResult}; use ion_rs::{Format, TextFormat}; use test_generator::test_resources; struct LazyReaderElementApi; impl ElementApi for LazyReaderElementApi { - type ElementReader<'a> = Reader<&'a [u8]>; - - fn make_reader(data: &[u8]) -> IonResult> { - Ok(Reader::new(data)) - } + type ElementReader<'a> = Reader; fn global_skip_list() -> SkipList { ELEMENT_GLOBAL_SKIP_LIST @@ -39,22 +34,26 @@ impl ElementApi for LazyReaderElementApi { fn non_equivs_skip_list() -> SkipList { &[] } + + fn make_reader(data: &[u8]) -> IonResult> { + Reader::new(AnyEncoding, data) + } } good_round_trip! { use LazyReaderElementApi; - fn binary_compact(Format::Binary, Format::Text(TextKind::Compact)); - fn binary_lines(Format::Binary, Format::Text(TextKind::Lines)); - fn binary_pretty(Format::Binary, Format::Text(TextKind::Pretty)); - fn compact_binary(Format::Text(TextKind::Compact), Format::Binary); - fn compact_lines(Format::Text(TextKind::Compact), Format::Text(TextKind::Lines)); - fn compact_pretty(Format::Text(TextKind::Compact), Format::Text(TextKind::Pretty)); - fn lines_binary(Format::Text(TextKind::Lines), Format::Binary); - fn lines_compact(Format::Text(TextKind::Lines), Format::Text(TextKind::Compact)); - fn lines_pretty(Format::Text(TextKind::Lines), Format::Text(TextKind::Pretty)); - fn pretty_binary(Format::Text(TextKind::Pretty), Format::Binary); - fn pretty_compact(Format::Text(TextKind::Pretty), Format::Text(TextKind::Compact)); - fn pretty_lines(Format::Text(TextKind::Pretty), Format::Text(TextKind::Lines)); + fn binary_compact(Format::Binary, Format::Text(TextFormat::Compact)); + fn binary_lines(Format::Binary, Format::Text(TextFormat::Lines)); + fn binary_pretty(Format::Binary, Format::Text(TextFormat::Pretty)); + fn compact_binary(Format::Text(TextFormat::Compact), Format::Binary); + fn compact_lines(Format::Text(TextFormat::Compact), Format::Text(TextFormat::Lines)); + fn compact_pretty(Format::Text(TextFormat::Compact), Format::Text(TextFormat::Pretty)); + fn lines_binary(Format::Text(TextFormat::Lines), Format::Binary); + fn lines_compact(Format::Text(TextFormat::Lines), Format::Text(TextFormat::Compact)); + fn lines_pretty(Format::Text(TextFormat::Lines), Format::Text(TextFormat::Pretty)); + fn pretty_binary(Format::Text(TextFormat::Pretty), Format::Binary); + fn pretty_compact(Format::Text(TextFormat::Pretty), Format::Text(TextFormat::Compact)); + fn pretty_lines(Format::Text(TextFormat::Pretty), Format::Text(TextFormat::Lines)); } #[test_resources("ion-tests/iontestdata_1_0/bad/**/*.ion")] diff --git a/tests/ion_tests/mod.rs b/tests/ion_tests/mod.rs index 84effa66..713931be 100644 --- a/tests/ion_tests/mod.rs +++ b/tests/ion_tests/mod.rs @@ -1,20 +1,19 @@ // Copyright Amazon.com, Inc. or its affiliates. -#![cfg(feature = "experimental-reader")] -#![cfg(feature = "experimental-writer")] +#![cfg(feature = "experimental-reader-writer")] #![allow(dead_code)] use std::fs::read; -use std::path::MAIN_SEPARATOR as PATH_SEPARATOR; +use std::path::MAIN_SEPARATOR_STR as PATH_SEPARATOR; -use ion_rs::lazy::encoder::value_writer::SequenceWriter; -use ion_rs::lazy::encoder::writer::Writer; -use ion_rs::lazy::encoding::{BinaryEncoding_1_0, TextEncoding_1_0}; -use ion_rs::WriteConfig; +use ion_rs::v1_0; +use ion_rs::Writer; use ion_rs::{ Element, ElementReader, ElementWriter, Format, IonData, IonError, IonResult, SExp, Sequence, Symbol, Value, }; +pub mod lazy_element_ion_tests; + /// Concatenates two slices of string slices together. #[inline] pub fn concat<'a>(left: &[&'a str], right: &[&'a str]) -> Vec<&'a str> { @@ -28,7 +27,7 @@ pub fn contains_path(paths: &[&str], file_name: &str) -> bool { paths .iter() // TODO construct the paths in a not so hacky way - .map(|p| p.replace('/', &PATH_SEPARATOR.to_string())) + .map(|p| p.replace('/', PATH_SEPARATOR)) .any(|p| p == file_name) } @@ -51,8 +50,7 @@ pub fn serialize(format: Format, elements: &Sequence) -> IonResult> { let mut buffer = Vec::with_capacity(2048); match format { Format::Text(kind) => { - let write_config = WriteConfig::::new(kind); - let mut writer = Writer::new(write_config, buffer)?; + let mut writer = Writer::new(v1_0::Text, buffer)?; writer.write_elements(elements)?; buffer = writer.close()?; println!( @@ -61,7 +59,7 @@ pub fn serialize(format: Format, elements: &Sequence) -> IonResult> { ); } Format::Binary => { - let mut binary_writer = Writer::::new(buffer)?; + let mut binary_writer = Writer::new(v1_0::Binary, buffer)?; binary_writer.write_elements(elements)?; buffer = binary_writer.close()?; } @@ -318,7 +316,7 @@ pub trait ElementApi { #[macro_export] macro_rules! good_round_trip { (use $ElementApiImpl:ident; $(fn $test_name:ident($format1:expr, $format2:expr);)+) => { - mod good_round_trip { + mod good_round_trip_tests { use super::*; $( #[test_resources("ion-tests/iontestdata_1_0/good/**/*.ion")] //#[test_resources("ion-tests/iontestdata_1_1/good/**/*.ion")] diff --git a/tests/ion_tests_1_1.rs b/tests/ion_tests_1_1.rs index 6be4d472..939464b0 100644 --- a/tests/ion_tests_1_1.rs +++ b/tests/ion_tests_1_1.rs @@ -1,20 +1,20 @@ -#![cfg(feature = "experimental-lazy-reader")] +#![cfg(feature = "experimental-reader-writer")] /// TODO: When the Ion 1.1 binary reader is complete, update this module to include binary tests mod ion_tests; use crate::ion_tests::{bad, equivs, non_equivs, ElementApi, SkipList}; -use ion_rs::lazy::reader::TextReader_1_1; use ion_rs::IonResult; +use ion_rs::{v1_1, Reader}; use test_generator::test_resources; struct LazyReaderElementApi; impl ElementApi for LazyReaderElementApi { - type ElementReader<'a> = TextReader_1_1<&'a [u8]>; + type ElementReader<'a> = Reader; fn make_reader(data: &[u8]) -> IonResult> { - Ok(TextReader_1_1::new(data).unwrap()) + Ok(Reader::new(v1_1::Text, data).unwrap()) } fn global_skip_list() -> SkipList {