Skip to content

Commit

Permalink
Adds read support for system symbols as values, annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
zslayton committed Oct 26, 2024
1 parent 2042f29 commit 042d714
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 102 deletions.
4 changes: 4 additions & 0 deletions src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ pub(crate) mod v1_1 {
"make_field", // $65
];

mod system_symbol_ids {
const ADD_SYMBOLS: usize = 45;
const ADD_MACROS: usize = 47;
}
pub(crate) static SYSTEM_SYMBOL_TEXT_TO_ID: phf::Map<&str, usize> = phf_map! {
"$ion" => 1,
"$ion_1_0" => 2,
Expand Down
56 changes: 48 additions & 8 deletions src/lazy/binary/raw/v1_1/immutable_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1123,19 +1123,20 @@ pub struct EncodedAnnotations {
mod tests {
use rstest::rstest;

use super::*;
use crate::ion_data::IonEq;
use crate::lazy::any_encoding::IonVersion;
use crate::lazy::binary::raw::v1_1::e_expression::BinaryEExpArgsIterator_1_1;
use crate::lazy::binary::raw::v1_1::RawBinaryAnnotationsIterator_1_1;
use crate::lazy::expanded::compiler::TemplateCompiler;
use crate::lazy::expanded::macro_evaluator::{EExpressionArgGroup, RawEExpression};
use crate::lazy::expanded::macro_table::MacroTable;
use crate::lazy::expanded::EncodingContext;
use crate::lazy::text::raw::v1_1::reader::{MacroAddress, MacroIdRef};
use crate::v1_0::RawValueRef;
use crate::RawSymbolRef;
use crate::{AnyEncoding, Element, ElementReader, Reader, SequenceWriter, Writer};

use super::*;

#[rstest]
#[case::no_args(0, &[0b00u8], &[])]
#[case::one_empty_arg(1, &[0b00u8], &[ArgGrouping::Empty])]
Expand Down Expand Up @@ -1212,16 +1213,48 @@ mod tests {
}

#[rstest]
#[case::single_address(&[0xE4, 0x07], 1, 1)]
#[case::two_addresses(&[0xE5, 0x07, 0x09], 1, 2)]
#[case::three_addresses(&[0xE6, 0x07, 0x07, 0x09, 0x0B], 2, 3)]
#[case::single_flex_sym(&[0xE7, 0x07], 1, 1)]
#[case::two_flex_syms(&[0xE8, 0x07, 0x09], 1, 2)]
#[case::three_flex_syms(&[0xE9, 0x07, 0x07, 0x09, 0x0B], 2, 3)]
#[case::single_address(AnnotationsEncoding::SymbolAddress, &[0xE4, 0x07], 1, 1, &[
RawSymbolRef::SymbolId(3)
])]
#[case::two_addresses(AnnotationsEncoding::SymbolAddress, &[0xE5, 0x07, 0x09], 1, 2, &[
RawSymbolRef::SymbolId(3),
RawSymbolRef::SymbolId(4)
])]
#[case::three_addresses(AnnotationsEncoding::SymbolAddress, &[0xE6, 0x07, 0x07, 0x09, 0x0B], 2, 3, &[
RawSymbolRef::SymbolId(3),
RawSymbolRef::SymbolId(4),
RawSymbolRef::SymbolId(5)
])]
#[case::single_flex_sym(AnnotationsEncoding::FlexSym, &[0xE7, 0x07], 1, 1, &[
RawSymbolRef::SymbolId(3)
])]
#[case::two_flex_syms(AnnotationsEncoding::FlexSym, &[0xE8, 0x07, 0x09], 1, 2, &[
RawSymbolRef::SymbolId(3),
RawSymbolRef::SymbolId(4),
])]
#[case::three_flex_syms(AnnotationsEncoding::FlexSym, &[0xE9, 0x07, 0x07, 0x09, 0x0B], 2, 3, &[
RawSymbolRef::SymbolId(3),
RawSymbolRef::SymbolId(4),
RawSymbolRef::SymbolId(5)
])]
#[case::one_flex_syms_with_system_symbol(AnnotationsEncoding::FlexSym, &[0xE7, 0x01, 0x6A], 1, 2, &[
RawSymbolRef::Text("$ion_encoding"),
])]
#[case::two_flex_syms_with_system_symbols(AnnotationsEncoding::FlexSym, &[0xE8, 0x01, 0x60, 0x01, 0x6A], 1, 4, &[
RawSymbolRef::SymbolId(0),
RawSymbolRef::Text("$ion_encoding"),
])]
#[case::three_flex_syms_with_system_symbols(AnnotationsEncoding::FlexSym, &[0xE9, 0x0D, 0x01, 0x60, 0x01, 0x6A, 0x01, 0xA1], 2, 6, &[
RawSymbolRef::SymbolId(0),
RawSymbolRef::Text("$ion_encoding"),
RawSymbolRef::Text("make_field"),
])]
fn read_annotations_sequence(
#[case] encoding: AnnotationsEncoding,
#[case] input: &[u8],
#[case] expected_header_length: usize,
#[case] expected_sequence_length: usize,
#[case] expected_annotations: &[RawSymbolRef],
) -> IonResult<()> {
let context = EncodingContext::empty();
let buffer = BinaryBuffer::new(context.get_ref(), input);
Expand All @@ -1237,6 +1270,13 @@ mod tests {
"sequence length actual {} != expected {}",
sequence.sequence_length as usize, expected_sequence_length
);
// Read the actual sequence
let annotations_iter = RawBinaryAnnotationsIterator_1_1::new(
buffer.consume(sequence.header_length as usize),
encoding,
);
let actual_annotations = annotations_iter.collect::<IonResult<Vec<_>>>()?;
assert_eq!(actual_annotations, expected_annotations);
assert!(remaining.is_empty(), "remaining input was not empty");
Ok(())
}
Expand Down
128 changes: 60 additions & 68 deletions src/lazy/binary/raw/v1_1/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,65 +345,41 @@ mod tests {

// Symbol ID: 65,793
0xE3, 0x01, 0x00, 0x00,

// System symbols
0xEE, 0x0A, // $ion_encoding
0xEE, 0x0E, // macro_table
0xEE, 0x15, // empty text
0xEE, 0x41, // make_field
];
let empty_context = EncodingContext::empty();
let context = empty_context.get_ref();
let mut reader = LazyRawBinaryReader_1_1::new(&data);
let _ivm = reader.next(context)?.expect_ivm()?;

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
"".into()
);

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
"fourteen bytes".into()
);

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
"variable length encoding".into()
);

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
RawSymbolRef::SymbolId(1)
);

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
RawSymbolRef::SymbolId(257)
);
let expected_symbols: &[RawSymbolRef] = &[
RawSymbolRef::Text(""),
RawSymbolRef::Text("fourteen bytes"),
RawSymbolRef::Text("variable length encoding"),
RawSymbolRef::SymbolId(1),
RawSymbolRef::SymbolId(257),
RawSymbolRef::SymbolId(65_793),
RawSymbolRef::Text("$ion_encoding"),
RawSymbolRef::Text("macro_table"),
RawSymbolRef::Text(""),
RawSymbolRef::Text("make_field"),
];

assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
RawSymbolRef::SymbolId(65793)
);
for expected_symbol in expected_symbols {
assert_eq!(
reader
.next(context)?
.expect_value()?
.read()?
.expect_symbol()?,
expected_symbol.clone()
);
}

Ok(())
}
Expand Down Expand Up @@ -512,10 +488,14 @@ mod tests {
#[case("3.141592653", &[0x76, 0xEF, 0x4D, 0xE6, 0x40, 0xBB, 0x00])]
#[case("3.141592653590", &[0x77, 0xE9, 0x16, 0x9F, 0x83, 0x75, 0xDB, 0x02])]
#[case("3.14159265358979323", &[0x79, 0xDF, 0xFB, 0xA0, 0x9E, 0xF6, 0x2F, 0x1E, 0x5C, 0x04])]
#[case("3.1415926535897932384626", &[0x7B, 0xD5, 0x72, 0x49, 0x64, 0xCC, 0xAF, 0xEF, 0x8F, 0x0F, 0xA7, 0x06])]
#[case("3.141592653589793238462643383", &[0x7D, 0xCB, 0xB7, 0x3C, 0x92, 0x86, 0x40, 0x9F, 0x1B, 0x01, 0x1F, 0xAA, 0x26, 0x0A])]
#[case("3.14159265358979323846264338327950", &[0x7F, 0xC1, 0x8E, 0x29, 0xE5, 0xE3, 0x56, 0xD5, 0xDF, 0xC5, 0x10, 0x8F, 0x55, 0x3F, 0x7D, 0x0F])]
#[case("3.141592653589793238462643383279503", &[0xF7, 0x21, 0xBF, 0x8F, 0x9F, 0xF3, 0xE6, 0x64, 0x55, 0xBE, 0xBA, 0xA7, 0x96, 0x57, 0x79, 0xE4, 0x9A, 0x00])]
#[case("3.1415926535897932384626", &[0x7B, 0xD5, 0x72, 0x49, 0x64, 0xCC, 0xAF, 0xEF, 0x8F, 0x0F, 0xA7, 0x06]
)]
#[case("3.141592653589793238462643383", &[0x7D, 0xCB, 0xB7, 0x3C, 0x92, 0x86, 0x40, 0x9F, 0x1B, 0x01, 0x1F, 0xAA, 0x26, 0x0A]
)]
#[case("3.14159265358979323846264338327950", &[0x7F, 0xC1, 0x8E, 0x29, 0xE5, 0xE3, 0x56, 0xD5, 0xDF, 0xC5, 0x10, 0x8F, 0x55, 0x3F, 0x7D, 0x0F]
)]
#[case("3.141592653589793238462643383279503", &[0xF7, 0x21, 0xBF, 0x8F, 0x9F, 0xF3, 0xE6, 0x64, 0x55, 0xBE, 0xBA, 0xA7, 0x96, 0x57, 0x79, 0xE4, 0x9A, 0x00]
)]
fn decimals(#[case] expected_txt: &str, #[case] ion_data: &[u8]) -> IonResult<()> {
use crate::lazy::decoder::{LazyRawReader, LazyRawValue};
use crate::lazy::text::raw::v1_1::reader::LazyRawTextReader_1_1;
Expand Down Expand Up @@ -556,10 +536,14 @@ mod tests {
#[case("1.0", &[0xF7, 0x05, 0xFF, 0x0A])]
#[case("1.28", &[0xF7, 0x07, 0xFD, 0x80, 0x00])]
#[case("3.141592653590", &[0xF7, 0x0F, 0xE9, 0x16, 0x9F, 0x83, 0x75, 0xDB, 0x02])]
#[case("3.14159265358979323", &[0xF7, 0x13, 0xDF, 0xFB, 0xA0, 0x9E, 0xF6, 0x2F, 0x1E, 0x5C, 0x04])]
#[case("3.1415926535897932384626", &[0xF7, 0x17, 0xD5, 0x72, 0x49, 0x64, 0xCC, 0xAF, 0xEF, 0x8F, 0x0F, 0xA7, 0x06])]
#[case("3.141592653589793238462643383", &[0xF7, 0x1B, 0xCB, 0xB7, 0x3C, 0x92, 0x86, 0x40, 0x9F, 0x1B, 0x01, 0x1F, 0xAA, 0x26, 0x0A])]
#[case("3.14159265358979323846264338327950", &[0xF7, 0x1F, 0xC1, 0x8E, 0x29, 0xE5, 0xE3, 0x56, 0xD5, 0xDF, 0xC5, 0x10, 0x8F, 0x55, 0x3F, 0x7D, 0x0F])]
#[case("3.14159265358979323", &[0xF7, 0x13, 0xDF, 0xFB, 0xA0, 0x9E, 0xF6, 0x2F, 0x1E, 0x5C, 0x04]
)]
#[case("3.1415926535897932384626", &[0xF7, 0x17, 0xD5, 0x72, 0x49, 0x64, 0xCC, 0xAF, 0xEF, 0x8F, 0x0F, 0xA7, 0x06]
)]
#[case("3.141592653589793238462643383", &[0xF7, 0x1B, 0xCB, 0xB7, 0x3C, 0x92, 0x86, 0x40, 0x9F, 0x1B, 0x01, 0x1F, 0xAA, 0x26, 0x0A]
)]
#[case("3.14159265358979323846264338327950", &[0xF7, 0x1F, 0xC1, 0x8E, 0x29, 0xE5, 0xE3, 0x56, 0xD5, 0xDF, 0xC5, 0x10, 0x8F, 0x55, 0x3F, 0x7D, 0x0F]
)]
fn decimals_long(#[case] expected_txt: &str, #[case] ion_data: &[u8]) -> IonResult<()> {
use crate::ion_data::IonEq;
use crate::lazy::decoder::{LazyRawReader, LazyRawValue};
Expand Down Expand Up @@ -587,14 +571,19 @@ mod tests {
#[case("2023-10-15T05:04Z", &[0x83, 0x35, 0x7D, 0x85, 0x00])]
#[case("2023-10-15T05:04:03Z", &[0x84, 0x35, 0x7D, 0x85, 0x30, 0x00])]
#[case("2023-10-15T05:04:03.123-00:00", &[0x85, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01])]
#[case("2023-10-15T05:04:03.000123-00:00", &[0x86, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00])]
#[case("2023-10-15T05:04:03.000000123-00:00", &[0x87, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00, 0x00])]
#[case("2023-10-15T05:04:03.000123-00:00", &[0x86, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00]
)]
#[case("2023-10-15T05:04:03.000000123-00:00", &[0x87, 0x35, 0x7D, 0x85, 0x38, 0xEC, 0x01, 0x00, 0x00]
)]
#[case("2023-10-15T05:04+01:00", &[0x88, 0x35, 0x7D, 0x85, 0xE0, 0x01])]
#[case("2023-10-15T05:04-01:00", &[0x88, 0x35, 0x7D, 0x85, 0xA0, 0x01])]
#[case("2023-10-15T05:04:03+01:00", &[0x89, 0x35, 0x7D, 0x85, 0xE0, 0x0D])]
#[case("2023-10-15T05:04:03.123+01:00", &[0x8A, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00])]
#[case("2023-10-15T05:04:03.000123+01:00", &[0x8B, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00])]
#[case("2023-10-15T05:04:03.000000123+01:00", &[0x8C, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00, 0x00])]
#[case("2023-10-15T05:04:03.123+01:00", &[0x8A, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00]
)]
#[case("2023-10-15T05:04:03.000123+01:00", &[0x8B, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00]
)]
#[case("2023-10-15T05:04:03.000000123+01:00", &[0x8C, 0x35, 0x7D, 0x85, 0xE0, 0x0D, 0x7B, 0x00, 0x00, 0x00]
)]
fn timestamps_short(#[case] expected_txt: &str, #[case] ion_data: &[u8]) -> IonResult<()> {
use crate::lazy::decoder::{LazyRawReader, LazyRawValue};
use crate::lazy::text::raw::v1_1::reader::LazyRawTextReader_1_1;
Expand Down Expand Up @@ -625,9 +614,12 @@ mod tests {
#[case("1947-12T", &[0xF8, 0x07, 0x9B, 0x07, 0x03])]
#[case("1947-12-23T", &[0xF8, 0x07, 0x9B, 0x07, 0x5F])]
#[case("1947-12-23T11:22-00:00", &[0xF8, 0x0D, 0x9B, 0x07, 0xDF, 0x65, 0xFD, 0x3F])]
#[case("1947-12-23T11:22:33+01:00", &[0xF8, 0x0F, 0x9B, 0x07, 0xDF, 0x65, 0x71, 0x57, 0x08])]
#[case("1947-12-23T11:22:33.127+01:15", &[0xF8, 0x13, 0x9B, 0x07, 0xDF, 0x65, 0xAD, 0x57, 0x08, 0x07, 0x7F])]
#[case("1947-12-23T11:22:33-01:00", &[0xF8, 0x0F, 0x9B, 0x07, 0xDF, 0x65, 0x91, 0x55, 0x08])]
#[case("1947-12-23T11:22:33+01:00", &[0xF8, 0x0F, 0x9B, 0x07, 0xDF, 0x65, 0x71, 0x57, 0x08]
)]
#[case("1947-12-23T11:22:33.127+01:15", &[0xF8, 0x13, 0x9B, 0x07, 0xDF, 0x65, 0xAD, 0x57, 0x08, 0x07, 0x7F]
)]
#[case("1947-12-23T11:22:33-01:00", &[0xF8, 0x0F, 0x9B, 0x07, 0xDF, 0x65, 0x91, 0x55, 0x08]
)]
fn timestamps_long(#[case] expected_txt: &str, #[case] ion_data: &[u8]) -> IonResult<()> {
use crate::lazy::decoder::{LazyRawReader, LazyRawValue};
use crate::lazy::text::raw::v1_1::reader::LazyRawTextReader_1_1;
Expand Down
14 changes: 7 additions & 7 deletions src/lazy/binary/raw/v1_1/type_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ pub enum OpcodeType {
Struct, // 0xD2-0xDF -
IonVersionMarker, // 0xE0 -

SymbolAddress, // 0xE1-0xE3 -
AnnotationSymAddress, // 0xE4-0xE6 -
AnnotationFlexSym, // 0xE7-0xE9 -
NullNull, // 0xEA -
TypedNull, // 0xEB -
Nop, // 0xEC-0xED -
// Reserved
SymbolAddress, // 0xE1-0xE3 -
AnnotationSymAddress, // 0xE4-0xE6 -
AnnotationFlexSym, // 0xE7-0xE9 -
NullNull, // 0xEA -
TypedNull, // 0xEB -
Nop, // 0xEC-0xED -
SystemSymbolAddress, // 0xEE
SystemMacroInvoke, // 0xEF -
DelimitedContainerClose, // 0xF0
ListDelimited, // 0xF1
Expand Down
5 changes: 5 additions & 0 deletions src/lazy/binary/raw/v1_1/type_descriptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ impl Opcode {
),
(0xE, 0xC) => (Nop, InOpcode(0), OpcodeKind::Control),
(0xE, 0xD) => (Nop, FlexUIntFollows, OpcodeKind::Control),
(0xE, 0xE) => (
SystemSymbolAddress,
InOpcode(1),
OpcodeKind::Value(IonType::Symbol),
),
(0xF, 0x0) => (DelimitedContainerClose, InOpcode(0), OpcodeKind::Control),
(0xF, 0x1) => (ListDelimited, Unknown, OpcodeKind::Value(IonType::List)),
(0xF, 0x2) => (SExpDelimited, Unknown, OpcodeKind::Value(IonType::SExp)),
Expand Down
48 changes: 35 additions & 13 deletions src/lazy/binary/raw/v1_1/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ use crate::lazy::decoder::{HasRange, HasSpan, RawVersionMarker};
use crate::lazy::expanded::EncodingContextRef;
use crate::lazy::span::Span;
use crate::lazy::str_ref::StrRef;
use crate::types::SymbolAddress;
use crate::v1_1::FlexUInt;
use crate::{
constants,
lazy::{
binary::{
encoded_value::{EncodedHeader, EncodedValue},
Expand Down Expand Up @@ -181,7 +183,6 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for &'top LazyRawBinaryValue_1
}
}


/// This is a fast path for reading values that we know need to be resolved.
///
/// When a `LazyValue` wrapping a raw binary value calls `read()`, it's clear that the `RawValueRef` will
Expand Down Expand Up @@ -799,33 +800,54 @@ impl<'top> LazyRawBinaryValue_1_1<'top> {

/// Helper method called by [`Self::read_symbol`]. Reads the current value as a symbol ID.
fn read_symbol_id(&'top self) -> IonResult<SymbolId> {
let biases: [usize; 3] = [0, 256, 65792];
const BIASES: [usize; 3] = [0, 256, 65792];
let length_code = self.encoded_value.header.low_nibble();
if (1..=3).contains(&length_code) {
let (id, _) = self
.value_body_buffer()
.read_fixed_uint(length_code.into())?;
let id = usize::try_from(id.value())?;
Ok(id + biases[(length_code - 1) as usize])
Ok(id + BIASES[(length_code - 1) as usize])
} else {
unreachable!("invalid length code for symbol ID");
}
}

/// Helper method called by [`Self::read_symbol`]. Reads the next byte as a `FixedUInt`
/// and returns it as a symbol address.
fn read_system_symbol_address(&self) -> IonResult<SymbolAddress> {
let fixed_uint = self.value_body_buffer().read_fixed_uint(1)?;
fixed_uint.0.value().expect_usize()
}

/// Helper method called by [`Self::read`]. Reads the current value as a symbol.
fn read_symbol(&'top self) -> IonResult<RawSymbolRef<'top>> {
debug_assert!(self.encoded_value.ion_type() == IonType::Symbol);
let type_code = self.encoded_value.header.ion_type_code;
if type_code == OpcodeType::InlineSymbol {
let raw_bytes = self.value_body();
let text = std::str::from_utf8(raw_bytes)
.map_err(|_| IonError::decoding_error("found symbol with invalid UTF-8 data"))?;
Ok(RawSymbolRef::from(text))
} else if type_code == OpcodeType::SymbolAddress {
let symbol_id = self.read_symbol_id()?;
Ok(RawSymbolRef::SymbolId(symbol_id))
} else {
unreachable!("invalid Opcode type found for symbol");
match type_code {
OpcodeType::InlineSymbol => {
let raw_bytes = self.value_body();
let text = std::str::from_utf8(raw_bytes).map_err(|_| {
IonError::decoding_error("found symbol with invalid UTF-8 data")
})?;
Ok(RawSymbolRef::from(text))
}
OpcodeType::SymbolAddress => {
let symbol_id = self.read_symbol_id()?;
Ok(RawSymbolRef::SymbolId(symbol_id))
}
OpcodeType::SystemSymbolAddress => {
// In order to minimize the changes needed to introduce a second address space
// for symbols in Ion 1.1, system symbol IDs are resolved eagerly and returned
// as `Text`.
// Read the next byte after the opcode as a 1-byte FixedUInt address.
let symbol_address = self.read_system_symbol_address()?;
// SYSTEM_SYMBOLS does not contain $0...
let text = constants::v1_1::SYSTEM_SYMBOLS[symbol_address - 1];
// ...so all of its indexes are shifted by one. ^^^^^^^^^^^^^^^^^^
Ok(RawSymbolRef::Text(text))
}
other => unreachable!("invalid Opcode type found for symbol: {:?}", other),
}
}

Expand Down
Loading

0 comments on commit 042d714

Please sign in to comment.