Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds binary 1.1 read support for symbol address annotations #814

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 181 additions & 73 deletions src/lazy/binary/raw/v1_1/immutable_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,9 @@ impl<'a> ImmutableBuffer<'a> {
EExpressionWithLengthPrefix => {
ParseValueExprResult::EExp(self.read_eexp_with_length_prefix(opcode))
}
AnnotationFlexSym => ParseValueExprResult::Value(self.read_annotated_value(opcode)),
AnnotationSymAddress => todo!("symbol address annotations"),
AnnotationFlexSym | AnnotationSymAddress => {
ParseValueExprResult::Value(self.read_annotated_value(opcode))
}
_ if opcode.ion_type().is_some() => {
ParseValueExprResult::Value(self.read_value_without_annotations(opcode))
}
Expand Down Expand Up @@ -505,8 +506,7 @@ impl<'a> ImmutableBuffer<'a> {
}
Ok((None, after_nops))
} else {
self.read_value(opcode)
.map(|(v, after)| (Some(v), after))
self.read_value(opcode).map(|(v, after)| (Some(v), after))
Comment on lines -508 to +509
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ cargo fmt re-flowed some things in this file even though they hadn't changed in this commit. You can ignore this and the one below (lines 630-654).

}
}

Expand Down Expand Up @@ -627,28 +627,33 @@ impl<'a> ImmutableBuffer<'a> {
})?;

let header_offset = input.offset();
let (total_length, length_length, value_body_length, delimited_contents) = if opcode.is_delimited_start() {
let (contents, after) = input.peek_delimited_container(opcode)?;
let total_length = after.offset() - self.offset();
let value_body_length = total_length - 1; // Total length - sizeof(opcode)
(total_length, 0, value_body_length, contents)
} else {
let length = match header.length_type() {
LengthType::InOpcode(n) => FlexUInt::new(0, n as u64),
LengthType::Unknown => FlexUInt::new(0, 0), // Delimited value, we do not know the size.
// This call to `read_value_length` is not always inlined, so we avoid the method call
// if possible.
LengthType::FlexUIntFollows => input.consume(1).read_flex_uint()?.0,
};

let (total_length, length_length, value_body_length, delimited_contents) =
if opcode.is_delimited_start() {
let (contents, after) = input.peek_delimited_container(opcode)?;
let total_length = after.offset() - self.offset();
let value_body_length = total_length - 1; // Total length - sizeof(opcode)
(total_length, 0, value_body_length, contents)
} else {
let length = match header.length_type() {
LengthType::InOpcode(n) => FlexUInt::new(0, n as u64),
LengthType::Unknown => FlexUInt::new(0, 0), // Delimited value, we do not know the size.
// This call to `read_value_length` is not always inlined, so we avoid the method call
// if possible.
LengthType::FlexUIntFollows => input.consume(1).read_flex_uint()?.0,
};

let length_length = length.size_in_bytes() as u8;
let value_length = length.value() as usize; // ha
let total_length = 1 // Header byte
let length_length = length.size_in_bytes() as u8;
let value_length = length.value() as usize; // ha
let total_length = 1 // Header byte
+ length_length as usize
+ value_length;
(total_length, length_length, value_length, DelimitedContents::None)
};
(
total_length,
length_length,
value_length,
DelimitedContents::None,
)
};

if total_length > input.len() {
return IonResult::incomplete(
Expand Down Expand Up @@ -718,8 +723,12 @@ impl<'a> ImmutableBuffer<'a> {
fn read_annotations_sequence(self, opcode: Opcode) -> ParseResult<'a, EncodedAnnotations> {
match opcode.opcode_type {
OpcodeType::AnnotationFlexSym => self.read_flex_sym_annotations_sequence(opcode),
OpcodeType::SymbolAddress => self.read_symbol_address_annotations_sequence(opcode),
_ => unreachable!("read_annotations_sequence called for non-annotations opcode"),
OpcodeType::AnnotationSymAddress => {
self.read_symbol_address_annotations_sequence(opcode)
}
opcode => unreachable!(
"read_annotations_sequence called for non-annotations opcode {opcode:?}"
),
}
}

Expand All @@ -728,68 +737,137 @@ impl<'a> ImmutableBuffer<'a> {
opcode: Opcode,
) -> ParseResult<'a, EncodedAnnotations> {
let input_after_opcode = self.consume(1);
// TODO: This implementation actively reads the annotations, which isn't necessary.
// At this phase of parsing we can just identify the buffer slice that contains
// the annotations and remember their encoding; later on, the annotations iterator
// can actually do the reading. That optimization would be impactful for FlexSyms
// that represent inline text.
Comment on lines -731 to -735
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ I did this optimization for skipping FlexUInt symbol addresses, but doing it for FlexSym would be much more involved. I introduced two methods below to encapsulate that logic:

  • consume_flex_uint (skips without reading)
  • consume_flex_sym (skips by reading)

let (sequence, remaining_input) = match opcode.low_nibble() {
let (header_length, remaining_input) = match opcode.low_nibble() {
7 => {
let (flex_sym, remaining_input) = input_after_opcode.read_flex_sym()?;
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: 1, // 0xE7
sequence_length: u16::try_from(flex_sym.size_in_bytes()).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
Comment on lines -739 to -747
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ Previously I was constructing an EncodedAnnotations struct in each match arm. I lifted out out to the end of the method, which I think helped readability.

(sequence, remaining_input)
let header_length = 1;
let remaining_input = input_after_opcode.consume_flex_sym()?;
(header_length, remaining_input)
}
8 => {
let (flex_sym1, input_after_sym1) = input_after_opcode.read_flex_sym()?;
let (flex_sym2, input_after_sym2) = input_after_sym1.read_flex_sym()?;
let combined_length = flex_sym1.size_in_bytes() + flex_sym2.size_in_bytes();
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: 1, // 0xE8
sequence_length: u16::try_from(combined_length).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
(sequence, input_after_sym2)
let header_length = 1;
let remaining_input = input_after_opcode.consume_flex_sym()?.consume_flex_sym()?;
(header_length, remaining_input)
}
9 => {
let (flex_uint, remaining_input) = input_after_opcode.read_flex_uint()?;
let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length: u8::try_from(1 + flex_uint.size_in_bytes()).map_err(|_| {
IonError::decoding_error("found a 256+ byte annotations header")
})?,
sequence_length: u16::try_from(flex_uint.value()).map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB.",
)
})?,
};
(
sequence,
remaining_input.consume(sequence.sequence_length as usize),
)
let (flex_uint, input_after_header) = input_after_opcode.read_flex_uint()?;
let sequence_length = flex_uint.value() as usize;
if input_after_header.len() < sequence_length {
return IonResult::incomplete(
"reading an annotations sequence",
input_after_header.offset(),
);
}
// Header = 0xE6, followed by FlexUInt size in bytes
let header_length = u8::try_from(1 + flex_uint.size_in_bytes()).map_err(|_| {
IonError::decoding_error("found a 256+ byte annotations header")
})?;
let remaining = input_after_header.consume(sequence_length);
(header_length, remaining)
}
_ => unreachable!("reading flexsym annotations sequence with invalid length code"),
};

let sequence_length =
u16::try_from(remaining_input.offset() - (self.offset() + header_length as usize))
.map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB",
)
})?;

let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::FlexSym,
header_length,
sequence_length,
};
Comment on lines +778 to +782
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ Here's the instantiation of the EncodedAnnotations.


Ok((sequence, remaining_input))
}

/// Skips beyond a `FlexSym` at the head of the buffer, returning the remaining slice.
fn consume_flex_sym(self) -> IonResult<Self> {
// TODO: As an optimization, see if we can avoid actually reading the flex_sym.
let (flex_sym, remaining) = self.read_flex_sym()?;
if let FlexSymValue::Opcode(opcode) = flex_sym.value() {
todo!("FlexSym escapes in annotation sequences; opcode: {opcode:?}");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ The skipper doesn't (yet) attempt to interpret the opcode and skip whatever it indicates.

}

Ok(remaining)
}

/// Skips beyond a `FlexUInt` at the head of the buffer, returning the remaining slice.
fn consume_flex_uint(self) -> IonResult<Self> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ Skips a FlexUInt without actually reading it.

match self.peek_next_byte() {
Some(0) => {
// This is a big FlexUInt; for now, fall back to the read impl.
let (_flex_uint, remaining) = self.read_flex_uint()?;
return Ok(remaining);
}
Some(byte) => {
let num_bytes = (byte.trailing_zeros() + 1) as usize;
// If we have enough data in the buffer, skip over the FlexUInt and return the tail.
if self.len() >= num_bytes {
return Ok(self.consume(num_bytes));
}
// Otherwise, it's incomplete.
}
None => {
// If there's no data in the buffer, it's incomplete.
}
}

IonResult::incomplete("skipping a FlexUInt", self.offset())
}

fn read_symbol_address_annotations_sequence(
self,
_opcode: Opcode,
opcode: Opcode,
) -> ParseResult<'a, EncodedAnnotations> {
todo!()
let input_after_opcode = self.consume(1);
let (header_length, remaining_input) = match opcode.low_nibble() {
4 => {
let remaining = input_after_opcode.consume_flex_uint()?;
(/* Header=0xE4, Length=*/ 1, remaining)
}
5 => {
let remaining = input_after_opcode
.consume_flex_uint()?
.consume_flex_uint()?;
(/* Header=0xE5, Length=*/ 1, remaining)
}
6 => {
let (flex_uint, input_after_header) = input_after_opcode.read_flex_uint()?;
let sequence_length = flex_uint.value() as usize;
if input_after_header.len() < sequence_length {
return IonResult::incomplete(
"reading an annotations sequence",
input_after_header.offset(),
);
}
// Header = 0xE6, followed by FlexUInt size in bytes
let header_length = u8::try_from(1 + flex_uint.size_in_bytes()).map_err(|_| {
IonError::decoding_error("found a 256+ byte annotations header")
})?;
let remaining = input_after_header.consume(sequence_length);
(header_length, remaining)
}
_ => unreachable!("reading flexsym annotations sequence with invalid length code"),
};

let sequence_length =
u16::try_from(remaining_input.offset() - (self.offset() + header_length as usize))
.map_err(|_| {
IonError::decoding_error(
"the maximum supported annotations sequence length is 65KB",
)
})?;

let sequence = EncodedAnnotations {
encoding: AnnotationsEncoding::SymbolAddress,
header_length,
sequence_length,
};
Ok((sequence, remaining_input))
}

#[inline]
Expand Down Expand Up @@ -1104,6 +1182,36 @@ mod tests {
assert_eq!(pad_size, 4);
}

#[rstest]
#[case::single_address(&[0xE4, 0x07], 1, 1)]
#[case::two_addresses(&[0xE5, 0x07, 0x09], 1, 2)]
#[case::three_addresses(&[0xE6, 0x07, 0x07, 0x09, 0x0B], 2, 3)]
#[case::single_flex_sym(&[0xE7, 0x07], 1, 1)]
#[case::two_flex_syms(&[0xE8, 0x07, 0x09], 1, 2)]
#[case::three_flex_syms(&[0xE9, 0x07, 0x07, 0x09, 0x0B], 2, 3)]
fn read_annotations_sequence(
#[case] input: &[u8],
#[case] expected_header_length: usize,
#[case] expected_sequence_length: usize,
) -> IonResult<()> {
let context = EncodingContext::empty();
let buffer = ImmutableBuffer::new(context.get_ref(), input);
let (sequence, remaining) =
buffer.read_annotations_sequence(Opcode::from_byte(buffer.data[0]))?;
assert_eq!(
sequence.header_length as usize, expected_header_length,
"header length actual {} != expected {}",
sequence.header_length as usize, expected_header_length
);
assert_eq!(
sequence.sequence_length as usize, expected_sequence_length,
"sequence length actual {} != expected {}",
sequence.sequence_length as usize, expected_sequence_length
);
assert!(remaining.is_empty(), "remaining input was not empty");
Ok(())
}

fn eexp_test(
macro_source: &str,
encode_macro_fn: impl FnOnce(MacroAddress) -> Vec<u8>,
Expand Down
1 change: 1 addition & 0 deletions src/lazy/binary/raw/v1_1/type_descriptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl Opcode {
(0xD, _) => (Struct, low_nibble, Some(IonType::Struct)),
(0xE, 0x0) => (IonVersionMarker, low_nibble, None),
(0xE, 0x1..=0x3) => (SymbolAddress, low_nibble, Some(IonType::Symbol)),
(0xE, 0x4..=0x6) => (AnnotationSymAddress, low_nibble, None),
(0xE, 0x7..=0x9) => (AnnotationFlexSym, low_nibble, None),
(0xE, 0xA) => (NullNull, low_nibble, Some(IonType::Null)),
(0xE, 0xB) => (TypedNull, low_nibble, Some(IonType::Null)),
Expand Down
Loading