From 6a48a28f4cc182112d9005f66f8ab4336ace3293 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 3 Jul 2024 20:03:05 +0500 Subject: [PATCH] Allow to have attributes in closing tags (compatibility with the Adobe Flash parser) --- Changelog.md | 2 + src/reader/buffered_reader.rs | 48 ------- src/reader/mod.rs | 235 +++++++++++++--------------------- src/reader/slice_reader.rs | 23 ---- tests/issues.rs | 27 ++++ 5 files changed, 117 insertions(+), 218 deletions(-) diff --git a/Changelog.md b/Changelog.md index 35b51fc1..c36ddad8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -24,7 +24,9 @@ ### Misc Changes - [#780]: `reader::Parser`, `reader::ElementParser` and `reader::PiParser` moved to the new module `parser`. +- [#776]: Allow to have attributes in the end tag for compatibility reasons with Adobe Flash XML parser. +[#776]: https://github.com/tafia/quick-xml/issues/776 [#780]: https://github.com/tafia/quick-xml/pull/780 [#781]: https://github.com/tafia/quick-xml/pull/781 diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index c8af425e..95ac1dc7 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -101,54 +101,6 @@ macro_rules! impl_buffered_source { ReadTextResult::UpToEof(&buf[start..]) } - #[inline] - $($async)? fn read_bytes_until $(<$lf>)? ( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut u64, - ) -> io::Result<(&'b [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - let mut read = 0; - let start = buf.len(); - loop { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(e); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - - let used = i + 1; - self $(.$reader)? .consume(used); - read += used as u64; - - *position += read; - return Ok((&buf[start..], true)); - } - None => { - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used as u64; - } - } - } - - *position += read; - Ok((&buf[start..], false)) - } - #[inline] $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 46a30e86..6e030b73 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -345,18 +345,26 @@ macro_rules! read_until_close { } }, // `` we will parse `` as end tag + // `` which probably no one existing parser + // does. This is malformed XML, however it is tolerated by some parsers + // (e.g. the one used by Adobe Flash) and such documents do exist in the wild. Ok(Some(b'/')) => match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) + .read_with(ElementParser::Outside, $buf, &mut $self.state.offset) $(.$await)? { - Ok((bytes, true)) => $self.state.emit_end(bytes), - Ok((_, false)) => { + Ok(bytes) => $self.state.emit_end(bytes), + Err(e) => { // We want to report error at `<`, but offset was increased, // so return it back (-1 for `<`) $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) + Err(e) } - Err(e) => Err(Error::Io(e.into())), }, // ` match $reader @@ -824,39 +832,6 @@ trait XmlSource<'r, B> { /// [events]: crate::events::Event fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>; - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte` (exclusive), - /// and a flag noting whether `byte` was found in the input or not. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// (b"abc".as_ref(), true) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut u64, - ) -> io::Result<(&'r [u8], bool)>; - /// Read input until processing instruction is finished. /// /// This method expect that start sequence of a parser already was read. @@ -1022,115 +997,6 @@ mod test { $buf:expr $(, $async:ident, $await:ident)? ) => { - mod read_bytes_until { - use super::*; - // Use Bytes for printing bytes as strings for ASCII range - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that search in the empty buffer returns `None` - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 0; - let mut input = b"".as_ref(); - // ^= 0 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), false) - ); - assert_eq!(position, 0); - } - - /// Checks that search in the buffer non-existent value returns entire buffer - /// as a result and set `position` to `len()` - #[$test] - $($async)? fn non_existent() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef".as_ref(); - // ^= 6 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), false) - ); - assert_eq!(position, 6); - } - - /// Checks that search in the buffer an element that is located in the front of - /// buffer returns empty slice as a result and set `position` to one symbol - /// after match (`1`) - #[$test] - $($async)? fn at_the_start() { - let buf = $buf; - let mut position = 0; - let mut input = b"*abcdef".as_ref(); - // ^= 1 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), true) - ); - assert_eq!(position, 1); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the middle of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match - #[$test] - $($async)? fn inside() { - let buf = $buf; - let mut position = 0; - let mut input = b"abc*def".as_ref(); - // ^= 4 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abc"), true) - ); - assert_eq!(position, 4); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the end of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match (`len()`) - #[$test] - $($async)? fn in_the_end() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef*".as_ref(); - // ^= 7 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), true) - ); - assert_eq!(position, 7); // position after the symbol matched - } - } - mod read_bang_element { use super::*; use crate::errors::{Error, SyntaxError}; @@ -1693,6 +1559,81 @@ mod test { assert_eq!(position, 42); } } + + mod close { + use super::*; + use pretty_assertions::assert_eq; + + #[$test] + $($async)? fn empty_tag() { + let buf = $buf; + let mut position = 1; + let mut input = b"/ >".as_ref(); + // ^= 4 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/ ") + ); + assert_eq!(position, 4); + } + + #[$test] + $($async)? fn normal() { + let buf = $buf; + let mut position = 1; + let mut input = b"/tag>".as_ref(); + // ^= 6 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/tag") + ); + assert_eq!(position, 6); + } + + #[$test] + $($async)? fn empty_ns_empty_tag() { + let buf = $buf; + let mut position = 1; + let mut input = b"/:>".as_ref(); + // ^= 4 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/:") + ); + assert_eq!(position, 4); + } + + #[$test] + $($async)? fn empty_ns() { + let buf = $buf; + let mut position = 1; + let mut input = b"/:tag>".as_ref(); + // ^= 7 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/:tag") + ); + assert_eq!(position, 7); + } + + #[$test] + $($async)? fn with_attributes() { + let buf = $buf; + let mut position = 1; + let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); + // ^= 40 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#) + ); + assert_eq!(position, 40); + } + } } /// Ensures, that no empty `Text` events are generated diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 1afc3e36..6b4c2804 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -284,29 +284,6 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } - #[inline] - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut u64, - ) -> io::Result<(&'a [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - if let Some(i) = memchr::memchr(byte, self) { - *position += i as u64 + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - Ok((bytes, true)) - } else { - *position += self.len() as u64; - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) - } - } - #[inline] fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]> where diff --git a/tests/issues.rs b/tests/issues.rs index e4a75696..c14f09c6 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -364,3 +364,30 @@ fn issue774() { Event::End(BytesEnd::new("tag")) ); } + +/// Regression test for https://github.com/tafia/quick-xml/issues/776 +#[test] +fn issue776() { + let mut reader = Reader::from_str(r#""#); + // We still think that the name of the end tag is everything between `` + // and if we do not disable this check we get error + reader.config_mut().check_end_names = false; + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new("tag/")) + ); + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new(r#"tag attr=">""#)) + ); +}