Skip to content
Merged
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ resolve predefined entities.
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.

### Bug Fixes

Expand Down Expand Up @@ -101,6 +102,7 @@ resolve predefined entities.
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
[#753]: https://github.com/tafia/quick-xml/pull/753
[#754]: https://github.com/tafia/quick-xml/pull/754
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
Expand Down
2 changes: 1 addition & 1 deletion src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down
118 changes: 39 additions & 79 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::path::Path;

use crate::errors::{Error, Result, SyntaxError};
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
#[cfg(not(feature = "encoding"))]
#[inline]
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
use crate::encoding::UTF8_BOM;

Expand All @@ -31,6 +32,7 @@ macro_rules! impl_buffered_source {
}

#[cfg(feature = "encoding")]
#[inline]
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Expand Down Expand Up @@ -91,49 +93,50 @@ macro_rules! impl_buffered_source {
Ok((&buf[start..], done))
}

$($async)? fn read_pi $(<$lf>)? (
#[inline]
$($async)? fn read_with<$($lf,)? P: Parser>(
&mut self,
mut parser: P,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
let mut parser = super::PiParser::default();

) -> Result<&'b [u8]> {
let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match parser.feed(available) {
Some(i) => {
// We does not include `>` in data
buf.extend_from_slice(&available[..i - 1]);
done = true;
i
}
None => {
buf.extend_from_slice(available);
available.len()
}
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

if let Some(i) = parser.feed(available) {
buf.extend_from_slice(&available[..i]);

// +1 for `>` which we do not include
self $(.$reader)? .consume(i + 1);
read += i + 1;

*position += read;
return Ok(&buf[start..]);
}

// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
*position += read;

Ok((&buf[start..], done))
*position += read;
Err(Error::Syntax(P::eof_error()))
}

#[inline]
$($async)? fn read_bang_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
Expand Down Expand Up @@ -185,49 +188,6 @@ macro_rules! impl_buffered_source {
}

#[inline]
$($async)? fn read_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<&'b [u8]> {
let mut state = ReadElementState::Elem;
let mut read = 0;

let start = buf.len();
loop {
match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(available) => {
if let Some((consumed, used)) = state.change(available) {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
read += used;

// Position now just after the `>` symbol
*position += read;
return Ok(&buf[start..]);
} else {
// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};
}

*position += read;
Err(Error::Syntax(SyntaxError::UnclosedTag))
}

$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Expand All @@ -247,25 +207,25 @@ macro_rules! impl_buffered_source {
}
}

$($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
#[inline]
$($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

match self.peek_one() $(.$await)? ? {
Some(b) if b == byte => {
*position += 1;
self $(.$reader)? .consume(1);
Ok(true)
}
_ => Ok(false),
}
}

#[inline]
$($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => Ok(None),
Ok(n) => Ok(Some(n[0])),
Ok(n) => Ok(n.first().cloned()),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
};
Expand Down
122 changes: 122 additions & 0 deletions src/reader/element.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
//! Contains a parser for an XML element.

use crate::errors::SyntaxError;
use crate::reader::Parser;

/// A parser that search a `>` symbol in the slice outside of quoted regions.
///
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
/// considered as results. Each region starts and ends by its quote symbol,
/// which cannot be escaped (but can be encoded as XML character entity or named
/// entity. Anyway, that encoding does not contain literal quotes).
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position of
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
/// typically would expect positive result of search, so that you should feed
/// new data until you get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::reader::{ElementParser, Parser};
///
/// let mut parser = ElementParser::default();
///
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<my-element"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some >"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
/// // ^ ^
/// // 0 8
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ElementParser {
/// The initial state (inside element, but outside of attribute value).
Outside,
/// Inside a single-quoted region (`'...'`).
SingleQ,
/// Inside a double-quoted region (`"..."`).
DoubleQ,
}

impl Parser for ElementParser {
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
#[inline]
fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
*self = match (*self, bytes[i]) {
// only allowed to match `>` while we are in state `Outside`
(Self::Outside, b'>') => return Some(i),
(Self::Outside, b'\'') => Self::SingleQ,
(Self::Outside, b'\"') => Self::DoubleQ,

// the only end_byte that gets us out if the same character
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside,

// all other bytes: no state change
_ => continue,
};
}
None
}

#[inline]
fn eof_error() -> SyntaxError {
SyntaxError::UnclosedTag
}
}

impl Default for ElementParser {
#[inline]
fn default() -> Self {
Self::Outside
}
}

#[test]
fn parse() {
use pretty_assertions::assert_eq;
use ElementParser::*;

/// Returns `Ok(pos)` with the position in the buffer where element is ended.
///
/// Returns `Err(internal_state)` if parsing does not done yet.
fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> {
match parser.feed(bytes) {
Some(i) => Ok(i),
None => Err(parser),
}
}

assert_eq!(parse_element(b"", Outside), Err(Outside));
assert_eq!(parse_element(b"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"'", Outside), Err(SingleQ));
assert_eq!(parse_element(b"'", SingleQ), Err(Outside));
assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ));
assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside));

assert_eq!(parse_element(b">", Outside), Ok(0));
assert_eq!(parse_element(b">", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"''>", Outside), Ok(2));
assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ));
}
Loading