Skip to content

Commit 569b794

Browse files
committed
Generalize reading methods of PI and element
They are identical except different type of parser used.
1 parent 6c55956 commit 569b794

File tree

6 files changed

+76
-118
lines changed

6 files changed

+76
-118
lines changed

src/reader/async_tokio.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use crate::errors::{Error, Result, SyntaxError};
88
use crate::events::Event;
99
use crate::name::{QName, ResolveResult};
1010
use crate::reader::buffered_reader::impl_buffered_source;
11-
use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span};
11+
use crate::reader::{
12+
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
13+
};
1214

1315
/// A struct for read XML asynchronously from an [`AsyncBufRead`].
1416
///

src/reader/buffered_reader.rs

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ use std::fs::File;
55
use std::io::{self, BufRead, BufReader};
66
use std::path::Path;
77

8-
use crate::errors::{Error, Result, SyntaxError};
8+
use crate::errors::{Error, Result};
99
use crate::events::Event;
1010
use crate::name::QName;
11-
use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource};
11+
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
1212

1313
macro_rules! impl_buffered_source {
1414
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
@@ -91,12 +91,12 @@ macro_rules! impl_buffered_source {
9191
Ok((&buf[start..], done))
9292
}
9393

94-
$($async)? fn read_pi $(<$lf>)? (
94+
$($async)? fn read<$($lf,)? P: Parser>(
9595
&mut self,
9696
buf: &'b mut Vec<u8>,
9797
position: &mut usize,
9898
) -> Result<&'b [u8]> {
99-
let mut parser = super::PiParser::default();
99+
let mut parser = P::default();
100100

101101
let mut read = 0;
102102
let start = buf.len();
@@ -131,7 +131,7 @@ macro_rules! impl_buffered_source {
131131
}
132132

133133
*position += read;
134-
Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
134+
Err(Error::Syntax(P::eof_error()))
135135
}
136136

137137
$($async)? fn read_bang_element $(<$lf>)? (
@@ -184,50 +184,6 @@ macro_rules! impl_buffered_source {
184184
Err(bang_type.to_err())
185185
}
186186

187-
#[inline]
188-
$($async)? fn read_element $(<$lf>)? (
189-
&mut self,
190-
buf: &'b mut Vec<u8>,
191-
position: &mut usize,
192-
) -> Result<&'b [u8]> {
193-
let mut parser = ElementParser::default();
194-
let mut read = 0;
195-
196-
let start = buf.len();
197-
loop {
198-
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
199-
Ok(n) if n.is_empty() => break,
200-
Ok(n) => n,
201-
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
202-
Err(e) => {
203-
*position += read;
204-
return Err(Error::Io(e.into()));
205-
}
206-
};
207-
208-
if let Some(used) = parser.feed(available) {
209-
buf.extend_from_slice(&available[..used]);
210-
211-
// +1 for `>` which we do not include
212-
self $(.$reader)? .consume(used + 1);
213-
read += used + 1;
214-
215-
// Position now just after the `>` symbol
216-
*position += read;
217-
return Ok(&buf[start..]);
218-
}
219-
220-
// The `>` symbol not yet found, continue reading
221-
buf.extend_from_slice(available);
222-
let used = available.len();
223-
self $(.$reader)? .consume(used);
224-
read += used;
225-
}
226-
227-
*position += read;
228-
Err(Error::Syntax(SyntaxError::UnclosedTag))
229-
}
230-
231187
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
232188
loop {
233189
break match self $(.$reader)? .fill_buf() $(.$await)? {

src/reader/element.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
//! Contains a parser for an XML element.
22
3+
use crate::errors::SyntaxError;
4+
use crate::reader::Parser;
5+
36
/// A parser that search a `>` symbol in the slice outside of quoted regions.
47
///
58
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
@@ -21,8 +24,9 @@
2124
/// # Example
2225
///
2326
/// ```
24-
/// # use quick_xml::reader::ElementParser;
2527
/// # use pretty_assertions::assert_eq;
28+
/// use quick_xml::reader::{ElementParser, Parser};
29+
///
2630
/// let mut parser = ElementParser::default();
2731
///
2832
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
@@ -47,10 +51,10 @@ pub enum ElementParser {
4751
DoubleQ,
4852
}
4953

50-
impl ElementParser {
54+
impl Parser for ElementParser {
5155
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
5256
#[inline]
53-
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
57+
fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
5458
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
5559
*self = match (*self, bytes[i]) {
5660
// only allowed to match `>` while we are in state `Outside`
@@ -67,6 +71,11 @@ impl ElementParser {
6771
}
6872
None
6973
}
74+
75+
#[inline]
76+
fn eof_error() -> SyntaxError {
77+
SyntaxError::UnclosedTag
78+
}
7079
}
7180

7281
impl Default for ElementParser {

src/reader/mod.rs

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ macro_rules! read_until_close {
361361
},
362362
// `<?` - processing instruction
363363
Ok(Some(b'?')) => match $reader
364-
.read_pi($buf, &mut $self.state.offset)
364+
.read::<PiParser>($buf, &mut $self.state.offset)
365365
$(.$await)?
366366
{
367367
Ok(bytes) => $self.state.emit_question_mark(bytes),
@@ -374,7 +374,7 @@ macro_rules! read_until_close {
374374
},
375375
// `<...` - opening or self-closed tag
376376
Ok(Some(_)) => match $reader
377-
.read_element($buf, &mut $self.state.offset)
377+
.read::<ElementParser>($buf, &mut $self.state.offset)
378378
$(.$await)?
379379
{
380380
Ok(bytes) => $self.state.emit_start(bytes),
@@ -763,6 +763,25 @@ impl<R> Reader<R> {
763763

764764
////////////////////////////////////////////////////////////////////////////////////////////////////
765765

766+
/// Used to decouple reading of data from data source and parsing XML structure from it.
767+
///
768+
/// This trait is implemented for every parser that processes piece of XML grammar.
769+
pub trait Parser: Default {
770+
/// Process new data and try to determine end of the parsed thing.
771+
///
772+
/// Returns position of the end of thing in `bytes` in case of successful search
773+
/// and `None` otherwise.
774+
///
775+
/// # Parameters
776+
/// - `bytes`: a slice to find the end of a thing
777+
/// Should contain text in ASCII-compatible encoding
778+
fn feed(&mut self, bytes: &[u8]) -> Option<usize>;
779+
780+
/// Returns parse error produced by this parser in case of reaching end of
781+
/// input without finding the end of a parsed thing.
782+
fn eof_error() -> SyntaxError;
783+
}
784+
766785
/// Represents an input for a reader that can return borrowed data.
767786
///
768787
/// There are two implementors of this trait: generic one that read data from
@@ -821,20 +840,20 @@ trait XmlSource<'r, B> {
821840

822841
/// Read input until processing instruction is finished.
823842
///
824-
/// This method expect that `<?` already was read.
843+
/// This method expect that start sequence of a parser already was read.
825844
///
826-
/// Returns a slice of data read up to end of processing instruction (`>`),
827-
/// which does not include into result (`?` at the end included).
845+
/// Returns a slice of data read up to end of a chunk, which does not include
846+
/// into result.
828847
///
829-
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
848+
/// If input (`Self`) is exhausted and nothing was read, returns `SyntaxError`.
830849
///
831850
/// # Parameters
832851
/// - `buf`: Buffer that could be filled from an input (`Self`) and
833852
/// from which [events] could borrow their data
834853
/// - `position`: Will be increased by amount of bytes consumed
835854
///
836855
/// [events]: crate::events::Event
837-
fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>;
856+
fn read<P: Parser>(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>;
838857

839858
/// Read input until comment or CDATA is finished.
840859
///
@@ -853,30 +872,6 @@ trait XmlSource<'r, B> {
853872
/// [events]: crate::events::Event
854873
fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>;
855874

856-
/// Read input until XML element is closed by approaching a `>` symbol.
857-
/// Returns a buffer that contains a data between `<` and `>` or
858-
/// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`.
859-
///
860-
/// Derived from `read_until`, but modified to handle XML attributes
861-
/// using a minimal state machine.
862-
///
863-
/// Attribute values are [defined] as follows:
864-
/// ```plain
865-
/// AttValue := '"' (([^<&"]) | Reference)* '"'
866-
/// | "'" (([^<&']) | Reference)* "'"
867-
/// ```
868-
/// (`Reference` is something like `&quot;`, but we don't care about
869-
/// escaped characters at this level)
870-
///
871-
/// # Parameters
872-
/// - `buf`: Buffer that could be filled from an input (`Self`) and
873-
/// from which [events] could borrow their data
874-
/// - `position`: Will be increased by amount of bytes consumed
875-
///
876-
/// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
877-
/// [events]: crate::events::Event
878-
fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>;
879-
880875
/// Consume and discard all the whitespace until the next non-whitespace
881876
/// character or EOF.
882877
///
@@ -1510,6 +1505,7 @@ mod test {
15101505
mod read_element {
15111506
use super::*;
15121507
use crate::errors::{Error, SyntaxError};
1508+
use crate::reader::ElementParser;
15131509
use crate::utils::Bytes;
15141510
use pretty_assertions::assert_eq;
15151511

@@ -1521,7 +1517,7 @@ mod test {
15211517
let mut input = b"".as_ref();
15221518
// ^= 1
15231519

1524-
match $source(&mut input).read_element(buf, &mut position) $(.$await)? {
1520+
match $source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? {
15251521
Err(Error::Syntax(SyntaxError::UnclosedTag)) => {}
15261522
x => panic!(
15271523
"Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`",
@@ -1543,7 +1539,7 @@ mod test {
15431539
// ^= 2
15441540

15451541
assert_eq!(
1546-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1542+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
15471543
Bytes(b"")
15481544
);
15491545
assert_eq!(position, 2);
@@ -1557,7 +1553,7 @@ mod test {
15571553
// ^= 5
15581554

15591555
assert_eq!(
1560-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1556+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
15611557
Bytes(b"tag")
15621558
);
15631559
assert_eq!(position, 5);
@@ -1571,7 +1567,7 @@ mod test {
15711567
// ^= 3
15721568

15731569
assert_eq!(
1574-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1570+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
15751571
Bytes(b":")
15761572
);
15771573
assert_eq!(position, 3);
@@ -1585,7 +1581,7 @@ mod test {
15851581
// ^= 6
15861582

15871583
assert_eq!(
1588-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1584+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
15891585
Bytes(b":tag")
15901586
);
15911587
assert_eq!(position, 6);
@@ -1599,7 +1595,7 @@ mod test {
15991595
// ^= 39
16001596

16011597
assert_eq!(
1602-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1598+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16031599
Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
16041600
);
16051601
assert_eq!(position, 39);
@@ -1618,7 +1614,7 @@ mod test {
16181614
// ^= 3
16191615

16201616
assert_eq!(
1621-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1617+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16221618
Bytes(b"/")
16231619
);
16241620
assert_eq!(position, 3);
@@ -1632,7 +1628,7 @@ mod test {
16321628
// ^= 6
16331629

16341630
assert_eq!(
1635-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1631+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16361632
Bytes(b"tag/")
16371633
);
16381634
assert_eq!(position, 6);
@@ -1646,7 +1642,7 @@ mod test {
16461642
// ^= 4
16471643

16481644
assert_eq!(
1649-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1645+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16501646
Bytes(b":/")
16511647
);
16521648
assert_eq!(position, 4);
@@ -1660,7 +1656,7 @@ mod test {
16601656
// ^= 7
16611657

16621658
assert_eq!(
1663-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1659+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16641660
Bytes(b":tag/")
16651661
);
16661662
assert_eq!(position, 7);
@@ -1674,7 +1670,7 @@ mod test {
16741670
// ^= 42
16751671

16761672
assert_eq!(
1677-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1673+
Bytes($source(&mut input).read::<ElementParser>(buf, &mut position) $(.$await)? .unwrap()),
16781674
Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
16791675
);
16801676
assert_eq!(position, 42);

0 commit comments

Comments
 (0)