From 4522d9dfa073840a8da2b477948a092fb2d4af33 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 1 Jun 2024 20:20:08 +0500 Subject: [PATCH] Allow `�` character references We should either restrict all invalid characters both in literal form and as character references, or none of them. Disallowing only the one character is inconsistently. Because checking literal forms means that we should decode and check all the input, this will influence performance. We are not ready to get that performance lost for now. Users of the Reader API could do their own checks themselves. --- Changelog.md | 2 ++ src/escape.rs | 14 ++------------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/Changelog.md b/Changelog.md index dfed863b..ae887d94 100644 --- a/Changelog.md +++ b/Changelog.md @@ -80,6 +80,7 @@ resolve predefined entities. - [#734]: No longer resolve predefined entities (`lt`, `gt`, `apos`, `quot`, `amp`) in `unescape_with` family of methods. You should do that by yourself using the methods listed above. +- [#750]: Remove `EscapeError::EntityWithNull` and allow `�` character reference. [#275]: https://github.com/tafia/quick-xml/issues/275 [#362]: https://github.com/tafia/quick-xml/issues/362 @@ -98,6 +99,7 @@ resolve predefined entities. [#738]: https://github.com/tafia/quick-xml/pull/738 [#743]: https://github.com/tafia/quick-xml/pull/743 [#748]: https://github.com/tafia/quick-xml/pull/748 +[#750]: https://github.com/tafia/quick-xml/pull/750 [`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html diff --git a/src/escape.rs b/src/escape.rs index 54972624..48946a38 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -10,8 +10,6 @@ use pretty_assertions::assert_eq; /// Error for XML escape / unescape. #[derive(Clone, Debug)] pub enum EscapeError { - /// Entity with Null character - EntityWithNull(Range), /// Unrecognized escape symbol UnrecognizedSymbol(Range, String), /// Cannot find `;` after `&` @@ -31,11 +29,6 @@ pub enum EscapeError { impl std::fmt::Display for EscapeError { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - EscapeError::EntityWithNull(e) => write!( - f, - "Error while escaping character at range {:?}: Null character entity not allowed", - e - ), EscapeError::UnrecognizedSymbol(rge, res) => write!( f, "Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}", @@ -251,7 +244,7 @@ where // search for character correctness let pat = &raw[start + 1..end]; if let Some(entity) = pat.strip_prefix('#') { - let codepoint = parse_number(entity, start..end)?; + let codepoint = parse_number(entity)?; unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4])); } else if let Some(value) = resolve_entity(pat) { unescaped.push_str(value); @@ -1792,15 +1785,12 @@ pub fn resolve_html5_entity(entity: &str) -> Option<&'static str> { Some(s) } -fn parse_number(bytes: &str, range: Range) -> Result { +fn parse_number(bytes: &str) -> Result { let code = if let Some(hex_digits) = bytes.strip_prefix('x') { parse_hexadecimal(hex_digits) } else { parse_decimal(bytes) }?; - if code == 0 { - return Err(EscapeError::EntityWithNull(range)); - } match std::char::from_u32(code) { Some(c) => Ok(c), None => Err(EscapeError::InvalidCodepoint(code)),