quick_xml/
encoding.rs

1//! A module for wrappers that encode / decode data.
2
3use std::borrow::Cow;
4
5#[cfg(feature = "encoding")]
6use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
7
8#[cfg(feature = "encoding")]
9use crate::Error;
10use crate::Result;
11
12/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
13/// See <https://unicode.org/faq/utf_bom.html#bom1>
14pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
15/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
16/// See <https://unicode.org/faq/utf_bom.html#bom1>
17#[cfg(feature = "encoding")]
18pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
19/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
20/// See <https://unicode.org/faq/utf_bom.html#bom1>
21#[cfg(feature = "encoding")]
22pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
23
24/// Decoder of byte slices into strings.
25///
26/// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
27/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
28/// key is not defined or contains unknown encoding.
29///
30/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
31/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16].
32///
33/// If feature [`encoding`] is disabled, the decoder is always UTF-8 decoder:
34/// any XML declarations are ignored.
35///
36/// [utf16]: https://github.com/tafia/quick-xml/issues/158
37/// [`encoding`]: ../index.html#encoding
38#[derive(Clone, Copy, Debug, Eq, PartialEq)]
39pub struct Decoder {
40    #[cfg(feature = "encoding")]
41    pub(crate) encoding: &'static Encoding,
42}
43
44impl Decoder {
45    pub(crate) fn utf8() -> Self {
46        Decoder {
47            #[cfg(feature = "encoding")]
48            encoding: UTF_8,
49        }
50    }
51
52    #[cfg(all(test, feature = "encoding", feature = "serialize"))]
53    pub(crate) fn utf16() -> Self {
54        Decoder { encoding: UTF_16LE }
55    }
56}
57
58impl Decoder {
59    /// Returns the `Reader`s encoding.
60    ///
61    /// This encoding will be used by [`decode`].
62    ///
63    /// [`decode`]: Self::decode
64    #[cfg(feature = "encoding")]
65    pub const fn encoding(&self) -> &'static Encoding {
66        self.encoding
67    }
68
69    /// ## Without `encoding` feature
70    ///
71    /// Decodes an UTF-8 slice regardless of XML declaration and ignoring BOM
72    /// if it is present in the `bytes`.
73    ///
74    /// ## With `encoding` feature
75    ///
76    /// Decodes specified bytes using encoding, declared in the XML, if it was
77    /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
78    /// in the `bytes`.
79    ///
80    /// ----
81    /// Returns an error in case of malformed sequences in the `bytes`.
82    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
83        #[cfg(not(feature = "encoding"))]
84        let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
85
86        #[cfg(feature = "encoding")]
87        let decoded = decode(bytes, self.encoding);
88
89        decoded
90    }
91
92    /// Like [`decode`][Self::decode] but using a pre-allocated buffer.
93    pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<()> {
94        #[cfg(not(feature = "encoding"))]
95        buf.push_str(std::str::from_utf8(bytes)?);
96
97        #[cfg(feature = "encoding")]
98        decode_into(bytes, self.encoding, buf)?;
99
100        Ok(())
101    }
102
103    /// Decodes the `Cow` buffer, preserves the lifetime
104    pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result<Cow<'b, str>> {
105        match bytes {
106            Cow::Borrowed(bytes) => self.decode(bytes),
107            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
108            Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
109        }
110    }
111}
112
113/// Decodes the provided bytes using the specified encoding.
114///
115/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
116#[cfg(feature = "encoding")]
117pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
118    encoding
119        .decode_without_bom_handling_and_without_replacement(bytes)
120        .ok_or(Error::NonDecodable(None))
121}
122
123/// Like [`decode`] but using a pre-allocated buffer.
124#[cfg(feature = "encoding")]
125pub fn decode_into(bytes: &[u8], encoding: &'static Encoding, buf: &mut String) -> Result<()> {
126    if encoding == UTF_8 {
127        buf.push_str(std::str::from_utf8(bytes)?);
128        return Ok(());
129    }
130
131    let mut decoder = encoding.new_decoder_without_bom_handling();
132    buf.reserve(
133        decoder
134            .max_utf8_buffer_length_without_replacement(bytes.len())
135            // SAFETY: None can be returned only if required size will overflow usize,
136            // but in that case String::reserve also panics
137            .unwrap(),
138    );
139    let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
140    match result {
141        DecoderResult::InputEmpty => {
142            debug_assert_eq!(read, bytes.len());
143            Ok(())
144        }
145        DecoderResult::Malformed(_, _) => Err(Error::NonDecodable(None)),
146        // SAFETY: We allocate enough space above
147        DecoderResult::OutputFull => unreachable!(),
148    }
149}
150
151/// Automatic encoding detection of XML files based using the
152/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
153///
154/// If encoding is detected, `Some` is returned with an encoding and size of BOM
155/// in bytes, if detection was performed using BOM, or zero, if detection was
156/// performed without BOM.
157///
158/// IF encoding was not recognized, `None` is returned.
159///
160/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
161/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
162///
163/// The algorithm suggests examine up to the first 4 bytes to determine encoding
164/// according to the following table:
165///
166/// | Bytes       |Detected encoding
167/// |-------------|------------------------------------------
168/// | **BOM**
169/// |`FE_FF_##_##`|UTF-16, big-endian
170/// |`FF FE ## ##`|UTF-16, little-endian
171/// |`EF BB BF`   |UTF-8
172/// | **No BOM**
173/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
174/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
175/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
176#[cfg(feature = "encoding")]
177pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
178    match bytes {
179        // with BOM
180        _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
181        _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
182        _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
183
184        // without BOM
185        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
186        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
187        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
188
189        _ => None,
190    }
191}