quick_xml/reader/
mod.rs

1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, Result, SyntaxError};
10use crate::events::Event;
11use crate::parser::{ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27    /// Whether unmatched closing tag names should be allowed. Unless enabled,
28    /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
29    /// is returned from read methods.
30    ///
31    /// When set to `true`, it won't check if a closing tag has a corresponding
32    /// opening tag at all. For example, `<a></a></b>` will be permitted.
33    ///
34    /// Note that the emitted [`End`] event will not be modified if this is enabled,
35    /// ie. it will contain the data of the unmatched end tag.
36    ///
37    /// Note, that setting this to `true` will lead to additional allocates that
38    /// needed to store tag name for an [`End`] event.
39    ///
40    /// Default: `false`
41    ///
42    /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
43    /// [`End`]: crate::events::Event::End
44    pub allow_unmatched_ends: bool,
45
46    /// Whether comments should be validated. If enabled, in case of invalid comment
47    /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
48    ///
49    /// When set to `true`, every [`Comment`] event will be checked for not
50    /// containing `--`, which [is not allowed] in XML comments. Most of the time
51    /// we don't want comments at all so we don't really care about comment
52    /// correctness, thus the default value is `false` to improve performance.
53    ///
54    /// Default: `false`
55    ///
56    /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
57    /// [`Comment`]: crate::events::Event::Comment
58    /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
59    pub check_comments: bool,
60
61    /// Whether mismatched closing tag names should be detected. If enabled, in
62    /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
63    /// read methods.
64    ///
65    /// Note, that start and end tags [should match literally][spec], they cannot
66    /// have different prefixes even if both prefixes resolve to the same namespace.
67    /// The XML
68    ///
69    /// ```xml
70    /// <outer xmlns="namespace" xmlns:p="namespace">
71    /// </p:outer>
72    /// ```
73    ///
74    /// is not valid, even though semantically the start tag is the same as the
75    /// end tag. The reason is that namespaces are an extension of the original
76    /// XML specification (without namespaces) and it should be backward-compatible.
77    ///
78    /// When set to `false`, it won't check if a closing tag matches the corresponding
79    /// opening tag. For example, `<mytag></different_tag>` will be permitted.
80    ///
81    /// If the XML is known to be sane (already processed, etc.) this saves extra time.
82    ///
83    /// Note that the emitted [`End`] event will not be modified if this is disabled,
84    /// ie. it will contain the data of the mismatched end tag.
85    ///
86    /// Note, that setting this to `true` will lead to additional allocates that
87    /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
88    /// is also set, only one additional allocation will be performed that support
89    /// both these options.
90    ///
91    /// Default: `true`
92    ///
93    /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
94    /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
95    /// [`End`]: crate::events::Event::End
96    /// [`expand_empty_elements`]: Self::expand_empty_elements
97    pub check_end_names: bool,
98
99    /// Whether empty elements should be split into an `Open` and a `Close` event.
100    ///
101    /// When set to `true`, all [`Empty`] events produced by a self-closing tag
102    /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
103    /// event. When set to `false` (the default), those tags are represented by
104    /// an [`Empty`] event instead.
105    ///
106    /// Note, that setting this to `true` will lead to additional allocates that
107    /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
108    /// is also set, only one additional allocation will be performed that support
109    /// both these options.
110    ///
111    /// Default: `false`
112    ///
113    /// [`Empty`]: crate::events::Event::Empty
114    /// [`Start`]: crate::events::Event::Start
115    /// [`End`]: crate::events::Event::End
116    /// [`check_end_names`]: Self::check_end_names
117    pub expand_empty_elements: bool,
118
119    /// Whether trailing whitespace after the markup name are trimmed in closing
120    /// tags `</a >`.
121    ///
122    /// If `true` the emitted [`End`] event is stripped of trailing whitespace
123    /// after the markup name.
124    ///
125    /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
126    /// of markup names is going to fail erroneously if a closing tag contains
127    /// trailing whitespace.
128    ///
129    /// Default: `true`
130    ///
131    /// [`End`]: crate::events::Event::End
132    /// [`check_end_names`]: Self::check_end_names
133    pub trim_markup_names_in_closing_tags: bool,
134
135    /// Whether whitespace before character data should be removed.
136    ///
137    /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
138    /// If after that the event is empty it will not be pushed.
139    ///
140    /// Default: `false`
141    ///
142    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
143    ///
144    /// WARNING: With this option every text events will be trimmed which is
145    /// incorrect behavior when text events delimited by comments, processing
146    /// instructions or CDATA sections. To correctly trim data manually apply
147    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
148    /// only to necessary events.
149    /// </div>
150    ///
151    /// [`Text`]: crate::events::Event::Text
152    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
153    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
154    pub trim_text_start: bool,
155
156    /// Whether whitespace after character data should be removed.
157    ///
158    /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
159    /// If after that the event is empty it will not be pushed.
160    ///
161    /// Default: `false`
162    ///
163    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
164    ///
165    /// WARNING: With this option every text events will be trimmed which is
166    /// incorrect behavior when text events delimited by comments, processing
167    /// instructions or CDATA sections. To correctly trim data manually apply
168    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
169    /// only to necessary events.
170    /// </div>
171    ///
172    /// [`Text`]: crate::events::Event::Text
173    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
174    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
175    pub trim_text_end: bool,
176}
177
178impl Config {
179    /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
180    ///
181    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
182    ///
183    /// WARNING: With this option every text events will be trimmed which is
184    /// incorrect behavior when text events delimited by comments, processing
185    /// instructions or CDATA sections. To correctly trim data manually apply
186    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
187    /// only to necessary events.
188    /// </div>
189    ///
190    /// [`trim_text_start`]: Self::trim_text_start
191    /// [`trim_text_end`]: Self::trim_text_end
192    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
193    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
194    #[inline]
195    pub fn trim_text(&mut self, trim: bool) {
196        self.trim_text_start = trim;
197        self.trim_text_end = trim;
198    }
199
200    /// Turn on or off all checks for well-formedness. Currently it is that settings:
201    /// - [`check_comments`](Self::check_comments)
202    /// - [`check_end_names`](Self::check_end_names)
203    #[inline]
204    pub fn enable_all_checks(&mut self, enable: bool) {
205        self.check_comments = enable;
206        self.check_end_names = enable;
207    }
208}
209
210impl Default for Config {
211    fn default() -> Self {
212        Self {
213            allow_unmatched_ends: false,
214            check_comments: false,
215            check_end_names: true,
216            expand_empty_elements: false,
217            trim_markup_names_in_closing_tags: true,
218            trim_text_start: false,
219            trim_text_end: false,
220        }
221    }
222}
223
224////////////////////////////////////////////////////////////////////////////////////////////////////
225
226macro_rules! read_event_impl {
227    (
228        $self:ident, $buf:ident,
229        $reader:expr,
230        $read_until_close:ident
231        $(, $await:ident)?
232    ) => {{
233        let event = loop {
234            break match $self.state.state {
235                ParseState::Init => { // Go to InsideMarkup state
236                    // If encoding set explicitly, we not need to detect it. For example,
237                    // explicit UTF-8 set automatically if Reader was created using `from_str`.
238                    // But we still need to remove BOM for consistency with no encoding
239                    // feature enabled path
240                    #[cfg(feature = "encoding")]
241                    if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
242                        if $self.state.encoding.can_be_refined() {
243                            $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
244                        }
245                    }
246
247                    // Removes UTF-8 BOM if it is present
248                    #[cfg(not(feature = "encoding"))]
249                    $reader.remove_utf8_bom() $(.$await)? ?;
250
251                    $self.state.state = ParseState::InsideText;
252                    continue;
253                },
254                ParseState::InsideText => { // Go to InsideMarkup or Done state
255                    if $self.state.config.trim_text_start {
256                        $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
257                    }
258
259                    match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
260                        ReadTextResult::Markup(buf) => {
261                            $self.state.state = ParseState::InsideMarkup;
262                            // Pass `buf` to the next next iteration of parsing loop
263                            $buf = buf;
264                            continue;
265                        }
266                        ReadTextResult::UpToMarkup(bytes) => {
267                            $self.state.state = ParseState::InsideMarkup;
268                            // FIXME: Can produce an empty event if:
269                            // - event contains only spaces
270                            // - trim_text_start = false
271                            // - trim_text_end = true
272                            Ok(Event::Text($self.state.emit_text(bytes)))
273                        }
274                        ReadTextResult::UpToEof(bytes) => {
275                            $self.state.state = ParseState::Done;
276                            // Trim bytes from end if required
277                            let event = $self.state.emit_text(bytes);
278                            if event.is_empty() {
279                                Ok(Event::Eof)
280                            } else {
281                                Ok(Event::Text(event))
282                            }
283                        }
284                        ReadTextResult::Err(e) => Err(Error::Io(e.into())),
285                    }
286                },
287                // Go to InsideText state in next two arms
288                ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
289                ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
290                ParseState::Done => Ok(Event::Eof),
291            };
292        };
293        match event {
294            // #513: In case of ill-formed errors we already consume the wrong data
295            // and change the state. We can continue parsing if we wish
296            Err(Error::IllFormed(_)) => {}
297            Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
298            _ => {}
299        }
300        event
301    }};
302}
303
304/// Read bytes up to the `>` and skip it. This method is expected to be called
305/// after seeing the `<` symbol and skipping it. Inspects the next (current)
306/// symbol and returns an appropriate [`Event`]:
307///
308/// |Symbol |Event
309/// |-------|-------------------------------------
310/// |`!`    |[`Comment`], [`CData`] or [`DocType`]
311/// |`/`    |[`End`]
312/// |`?`    |[`PI`]
313/// |_other_|[`Start`] or [`Empty`]
314///
315/// Moves parser to the `InsideText` state.
316///
317/// [`Comment`]: Event::Comment
318/// [`CData`]: Event::CData
319/// [`DocType`]: Event::DocType
320/// [`End`]: Event::End
321/// [`PI`]: Event::PI
322/// [`Start`]: Event::Start
323/// [`Empty`]: Event::Empty
324macro_rules! read_until_close {
325    (
326        $self:ident, $buf:ident,
327        $reader:expr
328        $(, $await:ident)?
329    ) => {{
330        $self.state.state = ParseState::InsideText;
331
332        let start = $self.state.offset;
333        match $reader.peek_one() $(.$await)? {
334            // `<!` - comment, CDATA or DOCTYPE declaration
335            Ok(Some(b'!')) => match $reader
336                .read_bang_element($buf, &mut $self.state.offset)
337                $(.$await)?
338            {
339                Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
340                Err(e) => {
341                    // We want to report error at `<`, but offset was increased,
342                    // so return it back (-1 for `<`)
343                    $self.state.last_error_offset = start - 1;
344                    Err(e)
345                }
346            },
347            // `</` - closing tag
348            // #776: We parse using ElementParser which allows us to have attributes
349            // in close tags. While such tags are not allowed by the specification,
350            // we anyway allow to parse them because:
351            // - we do not check constraints during parsing. This is performed by the
352            //   optional validate step which user should call manually
353            // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
354            //   `</tag attr=">` and text `" >` which probably no one existing parser
355            //   does. This is malformed XML, however it is tolerated by some parsers
356            //   (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
357            Ok(Some(b'/')) => match $reader
358                .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
359                $(.$await)?
360            {
361                Ok(bytes) => $self.state.emit_end(bytes),
362                Err(e) => {
363                    // We want to report error at `<`, but offset was increased,
364                    // so return it back (-1 for `<`)
365                    $self.state.last_error_offset = start - 1;
366                    Err(e)
367                }
368            },
369            // `<?` - processing instruction
370            Ok(Some(b'?')) => match $reader
371                .read_with(PiParser(false), $buf, &mut $self.state.offset)
372                $(.$await)?
373            {
374                Ok(bytes) => $self.state.emit_question_mark(bytes),
375                Err(e) => {
376                    // We want to report error at `<`, but offset was increased,
377                    // so return it back (-1 for `<`)
378                    $self.state.last_error_offset = start - 1;
379                    Err(e)
380                }
381            },
382            // `<...` - opening or self-closed tag
383            Ok(Some(_)) => match $reader
384                .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
385                $(.$await)?
386            {
387                Ok(bytes) => Ok($self.state.emit_start(bytes)),
388                Err(e) => {
389                    // We want to report error at `<`, but offset was increased,
390                    // so return it back (-1 for `<`)
391                    $self.state.last_error_offset = start - 1;
392                    Err(e)
393                }
394            },
395            // `<` - syntax error, tag not closed
396            Ok(None) => {
397                // We want to report error at `<`, but offset was increased,
398                // so return it back (-1 for `<`)
399                $self.state.last_error_offset = start - 1;
400                Err(Error::Syntax(SyntaxError::UnclosedTag))
401            }
402            Err(e) => Err(Error::Io(e.into())),
403        }
404    }};
405}
406
407/// Generalization of `read_to_end` method for buffered and borrowed readers
408macro_rules! read_to_end {
409    (
410        // $self: &mut Reader
411        $self:expr, $end:expr, $buf:expr,
412        $read_event:ident,
413        // Code block that performs clearing of internal buffer after read of each event
414        $clear:block
415        $(, $await:ident)?
416    ) => {{
417        // Because we take position after the event before the End event,
418        // it is important that this position indicates beginning of the End event.
419        // If between last event and the End event would be only spaces, then we
420        // take position before the spaces, but spaces would be skipped without
421        // generating event if `trim_text_start` is set to `true`. To prevent that
422        // we temporary disable start text trimming.
423        //
424        // We also cannot take position after getting End event, because if
425        // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
426        // we do not known the real size of the End event that it is occupies in
427        // the source and cannot correct the position after the End event.
428        // So, we in any case should tweak parser configuration.
429        let config = $self.config_mut();
430        let trim = config.trim_text_start;
431        config.trim_text_start = false;
432
433        let start = $self.buffer_position();
434        let mut depth = 0;
435        loop {
436            $clear
437            let end = $self.buffer_position();
438            match $self.$read_event($buf) $(.$await)? {
439                Err(e) => {
440                    $self.config_mut().trim_text_start = trim;
441                    return Err(e);
442                }
443
444                Ok(Event::Start(e)) if e.name() == $end => depth += 1,
445                Ok(Event::End(e)) if e.name() == $end => {
446                    if depth == 0 {
447                        $self.config_mut().trim_text_start = trim;
448                        break start..end;
449                    }
450                    depth -= 1;
451                }
452                Ok(Event::Eof) => {
453                    $self.config_mut().trim_text_start = trim;
454                    return Err(Error::missed_end($end, $self.decoder()));
455                }
456                _ => (),
457            }
458        }
459    }};
460}
461
462#[cfg(feature = "async-tokio")]
463mod async_tokio;
464mod buffered_reader;
465mod ns_reader;
466mod slice_reader;
467mod state;
468
469pub use ns_reader::NsReader;
470
471/// Range of input in bytes, that corresponds to some piece of XML
472pub type Span = Range<u64>;
473
474////////////////////////////////////////////////////////////////////////////////////////////////////
475
476/// Possible reader states. The state transition diagram (`true` and `false` shows
477/// value of [`Config::expand_empty_elements`] option):
478///
479/// ```mermaid
480/// flowchart LR
481///   subgraph _
482///     direction LR
483///
484///     Init         -- "(no event)"\n                                       --> InsideMarkup
485///     InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
486///     InsideText   -- "#lt;false#gt;\n(no event)"\nText                    --> InsideMarkup
487///   end
488///   InsideText     -- "#lt;true#gt;"\nStart --> InsideEmpty
489///   InsideEmpty    -- End                   --> InsideText
490///   _ -. Eof .-> Done
491/// ```
492#[derive(Clone, Debug)]
493enum ParseState {
494    /// Initial state in which reader stay after creation. Transition from that
495    /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
496    /// state is always `InsideMarkup`. The reader will never return to this state. The
497    /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
498    /// first symbol not `<`, otherwise no event are emitted.
499    Init,
500    /// State after seeing the `<` symbol. Depending on the next symbol all other
501    /// events could be generated.
502    ///
503    /// After generating one event the reader moves to the `InsideText` state.
504    InsideMarkup,
505    /// State in which reader searches the `<` symbol of a markup. All bytes before
506    /// that symbol will be returned in the [`Event::Text`] event. After that
507    /// the reader moves to the `InsideMarkup` state.
508    InsideText,
509    /// This state is used only if option [`expand_empty_elements`] is set to `true`.
510    /// Reader enters to this state when it is in a `InsideText` state and emits an
511    /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
512    /// after which reader returned to the `InsideText` state.
513    ///
514    /// [`expand_empty_elements`]: Config::expand_empty_elements
515    InsideEmpty,
516    /// Reader enters this state when `Eof` event generated or an error occurred.
517    /// This is the last state, the reader stay in it forever.
518    Done,
519}
520
521/// A reference to an encoding together with information about how it was retrieved.
522///
523/// The state transition diagram:
524///
525/// ```mermaid
526/// flowchart LR
527///   Implicit    -- from_str       --> Explicit
528///   Implicit    -- BOM            --> BomDetected
529///   Implicit    -- "encoding=..." --> XmlDetected
530///   BomDetected -- "encoding=..." --> XmlDetected
531/// ```
532#[cfg(feature = "encoding")]
533#[derive(Clone, Copy, Debug)]
534enum EncodingRef {
535    /// Encoding was implicitly assumed to have a specified value. It can be refined
536    /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
537    Implicit(&'static Encoding),
538    /// Encoding was explicitly set to the desired value. It cannot be changed
539    /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
540    Explicit(&'static Encoding),
541    /// Encoding was detected from a byte order mark (BOM) or by the first bytes
542    /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
543    BomDetected(&'static Encoding),
544    /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
545    /// It can no longer change
546    XmlDetected(&'static Encoding),
547}
548#[cfg(feature = "encoding")]
549impl EncodingRef {
550    #[inline]
551    const fn encoding(&self) -> &'static Encoding {
552        match self {
553            Self::Implicit(e) => e,
554            Self::Explicit(e) => e,
555            Self::BomDetected(e) => e,
556            Self::XmlDetected(e) => e,
557        }
558    }
559    #[inline]
560    const fn can_be_refined(&self) -> bool {
561        match self {
562            Self::Implicit(_) | Self::BomDetected(_) => true,
563            Self::Explicit(_) | Self::XmlDetected(_) => false,
564        }
565    }
566}
567
568////////////////////////////////////////////////////////////////////////////////////////////////////
569
570/// A direct stream to the underlying [`Reader`]s reader which updates
571/// [`Reader::buffer_position()`] when read from it.
572#[derive(Debug)]
573#[must_use = "streams do nothing unless read or polled"]
574pub struct BinaryStream<'r, R> {
575    inner: &'r mut R,
576    offset: &'r mut u64,
577}
578
579impl<'r, R> BinaryStream<'r, R> {
580    /// Returns current position in bytes in the original source.
581    #[inline]
582    pub const fn offset(&self) -> u64 {
583        *self.offset
584    }
585
586    /// Gets a reference to the underlying reader.
587    #[inline]
588    pub const fn get_ref(&self) -> &R {
589        self.inner
590    }
591
592    /// Gets a mutable reference to the underlying reader.
593    ///
594    /// Avoid read from this reader because this will not update reader's position
595    /// and will lead to incorrect positions of errors. Read from this stream instead.
596    #[inline]
597    pub fn get_mut(&mut self) -> &mut R {
598        self.inner
599    }
600}
601
602impl<'r, R> io::Read for BinaryStream<'r, R>
603where
604    R: io::Read,
605{
606    #[inline]
607    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
608        let amt = self.inner.read(buf)?;
609        *self.offset += amt as u64;
610        Ok(amt)
611    }
612}
613
614impl<'r, R> io::BufRead for BinaryStream<'r, R>
615where
616    R: io::BufRead,
617{
618    #[inline]
619    fn fill_buf(&mut self) -> io::Result<&[u8]> {
620        self.inner.fill_buf()
621    }
622
623    #[inline]
624    fn consume(&mut self, amt: usize) {
625        self.inner.consume(amt);
626        *self.offset += amt as u64;
627    }
628}
629
630////////////////////////////////////////////////////////////////////////////////////////////////////
631
632/// A low level encoding-agnostic XML event reader.
633///
634/// Consumes bytes and streams XML [`Event`]s.
635///
636/// This reader does not manage namespace declarations and not able to resolve
637/// prefixes. If you want these features, use the [`NsReader`].
638///
639/// # Examples
640///
641/// ```
642/// use quick_xml::events::Event;
643/// use quick_xml::reader::Reader;
644///
645/// let xml = r#"<tag1 att1 = "test">
646///                 <tag2><!--Test comment-->Test</tag2>
647///                 <tag2>Test 2</tag2>
648///              </tag1>"#;
649/// let mut reader = Reader::from_str(xml);
650/// reader.config_mut().trim_text(true);
651///
652/// let mut count = 0;
653/// let mut txt = Vec::new();
654/// let mut buf = Vec::new();
655///
656/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
657/// loop {
658///     // NOTE: this is the generic case when we don't know about the input BufRead.
659///     // when the input is a &str or a &[u8], we don't actually need to use another
660///     // buffer, we could directly call `reader.read_event()`
661///     match reader.read_event_into(&mut buf) {
662///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
663///         // exits the loop when reaching end of file
664///         Ok(Event::Eof) => break,
665///
666///         Ok(Event::Start(e)) => {
667///             match e.name().as_ref() {
668///                 b"tag1" => println!("attributes values: {:?}",
669///                                     e.attributes().map(|a| a.unwrap().value)
670///                                     .collect::<Vec<_>>()),
671///                 b"tag2" => count += 1,
672///                 _ => (),
673///             }
674///         }
675///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
676///
677///         // There are several other `Event`s we do not consider here
678///         _ => (),
679///     }
680///     // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
681///     buf.clear();
682/// }
683/// ```
684///
685/// [`NsReader`]: crate::reader::NsReader
686#[derive(Clone)]
687pub struct Reader<R> {
688    /// Source of data for parse
689    reader: R,
690    /// Configuration and current parse state
691    state: ReaderState,
692}
693
694/// Builder methods
695impl<R> Reader<R> {
696    /// Creates a `Reader` that reads from a given reader.
697    pub fn from_reader(reader: R) -> Self {
698        Self {
699            reader,
700            state: ReaderState::default(),
701        }
702    }
703
704    /// Returns reference to the parser configuration
705    pub const fn config(&self) -> &Config {
706        &self.state.config
707    }
708
709    /// Returns mutable reference to the parser configuration
710    pub fn config_mut(&mut self) -> &mut Config {
711        &mut self.state.config
712    }
713}
714
715/// Getters
716impl<R> Reader<R> {
717    /// Consumes `Reader` returning the underlying reader
718    ///
719    /// Can be used to compute line and column of a parsing error position
720    ///
721    /// # Examples
722    ///
723    /// ```
724    /// # use pretty_assertions::assert_eq;
725    /// use std::{str, io::Cursor};
726    /// use quick_xml::events::Event;
727    /// use quick_xml::reader::Reader;
728    ///
729    /// let xml = r#"<tag1 att1 = "test">
730    ///                 <tag2><!--Test comment-->Test</tag2>
731    ///                 <tag3>Test 2</tag3>
732    ///              </tag1>"#;
733    /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
734    /// let mut buf = Vec::new();
735    ///
736    /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
737    ///     // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
738    ///     let end_pos = reader.buffer_position() as usize;
739    ///     let mut cursor = reader.into_inner();
740    ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
741    ///         .expect("can't make a string");
742    ///     let mut line = 1;
743    ///     let mut column = 0;
744    ///     for c in s.chars() {
745    ///         if c == '\n' {
746    ///             line += 1;
747    ///             column = 0;
748    ///         } else {
749    ///             column += 1;
750    ///         }
751    ///     }
752    ///     (line, column)
753    /// }
754    ///
755    /// loop {
756    ///     match reader.read_event_into(&mut buf) {
757    ///         Ok(Event::Start(ref e)) => match e.name().as_ref() {
758    ///             b"tag1" | b"tag2" => (),
759    ///             tag => {
760    ///                 assert_eq!(b"tag3", tag);
761    ///                 assert_eq!((3, 22), into_line_and_column(reader));
762    ///                 break;
763    ///             }
764    ///         },
765    ///         Ok(Event::Eof) => unreachable!(),
766    ///         _ => (),
767    ///     }
768    ///     buf.clear();
769    /// }
770    /// ```
771    pub fn into_inner(self) -> R {
772        self.reader
773    }
774
775    /// Gets a reference to the underlying reader.
776    pub const fn get_ref(&self) -> &R {
777        &self.reader
778    }
779
780    /// Gets a mutable reference to the underlying reader.
781    ///
782    /// Avoid read from this reader because this will not update reader's position
783    /// and will lead to incorrect positions of errors. If you want to read, use
784    /// [`stream()`] instead.
785    ///
786    /// [`stream()`]: Self::stream
787    pub fn get_mut(&mut self) -> &mut R {
788        &mut self.reader
789    }
790
791    /// Gets the current byte position in the input data.
792    pub const fn buffer_position(&self) -> u64 {
793        // when internal state is InsideMarkup, we have actually read until '<',
794        // which we don't want to show
795        if let ParseState::InsideMarkup = self.state.state {
796            self.state.offset - 1
797        } else {
798            self.state.offset
799        }
800    }
801
802    /// Gets the last error byte position in the input data. If there is no errors
803    /// yet, returns `0`.
804    ///
805    /// Unlike `buffer_position` it will point to the place where it is rational
806    /// to report error to the end user. For example, all [`SyntaxError`]s are
807    /// reported when the parser sees EOF inside of some kind of markup. The
808    /// `buffer_position()` will point to the last byte of input which is not
809    /// very useful. `error_position()` will point to the start of corresponding
810    /// markup element (i. e. to the `<` character).
811    ///
812    /// This position is always `<= buffer_position()`.
813    pub const fn error_position(&self) -> u64 {
814        self.state.last_error_offset
815    }
816
817    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
818    ///
819    /// If [`encoding`] feature is enabled, the used encoding may change after
820    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
821    ///
822    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
823    /// defaults to UTF-8.
824    ///
825    /// [`encoding`]: ../index.html#encoding
826    #[inline]
827    pub const fn decoder(&self) -> Decoder {
828        self.state.decoder()
829    }
830
831    /// Get the direct access to the underlying reader, but tracks the amount of
832    /// read data and update [`Reader::buffer_position()`] accordingly.
833    ///
834    /// Note, that this method gives you access to the internal reader and read
835    /// data will not be returned in any subsequent events read by `read_event`
836    /// family of methods.
837    ///
838    /// # Example
839    ///
840    /// This example demonstrates how to read stream raw bytes from an XML document.
841    /// This could be used to implement streaming read of text, or to read raw binary
842    /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
843    /// valid XML, but XML-derived file formats exist where such documents are valid).
844    ///
845    /// ```
846    /// # use pretty_assertions::assert_eq;
847    /// use std::io::{BufRead, Read};
848    /// use quick_xml::events::{BytesEnd, BytesStart, Event};
849    /// use quick_xml::reader::Reader;
850    ///
851    /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
852    /// //                                 ^    ^               ^     ^
853    /// //                                 0    5              21    27
854    ///
855    /// assert_eq!(
856    ///     (reader.read_event().unwrap(), reader.buffer_position()),
857    ///     // 5 - end of the `<tag>`
858    ///     (Event::Start(BytesStart::new("tag")), 5)
859    /// );
860    ///
861    /// // Reading directly from underlying reader will not update position
862    /// // let mut inner = reader.get_mut();
863    ///
864    /// // Reading from the stream() advances position
865    /// let mut inner = reader.stream();
866    ///
867    /// // Read binary data. We must know its size
868    /// let mut binary = [0u8; 16];
869    /// inner.read_exact(&mut binary).unwrap();
870    /// assert_eq!(&binary, b"binary << data&>");
871    /// // 21 - end of the `binary << data&>`
872    /// assert_eq!(inner.offset(), 21);
873    /// assert_eq!(reader.buffer_position(), 21);
874    ///
875    /// assert_eq!(
876    ///     (reader.read_event().unwrap(), reader.buffer_position()),
877    ///     // 27 - end of the `</tag>`
878    ///     (Event::End(BytesEnd::new("tag")), 27)
879    /// );
880    ///
881    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
882    /// ```
883    #[inline]
884    pub fn stream(&mut self) -> BinaryStream<R> {
885        BinaryStream {
886            inner: &mut self.reader,
887            offset: &mut self.state.offset,
888        }
889    }
890}
891
892/// Private sync reading methods
893impl<R> Reader<R> {
894    /// Read text into the given buffer, and return an event that borrows from
895    /// either that buffer or from the input itself, based on the type of the
896    /// reader.
897    fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
898    where
899        R: XmlSource<'i, B>,
900    {
901        read_event_impl!(self, buf, self.reader, read_until_close)
902    }
903
904    /// Private function to read until `>` is found. This function expects that
905    /// it was called just after encounter a `<` symbol.
906    fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
907    where
908        R: XmlSource<'i, B>,
909    {
910        read_until_close!(self, buf, self.reader)
911    }
912}
913
914////////////////////////////////////////////////////////////////////////////////////////////////////
915
916/// Result of an attempt to read XML textual data from the reader.
917enum ReadTextResult<'r, B> {
918    /// Start of markup (`<` character) was found in the first byte.
919    /// Contains buffer that should be returned back to the next iteration cycle
920    /// to satisfy borrow checker requirements.
921    Markup(B),
922    /// Contains text block up to start of markup (`<` character).
923    UpToMarkup(&'r [u8]),
924    /// Contains text block up to EOF, start of markup (`<` character) was not found.
925    UpToEof(&'r [u8]),
926    /// IO error occurred.
927    Err(io::Error),
928}
929
930/// Represents an input for a reader that can return borrowed data.
931///
932/// There are two implementors of this trait: generic one that read data from
933/// `Self`, copies some part of it into a provided buffer of type `B` and then
934/// returns data that borrow from that buffer.
935///
936/// The other implementor is for `&[u8]` and instead of copying data returns
937/// borrowed data from `Self` instead. This implementation allows zero-copy
938/// deserialization.
939///
940/// # Parameters
941/// - `'r`: lifetime of a buffer from which events will borrow
942/// - `B`: a type of a buffer that can be used to store data read from `Self` and
943///   from which events can borrow
944trait XmlSource<'r, B> {
945    /// Removes UTF-8 BOM if it is present
946    #[cfg(not(feature = "encoding"))]
947    fn remove_utf8_bom(&mut self) -> io::Result<()>;
948
949    /// Determines encoding from the start of input and removes BOM if it is present
950    #[cfg(feature = "encoding")]
951    fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
952
953    /// Read input until start of markup (the `<`) is found or end of input is reached.
954    ///
955    /// # Parameters
956    /// - `buf`: Buffer that could be filled from an input (`Self`) and
957    ///   from which [events] could borrow their data
958    /// - `position`: Will be increased by amount of bytes consumed
959    ///
960    /// [events]: crate::events::Event
961    fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
962
963    /// Read input until processing instruction is finished.
964    ///
965    /// This method expect that start sequence of a parser already was read.
966    ///
967    /// Returns a slice of data read up to the end of the thing being parsed.
968    /// The end of thing and the returned content is determined by the used parser.
969    ///
970    /// If input (`Self`) is exhausted and no bytes was read, or if the specified
971    /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
972    ///
973    /// # Parameters
974    /// - `buf`: Buffer that could be filled from an input (`Self`) and
975    ///   from which [events] could borrow their data
976    /// - `position`: Will be increased by amount of bytes consumed
977    ///
978    /// A `P` type parameter is used to preserve state between calls to the underlying
979    /// reader which provides bytes fed into the parser.
980    ///
981    /// [events]: crate::events::Event
982    fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8]>
983    where
984        P: Parser;
985
986    /// Read input until comment or CDATA is finished.
987    ///
988    /// This method expect that `<` already was read.
989    ///
990    /// Returns a slice of data read up to end of comment or CDATA (`>`),
991    /// which does not include into result.
992    ///
993    /// If input (`Self`) is exhausted and nothing was read, returns `None`.
994    ///
995    /// # Parameters
996    /// - `buf`: Buffer that could be filled from an input (`Self`) and
997    ///   from which [events] could borrow their data
998    /// - `position`: Will be increased by amount of bytes consumed
999    ///
1000    /// [events]: crate::events::Event
1001    fn read_bang_element(&mut self, buf: B, position: &mut u64) -> Result<(BangType, &'r [u8])>;
1002
1003    /// Consume and discard all the whitespace until the next non-whitespace
1004    /// character or EOF.
1005    ///
1006    /// # Parameters
1007    /// - `position`: Will be increased by amount of bytes consumed
1008    fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1009
1010    /// Return one character without consuming it, so that future `read_*` calls
1011    /// will still include it. On EOF, return `None`.
1012    fn peek_one(&mut self) -> io::Result<Option<u8>>;
1013}
1014
1015/// Possible elements started with `<!`
1016#[derive(Debug, PartialEq)]
1017enum BangType {
1018    /// <![CDATA[...]]>
1019    CData,
1020    /// <!--...-->
1021    Comment,
1022    /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1023    DocType(i32),
1024}
1025impl BangType {
1026    #[inline(always)]
1027    const fn new(byte: Option<u8>) -> Result<Self> {
1028        Ok(match byte {
1029            Some(b'[') => Self::CData,
1030            Some(b'-') => Self::Comment,
1031            Some(b'D') | Some(b'd') => Self::DocType(0),
1032            _ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)),
1033        })
1034    }
1035
1036    /// If element is finished, returns its content up to `>` symbol and
1037    /// an index of this symbol, otherwise returns `None`
1038    ///
1039    /// # Parameters
1040    /// - `buf`: buffer with data consumed on previous iterations
1041    /// - `chunk`: data read on current iteration and not yet consumed from reader
1042    #[inline(always)]
1043    fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1044        match self {
1045            Self::Comment => {
1046                for i in memchr::memchr_iter(b'>', chunk) {
1047                    // Need to read at least 6 symbols (`!---->`) for properly finished comment
1048                    // <!----> - XML comment
1049                    //  012345 - i
1050                    if buf.len() + i > 4 {
1051                        if chunk[..i].ends_with(b"--") {
1052                            // We cannot strip last `--` from the buffer because we need it in case of
1053                            // check_comments enabled option. XML standard requires that comment
1054                            // will not end with `--->` sequence because this is a special case of
1055                            // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1056                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1057                        }
1058                        // End sequence `-|->` was splitted at |
1059                        //        buf --/   \-- chunk
1060                        if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1061                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1062                        }
1063                        // End sequence `--|>` was splitted at |
1064                        //         buf --/   \-- chunk
1065                        if i == 0 && buf.ends_with(b"--") {
1066                            return Some((&[], i + 1)); // +1 for `>`
1067                        }
1068                    }
1069                }
1070            }
1071            Self::CData => {
1072                for i in memchr::memchr_iter(b'>', chunk) {
1073                    if chunk[..i].ends_with(b"]]") {
1074                        return Some((&chunk[..i], i + 1)); // +1 for `>`
1075                    }
1076                    // End sequence `]|]>` was splitted at |
1077                    //        buf --/   \-- chunk
1078                    if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1079                        return Some((&chunk[..i], i + 1)); // +1 for `>`
1080                    }
1081                    // End sequence `]]|>` was splitted at |
1082                    //         buf --/   \-- chunk
1083                    if i == 0 && buf.ends_with(b"]]") {
1084                        return Some((&[], i + 1)); // +1 for `>`
1085                    }
1086                }
1087            }
1088            Self::DocType(ref mut balance) => {
1089                for i in memchr::memchr2_iter(b'<', b'>', chunk) {
1090                    if chunk[i] == b'<' {
1091                        *balance += 1;
1092                    } else {
1093                        if *balance == 0 {
1094                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1095                        }
1096                        *balance -= 1;
1097                    }
1098                }
1099            }
1100        }
1101        None
1102    }
1103    #[inline]
1104    const fn to_err(&self) -> Error {
1105        match self {
1106            Self::CData => Error::Syntax(SyntaxError::UnclosedCData),
1107            Self::Comment => Error::Syntax(SyntaxError::UnclosedComment),
1108            Self::DocType(_) => Error::Syntax(SyntaxError::UnclosedDoctype),
1109        }
1110    }
1111}
1112
1113////////////////////////////////////////////////////////////////////////////////////////////////////
1114
1115#[cfg(test)]
1116mod test {
1117    /// Checks the internal implementation of the various reader methods
1118    macro_rules! check {
1119        (
1120            #[$test:meta]
1121            $read_event:ident,
1122            $read_until_close:ident,
1123            // constructor of the XML source on which internal functions will be called
1124            $source:path,
1125            // constructor of the buffer to which read data will stored
1126            $buf:expr
1127            $(, $async:ident, $await:ident)?
1128        ) => {
1129            mod read_bang_element {
1130                use super::*;
1131                use crate::errors::{Error, SyntaxError};
1132                use crate::reader::BangType;
1133                use crate::utils::Bytes;
1134
1135                /// Checks that reading CDATA content works correctly
1136                mod cdata {
1137                    use super::*;
1138                    use pretty_assertions::assert_eq;
1139
1140                    /// Checks that if input begins like CDATA element, but CDATA start sequence
1141                    /// is not finished, parsing ends with an error
1142                    #[$test]
1143                    #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1144                    $($async)? fn not_properly_start() {
1145                        let buf = $buf;
1146                        let mut position = 1;
1147                        let mut input = b"![]]>other content".as_ref();
1148                        //                ^= 1
1149
1150                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1151                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1152                            x => panic!(
1153                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1154                                x
1155                            ),
1156                        }
1157                        assert_eq!(position, 1);
1158                    }
1159
1160                    /// Checks that if CDATA startup sequence was matched, but an end sequence
1161                    /// is not found, parsing ends with an error
1162                    #[$test]
1163                    $($async)? fn not_closed() {
1164                        let buf = $buf;
1165                        let mut position = 1;
1166                        let mut input = b"![CDATA[other content".as_ref();
1167                        //                ^= 1                 ^= 22
1168
1169                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1170                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1171                            x => panic!(
1172                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1173                                x
1174                            ),
1175                        }
1176                        assert_eq!(position, 22);
1177                    }
1178
1179                    /// Checks that CDATA element without content inside parsed successfully
1180                    #[$test]
1181                    $($async)? fn empty() {
1182                        let buf = $buf;
1183                        let mut position = 1;
1184                        let mut input = b"![CDATA[]]>other content".as_ref();
1185                        //                ^= 1       ^= 12
1186
1187                        let (ty, bytes) = $source(&mut input)
1188                            .read_bang_element(buf, &mut position)
1189                            $(.$await)?
1190                            .unwrap();
1191                        assert_eq!(
1192                            (ty, Bytes(bytes)),
1193                            (BangType::CData, Bytes(b"![CDATA[]]"))
1194                        );
1195                        assert_eq!(position, 12);
1196                    }
1197
1198                    /// Checks that CDATA element with content parsed successfully.
1199                    /// Additionally checks that sequences inside CDATA that may look like
1200                    /// a CDATA end sequence do not interrupt CDATA parsing
1201                    #[$test]
1202                    $($async)? fn with_content() {
1203                        let buf = $buf;
1204                        let mut position = 1;
1205                        let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1206                        //                ^= 1                        ^= 29
1207
1208                        let (ty, bytes) = $source(&mut input)
1209                            .read_bang_element(buf, &mut position)
1210                            $(.$await)?
1211                            .unwrap();
1212                        assert_eq!(
1213                            (ty, Bytes(bytes)),
1214                            (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1215                        );
1216                        assert_eq!(position, 29);
1217                    }
1218                }
1219
1220                /// Checks that reading XML comments works correctly. According to the [specification],
1221                /// comment data can contain any sequence except `--`:
1222                ///
1223                /// ```peg
1224                /// comment = '<--' (!'--' char)* '-->';
1225                /// char = [#x1-#x2C]
1226                ///      / [#x2E-#xD7FF]
1227                ///      / [#xE000-#xFFFD]
1228                ///      / [#x10000-#x10FFFF]
1229                /// ```
1230                ///
1231                /// The presence of this limitation, however, is simply a poorly designed specification
1232                /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1233                /// presence of these sequences by default. This tests allow such content.
1234                ///
1235                /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1236                mod comment {
1237                    use super::*;
1238                    use pretty_assertions::assert_eq;
1239
1240                    #[$test]
1241                    #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1242                    $($async)? fn not_properly_start() {
1243                        let buf = $buf;
1244                        let mut position = 1;
1245                        let mut input = b"!- -->other content".as_ref();
1246                        //                ^= 1
1247
1248                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1249                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1250                            x => panic!(
1251                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1252                                x
1253                            ),
1254                        }
1255                        assert_eq!(position, 1);
1256                    }
1257
1258                    #[$test]
1259                    $($async)? fn not_properly_end() {
1260                        let buf = $buf;
1261                        let mut position = 1;
1262                        let mut input = b"!->other content".as_ref();
1263                        //                ^= 1            ^= 17
1264
1265                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1266                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1267                            x => panic!(
1268                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1269                                x
1270                            ),
1271                        }
1272                        assert_eq!(position, 17);
1273                    }
1274
1275                    #[$test]
1276                    $($async)? fn not_closed1() {
1277                        let buf = $buf;
1278                        let mut position = 1;
1279                        let mut input = b"!--other content".as_ref();
1280                        //                ^= 1            ^= 17
1281
1282                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1283                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1284                            x => panic!(
1285                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1286                                x
1287                            ),
1288                        }
1289                        assert_eq!(position, 17);
1290                    }
1291
1292                    #[$test]
1293                    $($async)? fn not_closed2() {
1294                        let buf = $buf;
1295                        let mut position = 1;
1296                        let mut input = b"!-->other content".as_ref();
1297                        //                ^= 1             ^= 18
1298
1299                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1300                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1301                            x => panic!(
1302                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1303                                x
1304                            ),
1305                        }
1306                        assert_eq!(position, 18);
1307                    }
1308
1309                    #[$test]
1310                    $($async)? fn not_closed3() {
1311                        let buf = $buf;
1312                        let mut position = 1;
1313                        let mut input = b"!--->other content".as_ref();
1314                        //                ^= 1              ^= 19
1315
1316                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1317                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1318                            x => panic!(
1319                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1320                                x
1321                            ),
1322                        }
1323                        assert_eq!(position, 19);
1324                    }
1325
1326                    #[$test]
1327                    $($async)? fn empty() {
1328                        let buf = $buf;
1329                        let mut position = 1;
1330                        let mut input = b"!---->other content".as_ref();
1331                        //                ^= 1  ^= 7
1332
1333                        let (ty, bytes) = $source(&mut input)
1334                            .read_bang_element(buf, &mut position)
1335                            $(.$await)?
1336                            .unwrap();
1337                        assert_eq!(
1338                            (ty, Bytes(bytes)),
1339                            (BangType::Comment, Bytes(b"!----"))
1340                        );
1341                        assert_eq!(position, 7);
1342                    }
1343
1344                    #[$test]
1345                    $($async)? fn with_content() {
1346                        let buf = $buf;
1347                        let mut position = 1;
1348                        let mut input = b"!--->comment<--->other content".as_ref();
1349                        //                ^= 1             ^= 18
1350
1351                        let (ty, bytes) = $source(&mut input)
1352                            .read_bang_element(buf, &mut position)
1353                            $(.$await)?
1354                            .unwrap();
1355                        assert_eq!(
1356                            (ty, Bytes(bytes)),
1357                            (BangType::Comment, Bytes(b"!--->comment<---"))
1358                        );
1359                        assert_eq!(position, 18);
1360                    }
1361                }
1362
1363                /// Checks that reading DOCTYPE definition works correctly
1364                mod doctype {
1365                    use super::*;
1366
1367                    mod uppercase {
1368                        use super::*;
1369                        use pretty_assertions::assert_eq;
1370
1371                        #[$test]
1372                        $($async)? fn not_properly_start() {
1373                            let buf = $buf;
1374                            let mut position = 1;
1375                            let mut input = b"!D other content".as_ref();
1376                            //                ^= 1            ^= 17
1377
1378                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1379                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1380                                x => panic!(
1381                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1382                                    x
1383                                ),
1384                            }
1385                            assert_eq!(position, 17);
1386                        }
1387
1388                        #[$test]
1389                        $($async)? fn without_space() {
1390                            let buf = $buf;
1391                            let mut position = 1;
1392                            let mut input = b"!DOCTYPEother content".as_ref();
1393                            //                ^= 1                 ^= 22
1394
1395                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1396                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1397                                x => panic!(
1398                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1399                                    x
1400                                ),
1401                            }
1402                            assert_eq!(position, 22);
1403                        }
1404
1405                        #[$test]
1406                        $($async)? fn empty() {
1407                            let buf = $buf;
1408                            let mut position = 1;
1409                            let mut input = b"!DOCTYPE>other content".as_ref();
1410                            //                ^= 1     ^= 10
1411
1412                            let (ty, bytes) = $source(&mut input)
1413                                .read_bang_element(buf, &mut position)
1414                                $(.$await)?
1415                                .unwrap();
1416                            assert_eq!(
1417                                (ty, Bytes(bytes)),
1418                                (BangType::DocType(0), Bytes(b"!DOCTYPE"))
1419                            );
1420                            assert_eq!(position, 10);
1421                        }
1422
1423                        #[$test]
1424                        $($async)? fn not_closed() {
1425                            let buf = $buf;
1426                            let mut position = 1;
1427                            let mut input = b"!DOCTYPE other content".as_ref();
1428                            //                ^= 1                  ^23
1429
1430                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1431                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1432                                x => panic!(
1433                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1434                                    x
1435                                ),
1436                            }
1437                            assert_eq!(position, 23);
1438                        }
1439                    }
1440
1441                    mod lowercase {
1442                        use super::*;
1443                        use pretty_assertions::assert_eq;
1444
1445                        #[$test]
1446                        $($async)? fn not_properly_start() {
1447                            let buf = $buf;
1448                            let mut position = 1;
1449                            let mut input = b"!d other content".as_ref();
1450                            //                ^= 1            ^= 17
1451
1452                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1453                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1454                                x => panic!(
1455                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1456                                    x
1457                                ),
1458                            }
1459                            assert_eq!(position, 17);
1460                        }
1461
1462                        #[$test]
1463                        $($async)? fn without_space() {
1464                            let buf = $buf;
1465                            let mut position = 1;
1466                            let mut input = b"!doctypeother content".as_ref();
1467                            //                ^= 1                 ^= 22
1468
1469                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1470                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1471                                x => panic!(
1472                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1473                                    x
1474                                ),
1475                            }
1476                            assert_eq!(position, 22);
1477                        }
1478
1479                        #[$test]
1480                        $($async)? fn empty() {
1481                            let buf = $buf;
1482                            let mut position = 1;
1483                            let mut input = b"!doctype>other content".as_ref();
1484                            //                ^= 1     ^= 10
1485
1486                            let (ty, bytes) = $source(&mut input)
1487                                .read_bang_element(buf, &mut position)
1488                                $(.$await)?
1489                                .unwrap();
1490                            assert_eq!(
1491                                (ty, Bytes(bytes)),
1492                                (BangType::DocType(0), Bytes(b"!doctype"))
1493                            );
1494                            assert_eq!(position, 10);
1495                        }
1496
1497                        #[$test]
1498                        $($async)? fn not_closed() {
1499                            let buf = $buf;
1500                            let mut position = 1;
1501                            let mut input = b"!doctype other content".as_ref();
1502                            //                ^= 1                  ^= 23
1503
1504                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1505                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1506                                x => panic!(
1507                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1508                                    x
1509                                ),
1510                            }
1511                            assert_eq!(position, 23);
1512                        }
1513                    }
1514                }
1515            }
1516
1517            mod read_element {
1518                use super::*;
1519                use crate::errors::{Error, SyntaxError};
1520                use crate::parser::ElementParser;
1521                use crate::utils::Bytes;
1522                use pretty_assertions::assert_eq;
1523
1524                /// Checks that nothing was read from empty buffer
1525                #[$test]
1526                $($async)? fn empty() {
1527                    let buf = $buf;
1528                    let mut position = 1;
1529                    let mut input = b"".as_ref();
1530                    //                ^= 1
1531
1532                    match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1533                        Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1534                        x => panic!(
1535                            "Expected `Err(Syntax(_))`, but got `{:?}`",
1536                            x
1537                        ),
1538                    }
1539                    assert_eq!(position, 1);
1540                }
1541
1542                mod open {
1543                    use super::*;
1544                    use pretty_assertions::assert_eq;
1545
1546                    #[$test]
1547                    $($async)? fn empty_tag() {
1548                        let buf = $buf;
1549                        let mut position = 1;
1550                        let mut input = b">".as_ref();
1551                        //                 ^= 2
1552
1553                        assert_eq!(
1554                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1555                            Bytes(b"")
1556                        );
1557                        assert_eq!(position, 2);
1558                    }
1559
1560                    #[$test]
1561                    $($async)? fn normal() {
1562                        let buf = $buf;
1563                        let mut position = 1;
1564                        let mut input = b"tag>".as_ref();
1565                        //                    ^= 5
1566
1567                        assert_eq!(
1568                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1569                            Bytes(b"tag")
1570                        );
1571                        assert_eq!(position, 5);
1572                    }
1573
1574                    #[$test]
1575                    $($async)? fn empty_ns_empty_tag() {
1576                        let buf = $buf;
1577                        let mut position = 1;
1578                        let mut input = b":>".as_ref();
1579                        //                  ^= 3
1580
1581                        assert_eq!(
1582                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1583                            Bytes(b":")
1584                        );
1585                        assert_eq!(position, 3);
1586                    }
1587
1588                    #[$test]
1589                    $($async)? fn empty_ns() {
1590                        let buf = $buf;
1591                        let mut position = 1;
1592                        let mut input = b":tag>".as_ref();
1593                        //                     ^= 6
1594
1595                        assert_eq!(
1596                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1597                            Bytes(b":tag")
1598                        );
1599                        assert_eq!(position, 6);
1600                    }
1601
1602                    #[$test]
1603                    $($async)? fn with_attributes() {
1604                        let buf = $buf;
1605                        let mut position = 1;
1606                        let mut input = br#"tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1607                        //                                                        ^= 39
1608
1609                        assert_eq!(
1610                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1611                            Bytes(br#"tag  attr-1=">"  attr2  =  '>'  3attr"#)
1612                        );
1613                        assert_eq!(position, 39);
1614                    }
1615                }
1616
1617                mod self_closed {
1618                    use super::*;
1619                    use pretty_assertions::assert_eq;
1620
1621                    #[$test]
1622                    $($async)? fn empty_tag() {
1623                        let buf = $buf;
1624                        let mut position = 1;
1625                        let mut input = b"/>".as_ref();
1626                        //                  ^= 3
1627
1628                        assert_eq!(
1629                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1630                            Bytes(b"/")
1631                        );
1632                        assert_eq!(position, 3);
1633                    }
1634
1635                    #[$test]
1636                    $($async)? fn normal() {
1637                        let buf = $buf;
1638                        let mut position = 1;
1639                        let mut input = b"tag/>".as_ref();
1640                        //                     ^= 6
1641
1642                        assert_eq!(
1643                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1644                            Bytes(b"tag/")
1645                        );
1646                        assert_eq!(position, 6);
1647                    }
1648
1649                    #[$test]
1650                    $($async)? fn empty_ns_empty_tag() {
1651                        let buf = $buf;
1652                        let mut position = 1;
1653                        let mut input = b":/>".as_ref();
1654                        //                   ^= 4
1655
1656                        assert_eq!(
1657                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1658                            Bytes(b":/")
1659                        );
1660                        assert_eq!(position, 4);
1661                    }
1662
1663                    #[$test]
1664                    $($async)? fn empty_ns() {
1665                        let buf = $buf;
1666                        let mut position = 1;
1667                        let mut input = b":tag/>".as_ref();
1668                        //                      ^= 7
1669
1670                        assert_eq!(
1671                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1672                            Bytes(b":tag/")
1673                        );
1674                        assert_eq!(position, 7);
1675                    }
1676
1677                    #[$test]
1678                    $($async)? fn with_attributes() {
1679                        let buf = $buf;
1680                        let mut position = 1;
1681                        let mut input = br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/>"#.as_ref();
1682                        //                                                           ^= 42
1683
1684                        assert_eq!(
1685                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1686                            Bytes(br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/"#)
1687                        );
1688                        assert_eq!(position, 42);
1689                    }
1690                }
1691
1692                mod close {
1693                    use super::*;
1694                    use pretty_assertions::assert_eq;
1695
1696                    #[$test]
1697                    $($async)? fn empty_tag() {
1698                        let buf = $buf;
1699                        let mut position = 1;
1700                        let mut input = b"/ >".as_ref();
1701                        //                   ^= 4
1702
1703                        assert_eq!(
1704                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1705                            Bytes(b"/ ")
1706                        );
1707                        assert_eq!(position, 4);
1708                    }
1709
1710                    #[$test]
1711                    $($async)? fn normal() {
1712                        let buf = $buf;
1713                        let mut position = 1;
1714                        let mut input = b"/tag>".as_ref();
1715                        //                     ^= 6
1716
1717                        assert_eq!(
1718                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1719                            Bytes(b"/tag")
1720                        );
1721                        assert_eq!(position, 6);
1722                    }
1723
1724                    #[$test]
1725                    $($async)? fn empty_ns_empty_tag() {
1726                        let buf = $buf;
1727                        let mut position = 1;
1728                        let mut input = b"/:>".as_ref();
1729                        //                   ^= 4
1730
1731                        assert_eq!(
1732                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1733                            Bytes(b"/:")
1734                        );
1735                        assert_eq!(position, 4);
1736                    }
1737
1738                    #[$test]
1739                    $($async)? fn empty_ns() {
1740                        let buf = $buf;
1741                        let mut position = 1;
1742                        let mut input = b"/:tag>".as_ref();
1743                        //                      ^= 7
1744
1745                        assert_eq!(
1746                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1747                            Bytes(b"/:tag")
1748                        );
1749                        assert_eq!(position, 7);
1750                    }
1751
1752                    #[$test]
1753                    $($async)? fn with_attributes() {
1754                        let buf = $buf;
1755                        let mut position = 1;
1756                        let mut input = br#"/tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1757                        //                                                         ^= 40
1758
1759                        assert_eq!(
1760                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1761                            Bytes(br#"/tag  attr-1=">"  attr2  =  '>'  3attr"#)
1762                        );
1763                        assert_eq!(position, 40);
1764                    }
1765                }
1766            }
1767
1768            /// Ensures, that no empty `Text` events are generated
1769            mod $read_event {
1770                use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
1771                use crate::reader::Reader;
1772                use pretty_assertions::assert_eq;
1773
1774                /// When `encoding` feature is enabled, encoding should be detected
1775                /// from BOM (UTF-8) and BOM should be stripped.
1776                ///
1777                /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1778                /// character should be stripped for consistency
1779                #[$test]
1780                $($async)? fn bom_from_reader() {
1781                    let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1782
1783                    assert_eq!(
1784                        reader.$read_event($buf) $(.$await)? .unwrap(),
1785                        Event::Text(BytesText::from_escaped("\u{feff}"))
1786                    );
1787
1788                    assert_eq!(
1789                        reader.$read_event($buf) $(.$await)? .unwrap(),
1790                        Event::Eof
1791                    );
1792                }
1793
1794                /// When parsing from &str, encoding is fixed (UTF-8), so
1795                /// - when `encoding` feature is disabled, the behavior the
1796                ///   same as in `bom_from_reader` text
1797                /// - when `encoding` feature is enabled, the behavior should
1798                ///   stay consistent, so the first BOM character is stripped
1799                #[$test]
1800                $($async)? fn bom_from_str() {
1801                    let mut reader = Reader::from_str("\u{feff}\u{feff}");
1802
1803                    assert_eq!(
1804                        reader.$read_event($buf) $(.$await)? .unwrap(),
1805                        Event::Text(BytesText::from_escaped("\u{feff}"))
1806                    );
1807
1808                    assert_eq!(
1809                        reader.$read_event($buf) $(.$await)? .unwrap(),
1810                        Event::Eof
1811                    );
1812                }
1813
1814                #[$test]
1815                $($async)? fn declaration() {
1816                    let mut reader = Reader::from_str("<?xml ?>");
1817
1818                    assert_eq!(
1819                        reader.$read_event($buf) $(.$await)? .unwrap(),
1820                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1821                    );
1822                }
1823
1824                #[$test]
1825                $($async)? fn doctype() {
1826                    let mut reader = Reader::from_str("<!DOCTYPE x>");
1827
1828                    assert_eq!(
1829                        reader.$read_event($buf) $(.$await)? .unwrap(),
1830                        Event::DocType(BytesText::from_escaped("x"))
1831                    );
1832                }
1833
1834                #[$test]
1835                $($async)? fn processing_instruction() {
1836                    let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
1837
1838                    assert_eq!(
1839                        reader.$read_event($buf) $(.$await)? .unwrap(),
1840                        Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
1841                    );
1842                }
1843
1844                /// Lone closing tags are not allowed, so testing it together with start tag
1845                #[$test]
1846                $($async)? fn start_and_end() {
1847                    let mut reader = Reader::from_str("<tag></tag>");
1848
1849                    assert_eq!(
1850                        reader.$read_event($buf) $(.$await)? .unwrap(),
1851                        Event::Start(BytesStart::new("tag"))
1852                    );
1853
1854                    assert_eq!(
1855                        reader.$read_event($buf) $(.$await)? .unwrap(),
1856                        Event::End(BytesEnd::new("tag"))
1857                    );
1858                }
1859
1860                #[$test]
1861                $($async)? fn empty() {
1862                    let mut reader = Reader::from_str("<tag/>");
1863
1864                    assert_eq!(
1865                        reader.$read_event($buf) $(.$await)? .unwrap(),
1866                        Event::Empty(BytesStart::new("tag"))
1867                    );
1868                }
1869
1870                #[$test]
1871                $($async)? fn text() {
1872                    let mut reader = Reader::from_str("text");
1873
1874                    assert_eq!(
1875                        reader.$read_event($buf) $(.$await)? .unwrap(),
1876                        Event::Text(BytesText::from_escaped("text"))
1877                    );
1878                }
1879
1880                #[$test]
1881                $($async)? fn cdata() {
1882                    let mut reader = Reader::from_str("<![CDATA[]]>");
1883
1884                    assert_eq!(
1885                        reader.$read_event($buf) $(.$await)? .unwrap(),
1886                        Event::CData(BytesCData::new(""))
1887                    );
1888                }
1889
1890                #[$test]
1891                $($async)? fn comment() {
1892                    let mut reader = Reader::from_str("<!---->");
1893
1894                    assert_eq!(
1895                        reader.$read_event($buf) $(.$await)? .unwrap(),
1896                        Event::Comment(BytesText::from_escaped(""))
1897                    );
1898                }
1899
1900                #[$test]
1901                $($async)? fn eof() {
1902                    let mut reader = Reader::from_str("");
1903
1904                    assert_eq!(
1905                        reader.$read_event($buf) $(.$await)? .unwrap(),
1906                        Event::Eof
1907                    );
1908                }
1909            }
1910        };
1911    }
1912
1913    // Export macros for the child modules:
1914    // - buffered_reader
1915    // - slice_reader
1916    pub(super) use check;
1917}