toml_edit/parser/
trivia.rs

1use std::ops::RangeInclusive;
2
3use nom8::branch::alt;
4use nom8::bytes::one_of;
5use nom8::bytes::take_while;
6use nom8::bytes::take_while1;
7use nom8::combinator::eof;
8use nom8::combinator::opt;
9use nom8::multi::many0_count;
10use nom8::multi::many1_count;
11use nom8::prelude::*;
12use nom8::sequence::terminated;
13
14use crate::parser::prelude::*;
15
16pub(crate) unsafe fn from_utf8_unchecked<'b>(
17    bytes: &'b [u8],
18    safety_justification: &'static str,
19) -> &'b str {
20    if cfg!(debug_assertions) {
21        // Catch problems more quickly when testing
22        std::str::from_utf8(bytes).expect(safety_justification)
23    } else {
24        std::str::from_utf8_unchecked(bytes)
25    }
26}
27
28// wschar = ( %x20 /              ; Space
29//            %x09 )              ; Horizontal tab
30pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
31
32// ws = *wschar
33pub(crate) fn ws(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
34    take_while(WSCHAR)
35        .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` filters out on-ASCII") })
36        .parse(input)
37}
38
39// non-ascii = %x80-D7FF / %xE000-10FFFF
40// - ASCII is 0xxxxxxx
41// - First byte for UTF-8 is 11xxxxxx
42// - Subsequent UTF-8 bytes are 10xxxxxx
43pub(crate) const NON_ASCII: RangeInclusive<u8> = 0x80..=0xff;
44
45// non-eol = %x09 / %x20-7E / non-ascii
46pub(crate) const NON_EOL: (u8, RangeInclusive<u8>, RangeInclusive<u8>) =
47    (0x09, 0x20..=0x7E, NON_ASCII);
48
49// comment-start-symbol = %x23 ; #
50pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
51
52// comment = comment-start-symbol *non-eol
53pub(crate) fn comment(input: Input<'_>) -> IResult<Input<'_>, &[u8], ParserError<'_>> {
54    (COMMENT_START_SYMBOL, take_while(NON_EOL))
55        .recognize()
56        .parse(input)
57}
58
59// newline = ( %x0A /              ; LF
60//             %x0D.0A )           ; CRLF
61pub(crate) fn newline(input: Input<'_>) -> IResult<Input<'_>, u8, ParserError<'_>> {
62    alt((
63        one_of(LF).value(b'\n'),
64        (one_of(CR), one_of(LF)).value(b'\n'),
65    ))
66    .parse(input)
67}
68pub(crate) const LF: u8 = b'\n';
69pub(crate) const CR: u8 = b'\r';
70
71// ws-newline       = *( wschar / newline )
72pub(crate) fn ws_newline(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
73    many0_count(alt((newline.value(&b"\n"[..]), take_while1(WSCHAR))))
74        .recognize()
75        .map(|b| unsafe {
76            from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII")
77        })
78        .parse(input)
79}
80
81// ws-newlines      = newline *( wschar / newline )
82pub(crate) fn ws_newlines(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
83    (newline, ws_newline)
84        .recognize()
85        .map(|b| unsafe {
86            from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII")
87        })
88        .parse(input)
89}
90
91// note: this rule is not present in the original grammar
92// ws-comment-newline = *( ws-newline-nonempty / comment )
93pub(crate) fn ws_comment_newline(input: Input<'_>) -> IResult<Input<'_>, &[u8], ParserError<'_>> {
94    many0_count(alt((
95        many1_count(alt((take_while1(WSCHAR), newline.value(&b"\n"[..])))).value(()),
96        comment.value(()),
97    )))
98    .recognize()
99    .parse(input)
100}
101
102// note: this rule is not present in the original grammar
103// line-ending = newline / eof
104pub(crate) fn line_ending(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
105    alt((newline.value("\n"), eof.value(""))).parse(input)
106}
107
108// note: this rule is not present in the original grammar
109// line-trailing = ws [comment] skip-line-ending
110pub(crate) fn line_trailing(
111    input: Input<'_>,
112) -> IResult<Input<'_>, std::ops::Range<usize>, ParserError<'_>> {
113    terminated((ws, opt(comment)).span(), line_ending).parse(input)
114}
115
116#[cfg(test)]
117mod test {
118    use super::*;
119
120    #[test]
121    fn trivia() {
122        let inputs = [
123            "",
124            r#" "#,
125            r#"
126"#,
127            r#"
128# comment
129
130# comment2
131
132
133"#,
134            r#"
135        "#,
136            r#"# comment
137# comment2
138
139
140   "#,
141        ];
142        for input in inputs {
143            dbg!(input);
144            let parsed = ws_comment_newline.parse(new_input(input)).finish();
145            assert!(parsed.is_ok(), "{:?}", parsed);
146            let parsed = parsed.unwrap();
147            assert_eq!(parsed, input.as_bytes());
148        }
149    }
150}