toml_edit/parser/
strings.rs

1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use nom8::branch::alt;
6use nom8::bytes::any;
7use nom8::bytes::none_of;
8use nom8::bytes::one_of;
9use nom8::bytes::tag;
10use nom8::bytes::take_while;
11use nom8::bytes::take_while1;
12use nom8::bytes::take_while_m_n;
13use nom8::combinator::cut;
14use nom8::combinator::fail;
15use nom8::combinator::opt;
16use nom8::combinator::peek;
17use nom8::combinator::success;
18use nom8::multi::many0_count;
19use nom8::multi::many1_count;
20use nom8::sequence::delimited;
21use nom8::sequence::preceded;
22use nom8::sequence::terminated;
23
24use crate::parser::errors::CustomError;
25use crate::parser::numbers::HEXDIG;
26use crate::parser::prelude::*;
27use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28
29// ;; String
30
31// string = ml-basic-string / basic-string / ml-literal-string / literal-string
32pub(crate) fn string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
33    alt((
34        ml_basic_string,
35        basic_string,
36        ml_literal_string,
37        literal_string.map(Cow::Borrowed),
38    ))
39    .parse(input)
40}
41
42// ;; Basic String
43
44// basic-string = quotation-mark *basic-char quotation-mark
45pub(crate) fn basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
46    let (mut input, _) = one_of(QUOTATION_MARK).parse(input)?;
47
48    let mut c = Cow::Borrowed("");
49    if let Some((i, ci)) = ok_error(basic_chars.parse(input))? {
50        input = i;
51        c = ci;
52    }
53    while let Some((i, ci)) = ok_error(basic_chars.parse(input))? {
54        input = i;
55        c.to_mut().push_str(&ci);
56    }
57
58    let (input, _) = cut(one_of(QUOTATION_MARK))
59        .context(Context::Expression("basic string"))
60        .parse(input)?;
61
62    Ok((input, c))
63}
64
65// quotation-mark = %x22            ; "
66pub(crate) const QUOTATION_MARK: u8 = b'"';
67
68// basic-char = basic-unescaped / escaped
69fn basic_chars(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
70    alt((
71        // Deviate from the official grammar by batching the unescaped chars so we build a string a
72        // chunk at a time, rather than a `char` at a time.
73        take_while1(BASIC_UNESCAPED)
74            .map_res(std::str::from_utf8)
75            .map(Cow::Borrowed),
76        escaped.map(|c| Cow::Owned(String::from(c))),
77    ))
78    .parse(input)
79}
80
81// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
82pub(crate) const BASIC_UNESCAPED: (
83    (u8, u8),
84    u8,
85    RangeInclusive<u8>,
86    RangeInclusive<u8>,
87    RangeInclusive<u8>,
88) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
89
90// escaped = escape escape-seq-char
91fn escaped(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
92    preceded(ESCAPE, escape_seq_char).parse(input)
93}
94
95// escape = %x5C                    ; \
96pub(crate) const ESCAPE: u8 = b'\\';
97
98// escape-seq-char =  %x22         ; "    quotation mark  U+0022
99// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
100// escape-seq-char =/ %x62         ; b    backspace       U+0008
101// escape-seq-char =/ %x66         ; f    form feed       U+000C
102// escape-seq-char =/ %x6E         ; n    line feed       U+000A
103// escape-seq-char =/ %x72         ; r    carriage return U+000D
104// escape-seq-char =/ %x74         ; t    tab             U+0009
105// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
106// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
107fn escape_seq_char(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
108    dispatch! {any;
109        b'b' => success('\u{8}'),
110        b'f' => success('\u{c}'),
111        b'n' => success('\n'),
112        b'r' => success('\r'),
113        b't' => success('\t'),
114        b'u' => cut(hexescape::<4>).context(Context::Expression("unicode 4-digit hex code")),
115        b'U' => cut(hexescape::<8>).context(Context::Expression("unicode 8-digit hex code")),
116        b'\\' => success('\\'),
117        b'"' => success('"'),
118        _ => {
119            cut(fail::<_, char, _>)
120            .context(Context::Expression("escape sequence"))
121            .context(Context::Expected(ParserValue::CharLiteral('b')))
122            .context(Context::Expected(ParserValue::CharLiteral('f')))
123            .context(Context::Expected(ParserValue::CharLiteral('n')))
124            .context(Context::Expected(ParserValue::CharLiteral('r')))
125            .context(Context::Expected(ParserValue::CharLiteral('t')))
126            .context(Context::Expected(ParserValue::CharLiteral('u')))
127            .context(Context::Expected(ParserValue::CharLiteral('U')))
128            .context(Context::Expected(ParserValue::CharLiteral('\\')))
129            .context(Context::Expected(ParserValue::CharLiteral('"')))
130        }
131    }
132    .parse(input)
133}
134
135pub(crate) fn hexescape<const N: usize>(
136    input: Input<'_>,
137) -> IResult<Input<'_>, char, ParserError<'_>> {
138    take_while_m_n(0, N, HEXDIG)
139        .verify(|b: &[u8]| b.len() == N)
140        .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
141        .map_opt(|s| u32::from_str_radix(s, 16).ok())
142        .map_res(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
143        .parse(input)
144}
145
146// ;; Multiline Basic String
147
148// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
149//                   ml-basic-string-delim
150fn ml_basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
151    delimited(
152        ML_BASIC_STRING_DELIM,
153        preceded(opt(newline), cut(ml_basic_body)),
154        cut(ML_BASIC_STRING_DELIM),
155    )
156    .context(Context::Expression("multiline basic string"))
157    .parse(input)
158}
159
160// ml-basic-string-delim = 3quotation-mark
161pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
162
163// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
164fn ml_basic_body(mut input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
165    let mut c = Cow::Borrowed("");
166    if let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
167        input = i;
168        c = ci;
169    }
170    while let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
171        input = i;
172        c.to_mut().push_str(&ci);
173    }
174
175    while let Some((i, qi)) = ok_error(mlb_quotes(none_of(b'\"').value(())).parse(input))? {
176        if let Some((i, ci)) = ok_error(mlb_content.parse(i))? {
177            input = i;
178            c.to_mut().push_str(qi);
179            c.to_mut().push_str(&ci);
180            while let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
181                input = i;
182                c.to_mut().push_str(&ci);
183            }
184        } else {
185            break;
186        }
187    }
188
189    if let Some((i, qi)) = ok_error(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(())).parse(input))?
190    {
191        input = i;
192        c.to_mut().push_str(qi);
193    }
194
195    Ok((input, c))
196}
197
198// mlb-content = mlb-char / newline / mlb-escaped-nl
199// mlb-char = mlb-unescaped / escaped
200fn mlb_content(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
201    alt((
202        // Deviate from the official grammar by batching the unescaped chars so we build a string a
203        // chunk at a time, rather than a `char` at a time.
204        take_while1(MLB_UNESCAPED)
205            .map_res(std::str::from_utf8)
206            .map(Cow::Borrowed),
207        // Order changed fromg grammar so `escaped` can more easily `cut` on bad escape sequences
208        mlb_escaped_nl.map(|_| Cow::Borrowed("")),
209        escaped.map(|c| Cow::Owned(String::from(c))),
210        newline.map(|_| Cow::Borrowed("\n")),
211    ))
212    .parse(input)
213}
214
215// mlb-quotes = 1*2quotation-mark
216fn mlb_quotes<'i>(
217    mut term: impl nom8::Parser<Input<'i>, (), ParserError<'i>>,
218) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
219    move |input| {
220        let res = terminated(b"\"\"", peek(term.by_ref()))
221            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
222            .parse(input);
223
224        match res {
225            Err(nom8::Err::Error(_)) => terminated(b"\"", peek(term.by_ref()))
226                .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
227                .parse(input),
228            res => res,
229        }
230    }
231}
232
233// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
234pub(crate) const MLB_UNESCAPED: (
235    (u8, u8),
236    u8,
237    RangeInclusive<u8>,
238    RangeInclusive<u8>,
239    RangeInclusive<u8>,
240) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
241
242// mlb-escaped-nl = escape ws newline *( wschar / newline
243// When the last non-whitespace character on a line is a \,
244// it will be trimmed along with all whitespace
245// (including newlines) up to the next non-whitespace
246// character or closing delimiter.
247fn mlb_escaped_nl(input: Input<'_>) -> IResult<Input<'_>, (), ParserError<'_>> {
248    many1_count((ESCAPE, ws, ws_newlines))
249        .value(())
250        .parse(input)
251}
252
253// ;; Literal String
254
255// literal-string = apostrophe *literal-char apostrophe
256pub(crate) fn literal_string(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
257    delimited(APOSTROPHE, cut(take_while(LITERAL_CHAR)), cut(APOSTROPHE))
258        .map_res(std::str::from_utf8)
259        .context(Context::Expression("literal string"))
260        .parse(input)
261}
262
263// apostrophe = %x27 ; ' apostrophe
264pub(crate) const APOSTROPHE: u8 = b'\'';
265
266// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
267pub(crate) const LITERAL_CHAR: (
268    u8,
269    RangeInclusive<u8>,
270    RangeInclusive<u8>,
271    RangeInclusive<u8>,
272) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
273
274// ;; Multiline Literal String
275
276// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
277//                     ml-literal-string-delim
278fn ml_literal_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
279    delimited(
280        (ML_LITERAL_STRING_DELIM, opt(newline)),
281        cut(ml_literal_body.map(|t| {
282            if t.contains("\r\n") {
283                Cow::Owned(t.replace("\r\n", "\n"))
284            } else {
285                Cow::Borrowed(t)
286            }
287        })),
288        cut(ML_LITERAL_STRING_DELIM),
289    )
290    .context(Context::Expression("multiline literal string"))
291    .parse(input)
292}
293
294// ml-literal-string-delim = 3apostrophe
295pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
296
297// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
298fn ml_literal_body(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
299    (
300        many0_count(mll_content),
301        many0_count((
302            mll_quotes(none_of(APOSTROPHE).value(())),
303            many1_count(mll_content),
304        )),
305        opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
306    )
307        .recognize()
308        .map_res(std::str::from_utf8)
309        .parse(input)
310}
311
312// mll-content = mll-char / newline
313fn mll_content(input: Input<'_>) -> IResult<Input<'_>, u8, ParserError<'_>> {
314    alt((one_of(MLL_CHAR), newline)).parse(input)
315}
316
317// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
318const MLL_CHAR: (
319    u8,
320    RangeInclusive<u8>,
321    RangeInclusive<u8>,
322    RangeInclusive<u8>,
323) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
324
325// mll-quotes = 1*2apostrophe
326fn mll_quotes<'i>(
327    mut term: impl nom8::Parser<Input<'i>, (), ParserError<'i>>,
328) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
329    move |input| {
330        let res = terminated(b"''", peek(term.by_ref()))
331            .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
332            .parse(input);
333
334        match res {
335            Err(nom8::Err::Error(_)) => terminated(b"'", peek(term.by_ref()))
336                .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
337                .parse(input),
338            res => res,
339        }
340    }
341}
342
343#[cfg(test)]
344mod test {
345    use super::*;
346
347    #[test]
348    fn basic_string() {
349        let input =
350            r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
351        let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
352        let parsed = string.parse(new_input(input)).finish();
353        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
354    }
355
356    #[test]
357    fn ml_basic_string() {
358        let cases = [
359            (
360                r#""""
361Roses are red
362Violets are blue""""#,
363                r#"Roses are red
364Violets are blue"#,
365            ),
366            (r#"""" \""" """"#, " \"\"\" "),
367            (r#"""" \\""""#, " \\"),
368        ];
369
370        for &(input, expected) in &cases {
371            let parsed = string.parse(new_input(input)).finish();
372            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
373        }
374
375        let invalid_cases = [r#""""  """#, r#""""  \""""#];
376
377        for input in &invalid_cases {
378            let parsed = string.parse(new_input(input)).finish();
379            assert!(parsed.is_err());
380        }
381    }
382
383    #[test]
384    fn ml_basic_string_escape_ws() {
385        let inputs = [
386            r#""""
387The quick brown \
388
389
390  fox jumps over \
391    the lazy dog.""""#,
392            r#""""\
393       The quick brown \
394       fox jumps over \
395       the lazy dog.\
396       """"#,
397        ];
398        for input in &inputs {
399            let expected = "The quick brown fox jumps over the lazy dog.";
400            let parsed = string.parse(new_input(input)).finish();
401            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
402        }
403        let empties = [
404            r#""""\
405       """"#,
406            r#""""
407\
408  \
409""""#,
410        ];
411        for input in &empties {
412            let expected = "";
413            let parsed = string.parse(new_input(input)).finish();
414            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
415        }
416    }
417
418    #[test]
419    fn literal_string() {
420        let inputs = [
421            r#"'C:\Users\nodejs\templates'"#,
422            r#"'\\ServerX\admin$\system32\'"#,
423            r#"'Tom "Dubs" Preston-Werner'"#,
424            r#"'<\i\c*\s*>'"#,
425        ];
426
427        for input in &inputs {
428            let expected = &input[1..input.len() - 1];
429            let parsed = string.parse(new_input(input)).finish();
430            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
431        }
432    }
433
434    #[test]
435    fn ml_literal_string() {
436        let inputs = [
437            r#"'''I [dw]on't need \d{2} apples'''"#,
438            r#"''''one_quote''''"#,
439        ];
440        for input in &inputs {
441            let expected = &input[3..input.len() - 3];
442            let parsed = string.parse(new_input(input)).finish();
443            assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
444        }
445
446        let input = r#"'''
447The first newline is
448trimmed in raw strings.
449   All other whitespace
450   is preserved.
451'''"#;
452        let expected = &input[4..input.len() - 3];
453        let parsed = string.parse(new_input(input)).finish();
454        assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
455    }
456}
toml_edit/parser/strings.rs

toml_edit/parser/
strings.rs