1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use nom8::branch::alt;
6use nom8::bytes::any;
7use nom8::bytes::none_of;
8use nom8::bytes::one_of;
9use nom8::bytes::tag;
10use nom8::bytes::take_while;
11use nom8::bytes::take_while1;
12use nom8::bytes::take_while_m_n;
13use nom8::combinator::cut;
14use nom8::combinator::fail;
15use nom8::combinator::opt;
16use nom8::combinator::peek;
17use nom8::combinator::success;
18use nom8::multi::many0_count;
19use nom8::multi::many1_count;
20use nom8::sequence::delimited;
21use nom8::sequence::preceded;
22use nom8::sequence::terminated;
23
24use crate::parser::errors::CustomError;
25use crate::parser::numbers::HEXDIG;
26use crate::parser::prelude::*;
27use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28
29pub(crate) fn string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
33 alt((
34 ml_basic_string,
35 basic_string,
36 ml_literal_string,
37 literal_string.map(Cow::Borrowed),
38 ))
39 .parse(input)
40}
41
42pub(crate) fn basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
46 let (mut input, _) = one_of(QUOTATION_MARK).parse(input)?;
47
48 let mut c = Cow::Borrowed("");
49 if let Some((i, ci)) = ok_error(basic_chars.parse(input))? {
50 input = i;
51 c = ci;
52 }
53 while let Some((i, ci)) = ok_error(basic_chars.parse(input))? {
54 input = i;
55 c.to_mut().push_str(&ci);
56 }
57
58 let (input, _) = cut(one_of(QUOTATION_MARK))
59 .context(Context::Expression("basic string"))
60 .parse(input)?;
61
62 Ok((input, c))
63}
64
65pub(crate) const QUOTATION_MARK: u8 = b'"';
67
68fn basic_chars(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
70 alt((
71 take_while1(BASIC_UNESCAPED)
74 .map_res(std::str::from_utf8)
75 .map(Cow::Borrowed),
76 escaped.map(|c| Cow::Owned(String::from(c))),
77 ))
78 .parse(input)
79}
80
81pub(crate) const BASIC_UNESCAPED: (
83 (u8, u8),
84 u8,
85 RangeInclusive<u8>,
86 RangeInclusive<u8>,
87 RangeInclusive<u8>,
88) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
89
90fn escaped(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
92 preceded(ESCAPE, escape_seq_char).parse(input)
93}
94
95pub(crate) const ESCAPE: u8 = b'\\';
97
98fn escape_seq_char(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
108 dispatch! {any;
109 b'b' => success('\u{8}'),
110 b'f' => success('\u{c}'),
111 b'n' => success('\n'),
112 b'r' => success('\r'),
113 b't' => success('\t'),
114 b'u' => cut(hexescape::<4>).context(Context::Expression("unicode 4-digit hex code")),
115 b'U' => cut(hexescape::<8>).context(Context::Expression("unicode 8-digit hex code")),
116 b'\\' => success('\\'),
117 b'"' => success('"'),
118 _ => {
119 cut(fail::<_, char, _>)
120 .context(Context::Expression("escape sequence"))
121 .context(Context::Expected(ParserValue::CharLiteral('b')))
122 .context(Context::Expected(ParserValue::CharLiteral('f')))
123 .context(Context::Expected(ParserValue::CharLiteral('n')))
124 .context(Context::Expected(ParserValue::CharLiteral('r')))
125 .context(Context::Expected(ParserValue::CharLiteral('t')))
126 .context(Context::Expected(ParserValue::CharLiteral('u')))
127 .context(Context::Expected(ParserValue::CharLiteral('U')))
128 .context(Context::Expected(ParserValue::CharLiteral('\\')))
129 .context(Context::Expected(ParserValue::CharLiteral('"')))
130 }
131 }
132 .parse(input)
133}
134
135pub(crate) fn hexescape<const N: usize>(
136 input: Input<'_>,
137) -> IResult<Input<'_>, char, ParserError<'_>> {
138 take_while_m_n(0, N, HEXDIG)
139 .verify(|b: &[u8]| b.len() == N)
140 .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
141 .map_opt(|s| u32::from_str_radix(s, 16).ok())
142 .map_res(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
143 .parse(input)
144}
145
146fn ml_basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
151 delimited(
152 ML_BASIC_STRING_DELIM,
153 preceded(opt(newline), cut(ml_basic_body)),
154 cut(ML_BASIC_STRING_DELIM),
155 )
156 .context(Context::Expression("multiline basic string"))
157 .parse(input)
158}
159
160pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
162
163fn ml_basic_body(mut input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
165 let mut c = Cow::Borrowed("");
166 if let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
167 input = i;
168 c = ci;
169 }
170 while let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
171 input = i;
172 c.to_mut().push_str(&ci);
173 }
174
175 while let Some((i, qi)) = ok_error(mlb_quotes(none_of(b'\"').value(())).parse(input))? {
176 if let Some((i, ci)) = ok_error(mlb_content.parse(i))? {
177 input = i;
178 c.to_mut().push_str(qi);
179 c.to_mut().push_str(&ci);
180 while let Some((i, ci)) = ok_error(mlb_content.parse(input))? {
181 input = i;
182 c.to_mut().push_str(&ci);
183 }
184 } else {
185 break;
186 }
187 }
188
189 if let Some((i, qi)) = ok_error(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(())).parse(input))?
190 {
191 input = i;
192 c.to_mut().push_str(qi);
193 }
194
195 Ok((input, c))
196}
197
198fn mlb_content(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
201 alt((
202 take_while1(MLB_UNESCAPED)
205 .map_res(std::str::from_utf8)
206 .map(Cow::Borrowed),
207 mlb_escaped_nl.map(|_| Cow::Borrowed("")),
209 escaped.map(|c| Cow::Owned(String::from(c))),
210 newline.map(|_| Cow::Borrowed("\n")),
211 ))
212 .parse(input)
213}
214
215fn mlb_quotes<'i>(
217 mut term: impl nom8::Parser<Input<'i>, (), ParserError<'i>>,
218) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
219 move |input| {
220 let res = terminated(b"\"\"", peek(term.by_ref()))
221 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
222 .parse(input);
223
224 match res {
225 Err(nom8::Err::Error(_)) => terminated(b"\"", peek(term.by_ref()))
226 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
227 .parse(input),
228 res => res,
229 }
230 }
231}
232
233pub(crate) const MLB_UNESCAPED: (
235 (u8, u8),
236 u8,
237 RangeInclusive<u8>,
238 RangeInclusive<u8>,
239 RangeInclusive<u8>,
240) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
241
242fn mlb_escaped_nl(input: Input<'_>) -> IResult<Input<'_>, (), ParserError<'_>> {
248 many1_count((ESCAPE, ws, ws_newlines))
249 .value(())
250 .parse(input)
251}
252
253pub(crate) fn literal_string(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
257 delimited(APOSTROPHE, cut(take_while(LITERAL_CHAR)), cut(APOSTROPHE))
258 .map_res(std::str::from_utf8)
259 .context(Context::Expression("literal string"))
260 .parse(input)
261}
262
263pub(crate) const APOSTROPHE: u8 = b'\'';
265
266pub(crate) const LITERAL_CHAR: (
268 u8,
269 RangeInclusive<u8>,
270 RangeInclusive<u8>,
271 RangeInclusive<u8>,
272) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
273
274fn ml_literal_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
279 delimited(
280 (ML_LITERAL_STRING_DELIM, opt(newline)),
281 cut(ml_literal_body.map(|t| {
282 if t.contains("\r\n") {
283 Cow::Owned(t.replace("\r\n", "\n"))
284 } else {
285 Cow::Borrowed(t)
286 }
287 })),
288 cut(ML_LITERAL_STRING_DELIM),
289 )
290 .context(Context::Expression("multiline literal string"))
291 .parse(input)
292}
293
294pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
296
297fn ml_literal_body(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
299 (
300 many0_count(mll_content),
301 many0_count((
302 mll_quotes(none_of(APOSTROPHE).value(())),
303 many1_count(mll_content),
304 )),
305 opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
306 )
307 .recognize()
308 .map_res(std::str::from_utf8)
309 .parse(input)
310}
311
312fn mll_content(input: Input<'_>) -> IResult<Input<'_>, u8, ParserError<'_>> {
314 alt((one_of(MLL_CHAR), newline)).parse(input)
315}
316
317const MLL_CHAR: (
319 u8,
320 RangeInclusive<u8>,
321 RangeInclusive<u8>,
322 RangeInclusive<u8>,
323) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
324
325fn mll_quotes<'i>(
327 mut term: impl nom8::Parser<Input<'i>, (), ParserError<'i>>,
328) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
329 move |input| {
330 let res = terminated(b"''", peek(term.by_ref()))
331 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
332 .parse(input);
333
334 match res {
335 Err(nom8::Err::Error(_)) => terminated(b"'", peek(term.by_ref()))
336 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
337 .parse(input),
338 res => res,
339 }
340 }
341}
342
343#[cfg(test)]
344mod test {
345 use super::*;
346
347 #[test]
348 fn basic_string() {
349 let input =
350 r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
351 let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
352 let parsed = string.parse(new_input(input)).finish();
353 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
354 }
355
356 #[test]
357 fn ml_basic_string() {
358 let cases = [
359 (
360 r#""""
361Roses are red
362Violets are blue""""#,
363 r#"Roses are red
364Violets are blue"#,
365 ),
366 (r#"""" \""" """"#, " \"\"\" "),
367 (r#"""" \\""""#, " \\"),
368 ];
369
370 for &(input, expected) in &cases {
371 let parsed = string.parse(new_input(input)).finish();
372 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
373 }
374
375 let invalid_cases = [r#"""" """#, r#"""" \""""#];
376
377 for input in &invalid_cases {
378 let parsed = string.parse(new_input(input)).finish();
379 assert!(parsed.is_err());
380 }
381 }
382
383 #[test]
384 fn ml_basic_string_escape_ws() {
385 let inputs = [
386 r#""""
387The quick brown \
388
389
390 fox jumps over \
391 the lazy dog.""""#,
392 r#""""\
393 The quick brown \
394 fox jumps over \
395 the lazy dog.\
396 """"#,
397 ];
398 for input in &inputs {
399 let expected = "The quick brown fox jumps over the lazy dog.";
400 let parsed = string.parse(new_input(input)).finish();
401 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
402 }
403 let empties = [
404 r#""""\
405 """"#,
406 r#""""
407\
408 \
409""""#,
410 ];
411 for input in &empties {
412 let expected = "";
413 let parsed = string.parse(new_input(input)).finish();
414 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
415 }
416 }
417
418 #[test]
419 fn literal_string() {
420 let inputs = [
421 r#"'C:\Users\nodejs\templates'"#,
422 r#"'\\ServerX\admin$\system32\'"#,
423 r#"'Tom "Dubs" Preston-Werner'"#,
424 r#"'<\i\c*\s*>'"#,
425 ];
426
427 for input in &inputs {
428 let expected = &input[1..input.len() - 1];
429 let parsed = string.parse(new_input(input)).finish();
430 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
431 }
432 }
433
434 #[test]
435 fn ml_literal_string() {
436 let inputs = [
437 r#"'''I [dw]on't need \d{2} apples'''"#,
438 r#"''''one_quote''''"#,
439 ];
440 for input in &inputs {
441 let expected = &input[3..input.len() - 3];
442 let parsed = string.parse(new_input(input)).finish();
443 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
444 }
445
446 let input = r#"'''
447The first newline is
448trimmed in raw strings.
449 All other whitespace
450 is preserved.
451'''"#;
452 let expected = &input[4..input.len() - 3];
453 let parsed = string.parse(new_input(input)).finish();
454 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
455 }
456}