rnix/
tokenizer.rs

1//! The tokenizer: turns a string into tokens, such as numbers, strings, and keywords
2
3use crate::SyntaxKind::{self, *};
4
5#[derive(Clone, Copy, Debug, PartialEq, Eq)]
6enum IdentType {
7    Ident,
8    Path,
9    Store,
10    Uri,
11}
12
13fn is_valid_path_char(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '_' | '.' | '+' | '-')
15}
16
17fn is_valid_uri_char(c: char) -> bool {
18    match c {
19        '%' | '?' | ':' | '@' | '&' | '=' | '$' | ',' | '!' | '~' | '*' | '\'' => true,
20        _ => is_valid_path_char(c),
21    }
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25enum Context {
26    StringBody { multiline: bool },
27    StringEnd,
28    Interpol { brackets: u32 },
29    InterpolStart,
30    Path,
31}
32
33#[derive(Clone, Copy)]
34struct State<'a> {
35    input: &'a str,
36    offset: usize,
37}
38
39impl PartialEq for State<'_> {
40    fn eq(&self, other: &Self) -> bool {
41        std::ptr::eq(self.input, other.input) && self.offset == other.offset
42    }
43}
44
45impl Eq for State<'_> {}
46
47pub type Token<'a> = (SyntaxKind, &'a str);
48
49/// A convenience function for tokenizing the given input
50pub fn tokenize(input: &str) -> Vec<Token<'_>> {
51    Tokenizer::new(input).collect()
52}
53
54/// The tokenizer. You may want to use the `tokenize` convenience function from this module instead.
55pub struct Tokenizer<'a> {
56    ctx: Vec<Context>,
57    state: State<'a>,
58}
59
60impl<'a> Tokenizer<'a> {
61    pub fn new(input: &'a str) -> Self {
62        Self { ctx: Vec::new(), state: State { input, offset: 0 } }
63    }
64}
65
66impl Tokenizer<'_> {
67    fn remaining(&self) -> &str {
68        &self.state.input[self.state.offset..]
69    }
70    fn peek(&self) -> Option<char> {
71        self.remaining().chars().next()
72    }
73    fn next(&mut self) -> Option<char> {
74        let c = self.peek();
75        if let Some(c) = c {
76            self.state.offset += c.len_utf8();
77        }
78        c
79    }
80    fn starts_with_bump(&mut self, s: &str) -> bool {
81        let starts_with = self.remaining().starts_with(s);
82        if starts_with {
83            self.state.offset += s.len();
84        }
85        starts_with
86    }
87    fn str_since<'a>(&self, past: State<'a>) -> &'a str {
88        &past.input[past.offset..self.state.offset]
89    }
90
91    fn push_ctx(&mut self, ctx: Context) {
92        self.ctx.push(ctx)
93    }
94
95    fn pop_ctx(&mut self, ctx: Context) {
96        debug_assert_eq!(self.ctx.last(), Some(&ctx));
97        self.ctx.pop();
98    }
99
100    fn consume<F>(&mut self, mut f: F) -> usize
101    where
102        F: FnMut(char) -> bool,
103    {
104        let len: usize =
105            self.remaining().chars().take_while(|&c| f(c)).map(|c| c.len_utf8()).sum::<usize>();
106        self.state.offset += len;
107        len
108    }
109    fn next_string(&mut self, multiline: bool) -> SyntaxKind {
110        loop {
111            let start = self.state;
112            match self.next() {
113                None => {
114                    self.pop_ctx(Context::StringBody { multiline });
115                    return TOKEN_ERROR;
116                }
117                Some('"') if !multiline => {
118                    self.state = start;
119                    self.pop_ctx(Context::StringBody { multiline: false });
120                    self.push_ctx(Context::StringEnd);
121                    return TOKEN_STRING_CONTENT;
122                }
123                Some('\\') if !multiline => match self.next() {
124                    None => return TOKEN_ERROR,
125                    Some(_) => (),
126                },
127
128                Some('\'') if multiline => match self.peek() {
129                    None => return TOKEN_ERROR,
130                    Some('\'') => match {
131                        self.next();
132                        self.peek()
133                    } {
134                        Some('\'') | Some('$') => {
135                            self.next().unwrap();
136                        }
137                        Some('\\') => {
138                            self.next().unwrap();
139                            if let None = self.next() {
140                                return TOKEN_ERROR;
141                            }
142                        }
143                        _ => {
144                            self.state = start;
145                            self.pop_ctx(Context::StringBody { multiline: true });
146                            self.push_ctx(Context::StringEnd);
147                            return TOKEN_STRING_CONTENT;
148                        }
149                    },
150                    Some(_) => (),
151                },
152
153                Some('$') => match self.peek() {
154                    Some('$') => {
155                        self.next().unwrap();
156                    }
157                    Some('{') => {
158                        self.state = start;
159                        self.push_ctx(Context::InterpolStart);
160                        return TOKEN_STRING_CONTENT;
161                    }
162                    _ => (),
163                },
164                Some(_) => (),
165            }
166        }
167    }
168
169    fn check_path_since(&mut self, past: State) -> SyntaxKind {
170        self.consume(is_valid_path_char);
171        if self.remaining().starts_with("${") {
172            self.ctx.push(Context::InterpolStart);
173        } else if self.str_since(past).ends_with('/') {
174            return TOKEN_ERROR;
175        } else {
176            self.pop_ctx(Context::Path);
177        }
178        TOKEN_PATH
179    }
180
181    fn next_inner(&mut self) -> Option<SyntaxKind> {
182        let start = self.state;
183
184        // Handle already started multi-token
185        loop {
186            match self.ctx.last() {
187                Some(Context::InterpolStart) => {
188                    self.pop_ctx(Context::InterpolStart);
189                    self.ctx.push(Context::Interpol { brackets: 0 });
190                    if self.starts_with_bump("${") {
191                        return Some(TOKEN_INTERPOL_START);
192                    } else {
193                        unreachable!()
194                    }
195                }
196                Some(Context::Path) => {
197                    if self.starts_with_bump("${") {
198                        self.ctx.push(Context::Interpol { brackets: 0 });
199                        return Some(TOKEN_INTERPOL_START);
200                    } else if self.peek().map_or(false, is_valid_path_char) {
201                        return Some(self.check_path_since(start));
202                    } else {
203                        self.pop_ctx(Context::Path);
204                    }
205                }
206                Some(Context::StringBody { multiline }) => {
207                    let token = self.next_string(*multiline);
208                    // skip empty stuff
209                    if self.state == start {
210                        continue;
211                    }
212                    return Some(token);
213                }
214                Some(Context::StringEnd) => {
215                    self.pop_ctx(Context::StringEnd);
216                    let status = match self.peek() {
217                        Some('"') => {
218                            self.next().unwrap();
219                            true
220                        }
221                        Some('\'') => match {
222                            self.next().unwrap();
223                            self.peek()
224                        } {
225                            Some('\'') => {
226                                self.next().unwrap();
227                                true
228                            }
229                            _ => false,
230                        },
231                        _ => false,
232                    };
233                    return Some(if status { TOKEN_STRING_END } else { TOKEN_ERROR });
234                }
235                _ => (),
236            }
237            break;
238        }
239
240        if self.consume(char::is_whitespace) > 0 {
241            return Some(TOKEN_WHITESPACE);
242        }
243
244        if self.peek() == Some('#') {
245            self.consume(|c| c != '\n');
246            return Some(TOKEN_COMMENT);
247        }
248        if self.starts_with_bump("/*") {
249            loop {
250                self.consume(|c| c != '*');
251                self.next(); // consume the '*', if any
252                match self.peek() {
253                    None => return Some(TOKEN_ERROR),
254                    Some('/') => {
255                        self.next().unwrap();
256                        return Some(TOKEN_COMMENT);
257                    }
258                    _ => (),
259                }
260            }
261        }
262
263        if self.starts_with_bump("...") {
264            return Some(TOKEN_ELLIPSIS);
265        }
266
267        // Check if it's a path
268        let store_path = self.peek() == Some('<');
269        let kind = {
270            let skipped = self
271                .remaining()
272                .chars()
273                .take_while(|&c| match c {
274                    '<' | '/' => store_path,
275                    _ => is_valid_path_char(c),
276                })
277                .collect::<String>();
278
279            let mut lookahead = self.remaining().chars().skip(skipped.chars().count());
280
281            match (lookahead.next(), lookahead.next()) {
282                // a//b parses as Update(a, b)
283                (Some('/'), Some('/')) => None,
284                (Some('/'), Some('*')) => None,
285                (Some('/'), Some(c)) if !c.is_whitespace() => Some(IdentType::Path),
286                (Some('>'), _) => Some(IdentType::Store),
287                (Some(':'), Some(c)) if is_valid_uri_char(c) && !skipped.contains('_') => {
288                    Some(IdentType::Uri)
289                }
290                _ => None,
291            }
292        };
293
294        let c = self.next()?;
295
296        if c == '~' || kind == Some(IdentType::Path) {
297            return Some(if c == '~' && self.next() != Some('/') {
298                TOKEN_ERROR
299            } else {
300                self.push_ctx(Context::Path);
301                self.check_path_since(start)
302            });
303        }
304
305        Some(match c {
306            '=' if self.peek() == Some('=') => {
307                self.next().unwrap();
308                TOKEN_EQUAL
309            }
310            '!' if self.peek() == Some('=') => {
311                self.next().unwrap();
312                TOKEN_NOT_EQUAL
313            }
314            '!' => TOKEN_INVERT,
315            '{' => {
316                if let Some(Context::Interpol { brackets }) = self.ctx.last_mut() {
317                    *brackets += 1;
318                }
319                TOKEN_L_BRACE
320            }
321            '}' => {
322                if let Some(Context::Interpol { brackets }) = self.ctx.last_mut() {
323                    match brackets.checked_sub(1) {
324                        Some(new) => *brackets = new,
325                        None => {
326                            self.pop_ctx(Context::Interpol { brackets: 0 });
327                            return Some(TOKEN_INTERPOL_END);
328                        }
329                    }
330                }
331                TOKEN_R_BRACE
332            }
333            '[' => TOKEN_L_BRACK,
334            ']' => TOKEN_R_BRACK,
335            '@' => TOKEN_AT,
336            ':' => TOKEN_COLON,
337            ',' => TOKEN_COMMA,
338            '.' => {
339                if self.peek().map_or(false, |x| ('0'..='9').contains(&x)) {
340                    self.consume(|c| ('0'..='9').contains(&c));
341                    self.consume_scientific()
342                } else {
343                    TOKEN_DOT
344                }
345            }
346            '=' => TOKEN_ASSIGN,
347            '?' => TOKEN_QUESTION,
348            ';' => TOKEN_SEMICOLON,
349            '(' => TOKEN_L_PAREN,
350            ')' => TOKEN_R_PAREN,
351            '+' if self.peek() == Some('+') => {
352                self.next().unwrap();
353                TOKEN_CONCAT
354            }
355            '-' if self.peek() == Some('>') => {
356                self.next().unwrap();
357                TOKEN_IMPLICATION
358            }
359            '/' if self.peek() == Some('/') => {
360                self.next().unwrap();
361                TOKEN_UPDATE
362            }
363            '+' => TOKEN_ADD,
364            '-' => TOKEN_SUB,
365            '*' => TOKEN_MUL,
366            '/' => TOKEN_DIV,
367            '<' if kind == Some(IdentType::Store) => {
368                self.consume(is_valid_path_char);
369                if self.next() != Some('>') {
370                    TOKEN_ERROR
371                } else {
372                    TOKEN_PATH
373                }
374            }
375            '&' if self.peek() == Some('&') => {
376                self.next().unwrap();
377                TOKEN_AND_AND
378            }
379            '|' if self.peek() == Some('|') => {
380                self.next().unwrap();
381                TOKEN_OR_OR
382            }
383            '<' if self.peek() == Some('=') => {
384                self.next().unwrap();
385                TOKEN_LESS_OR_EQ
386            }
387            '<' => TOKEN_LESS,
388            '>' if self.peek() == Some('=') => {
389                self.next().unwrap();
390                TOKEN_MORE_OR_EQ
391            }
392            '>' => TOKEN_MORE,
393            '$' if self.peek() == Some('{') => {
394                self.next().unwrap();
395                self.push_ctx(Context::Interpol { brackets: 0 });
396                TOKEN_INTERPOL_START
397            }
398            'a'..='z' | 'A'..='Z' | '_' => {
399                let kind = match kind {
400                    // It's detected as store if it ends with >, but if it
401                    // didn't start with <, that's wrong
402                    Some(IdentType::Store) | None => IdentType::Ident,
403                    Some(kind) => kind,
404                };
405                self.consume(|c| match c {
406                    'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '-' | '\'' => true,
407                    c => kind == IdentType::Uri && is_valid_uri_char(c),
408                });
409                match kind {
410                    IdentType::Ident => match self.str_since(start) {
411                        "assert" => TOKEN_ASSERT,
412                        "else" => TOKEN_ELSE,
413                        "if" => TOKEN_IF,
414                        "in" => TOKEN_IN,
415                        "inherit" => TOKEN_INHERIT,
416                        "let" => TOKEN_LET,
417                        // "or" is a contextual keyword and will be handled in the parser.
418                        "or" => TOKEN_OR,
419                        "rec" => TOKEN_REC,
420                        "then" => TOKEN_THEN,
421                        "with" => TOKEN_WITH,
422                        _ => TOKEN_IDENT,
423                    },
424                    IdentType::Uri => TOKEN_URI,
425                    IdentType::Path => panic!("paths are checked earlier"),
426                    IdentType::Store => panic!("store paths are checked earlier"),
427                }
428            }
429            '"' => {
430                self.push_ctx(Context::StringBody { multiline: false });
431                TOKEN_STRING_START
432            }
433            '\'' if self.peek() == Some('\'') => {
434                self.next().unwrap();
435                self.push_ctx(Context::StringBody { multiline: true });
436                TOKEN_STRING_START
437            }
438            '0'..='9' => {
439                self.consume(|c| ('0'..='9').contains(&c));
440                if self.peek() == Some('.') {
441                    self.next().unwrap();
442                    self.consume(|c| ('0'..='9').contains(&c));
443                    self.consume_scientific()
444                } else {
445                    TOKEN_INTEGER
446                }
447            }
448            _ => TOKEN_ERROR,
449        })
450    }
451
452    fn consume_scientific(&mut self) -> SyntaxKind {
453        if self.peek() == Some('e') || self.peek() == Some('E') {
454            self.next().unwrap();
455            if self.peek() == Some('-') || self.peek() == Some('+') {
456                self.next().unwrap();
457            }
458            if self.consume(|c| ('0'..='9').contains(&c)) == 0 {
459                return TOKEN_ERROR;
460            }
461        }
462        TOKEN_FLOAT
463    }
464}
465
466impl<'a> Iterator for Tokenizer<'a> {
467    type Item = Token<'a>;
468    fn next(&mut self) -> Option<Self::Item> {
469        let start = self.state;
470        self.next_inner().map(|syntax_kind| (syntax_kind, self.str_since(start)))
471    }
472}