1use crate::SyntaxKind::{self, *};
4
5#[derive(Clone, Copy, Debug, PartialEq, Eq)]
6enum IdentType {
7 Ident,
8 Path,
9 Store,
10 Uri,
11}
12
13fn is_valid_path_char(c: char) -> bool {
14 matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '_' | '.' | '+' | '-')
15}
16
17fn is_valid_uri_char(c: char) -> bool {
18 match c {
19 '%' | '?' | ':' | '@' | '&' | '=' | '$' | ',' | '!' | '~' | '*' | '\'' => true,
20 _ => is_valid_path_char(c),
21 }
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25enum Context {
26 StringBody { multiline: bool },
27 StringEnd,
28 Interpol { brackets: u32 },
29 InterpolStart,
30 Path,
31}
32
33#[derive(Clone, Copy)]
34struct State<'a> {
35 input: &'a str,
36 offset: usize,
37}
38
39impl PartialEq for State<'_> {
40 fn eq(&self, other: &Self) -> bool {
41 std::ptr::eq(self.input, other.input) && self.offset == other.offset
42 }
43}
44
45impl Eq for State<'_> {}
46
47pub type Token<'a> = (SyntaxKind, &'a str);
48
49pub fn tokenize(input: &str) -> Vec<Token<'_>> {
51 Tokenizer::new(input).collect()
52}
53
54pub struct Tokenizer<'a> {
56 ctx: Vec<Context>,
57 state: State<'a>,
58}
59
60impl<'a> Tokenizer<'a> {
61 pub fn new(input: &'a str) -> Self {
62 Self { ctx: Vec::new(), state: State { input, offset: 0 } }
63 }
64}
65
66impl Tokenizer<'_> {
67 fn remaining(&self) -> &str {
68 &self.state.input[self.state.offset..]
69 }
70 fn peek(&self) -> Option<char> {
71 self.remaining().chars().next()
72 }
73 fn next(&mut self) -> Option<char> {
74 let c = self.peek();
75 if let Some(c) = c {
76 self.state.offset += c.len_utf8();
77 }
78 c
79 }
80 fn starts_with_bump(&mut self, s: &str) -> bool {
81 let starts_with = self.remaining().starts_with(s);
82 if starts_with {
83 self.state.offset += s.len();
84 }
85 starts_with
86 }
87 fn str_since<'a>(&self, past: State<'a>) -> &'a str {
88 &past.input[past.offset..self.state.offset]
89 }
90
91 fn push_ctx(&mut self, ctx: Context) {
92 self.ctx.push(ctx)
93 }
94
95 fn pop_ctx(&mut self, ctx: Context) {
96 debug_assert_eq!(self.ctx.last(), Some(&ctx));
97 self.ctx.pop();
98 }
99
100 fn consume<F>(&mut self, mut f: F) -> usize
101 where
102 F: FnMut(char) -> bool,
103 {
104 let len: usize =
105 self.remaining().chars().take_while(|&c| f(c)).map(|c| c.len_utf8()).sum::<usize>();
106 self.state.offset += len;
107 len
108 }
109 fn next_string(&mut self, multiline: bool) -> SyntaxKind {
110 loop {
111 let start = self.state;
112 match self.next() {
113 None => {
114 self.pop_ctx(Context::StringBody { multiline });
115 return TOKEN_ERROR;
116 }
117 Some('"') if !multiline => {
118 self.state = start;
119 self.pop_ctx(Context::StringBody { multiline: false });
120 self.push_ctx(Context::StringEnd);
121 return TOKEN_STRING_CONTENT;
122 }
123 Some('\\') if !multiline => match self.next() {
124 None => return TOKEN_ERROR,
125 Some(_) => (),
126 },
127
128 Some('\'') if multiline => match self.peek() {
129 None => return TOKEN_ERROR,
130 Some('\'') => match {
131 self.next();
132 self.peek()
133 } {
134 Some('\'') | Some('$') => {
135 self.next().unwrap();
136 }
137 Some('\\') => {
138 self.next().unwrap();
139 if let None = self.next() {
140 return TOKEN_ERROR;
141 }
142 }
143 _ => {
144 self.state = start;
145 self.pop_ctx(Context::StringBody { multiline: true });
146 self.push_ctx(Context::StringEnd);
147 return TOKEN_STRING_CONTENT;
148 }
149 },
150 Some(_) => (),
151 },
152
153 Some('$') => match self.peek() {
154 Some('$') => {
155 self.next().unwrap();
156 }
157 Some('{') => {
158 self.state = start;
159 self.push_ctx(Context::InterpolStart);
160 return TOKEN_STRING_CONTENT;
161 }
162 _ => (),
163 },
164 Some(_) => (),
165 }
166 }
167 }
168
169 fn check_path_since(&mut self, past: State) -> SyntaxKind {
170 self.consume(is_valid_path_char);
171 if self.remaining().starts_with("${") {
172 self.ctx.push(Context::InterpolStart);
173 } else if self.str_since(past).ends_with('/') {
174 return TOKEN_ERROR;
175 } else {
176 self.pop_ctx(Context::Path);
177 }
178 TOKEN_PATH
179 }
180
181 fn next_inner(&mut self) -> Option<SyntaxKind> {
182 let start = self.state;
183
184 loop {
186 match self.ctx.last() {
187 Some(Context::InterpolStart) => {
188 self.pop_ctx(Context::InterpolStart);
189 self.ctx.push(Context::Interpol { brackets: 0 });
190 if self.starts_with_bump("${") {
191 return Some(TOKEN_INTERPOL_START);
192 } else {
193 unreachable!()
194 }
195 }
196 Some(Context::Path) => {
197 if self.starts_with_bump("${") {
198 self.ctx.push(Context::Interpol { brackets: 0 });
199 return Some(TOKEN_INTERPOL_START);
200 } else if self.peek().map_or(false, is_valid_path_char) {
201 return Some(self.check_path_since(start));
202 } else {
203 self.pop_ctx(Context::Path);
204 }
205 }
206 Some(Context::StringBody { multiline }) => {
207 let token = self.next_string(*multiline);
208 if self.state == start {
210 continue;
211 }
212 return Some(token);
213 }
214 Some(Context::StringEnd) => {
215 self.pop_ctx(Context::StringEnd);
216 let status = match self.peek() {
217 Some('"') => {
218 self.next().unwrap();
219 true
220 }
221 Some('\'') => match {
222 self.next().unwrap();
223 self.peek()
224 } {
225 Some('\'') => {
226 self.next().unwrap();
227 true
228 }
229 _ => false,
230 },
231 _ => false,
232 };
233 return Some(if status { TOKEN_STRING_END } else { TOKEN_ERROR });
234 }
235 _ => (),
236 }
237 break;
238 }
239
240 if self.consume(char::is_whitespace) > 0 {
241 return Some(TOKEN_WHITESPACE);
242 }
243
244 if self.peek() == Some('#') {
245 self.consume(|c| c != '\n');
246 return Some(TOKEN_COMMENT);
247 }
248 if self.starts_with_bump("/*") {
249 loop {
250 self.consume(|c| c != '*');
251 self.next(); match self.peek() {
253 None => return Some(TOKEN_ERROR),
254 Some('/') => {
255 self.next().unwrap();
256 return Some(TOKEN_COMMENT);
257 }
258 _ => (),
259 }
260 }
261 }
262
263 if self.starts_with_bump("...") {
264 return Some(TOKEN_ELLIPSIS);
265 }
266
267 let store_path = self.peek() == Some('<');
269 let kind = {
270 let skipped = self
271 .remaining()
272 .chars()
273 .take_while(|&c| match c {
274 '<' | '/' => store_path,
275 _ => is_valid_path_char(c),
276 })
277 .collect::<String>();
278
279 let mut lookahead = self.remaining().chars().skip(skipped.chars().count());
280
281 match (lookahead.next(), lookahead.next()) {
282 (Some('/'), Some('/')) => None,
284 (Some('/'), Some('*')) => None,
285 (Some('/'), Some(c)) if !c.is_whitespace() => Some(IdentType::Path),
286 (Some('>'), _) => Some(IdentType::Store),
287 (Some(':'), Some(c)) if is_valid_uri_char(c) && !skipped.contains('_') => {
288 Some(IdentType::Uri)
289 }
290 _ => None,
291 }
292 };
293
294 let c = self.next()?;
295
296 if c == '~' || kind == Some(IdentType::Path) {
297 return Some(if c == '~' && self.next() != Some('/') {
298 TOKEN_ERROR
299 } else {
300 self.push_ctx(Context::Path);
301 self.check_path_since(start)
302 });
303 }
304
305 Some(match c {
306 '=' if self.peek() == Some('=') => {
307 self.next().unwrap();
308 TOKEN_EQUAL
309 }
310 '!' if self.peek() == Some('=') => {
311 self.next().unwrap();
312 TOKEN_NOT_EQUAL
313 }
314 '!' => TOKEN_INVERT,
315 '{' => {
316 if let Some(Context::Interpol { brackets }) = self.ctx.last_mut() {
317 *brackets += 1;
318 }
319 TOKEN_L_BRACE
320 }
321 '}' => {
322 if let Some(Context::Interpol { brackets }) = self.ctx.last_mut() {
323 match brackets.checked_sub(1) {
324 Some(new) => *brackets = new,
325 None => {
326 self.pop_ctx(Context::Interpol { brackets: 0 });
327 return Some(TOKEN_INTERPOL_END);
328 }
329 }
330 }
331 TOKEN_R_BRACE
332 }
333 '[' => TOKEN_L_BRACK,
334 ']' => TOKEN_R_BRACK,
335 '@' => TOKEN_AT,
336 ':' => TOKEN_COLON,
337 ',' => TOKEN_COMMA,
338 '.' => {
339 if self.peek().map_or(false, |x| ('0'..='9').contains(&x)) {
340 self.consume(|c| ('0'..='9').contains(&c));
341 self.consume_scientific()
342 } else {
343 TOKEN_DOT
344 }
345 }
346 '=' => TOKEN_ASSIGN,
347 '?' => TOKEN_QUESTION,
348 ';' => TOKEN_SEMICOLON,
349 '(' => TOKEN_L_PAREN,
350 ')' => TOKEN_R_PAREN,
351 '+' if self.peek() == Some('+') => {
352 self.next().unwrap();
353 TOKEN_CONCAT
354 }
355 '-' if self.peek() == Some('>') => {
356 self.next().unwrap();
357 TOKEN_IMPLICATION
358 }
359 '/' if self.peek() == Some('/') => {
360 self.next().unwrap();
361 TOKEN_UPDATE
362 }
363 '+' => TOKEN_ADD,
364 '-' => TOKEN_SUB,
365 '*' => TOKEN_MUL,
366 '/' => TOKEN_DIV,
367 '<' if kind == Some(IdentType::Store) => {
368 self.consume(is_valid_path_char);
369 if self.next() != Some('>') {
370 TOKEN_ERROR
371 } else {
372 TOKEN_PATH
373 }
374 }
375 '&' if self.peek() == Some('&') => {
376 self.next().unwrap();
377 TOKEN_AND_AND
378 }
379 '|' if self.peek() == Some('|') => {
380 self.next().unwrap();
381 TOKEN_OR_OR
382 }
383 '<' if self.peek() == Some('=') => {
384 self.next().unwrap();
385 TOKEN_LESS_OR_EQ
386 }
387 '<' => TOKEN_LESS,
388 '>' if self.peek() == Some('=') => {
389 self.next().unwrap();
390 TOKEN_MORE_OR_EQ
391 }
392 '>' => TOKEN_MORE,
393 '$' if self.peek() == Some('{') => {
394 self.next().unwrap();
395 self.push_ctx(Context::Interpol { brackets: 0 });
396 TOKEN_INTERPOL_START
397 }
398 'a'..='z' | 'A'..='Z' | '_' => {
399 let kind = match kind {
400 Some(IdentType::Store) | None => IdentType::Ident,
403 Some(kind) => kind,
404 };
405 self.consume(|c| match c {
406 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '-' | '\'' => true,
407 c => kind == IdentType::Uri && is_valid_uri_char(c),
408 });
409 match kind {
410 IdentType::Ident => match self.str_since(start) {
411 "assert" => TOKEN_ASSERT,
412 "else" => TOKEN_ELSE,
413 "if" => TOKEN_IF,
414 "in" => TOKEN_IN,
415 "inherit" => TOKEN_INHERIT,
416 "let" => TOKEN_LET,
417 "or" => TOKEN_OR,
419 "rec" => TOKEN_REC,
420 "then" => TOKEN_THEN,
421 "with" => TOKEN_WITH,
422 _ => TOKEN_IDENT,
423 },
424 IdentType::Uri => TOKEN_URI,
425 IdentType::Path => panic!("paths are checked earlier"),
426 IdentType::Store => panic!("store paths are checked earlier"),
427 }
428 }
429 '"' => {
430 self.push_ctx(Context::StringBody { multiline: false });
431 TOKEN_STRING_START
432 }
433 '\'' if self.peek() == Some('\'') => {
434 self.next().unwrap();
435 self.push_ctx(Context::StringBody { multiline: true });
436 TOKEN_STRING_START
437 }
438 '0'..='9' => {
439 self.consume(|c| ('0'..='9').contains(&c));
440 if self.peek() == Some('.') {
441 self.next().unwrap();
442 self.consume(|c| ('0'..='9').contains(&c));
443 self.consume_scientific()
444 } else {
445 TOKEN_INTEGER
446 }
447 }
448 _ => TOKEN_ERROR,
449 })
450 }
451
452 fn consume_scientific(&mut self) -> SyntaxKind {
453 if self.peek() == Some('e') || self.peek() == Some('E') {
454 self.next().unwrap();
455 if self.peek() == Some('-') || self.peek() == Some('+') {
456 self.next().unwrap();
457 }
458 if self.consume(|c| ('0'..='9').contains(&c)) == 0 {
459 return TOKEN_ERROR;
460 }
461 }
462 TOKEN_FLOAT
463 }
464}
465
466impl<'a> Iterator for Tokenizer<'a> {
467 type Item = Token<'a>;
468 fn next(&mut self) -> Option<Self::Item> {
469 let start = self.state;
470 self.next_inner().map(|syntax_kind| (syntax_kind, self.str_since(start)))
471 }
472}