lexical_parse_integer/
shared.rs

1//! Shared algorithms and utilities for parsing integers.
2//!
3//! These allow implementations of partial and complete parsers
4//! using a single code-path via macros.
5//!
6//! This looks relatively, complex, but it's quite simple. Almost all
7//! of these branches are resolved at compile-time, so the resulting
8//! code is quite small while handling all of the internal complexity.
9//!
10//! 1. Helpers to process ok and error results for both complete and partial
11//!     parsers. They have different APIs, and mixing the APIs leads to
12//!     substantial performance hits.
13//! 2. Overflow checking on invalid digits for partial parsers, while
14//!     just returning invalid digits for complete parsers.
15//! 3. A format-aware sign parser.
16//! 4. Digit parsing algorithms which explicitly wrap on overflow, for no
17//!     additional overhead. This has major performance wins for **most**
18//!     real-world integers, so most valid input will be substantially faster.
19//! 5. An algorithm to detect if overflow occurred. This is comprehensive,
20//!     and short-circuits for common cases.
21//! 6. A parsing algorithm for unsigned integers, always producing positive
22//!     values. This avoids any unnecessary branching.
23//! 7. Multi-digit optimizations for larger sizes.
24
25#![doc(hidden)]
26
27use lexical_util::format::NumberFormat;
28use lexical_util::num::{as_cast, Integer, UnsignedInteger};
29use lexical_util::step::max_step;
30
31/// Return an error, returning the index and the error.
32macro_rules! into_error {
33    ($code:ident, $index:expr) => {
34        Err((lexical_util::error::Error::$code($index)))
35    };
36}
37
38/// Return an value for a complete parser.
39macro_rules! into_ok_complete {
40    ($value:expr, $index:expr) => {
41        Ok(as_cast($value))
42    };
43}
44
45/// Return an value and index for a partial parser.
46macro_rules! into_ok_partial {
47    ($value:expr, $index:expr) => {
48        Ok((as_cast($value), $index))
49    };
50}
51
52/// Return an error for a complete parser upon an invalid digit.
53macro_rules! invalid_digit_complete {
54    (
55        $value:ident,
56        $iter:ident,
57        $format:ident,
58        $is_negative:ident,
59        $start_index:ident,
60        $t:ident,
61        $u:ident
62    ) => {{
63        // Don't do any overflow checking here: we don't need it.
64        into_error!(InvalidDigit, $iter.cursor() - 1)
65    }};
66}
67
68/// Return a value for a partial parser upon an invalid digit.
69/// This checks for numeric overflow, and returns the appropriate error.
70macro_rules! invalid_digit_partial {
71    (
72        $value:ident,
73        $iter:ident,
74        $format:ident,
75        $is_negative:ident,
76        $start_index:ident,
77        $t:ident,
78        $u:ident
79    ) => {{
80        let radix = NumberFormat::<{ $format }>::MANTISSA_RADIX;
81        let count = $iter.current_count() - $start_index - 1;
82        if is_overflow::<$t, $u, $format>($value, count, $is_negative) {
83            let min = min_step(radix, <$t as Integer>::BITS, <$t>::IS_SIGNED);
84            if <$t>::IS_SIGNED && $is_negative {
85                into_error!(Underflow, (count - 1).min(min + 1))
86            } else {
87                into_error!(Overflow, (count - 1).min(min + 1))
88            }
89        } else if <$t>::IS_SIGNED && $is_negative {
90            into_ok_partial!($value.wrapping_neg(), $iter.cursor() - 1)
91        } else {
92            into_ok_partial!($value, $iter.cursor() - 1)
93        }
94    }};
95}
96
97/// Parse the sign from the leading digits.
98///
99/// This routine does the following:
100///
101/// 1. Parses the sign digit.
102/// 2. Handles if positive signs before integers are not allowed.
103/// 3. Handles negative signs if the type is unsigned.
104/// 4. Handles if the sign is required, but missing.
105/// 5. Handles if the iterator is empty, before or after parsing the sign.
106/// 6. Handles if the iterator has invalid, leading zeros.
107///
108/// Returns if the value is negative, or any values detected when
109/// validating the input.
110macro_rules! parse_sign {
111    ($iter:ident, $format:ident) => {
112        //  This works in all cases, and gets a few handy
113        //  optimizations:
114        //  1. It minimizes branching: we either need to subslice
115        //      or return an offset from the loop. We can't increment
116        //      the iterator in the loop or it decimates performance.
117        //
118        //  Using `iter.peek()` means we respect digit separators at
119        //  the start of the number, when they're valid.
120        //
121        //  All the other cases are removed at compile time.
122        //  Note: the compiler isn't smart enough to realize that
123        //  `Some(_) if !$format.call() =>` and `Some(_) =>` are
124        //  mutually exclusive, so make sure we manually expand
125        //  these cases.
126        match $iter.peek() {
127            Some(&b'+') if !$format.no_positive_mantissa_sign() => (false, 1),
128            Some(&b'+') if $format.no_positive_mantissa_sign() => {
129                return into_error!(InvalidPositiveSign, 0);
130            },
131            // Don't add the check for the negative sign here if unsigned,
132            // since it absolutely decimates performance. If it's for a
133            // partial parser, we'll simply get 0 digits parsed, like before.
134            // Complete parsers will still error, like before. That is, it's
135            // correct **enough**.
136            Some(&b'-') if T::IS_SIGNED => (true, 1),
137            Some(_) if $format.required_mantissa_sign() => return into_error!(MissingSign, 0),
138            _ => (false, 0),
139        }
140    };
141}
142
143/// Determine if the value has overflowed.
144#[cfg_attr(not(feature = "compact"), inline)]
145pub(super) fn is_overflow<T, U, const FORMAT: u128>(
146    value: U,
147    count: usize,
148    is_negative: bool,
149) -> bool
150where
151    T: Integer,
152    U: UnsignedInteger,
153{
154    let format = NumberFormat::<{ FORMAT }> {};
155
156    let max = max_step(format.radix(), T::BITS, T::IS_SIGNED);
157    let radix: U = as_cast(format.radix());
158    let min_value: U = radix.pow(max as u32 - 1);
159    if T::IS_SIGNED {
160        // Signed type: have to deal with 2's complement.
161        let max_value: U = as_cast::<U, _>(T::MAX) + U::ONE;
162        if count > max
163            || (count == max
164                && (value < min_value || value > max_value || (!is_negative && value == max_value)))
165        {
166            // Must have overflowed, or wrapped.
167            // 1. Guaranteed overflow due to too many digits.
168            // 2. Guaranteed overflow due to wrap.
169            // 3. Guaranteed overflow since it's too large for the signed type.
170            // 4. Guaranteed overflow due to 2's complement.
171            return true;
172        }
173    } else if count > max || (count == max && value < min_value) {
174        // Must have overflowed: too many digits or wrapped.
175        return true;
176    }
177    false
178}
179
180/// Parse the value for the given type.
181macro_rules! parse_value {
182    (
183        $iter:ident,
184        $is_negative:ident,
185        $format:ident,
186        $start_index:ident,
187        $t:ident,
188        $u:ident,
189        $parser:ident,
190        $invalid_digit:ident,
191        $into_ok:ident
192    ) => {{
193        // Use a simple optimization: parse as an unsigned integer, using
194        // unsigned arithmetic , avoiding any branching in the initial stage.
195        // We can then validate the input based on the signed integer limits,
196        // and cast the value over, which is fast. Leads to substantial
197        // improvements due to decreased branching for all but `i8`.
198        let mut value = <$u>::ZERO;
199        let format = NumberFormat::<{ $format }> {};
200        $parser!(value, $iter, $format, $is_negative, $start_index, $t, $u, $invalid_digit);
201        let count = $iter.current_count() - $start_index;
202
203        if is_overflow::<$t, $u, $format>(value, count, $is_negative) {
204            let min = min_step(format.radix(), <$t as Integer>::BITS, <$t>::IS_SIGNED);
205            if <$t>::IS_SIGNED && $is_negative {
206                into_error!(Underflow, (count - 1).min(min + 1))
207            } else {
208                into_error!(Overflow, (count - 1).min(min + 1))
209            }
210        } else if <$t>::IS_SIGNED && $is_negative {
211            // Need to cast it to the signed type first, so we don't
212            // get an invalid representation for i128 if it's widened.
213            $into_ok!(as_cast::<$t, _>(value.wrapping_neg()), $iter.length())
214        } else {
215            $into_ok!(value, $iter.length())
216        }
217    }};
218}
219
220/// Parse a single digit at a time.
221/// This has no multiple-digit optimizations.
222#[rustfmt::skip]
223macro_rules! parse_1digit {
224    (
225        $value:ident,
226        $iter:ident,
227        $format:ident,
228        $is_negative:ident,
229        $start_index:ident,
230        $t:ident,
231        $u:ident,
232        $invalid_digit:ident
233    ) => {{
234        let format = NumberFormat::<{ $format }>;
235        let radix = NumberFormat::<{ $format }>::MANTISSA_RADIX;
236
237        // Do our slow parsing algorithm: 1 digit at a time.
238        while let Some(&c) = $iter.next() {
239            let digit = match char_to_digit_const(c, radix) {
240                Some(v) => v,
241                None => {
242                    // Need to check for a base suffix, if so, return a valid value.
243                    // We can't have a base suffix at the first value (need at least
244                    // 1 digit).
245                    let base_suffix = format.base_suffix();
246                    if cfg!(feature = "format") && base_suffix != 0 && $iter.cursor() - $start_index > 1 {
247                        let is_suffix = if format.case_sensitive_base_suffix() {
248                            c == base_suffix
249                        } else {
250                            c.to_ascii_lowercase() == base_suffix.to_ascii_lowercase()
251                        };
252                        if is_suffix && $iter.is_done() {
253                            // Break out of the loop, we've finished parsing.
254                            break;
255                        } else if is_suffix {
256                            // Haven't finished parsing, so we're going to call
257                            // invalid_digit!. Need to ensure we include the
258                            // base suffix in that.
259                            // SAFETY: safe since the iterator is not empty, as checked
260                            // in `$iter.is_done()` above.
261                            unsafe { $iter.step_unchecked() };
262                        }
263                    }
264                    // Might have handled our base-prefix here.
265                    return $invalid_digit!(
266                        $value,
267                        $iter,
268                        $format,
269                        $is_negative,
270                        $start_index,
271                        $t,
272                        $u
273                    );
274                },
275            };
276            $value = $value.wrapping_mul(as_cast(radix));
277            $value = $value.wrapping_add(as_cast(digit));
278        }
279    }};
280}
281
282/// Generic algorithm for both partial and complete parsers.
283///
284/// * `invalid_digit` - Behavior on finding an invalid digit.
285/// * `into_ok` - Behavior when returning a valid value.
286#[rustfmt::skip]
287macro_rules! algorithm {
288    (
289        $bytes:ident,
290        $format:ident,
291        $t:ident,
292        $u:ident,
293        $parser:ident,
294        $invalid_digit:ident,
295        $into_ok:ident
296    ) => {{
297        let format = NumberFormat::<{ $format }> {};
298
299        // WARNING:
300        // --------
301        // None of this code can be changed for optimization reasons.
302        // Do not change it without benchmarking every change.
303        //  1. You cannot use the NoSkipIterator in the loop,
304        //      you must either return a subslice (indexing)
305        //      or increment outside of the loop.
306        //      Failing to do so leads to numerous more, unnecessary
307        //      conditional move instructions, killing performance.
308        //  2. Return a 0 or 1 shift, and indexing unchecked outside
309        //      of the loop is slightly faster.
310        //  3. Partial and complete parsers cannot be efficiently done
311        //      together.
312        //
313        // If you try to refactor without carefully monitoring benchmarks or
314        // assembly generation, please log the number of wasted hours: so
315        //  16 hours so far.
316
317        // With `step_by_unchecked`, this is sufficiently optimized.
318        // Removes conditional paths, to, which simplifies maintenance.
319        // The skip version of the iterator automatically coalesces to
320        // the noskip iterator.
321        let mut byte = $bytes.bytes::<{ $format }>();
322
323        let mut iter = byte.integer_iter();
324        let (is_negative, shift) = parse_sign!(iter, format);
325        // SAFETY: safe since we shift at most one for a parsed sign byte.
326        unsafe { iter.step_by_unchecked(shift) };
327        if iter.is_done() {
328            return into_error!(Empty, shift);
329        }
330        // Skip any leading zeros.
331        let mut start_index = iter.cursor();
332        let zeros = iter.skip_zeros();
333        start_index += zeros;
334
335        // Now, check to see if we have a valid base prefix.
336        let base_prefix = format.base_prefix();
337        let mut is_prefix = false;
338        if cfg!(feature = "format") && base_prefix != 0 && zeros == 1 {
339            // Check to see if the next character is the base prefix.
340            // We must have a format like `0x`, `0d`, `0o`. Note:
341            if let Some(&c) = iter.peek() {
342                is_prefix = if format.case_sensitive_base_prefix() {
343                    c == base_prefix
344                } else {
345                    c.to_ascii_lowercase() == base_prefix.to_ascii_lowercase()
346                };
347                if is_prefix {
348                    // SAFETY: safe since we `byte.len() >= 1`.
349                    unsafe { iter.step_unchecked() };
350                    if iter.is_done() {
351                        return into_error!(Empty, iter.cursor());
352                    } else {
353                        start_index += 1;
354                    }
355                }
356            }
357        }
358
359        // If we have a format that doesn't accept leading zeros,
360        // check if the next value is invalid. It's invalid if the
361        // first is 0, and the next is not a valid digit.
362        if cfg!(feature = "format") && !is_prefix && format.no_integer_leading_zeros() && zeros != 0 {
363            // Cannot have a base prefix and no leading zeros.
364            let index = iter.cursor() - zeros;
365            if zeros > 1 {
366                return into_error!(InvalidLeadingZeros, index);
367            }
368            match iter.peek().map(|&c| char_to_digit_const(c, format.radix())) {
369                // Valid digit, we have an invalid value.
370                Some(Some(_)) => return into_error!(InvalidLeadingZeros, index),
371                // Either not a digit that follows, or nothing follows.
372                _ => return $into_ok!(<$t>::ZERO, index),
373            };
374        }
375
376        //  NOTE:
377        //      Don't add optimizations for 128-bit integers.
378        //      128-bit multiplication is rather efficient, it's only division
379        //      that's very slow. Any shortcut optimizations increasing branching,
380        //      and even if parsing a 64-bit integer is marginally faster, it
381        //      culminates in **way** slower performance overall for simple
382        //      integers, and no improvement for large integers.
383        parse_value!(
384            iter,
385            is_negative,
386            $format,
387            start_index,
388            $t,
389            $u,
390            $parser,
391            $invalid_digit,
392            $into_ok
393        )
394    }};
395}