lexical_parse_integer/shared.rs
1//! Shared algorithms and utilities for parsing integers.
2//!
3//! These allow implementations of partial and complete parsers
4//! using a single code-path via macros.
5//!
6//! This looks relatively, complex, but it's quite simple. Almost all
7//! of these branches are resolved at compile-time, so the resulting
8//! code is quite small while handling all of the internal complexity.
9//!
10//! 1. Helpers to process ok and error results for both complete and partial
11//! parsers. They have different APIs, and mixing the APIs leads to
12//! substantial performance hits.
13//! 2. Overflow checking on invalid digits for partial parsers, while
14//! just returning invalid digits for complete parsers.
15//! 3. A format-aware sign parser.
16//! 4. Digit parsing algorithms which explicitly wrap on overflow, for no
17//! additional overhead. This has major performance wins for **most**
18//! real-world integers, so most valid input will be substantially faster.
19//! 5. An algorithm to detect if overflow occurred. This is comprehensive,
20//! and short-circuits for common cases.
21//! 6. A parsing algorithm for unsigned integers, always producing positive
22//! values. This avoids any unnecessary branching.
23//! 7. Multi-digit optimizations for larger sizes.
24
25#![doc(hidden)]
26
27use lexical_util::format::NumberFormat;
28use lexical_util::num::{as_cast, Integer, UnsignedInteger};
29use lexical_util::step::max_step;
30
31/// Return an error, returning the index and the error.
32macro_rules! into_error {
33 ($code:ident, $index:expr) => {
34 Err((lexical_util::error::Error::$code($index)))
35 };
36}
37
38/// Return an value for a complete parser.
39macro_rules! into_ok_complete {
40 ($value:expr, $index:expr) => {
41 Ok(as_cast($value))
42 };
43}
44
45/// Return an value and index for a partial parser.
46macro_rules! into_ok_partial {
47 ($value:expr, $index:expr) => {
48 Ok((as_cast($value), $index))
49 };
50}
51
52/// Return an error for a complete parser upon an invalid digit.
53macro_rules! invalid_digit_complete {
54 (
55 $value:ident,
56 $iter:ident,
57 $format:ident,
58 $is_negative:ident,
59 $start_index:ident,
60 $t:ident,
61 $u:ident
62 ) => {{
63 // Don't do any overflow checking here: we don't need it.
64 into_error!(InvalidDigit, $iter.cursor() - 1)
65 }};
66}
67
68/// Return a value for a partial parser upon an invalid digit.
69/// This checks for numeric overflow, and returns the appropriate error.
70macro_rules! invalid_digit_partial {
71 (
72 $value:ident,
73 $iter:ident,
74 $format:ident,
75 $is_negative:ident,
76 $start_index:ident,
77 $t:ident,
78 $u:ident
79 ) => {{
80 let radix = NumberFormat::<{ $format }>::MANTISSA_RADIX;
81 let count = $iter.current_count() - $start_index - 1;
82 if is_overflow::<$t, $u, $format>($value, count, $is_negative) {
83 let min = min_step(radix, <$t as Integer>::BITS, <$t>::IS_SIGNED);
84 if <$t>::IS_SIGNED && $is_negative {
85 into_error!(Underflow, (count - 1).min(min + 1))
86 } else {
87 into_error!(Overflow, (count - 1).min(min + 1))
88 }
89 } else if <$t>::IS_SIGNED && $is_negative {
90 into_ok_partial!($value.wrapping_neg(), $iter.cursor() - 1)
91 } else {
92 into_ok_partial!($value, $iter.cursor() - 1)
93 }
94 }};
95}
96
97/// Parse the sign from the leading digits.
98///
99/// This routine does the following:
100///
101/// 1. Parses the sign digit.
102/// 2. Handles if positive signs before integers are not allowed.
103/// 3. Handles negative signs if the type is unsigned.
104/// 4. Handles if the sign is required, but missing.
105/// 5. Handles if the iterator is empty, before or after parsing the sign.
106/// 6. Handles if the iterator has invalid, leading zeros.
107///
108/// Returns if the value is negative, or any values detected when
109/// validating the input.
110macro_rules! parse_sign {
111 ($iter:ident, $format:ident) => {
112 // This works in all cases, and gets a few handy
113 // optimizations:
114 // 1. It minimizes branching: we either need to subslice
115 // or return an offset from the loop. We can't increment
116 // the iterator in the loop or it decimates performance.
117 //
118 // Using `iter.peek()` means we respect digit separators at
119 // the start of the number, when they're valid.
120 //
121 // All the other cases are removed at compile time.
122 // Note: the compiler isn't smart enough to realize that
123 // `Some(_) if !$format.call() =>` and `Some(_) =>` are
124 // mutually exclusive, so make sure we manually expand
125 // these cases.
126 match $iter.peek() {
127 Some(&b'+') if !$format.no_positive_mantissa_sign() => (false, 1),
128 Some(&b'+') if $format.no_positive_mantissa_sign() => {
129 return into_error!(InvalidPositiveSign, 0);
130 },
131 // Don't add the check for the negative sign here if unsigned,
132 // since it absolutely decimates performance. If it's for a
133 // partial parser, we'll simply get 0 digits parsed, like before.
134 // Complete parsers will still error, like before. That is, it's
135 // correct **enough**.
136 Some(&b'-') if T::IS_SIGNED => (true, 1),
137 Some(_) if $format.required_mantissa_sign() => return into_error!(MissingSign, 0),
138 _ => (false, 0),
139 }
140 };
141}
142
143/// Determine if the value has overflowed.
144#[cfg_attr(not(feature = "compact"), inline)]
145pub(super) fn is_overflow<T, U, const FORMAT: u128>(
146 value: U,
147 count: usize,
148 is_negative: bool,
149) -> bool
150where
151 T: Integer,
152 U: UnsignedInteger,
153{
154 let format = NumberFormat::<{ FORMAT }> {};
155
156 let max = max_step(format.radix(), T::BITS, T::IS_SIGNED);
157 let radix: U = as_cast(format.radix());
158 let min_value: U = radix.pow(max as u32 - 1);
159 if T::IS_SIGNED {
160 // Signed type: have to deal with 2's complement.
161 let max_value: U = as_cast::<U, _>(T::MAX) + U::ONE;
162 if count > max
163 || (count == max
164 && (value < min_value || value > max_value || (!is_negative && value == max_value)))
165 {
166 // Must have overflowed, or wrapped.
167 // 1. Guaranteed overflow due to too many digits.
168 // 2. Guaranteed overflow due to wrap.
169 // 3. Guaranteed overflow since it's too large for the signed type.
170 // 4. Guaranteed overflow due to 2's complement.
171 return true;
172 }
173 } else if count > max || (count == max && value < min_value) {
174 // Must have overflowed: too many digits or wrapped.
175 return true;
176 }
177 false
178}
179
180/// Parse the value for the given type.
181macro_rules! parse_value {
182 (
183 $iter:ident,
184 $is_negative:ident,
185 $format:ident,
186 $start_index:ident,
187 $t:ident,
188 $u:ident,
189 $parser:ident,
190 $invalid_digit:ident,
191 $into_ok:ident
192 ) => {{
193 // Use a simple optimization: parse as an unsigned integer, using
194 // unsigned arithmetic , avoiding any branching in the initial stage.
195 // We can then validate the input based on the signed integer limits,
196 // and cast the value over, which is fast. Leads to substantial
197 // improvements due to decreased branching for all but `i8`.
198 let mut value = <$u>::ZERO;
199 let format = NumberFormat::<{ $format }> {};
200 $parser!(value, $iter, $format, $is_negative, $start_index, $t, $u, $invalid_digit);
201 let count = $iter.current_count() - $start_index;
202
203 if is_overflow::<$t, $u, $format>(value, count, $is_negative) {
204 let min = min_step(format.radix(), <$t as Integer>::BITS, <$t>::IS_SIGNED);
205 if <$t>::IS_SIGNED && $is_negative {
206 into_error!(Underflow, (count - 1).min(min + 1))
207 } else {
208 into_error!(Overflow, (count - 1).min(min + 1))
209 }
210 } else if <$t>::IS_SIGNED && $is_negative {
211 // Need to cast it to the signed type first, so we don't
212 // get an invalid representation for i128 if it's widened.
213 $into_ok!(as_cast::<$t, _>(value.wrapping_neg()), $iter.length())
214 } else {
215 $into_ok!(value, $iter.length())
216 }
217 }};
218}
219
220/// Parse a single digit at a time.
221/// This has no multiple-digit optimizations.
222#[rustfmt::skip]
223macro_rules! parse_1digit {
224 (
225 $value:ident,
226 $iter:ident,
227 $format:ident,
228 $is_negative:ident,
229 $start_index:ident,
230 $t:ident,
231 $u:ident,
232 $invalid_digit:ident
233 ) => {{
234 let format = NumberFormat::<{ $format }>;
235 let radix = NumberFormat::<{ $format }>::MANTISSA_RADIX;
236
237 // Do our slow parsing algorithm: 1 digit at a time.
238 while let Some(&c) = $iter.next() {
239 let digit = match char_to_digit_const(c, radix) {
240 Some(v) => v,
241 None => {
242 // Need to check for a base suffix, if so, return a valid value.
243 // We can't have a base suffix at the first value (need at least
244 // 1 digit).
245 let base_suffix = format.base_suffix();
246 if cfg!(feature = "format") && base_suffix != 0 && $iter.cursor() - $start_index > 1 {
247 let is_suffix = if format.case_sensitive_base_suffix() {
248 c == base_suffix
249 } else {
250 c.to_ascii_lowercase() == base_suffix.to_ascii_lowercase()
251 };
252 if is_suffix && $iter.is_done() {
253 // Break out of the loop, we've finished parsing.
254 break;
255 } else if is_suffix {
256 // Haven't finished parsing, so we're going to call
257 // invalid_digit!. Need to ensure we include the
258 // base suffix in that.
259 // SAFETY: safe since the iterator is not empty, as checked
260 // in `$iter.is_done()` above.
261 unsafe { $iter.step_unchecked() };
262 }
263 }
264 // Might have handled our base-prefix here.
265 return $invalid_digit!(
266 $value,
267 $iter,
268 $format,
269 $is_negative,
270 $start_index,
271 $t,
272 $u
273 );
274 },
275 };
276 $value = $value.wrapping_mul(as_cast(radix));
277 $value = $value.wrapping_add(as_cast(digit));
278 }
279 }};
280}
281
282/// Generic algorithm for both partial and complete parsers.
283///
284/// * `invalid_digit` - Behavior on finding an invalid digit.
285/// * `into_ok` - Behavior when returning a valid value.
286#[rustfmt::skip]
287macro_rules! algorithm {
288 (
289 $bytes:ident,
290 $format:ident,
291 $t:ident,
292 $u:ident,
293 $parser:ident,
294 $invalid_digit:ident,
295 $into_ok:ident
296 ) => {{
297 let format = NumberFormat::<{ $format }> {};
298
299 // WARNING:
300 // --------
301 // None of this code can be changed for optimization reasons.
302 // Do not change it without benchmarking every change.
303 // 1. You cannot use the NoSkipIterator in the loop,
304 // you must either return a subslice (indexing)
305 // or increment outside of the loop.
306 // Failing to do so leads to numerous more, unnecessary
307 // conditional move instructions, killing performance.
308 // 2. Return a 0 or 1 shift, and indexing unchecked outside
309 // of the loop is slightly faster.
310 // 3. Partial and complete parsers cannot be efficiently done
311 // together.
312 //
313 // If you try to refactor without carefully monitoring benchmarks or
314 // assembly generation, please log the number of wasted hours: so
315 // 16 hours so far.
316
317 // With `step_by_unchecked`, this is sufficiently optimized.
318 // Removes conditional paths, to, which simplifies maintenance.
319 // The skip version of the iterator automatically coalesces to
320 // the noskip iterator.
321 let mut byte = $bytes.bytes::<{ $format }>();
322
323 let mut iter = byte.integer_iter();
324 let (is_negative, shift) = parse_sign!(iter, format);
325 // SAFETY: safe since we shift at most one for a parsed sign byte.
326 unsafe { iter.step_by_unchecked(shift) };
327 if iter.is_done() {
328 return into_error!(Empty, shift);
329 }
330 // Skip any leading zeros.
331 let mut start_index = iter.cursor();
332 let zeros = iter.skip_zeros();
333 start_index += zeros;
334
335 // Now, check to see if we have a valid base prefix.
336 let base_prefix = format.base_prefix();
337 let mut is_prefix = false;
338 if cfg!(feature = "format") && base_prefix != 0 && zeros == 1 {
339 // Check to see if the next character is the base prefix.
340 // We must have a format like `0x`, `0d`, `0o`. Note:
341 if let Some(&c) = iter.peek() {
342 is_prefix = if format.case_sensitive_base_prefix() {
343 c == base_prefix
344 } else {
345 c.to_ascii_lowercase() == base_prefix.to_ascii_lowercase()
346 };
347 if is_prefix {
348 // SAFETY: safe since we `byte.len() >= 1`.
349 unsafe { iter.step_unchecked() };
350 if iter.is_done() {
351 return into_error!(Empty, iter.cursor());
352 } else {
353 start_index += 1;
354 }
355 }
356 }
357 }
358
359 // If we have a format that doesn't accept leading zeros,
360 // check if the next value is invalid. It's invalid if the
361 // first is 0, and the next is not a valid digit.
362 if cfg!(feature = "format") && !is_prefix && format.no_integer_leading_zeros() && zeros != 0 {
363 // Cannot have a base prefix and no leading zeros.
364 let index = iter.cursor() - zeros;
365 if zeros > 1 {
366 return into_error!(InvalidLeadingZeros, index);
367 }
368 match iter.peek().map(|&c| char_to_digit_const(c, format.radix())) {
369 // Valid digit, we have an invalid value.
370 Some(Some(_)) => return into_error!(InvalidLeadingZeros, index),
371 // Either not a digit that follows, or nothing follows.
372 _ => return $into_ok!(<$t>::ZERO, index),
373 };
374 }
375
376 // NOTE:
377 // Don't add optimizations for 128-bit integers.
378 // 128-bit multiplication is rather efficient, it's only division
379 // that's very slow. Any shortcut optimizations increasing branching,
380 // and even if parsing a 64-bit integer is marginally faster, it
381 // culminates in **way** slower performance overall for simple
382 // integers, and no improvement for large integers.
383 parse_value!(
384 iter,
385 is_negative,
386 $format,
387 start_index,
388 $t,
389 $u,
390 $parser,
391 $invalid_digit,
392 $into_ok
393 )
394 }};
395}