lexical_util/
div128.rs

1//! Optimized division algorithms for u128.
2//!
3//! # Fast Algorithms
4//!
5//! The more optimized algorithms for calculating the divisor constants are
6//! based off of the paper "Division by Invariant Integers Using Multiplication",
7//! by T. Granlund and P. Montgomery, in "Proc. of the SIGPLAN94 Conference
8//! on Programming Language Design and Implementation", available online
9//! [here](https://gmplib.org/~tege/divcnst-pldi94.pdf).
10//!
11//! This approach is derived from the Rust algorithm for formatting 128-bit
12//! values, and therefore is similarly dual-licensed under MIT/Apache-2.0.
13//!
14//! # Fallback Algorithms
15//!
16//! The slower algorithms in this module are derived off of `dtolnay/itoa`
17//! and Rust's compiler-builtins crate. This copies a specific
18//! path of LLVM's `__udivmodti4` intrinsic, which does division/
19//! modulus for u128 in a single step. Rust implements both division
20//! and modulus in terms of this intrinsic, but calls the intrinsic
21//! twice for subsequent division and modulus operations on the same
22//! dividend/divisor, leading to significant performance overhead.
23//!
24//! This module calculates the optimal divisors for each radix,
25//! and exports a general-purpose division algorithm for u128 where
26//! the divisor can fit in a u64. The moderate algorithm is derived from
27//! dtolnay/itoa, which can be found
28//! [here](https://github.com/dtolnay/itoa/blob/master/src/udiv128.rs), which
29//! in turn is derived from Rust's compiler-builtins crate, which can be found
30//! [here](https://github.com/rust-lang-nursery/compiler-builtins/blob/master/src/int/udiv.rs).
31//!
32//! Licensing for these routines is therefore subject to an MIT/Illinois
33//! dual license (a BSD-like license), while the rest of the module is
34//! subject to an MIT/Apache-2.0 dual-license.
35//!
36//! # Generation
37//!
38//! See [`etc/div128.py`] for the script to generate the divisors and the
39//! constants, and the division algorithm.
40//!
41//! [`etc/div128.py`]: https://github.com/Alexhuszagh/rust-lexical/blob/main/lexical-util/etc/div128.py
42
43#![cfg(not(feature = "compact"))]
44#![cfg(feature = "write")]
45
46use crate::assert::debug_assert_radix;
47use crate::mul::mulhi;
48
49/// Calculate a div/remainder algorithm optimized for power-of-two radixes.
50/// This is trivial: the number of digits we process is `64 / log2(radix)`.
51/// Therefore, the `shr` is `log2(radix) * digits`, and the mask is just the
52/// lower `shr` bits of the digits.
53#[inline]
54#[allow(clippy::many_single_char_names)]
55pub const fn pow2_u128_divrem(n: u128, mask: u64, shr: u32) -> (u128, u64) {
56    let quot = n >> shr;
57    let rem = mask & n as u64;
58    (quot, rem)
59}
60
61/// Fast division/remainder algorithm for u128, without a fast native approximation.
62#[inline]
63#[allow(clippy::many_single_char_names)]
64pub fn fast_u128_divrem(
65    n: u128,
66    d: u64,
67    fast: u128,
68    fast_shr: u32,
69    factor: u128,
70    factor_shr: u32,
71) -> (u128, u64) {
72    let quot = if n < fast {
73        ((n >> fast_shr) as u64 / (d >> fast_shr)) as u128
74    } else {
75        mulhi::<u128, u64>(n, factor) >> factor_shr
76    };
77    let rem = (n - quot * d as u128) as u64;
78    (quot, rem)
79}
80
81/// Fast division/remainder algorithm for u128, without a fast native approximation.
82#[inline]
83#[allow(clippy::many_single_char_names)]
84pub fn moderate_u128_divrem(n: u128, d: u64, factor: u128, factor_shr: u32) -> (u128, u64) {
85    let quot = mulhi::<u128, u64>(n, factor) >> factor_shr;
86    let rem = (n - quot * d as u128) as u64;
87    (quot, rem)
88}
89
90/// Optimized fallback division/remainder algorithm for u128.
91///
92/// This is because the codegen for u128 divrem is very inefficient in Rust,
93/// calling both `__udivmodti4` twice internally, rather than a single time.
94///
95/// This is still a fair bit slower than the optimized algorithms described
96/// in the above paper, but this is a suitable fallback when we cannot use
97/// the faster algorithm.
98#[inline]
99#[allow(clippy::many_single_char_names)]
100pub fn slow_u128_divrem(n: u128, d: u64, d_ctlz: u32) -> (u128, u64) {
101    // Ensure we have the correct number of leading zeros passed.
102    debug_assert_eq!(d_ctlz, d.leading_zeros());
103
104    // Optimize if we can divide using u64 first.
105    let high = (n >> 64) as u64;
106    if high == 0 {
107        let low = n as u64;
108        return ((low / d) as u128, low % d);
109    }
110
111    // sr = 1 + u64::BITS + d.leading_zeros() - high.leading_zeros();
112    let sr = 65 + d_ctlz - high.leading_zeros();
113
114    // 1 <= sr <= u64::BITS - 1
115    let mut q: u128 = n << (128 - sr);
116    let mut r: u128 = n >> sr;
117    let mut carry: u64 = 0;
118
119    // Don't use a range because they may generate references to memcpy in unoptimized code
120    // Loop invariants:  r < d; carry is 0 or 1
121    let mut i = 0;
122    while i < sr {
123        i += 1;
124
125        // r:q = ((r:q) << 1) | carry
126        r = (r << 1) | (q >> 127);
127        q = (q << 1) | carry as u128;
128
129        // carry = 0
130        // if r >= d {
131        //     r -= d;
132        //     carry = 1;
133        // }
134        let s = (d as u128).wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
135        carry = (s & 1) as u64;
136        r -= (d as u128) & s as u128;
137    }
138
139    ((q << 1) | carry as u128, r as u64)
140}
141
142/// Calculate the div/remainder of a value based on the radix.
143///
144/// This uses the largest divisor possible for the given size,
145/// and uses various fast-path approximations for different types.
146///
147/// 1. Powers-of-two can be cleanly split into 2 64-bit products.
148/// 2. Division that can be simulated as if by multiplication by a constant.
149/// 3. Cases of 2. with a power-of-two divisor.
150/// 4. Fallback cases.
151///
152/// This returns the quotient and the remainder.
153/// For the number of digits processed, see
154/// [min_step](crate::step::min_step).
155#[inline(always)]
156pub fn u128_divrem(n: u128, radix: u32) -> (u128, u64) {
157    debug_assert_radix(radix);
158
159    if cfg!(feature = "radix") {
160        match radix {
161            2 => u128_divrem_2(n),
162            3 => u128_divrem_3(n),
163            4 => u128_divrem_4(n),
164            5 => u128_divrem_5(n),
165            6 => u128_divrem_6(n),
166            7 => u128_divrem_7(n),
167            8 => u128_divrem_8(n),
168            9 => u128_divrem_9(n),
169            10 => u128_divrem_10(n),
170            11 => u128_divrem_11(n),
171            12 => u128_divrem_12(n),
172            13 => u128_divrem_13(n),
173            14 => u128_divrem_14(n),
174            15 => u128_divrem_15(n),
175            16 => u128_divrem_16(n),
176            17 => u128_divrem_17(n),
177            18 => u128_divrem_18(n),
178            19 => u128_divrem_19(n),
179            20 => u128_divrem_20(n),
180            21 => u128_divrem_21(n),
181            22 => u128_divrem_22(n),
182            23 => u128_divrem_23(n),
183            24 => u128_divrem_24(n),
184            25 => u128_divrem_25(n),
185            26 => u128_divrem_26(n),
186            27 => u128_divrem_27(n),
187            28 => u128_divrem_28(n),
188            29 => u128_divrem_29(n),
189            30 => u128_divrem_30(n),
190            31 => u128_divrem_31(n),
191            32 => u128_divrem_32(n),
192            33 => u128_divrem_33(n),
193            34 => u128_divrem_34(n),
194            35 => u128_divrem_35(n),
195            36 => u128_divrem_36(n),
196            _ => unreachable!(),
197        }
198    } else if cfg!(feature = "power-of-two") {
199        match radix {
200            2 => u128_divrem_2(n),
201            4 => u128_divrem_4(n),
202            8 => u128_divrem_8(n),
203            10 => u128_divrem_10(n),
204            16 => u128_divrem_16(n),
205            32 => u128_divrem_32(n),
206            _ => unreachable!(),
207        }
208    } else {
209        u128_divrem_10(n)
210    }
211}
212
213// AUTO-GENERATED
214// These functions were auto-generated by `etc/div128.py`.
215// Do not edit them unless there is a good reason to.
216// Preferably, edit the source code to generate the constants.
217//
218// The seemingly magical values are all derived there, and are explained
219// in the function signatures of the functions they call.
220
221#[inline]
222const fn u128_divrem_2(n: u128) -> (u128, u64) {
223    pow2_u128_divrem(n, 18446744073709551615, 64)
224}
225
226#[inline]
227fn u128_divrem_3(n: u128) -> (u128, u64) {
228    slow_u128_divrem(n, 12157665459056928801, 0)
229}
230
231#[inline]
232const fn u128_divrem_4(n: u128) -> (u128, u64) {
233    pow2_u128_divrem(n, 18446744073709551615, 64)
234}
235
236#[inline]
237fn u128_divrem_5(n: u128) -> (u128, u64) {
238    moderate_u128_divrem(n, 7450580596923828125, 105312291668557186697918027683670432319, 61)
239}
240
241#[inline]
242fn u128_divrem_6(n: u128) -> (u128, u64) {
243    fast_u128_divrem(
244        n,
245        4738381338321616896,
246        309485009821345068724781056,
247        24,
248        165591931273573223021296166324748699891,
249        61,
250    )
251}
252
253#[inline]
254fn u128_divrem_7(n: u128) -> (u128, u64) {
255    moderate_u128_divrem(n, 3909821048582988049, 200683792729517998822275406364627986707, 61)
256}
257
258#[inline]
259const fn u128_divrem_8(n: u128) -> (u128, u64) {
260    pow2_u128_divrem(n, 9223372036854775807, 63)
261}
262
263#[inline]
264fn u128_divrem_9(n: u128) -> (u128, u64) {
265    slow_u128_divrem(n, 12157665459056928801, 0)
266}
267
268#[inline]
269fn u128_divrem_10(n: u128) -> (u128, u64) {
270    fast_u128_divrem(
271        n,
272        10000000000000000000,
273        9671406556917033397649408,
274        19,
275        156927543384667019095894735580191660403,
276        62,
277    )
278}
279
280#[inline]
281fn u128_divrem_11(n: u128) -> (u128, u64) {
282    slow_u128_divrem(n, 5559917313492231481, 1)
283}
284
285#[inline]
286fn u128_divrem_12(n: u128) -> (u128, u64) {
287    slow_u128_divrem(n, 2218611106740436992, 3)
288}
289
290#[inline]
291fn u128_divrem_13(n: u128) -> (u128, u64) {
292    moderate_u128_divrem(n, 8650415919381337933, 181410402513790565292660635782582404765, 62)
293}
294
295#[inline]
296fn u128_divrem_14(n: u128) -> (u128, u64) {
297    fast_u128_divrem(
298        n,
299        2177953337809371136,
300        1208925819614629174706176,
301        16,
302        1407280417134467544760816054546363235,
303        53,
304    )
305}
306
307#[inline]
308fn u128_divrem_15(n: u128) -> (u128, u64) {
309    moderate_u128_divrem(n, 6568408355712890625, 1866504587258795246613513364166764993, 55)
310}
311
312#[inline]
313const fn u128_divrem_16(n: u128) -> (u128, u64) {
314    pow2_u128_divrem(n, 18446744073709551615, 64)
315}
316
317#[inline]
318fn u128_divrem_17(n: u128) -> (u128, u64) {
319    moderate_u128_divrem(n, 2862423051509815793, 68529153692836345537218837732158950089, 59)
320}
321
322#[inline]
323fn u128_divrem_18(n: u128) -> (u128, u64) {
324    fast_u128_divrem(
325        n,
326        6746640616477458432,
327        604462909807314587353088,
328        15,
329        232601011830094623283686247347795155951,
330        62,
331    )
332}
333
334#[inline]
335fn u128_divrem_19(n: u128) -> (u128, u64) {
336    moderate_u128_divrem(n, 15181127029874798299, 25842538415601616733690423925257626679, 60)
337}
338
339#[inline]
340fn u128_divrem_20(n: u128) -> (u128, u64) {
341    fast_u128_divrem(
342        n,
343        1638400000000000000,
344        4951760157141521099596496896,
345        28,
346        239452428260295134118491722992235809941,
347        60,
348    )
349}
350
351#[inline]
352fn u128_divrem_21(n: u128) -> (u128, u64) {
353    moderate_u128_divrem(n, 3243919932521508681, 120939747781233590383781714337497669585, 60)
354}
355
356#[inline]
357fn u128_divrem_22(n: u128) -> (u128, u64) {
358    slow_u128_divrem(n, 6221821273427820544, 1)
359}
360
361#[inline]
362fn u128_divrem_23(n: u128) -> (u128, u64) {
363    moderate_u128_divrem(n, 11592836324538749809, 270731922700393644432243678371210997949, 63)
364}
365
366#[inline]
367fn u128_divrem_24(n: u128) -> (u128, u64) {
368    fast_u128_divrem(
369        n,
370        876488338465357824,
371        10141204801825835211973625643008,
372        39,
373        55950381945266105153185943557606235389,
374        57,
375    )
376}
377
378#[inline]
379fn u128_divrem_25(n: u128) -> (u128, u64) {
380    moderate_u128_divrem(n, 1490116119384765625, 131640364585696483372397534604588040399, 59)
381}
382
383#[inline]
384fn u128_divrem_26(n: u128) -> (u128, u64) {
385    fast_u128_divrem(
386        n,
387        2481152873203736576,
388        151115727451828646838272,
389        13,
390        316239166637962178669658228673482425689,
391        61,
392    )
393}
394
395#[inline]
396fn u128_divrem_27(n: u128) -> (u128, u64) {
397    slow_u128_divrem(n, 4052555153018976267, 2)
398}
399
400#[inline]
401fn u128_divrem_28(n: u128) -> (u128, u64) {
402    fast_u128_divrem(
403        n,
404        6502111422497947648,
405        1237940039285380274899124224,
406        26,
407        241348591538561183926479953354701294803,
408        62,
409    )
410}
411
412#[inline]
413fn u128_divrem_29(n: u128) -> (u128, u64) {
414    moderate_u128_divrem(n, 10260628712958602189, 152941450056053853841698190746050519297, 62)
415}
416
417#[inline]
418fn u128_divrem_30(n: u128) -> (u128, u64) {
419    slow_u128_divrem(n, 15943230000000000000, 0)
420}
421
422#[inline]
423fn u128_divrem_31(n: u128) -> (u128, u64) {
424    moderate_u128_divrem(n, 787662783788549761, 124519929891402176328714857711808162537, 58)
425}
426
427#[inline]
428const fn u128_divrem_32(n: u128) -> (u128, u64) {
429    pow2_u128_divrem(n, 1152921504606846975, 60)
430}
431
432#[inline]
433fn u128_divrem_33(n: u128) -> (u128, u64) {
434    slow_u128_divrem(n, 1667889514952984961, 3)
435}
436
437#[inline]
438fn u128_divrem_34(n: u128) -> (u128, u64) {
439    fast_u128_divrem(
440        n,
441        2386420683693101056,
442        75557863725914323419136,
443        12,
444        328792707121977505492535302517672775183,
445        61,
446    )
447}
448
449#[inline]
450fn u128_divrem_35(n: u128) -> (u128, u64) {
451    moderate_u128_divrem(n, 3379220508056640625, 116097442450503652080238494022501325491, 60)
452}
453
454#[inline]
455fn u128_divrem_36(n: u128) -> (u128, u64) {
456    fast_u128_divrem(
457        n,
458        4738381338321616896,
459        309485009821345068724781056,
460        24,
461        165591931273573223021296166324748699891,
462        61,
463    )
464}