vu128/vu128.rs
1// Copyright (c) 2024 John Millikin <john@john-millikin.com>
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted.
5//
6// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
7// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
8// AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
9// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
10// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
11// OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
12// PERFORMANCE OF THIS SOFTWARE.
13//
14// SPDX-License-Identifier: 0BSD
15
16//! # vu128: Efficient variable-length integers
17//!
18//! `vu128` is a variable-length integer encoding, with smaller values being
19//! encoded using fewer bytes. Integer sizes up to 128 bits are supported.
20//! The compression ratio of `vu128` equals or exceeds the widely used [VLQ]
21//! and [LEB128] encodings, and is faster on modern pipelined architectures.
22//!
23//! [VLQ]: https://en.wikipedia.org/wiki/Variable-length_quantity
24//! [LEB128]: https://en.wikipedia.org/wiki/LEB128
25//!
26//! # Encoding details
27//!
28//! Values in the range `[0, 2^7)` are encoded as a single byte with
29//! the same bits as the original value.
30//!
31//! Values in the range `[2^7, 2^28)` are encoded as a unary length prefix,
32//! followed by `(length*7)` bits, in little-endian order. This is conceptually
33//! similar to LEB128, but the continuation bits are placed in upper half
34//! of the initial byte. This arrangement is also known as a "prefix varint".
35//!
36//! ```text
37//! MSB ------------------ LSB
38//!
39//! 10101011110011011110 Input value (0xABCDE)
40//! 0101010 1111001 1011110 Zero-padded to a multiple of 7 bits
41//! 01010101 11100110 ___11110 Grouped into octets, with 3 continuation bits
42//! 01010101 11100110 11011110 Continuation bits `110` added
43//! 0x55 0xE6 0xDE In hexadecimal
44//!
45//! [0xDE, 0xE6, 0x55] Encoded output (order is little-endian)
46//! ```
47//!
48//! Values in the range `[2^28, 2^128)` are encoded as a binary length prefix,
49//! followed by payload bytes, in little-endian order. To differentiate this
50//! format from the format of smaller values, the top 4 bits of the first byte
51//! are set. The length prefix value is the number of payload bytes minus one;
52//! equivalently it is the total length of the encoded value minus two.
53//!
54//! ```text
55//! MSB ------------------------------------ LSB
56//!
57//! 10010001101000101011001111000 Input value (0x12345678)
58//! 00010010 00110100 01010110 01111000 Zero-padded to a multiple of 8 bits
59//! 00010010 00110100 01010110 01111000 11110011 Prefix byte is `0xF0 | (4 - 1)`
60//! 0x12 0x34 0x56 0x78 0xF3 In hexadecimal
61//!
62//! [0xF3, 0x78, 0x56, 0x34, 0x12] Encoded output (order is little-endian)
63//! ```
64//!
65//! # Handling of over-long encodings
66//!
67//! The `vu128` format permits over-long encodings, which encode a value using
68//! a byte sequence that is unnecessarily long:
69//!
70//! * Zero-padding beyond that required to reach a multiple of 7 or 8 bits.
71//! * Using a length prefix byte for a value in the range `[0, 2^7)`.
72//! * Using a binary length prefix byte for a value in the range `[0, 2^28)`.
73//!
74//! The `encode_*` functions in this module will not generate such over-long
75//! encodings, but the `decode_*` functions will accept them. This is intended
76//! to allow `vu128` values to be placed in a buffer before the value to be
77//! written is known. Applications that require a single canonical encoding for
78//! any given value should perform appropriate checking in their own code.
79//!
80//! # Signed integers and floating-point values
81//!
82//! Signed integers and IEEE-754 floating-point values may be encoded with
83//! `vu128` by mapping them to unsigned integers. It is recommended that the
84//! mapping functions be chosen so as to minimize the number of zeroes in the
85//! higher-order bits, which enables better compression.
86//!
87//! This library includes helper functions that use Protocol Buffer's ["ZigZag"
88//! encoding] for signed integers and reverse-endian layout for floating-point.
89//!
90//! ["ZigZag" encoding]: https://protobuf.dev/programming-guides/encoding/#signed-ints
91
92#![no_std]
93#![warn(clippy::must_use_candidate)]
94#![warn(clippy::undocumented_unsafe_blocks)]
95#![warn(missing_docs)]
96
97use core::mem;
98
99/// Returns the encoded length in a `vu128` prefix byte.
100///
101/// # Examples
102///
103/// ```
104/// let mut buf = [0u8; 5];
105/// let encoded_len = vu128::encode_u32(&mut buf, 12345);
106/// assert_eq!(vu128::encoded_len(buf[0]), encoded_len);
107/// ```
108#[must_use]
109pub const fn encoded_len(b: u8) -> usize {
110 if b < 0b10000000 {
111 return 1;
112 }
113 if b < 0b11000000 {
114 return 2;
115 }
116 if b < 0b11100000 {
117 return 3;
118 }
119 if b < 0b11110000 {
120 return 4;
121 }
122 ((b & 0x0F) + 2) as usize
123}
124
125/// Encodes a `u32` into a buffer, returning the encoded length.
126///
127/// The contents of the buffer beyond the returned length are unspecified.
128///
129/// # Examples
130///
131/// ```
132/// let mut buf = [0u8; 5];
133/// let encoded_len = vu128::encode_u32(&mut buf, 12345);
134/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
135/// ```
136#[inline]
137#[must_use]
138pub fn encode_u32(buf: &mut [u8; 5], value: u32) -> usize {
139 let mut x = value;
140 if x < 0x80 {
141 buf[0] = x as u8;
142 return 1;
143 }
144 if x < 0x10000000 {
145 if x < 0x00004000 {
146 x <<= 2;
147 buf[0] = 0x80 | ((x as u8) >> 2);
148 buf[1] = (x >> 8) as u8;
149 return 2;
150 }
151 if x < 0x00200000 {
152 x <<= 3;
153 buf[0] = 0xC0 | ((x as u8) >> 3);
154 buf[1] = (x >> 8) as u8;
155 buf[2] = (x >> 16) as u8;
156 return 3;
157 }
158 x <<= 4;
159 buf[0] = 0xE0 | ((x as u8) >> 4);
160 buf[1] = (x >> 8) as u8;
161 buf[2] = (x >> 16) as u8;
162 buf[3] = (x >> 24) as u8;
163 return 4;
164 }
165
166 // SAFETY: buf has a const length of `size_of::<u32>() + 1`.
167 unsafe {
168 ptr_from_mut::<[u8; mem::size_of::<u32>() + 1]>(buf)
169 .cast::<u8>()
170 .add(1)
171 .cast::<u32>()
172 .write_unaligned(x.to_le());
173 }
174
175 buf[0] = 0xF3;
176 5
177}
178
179/// Encodes a `u64` into a buffer, returning the encoded length.
180///
181/// The contents of the buffer beyond the returned length are unspecified.
182///
183/// # Examples
184///
185/// ```
186/// let mut buf = [0u8; 9];
187/// let encoded_len = vu128::encode_u64(&mut buf, 12345);
188/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
189/// ```
190#[inline]
191#[must_use]
192pub fn encode_u64(buf: &mut [u8; 9], value: u64) -> usize {
193 let mut x = value;
194 if x < 0x80 {
195 buf[0] = x as u8;
196 return 1;
197 }
198 if x < 0x10000000 {
199 if x < 0x00004000 {
200 x <<= 2;
201 buf[0] = 0x80 | ((x as u8) >> 2);
202 buf[1] = (x >> 8) as u8;
203 return 2;
204 }
205 if x < 0x00200000 {
206 x <<= 3;
207 buf[0] = 0xC0 | ((x as u8) >> 3);
208 buf[1] = (x >> 8) as u8;
209 buf[2] = (x >> 16) as u8;
210 return 3;
211 }
212 x <<= 4;
213 buf[0] = 0xE0 | ((x as u8) >> 4);
214 buf[1] = (x >> 8) as u8;
215 buf[2] = (x >> 16) as u8;
216 buf[3] = (x >> 24) as u8;
217 return 4;
218 }
219
220 // SAFETY: buf has a const length of `size_of::<u64>() + 1`.
221 unsafe {
222 ptr_from_mut::<[u8; mem::size_of::<u64>() + 1]>(buf)
223 .cast::<u8>()
224 .add(1)
225 .cast::<u64>()
226 .write_unaligned(x.to_le());
227 }
228
229 const LEN_MASK: u8 = 0b111;
230 let len = ((x.leading_zeros() >> 3) as u8) ^ LEN_MASK;
231 buf[0] = 0xF0 | len;
232 (len + 2) as usize
233}
234
235/// Encodes a `u128` into a buffer, returning the encoded length.
236///
237/// The contents of the buffer beyond the returned length are unspecified.
238///
239/// # Examples
240///
241/// ```
242/// let mut buf = [0u8; 17];
243/// let encoded_len = vu128::encode_u128(&mut buf, 12345);
244/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
245/// ```
246#[inline]
247#[must_use]
248pub fn encode_u128(buf: &mut [u8; 17], value: u128) -> usize {
249 if value < 0x80 {
250 buf[0] = value as u8;
251 return 1;
252 }
253 if value < 0x10000000 {
254 // SAFETY: A `[u8; 17]` can be safely truncated to a `[u8; 5]`.
255 let buf_u32 = unsafe {
256 &mut *(ptr_from_mut::<[u8; 17]>(buf).cast::<[u8; 5]>())
257 };
258 return encode_u32(buf_u32, value as u32);
259 }
260
261 // SAFETY: buf has a const length of `size_of::<u128>() + 1`.
262 unsafe {
263 ptr_from_mut::<[u8; mem::size_of::<u128>() + 1]>(buf)
264 .cast::<u8>()
265 .add(1)
266 .cast::<u128>()
267 .write_unaligned(value.to_le());
268 }
269
270 const LEN_MASK: u8 = 0b1111;
271 let len = ((value.leading_zeros() >> 3) as u8) ^ LEN_MASK;
272 buf[0] = 0xF0 | len;
273 (len + 2) as usize
274}
275
276/// Decodes a `u32` from a buffer, returning the value and encoded length.
277///
278/// # Examples
279///
280/// ```
281/// let mut buf = [0u8; 5];
282/// let encoded_len = vu128::encode_u32(&mut buf, 123);
283/// assert_eq!(vu128::decode_u32(&buf), (123, encoded_len));
284/// ```
285#[inline]
286#[must_use]
287pub fn decode_u32(buf: &[u8; 5]) -> (u32, usize) {
288 let buf0 = buf[0] as u32;
289 if (buf0 & 0x80) == 0 {
290 return (buf0, 1);
291 }
292 if (buf0 & 0b01000000) == 0 {
293 let low = (buf0 as u8) & 0x3F;
294 let value = ((buf[1] as u32) << 6) | (low as u32);
295 return (value, 2);
296 }
297 if buf0 >= 0xF0 {
298 let len = ((buf0 as u8) & 0x0F) + 2;
299 let value = ((buf[4] as u32) << 24)
300 | ((buf[3] as u32) << 16)
301 | ((buf[2] as u32) << 8)
302 | (buf[1] as u32);
303 return (value, len as usize);
304 }
305 if (buf0 & 0b00100000) == 0 {
306 let low = (buf0 as u8) & 0x1F;
307 let value = ((buf[2] as u32) << 13)
308 | ((buf[1] as u32) << 5)
309 | (low as u32);
310 return (value, 3);
311 }
312 let value = ((buf[3] as u32) << 20)
313 | ((buf[2] as u32) << 12)
314 | ((buf[1] as u32) << 4)
315 | (buf0 & 0x0F);
316 (value, 4)
317}
318
319/// Decodes a `u64` from a buffer, returning the value and encoded length.
320///
321/// # Examples
322///
323/// ```
324/// let mut buf = [0u8; 9];
325/// let encoded_len = vu128::encode_u64(&mut buf, 123);
326/// assert_eq!(vu128::decode_u64(&buf), (123, encoded_len));
327/// ```
328#[inline]
329#[must_use]
330pub fn decode_u64(buf: &[u8; 9]) -> (u64, usize) {
331 let buf0 = buf[0] as u64;
332 if (buf0 & 0x80) == 0 {
333 return (buf0, 1);
334 }
335 if buf0 < 0xF0 {
336 if (buf0 & 0b01000000) == 0 {
337 let low = (buf0 as u8) & 0b00111111;
338 let value = ((buf[1] as u32) << 6) | (low as u32);
339 return (value as u64, 2);
340 }
341 if (buf0 & 0b00100000) == 0 {
342 let low = (buf0 as u8) & 0b00011111;
343 let value = ((buf[2] as u32) << 13)
344 | ((buf[1] as u32) << 5)
345 | (low as u32);
346 return (value as u64, 3);
347 }
348 let value = ((buf[3] as u32) << 20)
349 | ((buf[2] as u32) << 12)
350 | ((buf[1] as u32) << 4)
351 | ((buf0 as u32) & 0b00001111);
352 return (value as u64, 4);
353 }
354
355 // SAFETY: buf has a const length of `size_of::<u64>() + 1`.
356 let value = u64::from_le(unsafe {
357 ptr_from_ref::<[u8; mem::size_of::<u64>() + 1]>(buf)
358 .cast::<u8>()
359 .add(1)
360 .cast::<u64>()
361 .read_unaligned()
362 });
363
364 const LEN_MASK: u8 = 0b111;
365 let len = buf[0] & 0x0F;
366 let mask_octets = (len & LEN_MASK) ^ LEN_MASK;
367 let mask = u64::MAX >> (mask_octets * 8);
368 (value & mask, (len + 2) as usize)
369}
370
371/// Decodes a `u128` from a buffer, returning the value and encoded length.
372///
373/// # Examples
374///
375/// ```
376/// let mut buf = [0u8; 17];
377/// let encoded_len = vu128::encode_u128(&mut buf, 123);
378/// assert_eq!(vu128::decode_u128(&buf), (123, encoded_len));
379/// ```
380#[inline]
381#[must_use]
382pub fn decode_u128(buf: &[u8; 17]) -> (u128, usize) {
383 if (buf[0] & 0x80) == 0 {
384 return (buf[0] as u128, 1);
385 }
386 if buf[0] < 0xF0 {
387 // SAFETY: A `[u8; 17]` can be safely truncated to a `[u8; 5]`.
388 let buf_u32 = unsafe {
389 &*(ptr_from_ref::<[u8; 17]>(buf).cast::<[u8; 5]>())
390 };
391 let (value, len) = decode_u32(buf_u32);
392 return (value as u128, len);
393 }
394
395 // SAFETY: buf has a const length of `size_of::<u128>() + 1`.
396 let value = u128::from_le(unsafe {
397 ptr_from_ref::<[u8; mem::size_of::<u128>() + 1]>(buf)
398 .cast::<u8>()
399 .add(1)
400 .cast::<u128>()
401 .read_unaligned()
402 });
403 const LEN_MASK: u8 = 0b1111;
404 let len = buf[0] & 0x0F;
405 let mask_octets = (len & LEN_MASK) ^ LEN_MASK;
406 let mask = u128::MAX >> (mask_octets * 8);
407 (value & mask, (len + 2) as usize)
408}
409
410macro_rules! encode_iNN {
411 ($(#[$docs:meta])* $name:ident ( $it:ident, $ut:ident, $encode_fn:ident ) ) => {
412 $(#[$docs])*
413 #[inline]
414 #[must_use]
415 pub fn $name(buf: &mut [u8; mem::size_of::<$ut>()+1], value: $it) -> usize {
416 const ZIGZAG_SHIFT: u8 = ($ut::BITS as u8) - 1;
417 let zigzag = ((value >> ZIGZAG_SHIFT) as $ut) ^ ((value << 1) as $ut);
418 $encode_fn(buf, zigzag)
419 }
420 };
421}
422
423macro_rules! decode_iNN {
424 ($(#[$docs:meta])* $name:ident ( $it:ident, $ut:ident, $decode_fn:ident ) ) => {
425 $(#[$docs])*
426 #[inline]
427 #[must_use]
428 pub fn $name(buf: &[u8; mem::size_of::<$ut>()+1]) -> ($it, usize) {
429 let (zz, len) = $decode_fn(buf);
430 let value = ((zz >> 1) as $it) ^ (-((zz & 1) as $it));
431 (value, len)
432 }
433 };
434}
435
436encode_iNN! {
437 /// Encodes an `i32` into a buffer, returning the encoded length.
438 ///
439 /// The contents of the buffer beyond the returned length are unspecified.
440 ///
441 /// # Examples
442 ///
443 /// ```
444 /// let mut buf = [0u8; 5];
445 /// let encoded_len = vu128::encode_i32(&mut buf, 123);
446 /// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
447 /// ```
448 encode_i32(i32, u32, encode_u32)
449}
450
451encode_iNN! {
452 /// Encodes an `i64` into a buffer, returning the encoded length.
453 ///
454 /// The contents of the buffer beyond the returned length are unspecified.
455 ///
456 /// # Examples
457 ///
458 /// ```
459 /// let mut buf = [0u8; 9];
460 /// let encoded_len = vu128::encode_i64(&mut buf, 123);
461 /// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
462 /// ```
463 encode_i64(i64, u64, encode_u64)
464}
465
466encode_iNN! {
467 /// Encodes an `i128` into a buffer, returning the encoded length.
468 ///
469 /// The contents of the buffer beyond the returned length are unspecified.
470 ///
471 /// # Examples
472 ///
473 /// ```
474 /// let mut buf = [0u8; 17];
475 /// let encoded_len = vu128::encode_i128(&mut buf, 123);
476 /// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
477 /// ```
478 encode_i128(i128, u128, encode_u128)
479}
480
481decode_iNN! {
482 /// Decodes an `i32` from a buffer, returning the value and encoded length.
483 ///
484 /// # Examples
485 ///
486 /// ```
487 /// let mut buf = [0u8; 5];
488 /// let encoded_len = vu128::encode_i32(&mut buf, 123);
489 /// assert_eq!(vu128::decode_i32(&buf), (123, encoded_len));
490 /// ```
491 decode_i32(i32, u32, decode_u32)
492}
493
494decode_iNN! {
495 /// Decodes an `i64` from a buffer, returning the value and encoded length.
496 ///
497 /// # Examples
498 ///
499 /// ```
500 /// let mut buf = [0u8; 9];
501 /// let encoded_len = vu128::encode_i64(&mut buf, 123);
502 /// assert_eq!(vu128::decode_i64(&buf), (123, encoded_len));
503 /// ```
504 decode_i64(i64, u64, decode_u64)
505}
506
507decode_iNN! {
508 /// Decodes an `i128` from a buffer, returning the value and encoded length.
509 ///
510 /// # Examples
511 ///
512 /// ```
513 /// let mut buf = [0u8; 17];
514 /// let encoded_len = vu128::encode_i128(&mut buf, 123);
515 /// assert_eq!(vu128::decode_i128(&buf), (123, encoded_len));
516 /// ```
517 decode_i128(i128, u128, decode_u128)
518}
519
520/// Encodes an `f32` into a buffer, returning the encoded length.
521///
522/// The contents of the buffer beyond the returned length are unspecified.
523///
524/// # Examples
525///
526/// ```
527/// let mut buf = [0u8; 5];
528/// let encoded_len = vu128::encode_f32(&mut buf, 2.5);
529/// assert_eq!(&buf[..encoded_len], &[0x80, 0x81]);
530/// ```
531#[inline]
532#[must_use]
533pub fn encode_f32(buf: &mut [u8; 5], value: f32) -> usize {
534 encode_u32(buf, value.to_bits().swap_bytes())
535}
536
537/// Encodes an `f64` into a buffer, returning the encoded length.
538///
539/// The contents of the buffer beyond the returned length are unspecified.
540///
541/// # Examples
542///
543/// ```
544/// let mut buf = [0u8; 9];
545/// let encoded_len = vu128::encode_f64(&mut buf, 2.5);
546/// assert_eq!(&buf[..encoded_len], &[0x80, 0x11]);
547/// ```
548#[inline]
549#[must_use]
550pub fn encode_f64(buf: &mut [u8; 9], value: f64) -> usize {
551 encode_u64(buf, value.to_bits().swap_bytes())
552}
553
554/// Decodes an `f32` from a buffer, returning the value and encoded length.
555///
556/// # Examples
557///
558/// ```
559/// let mut buf = [0u8; 5];
560/// let encoded_len = vu128::encode_f32(&mut buf, 2.5);
561/// assert_eq!(vu128::decode_f32(&buf), (2.5, encoded_len));
562/// ```
563#[inline]
564#[must_use]
565pub fn decode_f32(buf: &[u8; 5]) -> (f32, usize) {
566 let (swapped, len) = decode_u32(buf);
567 (f32::from_bits(swapped.swap_bytes()), len)
568}
569
570/// Decodes an `f64` from a buffer, returning the value and encoded length.
571///
572/// # Examples
573///
574/// ```
575/// let mut buf = [0u8; 9];
576/// let encoded_len = vu128::encode_f64(&mut buf, 2.5);
577/// assert_eq!(vu128::decode_f64(&buf), (2.5, encoded_len));
578/// ```
579#[inline]
580#[must_use]
581pub fn decode_f64(buf: &[u8; 9]) -> (f64, usize) {
582 let (swapped, len) = decode_u64(buf);
583 (f64::from_bits(swapped.swap_bytes()), len)
584}
585
586#[inline(always)]
587const fn ptr_from_ref<T: ?Sized>(r: &T) -> *const T {
588 r
589}
590
591#[inline(always)]
592fn ptr_from_mut<T: ?Sized>(r: &mut T) -> *mut T {
593 r
594}