vu128/
vu128.rs

1// Copyright (c) 2024 John Millikin <john@john-millikin.com>
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted.
5//
6// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
7// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
8// AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
9// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
10// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
11// OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
12// PERFORMANCE OF THIS SOFTWARE.
13//
14// SPDX-License-Identifier: 0BSD
15
16//! # vu128: Efficient variable-length integers
17//!
18//! `vu128` is a variable-length integer encoding, with smaller values being
19//! encoded using fewer bytes. Integer sizes up to 128 bits are supported.
20//! The compression ratio of `vu128` equals or exceeds the widely used [VLQ]
21//! and [LEB128] encodings, and is faster on modern pipelined architectures.
22//!
23//! [VLQ]: https://en.wikipedia.org/wiki/Variable-length_quantity
24//! [LEB128]: https://en.wikipedia.org/wiki/LEB128
25//!
26//! # Encoding details
27//!
28//! Values in the range `[0, 2^7)` are encoded as a single byte with
29//! the same bits as the original value.
30//!
31//! Values in the range `[2^7, 2^28)` are encoded as a unary length prefix,
32//! followed by `(length*7)` bits, in little-endian order. This is conceptually
33//! similar to LEB128, but the continuation bits are placed in upper half
34//! of the initial byte. This arrangement is also known as a "prefix varint".
35//!
36//! ```text
37//! MSB ------------------ LSB
38//!
39//!       10101011110011011110  Input value (0xABCDE)
40//!    0101010 1111001 1011110  Zero-padded to a multiple of 7 bits
41//! 01010101 11100110 ___11110  Grouped into octets, with 3 continuation bits
42//! 01010101 11100110 11011110  Continuation bits `110` added
43//!     0x55     0xE6     0xDE  In hexadecimal
44//!
45//!         [0xDE, 0xE6, 0x55]  Encoded output (order is little-endian)
46//! ```
47//!
48//! Values in the range `[2^28, 2^128)` are encoded as a binary length prefix,
49//! followed by payload bytes, in little-endian order. To differentiate this
50//! format from the format of smaller values, the top 4 bits of the first byte
51//! are set. The length prefix value is the number of payload bytes minus one;
52//! equivalently it is the total length of the encoded value minus two.
53//!
54//! ```text
55//! MSB ------------------------------------ LSB
56//!
57//!                10010001101000101011001111000  Input value (0x12345678)
58//!          00010010 00110100 01010110 01111000  Zero-padded to a multiple of 8 bits
59//! 00010010 00110100 01010110 01111000 11110011  Prefix byte is `0xF0 | (4 - 1)`
60//!     0x12     0x34     0x56     0x78     0xF3  In hexadecimal
61//!
62//!               [0xF3, 0x78, 0x56, 0x34, 0x12]  Encoded output (order is little-endian)
63//! ```
64//!
65//! # Handling of over-long encodings
66//!
67//! The `vu128` format permits over-long encodings, which encode a value using
68//! a byte sequence that is unnecessarily long:
69//!
70//! * Zero-padding beyond that required to reach a multiple of 7 or 8 bits.
71//! * Using a length prefix byte for a value in the range `[0, 2^7)`.
72//! * Using a binary length prefix byte for a value in the range `[0, 2^28)`.
73//!
74//! The `encode_*` functions in this module will not generate such over-long
75//! encodings, but the `decode_*` functions will accept them. This is intended
76//! to allow `vu128` values to be placed in a buffer before the value to be
77//! written is known. Applications that require a single canonical encoding for
78//! any given value should perform appropriate checking in their own code.
79//!
80//! # Signed integers and floating-point values
81//!
82//! Signed integers and IEEE-754 floating-point values may be encoded with
83//! `vu128` by mapping them to unsigned integers. It is recommended that the
84//! mapping functions be chosen so as to minimize the number of zeroes in the
85//! higher-order bits, which enables better compression.
86//!
87//! This library includes helper functions that use Protocol Buffer's ["ZigZag"
88//! encoding] for signed integers and reverse-endian layout for floating-point.
89//!
90//! ["ZigZag" encoding]: https://protobuf.dev/programming-guides/encoding/#signed-ints
91
92#![no_std]
93#![warn(clippy::must_use_candidate)]
94#![warn(clippy::undocumented_unsafe_blocks)]
95#![warn(missing_docs)]
96
97use core::mem;
98
99/// Returns the encoded length in a `vu128` prefix byte.
100///
101/// # Examples
102///
103/// ```
104/// let mut buf = [0u8; 5];
105/// let encoded_len = vu128::encode_u32(&mut buf, 12345);
106/// assert_eq!(vu128::encoded_len(buf[0]), encoded_len);
107/// ```
108#[must_use]
109pub const fn encoded_len(b: u8) -> usize {
110	if b < 0b10000000 {
111		return 1;
112	}
113	if b < 0b11000000 {
114		return 2;
115	}
116	if b < 0b11100000 {
117		return 3;
118	}
119	if b < 0b11110000 {
120		return 4;
121	}
122	((b & 0x0F) + 2) as usize
123}
124
125/// Encodes a `u32` into a buffer, returning the encoded length.
126///
127/// The contents of the buffer beyond the returned length are unspecified.
128///
129/// # Examples
130///
131/// ```
132/// let mut buf = [0u8; 5];
133/// let encoded_len = vu128::encode_u32(&mut buf, 12345);
134/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
135/// ```
136#[inline]
137#[must_use]
138pub fn encode_u32(buf: &mut [u8; 5], value: u32) -> usize {
139	let mut x = value;
140	if x < 0x80 {
141		buf[0] = x as u8;
142		return 1;
143	}
144	if x < 0x10000000 {
145		if x < 0x00004000 {
146			x <<= 2;
147			buf[0] = 0x80 | ((x as u8) >> 2);
148			buf[1] = (x >> 8) as u8;
149			return 2;
150		}
151		if x < 0x00200000 {
152			x <<= 3;
153			buf[0] = 0xC0 | ((x as u8) >> 3);
154			buf[1] = (x >> 8) as u8;
155			buf[2] = (x >> 16) as u8;
156			return 3;
157		}
158		x <<= 4;
159		buf[0] = 0xE0 | ((x as u8) >> 4);
160		buf[1] = (x >> 8) as u8;
161		buf[2] = (x >> 16) as u8;
162		buf[3] = (x >> 24) as u8;
163		return 4;
164	}
165
166	// SAFETY: buf has a const length of `size_of::<u32>() + 1`.
167	unsafe {
168		ptr_from_mut::<[u8; mem::size_of::<u32>() + 1]>(buf)
169			.cast::<u8>()
170			.add(1)
171			.cast::<u32>()
172			.write_unaligned(x.to_le());
173	}
174
175	buf[0] = 0xF3;
176	5
177}
178
179/// Encodes a `u64` into a buffer, returning the encoded length.
180///
181/// The contents of the buffer beyond the returned length are unspecified.
182///
183/// # Examples
184///
185/// ```
186/// let mut buf = [0u8; 9];
187/// let encoded_len = vu128::encode_u64(&mut buf, 12345);
188/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
189/// ```
190#[inline]
191#[must_use]
192pub fn encode_u64(buf: &mut [u8; 9], value: u64) -> usize {
193	let mut x = value;
194	if x < 0x80 {
195		buf[0] = x as u8;
196		return 1;
197	}
198	if x < 0x10000000 {
199		if x < 0x00004000 {
200			x <<= 2;
201			buf[0] = 0x80 | ((x as u8) >> 2);
202			buf[1] = (x >> 8) as u8;
203			return 2;
204		}
205		if x < 0x00200000 {
206			x <<= 3;
207			buf[0] = 0xC0 | ((x as u8) >> 3);
208			buf[1] = (x >> 8) as u8;
209			buf[2] = (x >> 16) as u8;
210			return 3;
211		}
212		x <<= 4;
213		buf[0] = 0xE0 | ((x as u8) >> 4);
214		buf[1] = (x >> 8) as u8;
215		buf[2] = (x >> 16) as u8;
216		buf[3] = (x >> 24) as u8;
217		return 4;
218	}
219
220	// SAFETY: buf has a const length of `size_of::<u64>() + 1`.
221	unsafe {
222		ptr_from_mut::<[u8; mem::size_of::<u64>() + 1]>(buf)
223			.cast::<u8>()
224			.add(1)
225			.cast::<u64>()
226			.write_unaligned(x.to_le());
227	}
228
229	const LEN_MASK: u8 = 0b111;
230	let len = ((x.leading_zeros() >> 3) as u8) ^ LEN_MASK;
231	buf[0] = 0xF0 | len;
232	(len + 2) as usize
233}
234
235/// Encodes a `u128` into a buffer, returning the encoded length.
236///
237/// The contents of the buffer beyond the returned length are unspecified.
238///
239/// # Examples
240///
241/// ```
242/// let mut buf = [0u8; 17];
243/// let encoded_len = vu128::encode_u128(&mut buf, 12345);
244/// assert_eq!(&buf[..encoded_len], &[0xB9, 0xC0]);
245/// ```
246#[inline]
247#[must_use]
248pub fn encode_u128(buf: &mut [u8; 17], value: u128) -> usize {
249	if value < 0x80 {
250		buf[0] = value as u8;
251		return 1;
252	}
253	if value < 0x10000000 {
254		// SAFETY: A `[u8; 17]` can be safely truncated to a `[u8; 5]`.
255		let buf_u32 = unsafe {
256			&mut *(ptr_from_mut::<[u8; 17]>(buf).cast::<[u8; 5]>())
257		};
258		return encode_u32(buf_u32, value as u32);
259	}
260
261	// SAFETY: buf has a const length of `size_of::<u128>() + 1`.
262	unsafe {
263		ptr_from_mut::<[u8; mem::size_of::<u128>() + 1]>(buf)
264			.cast::<u8>()
265			.add(1)
266			.cast::<u128>()
267			.write_unaligned(value.to_le());
268	}
269
270	const LEN_MASK: u8 = 0b1111;
271	let len = ((value.leading_zeros() >> 3) as u8) ^ LEN_MASK;
272	buf[0] = 0xF0 | len;
273	(len + 2) as usize
274}
275
276/// Decodes a `u32` from a buffer, returning the value and encoded length.
277///
278/// # Examples
279///
280/// ```
281/// let mut buf = [0u8; 5];
282/// let encoded_len = vu128::encode_u32(&mut buf, 123);
283/// assert_eq!(vu128::decode_u32(&buf), (123, encoded_len));
284/// ```
285#[inline]
286#[must_use]
287pub fn decode_u32(buf: &[u8; 5]) -> (u32, usize) {
288	let buf0 = buf[0] as u32;
289	if (buf0 & 0x80) == 0 {
290		return (buf0, 1);
291	}
292	if (buf0 & 0b01000000) == 0 {
293		let low = (buf0 as u8) & 0x3F;
294		let value = ((buf[1] as u32) << 6) | (low as u32);
295		return (value, 2);
296	}
297	if buf0 >= 0xF0 {
298		let len = ((buf0 as u8) & 0x0F) + 2;
299		let value = ((buf[4] as u32) << 24)
300		          | ((buf[3] as u32) << 16)
301		          | ((buf[2] as u32) << 8)
302		          | (buf[1] as u32);
303		return (value, len as usize);
304	}
305	if (buf0 & 0b00100000) == 0 {
306		let low = (buf0 as u8) & 0x1F;
307		let value = ((buf[2] as u32) << 13)
308		          | ((buf[1] as u32) << 5)
309		          | (low as u32);
310		return (value, 3);
311	}
312	let value = ((buf[3] as u32) << 20)
313	          | ((buf[2] as u32) << 12)
314	          | ((buf[1] as u32) << 4)
315	          | (buf0 & 0x0F);
316	(value, 4)
317}
318
319/// Decodes a `u64` from a buffer, returning the value and encoded length.
320///
321/// # Examples
322///
323/// ```
324/// let mut buf = [0u8; 9];
325/// let encoded_len = vu128::encode_u64(&mut buf, 123);
326/// assert_eq!(vu128::decode_u64(&buf), (123, encoded_len));
327/// ```
328#[inline]
329#[must_use]
330pub fn decode_u64(buf: &[u8; 9]) -> (u64, usize) {
331	let buf0 = buf[0] as u64;
332	if (buf0 & 0x80) == 0 {
333		return (buf0, 1);
334	}
335	if buf0 < 0xF0 {
336		if (buf0 & 0b01000000) == 0 {
337			let low = (buf0 as u8) & 0b00111111;
338			let value = ((buf[1] as u32) << 6) | (low as u32);
339			return (value as u64, 2);
340		}
341		if (buf0 & 0b00100000) == 0 {
342			let low = (buf0 as u8) & 0b00011111;
343			let value = ((buf[2] as u32) << 13)
344			          | ((buf[1] as u32) << 5)
345			          | (low as u32);
346			return (value as u64, 3);
347		}
348		let value = ((buf[3] as u32) << 20)
349		          | ((buf[2] as u32) << 12)
350		          | ((buf[1] as u32) << 4)
351		          | ((buf0 as u32) & 0b00001111);
352		return (value as u64, 4);
353	}
354
355	// SAFETY: buf has a const length of `size_of::<u64>() + 1`.
356	let value = u64::from_le(unsafe {
357		ptr_from_ref::<[u8; mem::size_of::<u64>() + 1]>(buf)
358			.cast::<u8>()
359			.add(1)
360			.cast::<u64>()
361			.read_unaligned()
362	});
363
364	const LEN_MASK: u8 = 0b111;
365	let len = buf[0] & 0x0F;
366	let mask_octets = (len & LEN_MASK) ^ LEN_MASK;
367	let mask = u64::MAX >> (mask_octets * 8);
368	(value & mask, (len + 2) as usize)
369}
370
371/// Decodes a `u128` from a buffer, returning the value and encoded length.
372///
373/// # Examples
374///
375/// ```
376/// let mut buf = [0u8; 17];
377/// let encoded_len = vu128::encode_u128(&mut buf, 123);
378/// assert_eq!(vu128::decode_u128(&buf), (123, encoded_len));
379/// ```
380#[inline]
381#[must_use]
382pub fn decode_u128(buf: &[u8; 17]) -> (u128, usize) {
383	if (buf[0] & 0x80) == 0 {
384		return (buf[0] as u128, 1);
385	}
386	if buf[0] < 0xF0 {
387		// SAFETY: A `[u8; 17]` can be safely truncated to a `[u8; 5]`.
388		let buf_u32 = unsafe {
389			&*(ptr_from_ref::<[u8; 17]>(buf).cast::<[u8; 5]>())
390		};
391		let (value, len) = decode_u32(buf_u32);
392		return (value as u128, len);
393	}
394
395	// SAFETY: buf has a const length of `size_of::<u128>() + 1`.
396	let value = u128::from_le(unsafe {
397		ptr_from_ref::<[u8; mem::size_of::<u128>() + 1]>(buf)
398			.cast::<u8>()
399			.add(1)
400			.cast::<u128>()
401			.read_unaligned()
402	});
403	const LEN_MASK: u8 = 0b1111;
404	let len = buf[0] & 0x0F;
405	let mask_octets = (len & LEN_MASK) ^ LEN_MASK;
406	let mask = u128::MAX >> (mask_octets * 8);
407	(value & mask, (len + 2) as usize)
408}
409
410macro_rules! encode_iNN {
411	($(#[$docs:meta])* $name:ident ( $it:ident, $ut:ident, $encode_fn:ident ) ) => {
412		$(#[$docs])*
413		#[inline]
414		#[must_use]
415		pub fn $name(buf: &mut [u8; mem::size_of::<$ut>()+1], value: $it) -> usize {
416			const ZIGZAG_SHIFT: u8 = ($ut::BITS as u8) - 1;
417			let zigzag = ((value >> ZIGZAG_SHIFT) as $ut) ^ ((value << 1) as $ut);
418			$encode_fn(buf, zigzag)
419		}
420	};
421}
422
423macro_rules! decode_iNN {
424	($(#[$docs:meta])* $name:ident ( $it:ident, $ut:ident, $decode_fn:ident ) ) => {
425		$(#[$docs])*
426		#[inline]
427		#[must_use]
428		pub fn $name(buf: &[u8; mem::size_of::<$ut>()+1]) -> ($it, usize) {
429			let (zz, len) = $decode_fn(buf);
430			let value = ((zz >> 1) as $it) ^ (-((zz & 1) as $it));
431			(value, len)
432		}
433	};
434}
435
436encode_iNN! {
437	/// Encodes an `i32` into a buffer, returning the encoded length.
438	///
439	/// The contents of the buffer beyond the returned length are unspecified.
440	///
441	/// # Examples
442	///
443	/// ```
444	/// let mut buf = [0u8; 5];
445	/// let encoded_len = vu128::encode_i32(&mut buf, 123);
446	/// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
447	/// ```
448	encode_i32(i32, u32, encode_u32)
449}
450
451encode_iNN! {
452	/// Encodes an `i64` into a buffer, returning the encoded length.
453	///
454	/// The contents of the buffer beyond the returned length are unspecified.
455	///
456	/// # Examples
457	///
458	/// ```
459	/// let mut buf = [0u8; 9];
460	/// let encoded_len = vu128::encode_i64(&mut buf, 123);
461	/// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
462	/// ```
463	encode_i64(i64, u64, encode_u64)
464}
465
466encode_iNN! {
467	/// Encodes an `i128` into a buffer, returning the encoded length.
468	///
469	/// The contents of the buffer beyond the returned length are unspecified.
470	///
471	/// # Examples
472	///
473	/// ```
474	/// let mut buf = [0u8; 17];
475	/// let encoded_len = vu128::encode_i128(&mut buf, 123);
476	/// assert_eq!(&buf[..encoded_len], &[0xB6, 0x03]);
477	/// ```
478	encode_i128(i128, u128, encode_u128)
479}
480
481decode_iNN! {
482	/// Decodes an `i32` from a buffer, returning the value and encoded length.
483	///
484	/// # Examples
485	///
486	/// ```
487	/// let mut buf = [0u8; 5];
488	/// let encoded_len = vu128::encode_i32(&mut buf, 123);
489	/// assert_eq!(vu128::decode_i32(&buf), (123, encoded_len));
490	/// ```
491	decode_i32(i32, u32, decode_u32)
492}
493
494decode_iNN! {
495	/// Decodes an `i64` from a buffer, returning the value and encoded length.
496	///
497	/// # Examples
498	///
499	/// ```
500	/// let mut buf = [0u8; 9];
501	/// let encoded_len = vu128::encode_i64(&mut buf, 123);
502	/// assert_eq!(vu128::decode_i64(&buf), (123, encoded_len));
503	/// ```
504	decode_i64(i64, u64, decode_u64)
505}
506
507decode_iNN! {
508	/// Decodes an `i128` from a buffer, returning the value and encoded length.
509	///
510	/// # Examples
511	///
512	/// ```
513	/// let mut buf = [0u8; 17];
514	/// let encoded_len = vu128::encode_i128(&mut buf, 123);
515	/// assert_eq!(vu128::decode_i128(&buf), (123, encoded_len));
516	/// ```
517	decode_i128(i128, u128, decode_u128)
518}
519
520/// Encodes an `f32` into a buffer, returning the encoded length.
521///
522/// The contents of the buffer beyond the returned length are unspecified.
523///
524/// # Examples
525///
526/// ```
527/// let mut buf = [0u8; 5];
528/// let encoded_len = vu128::encode_f32(&mut buf, 2.5);
529/// assert_eq!(&buf[..encoded_len], &[0x80, 0x81]);
530/// ```
531#[inline]
532#[must_use]
533pub fn encode_f32(buf: &mut [u8; 5], value: f32) -> usize {
534	encode_u32(buf, value.to_bits().swap_bytes())
535}
536
537/// Encodes an `f64` into a buffer, returning the encoded length.
538///
539/// The contents of the buffer beyond the returned length are unspecified.
540///
541/// # Examples
542///
543/// ```
544/// let mut buf = [0u8; 9];
545/// let encoded_len = vu128::encode_f64(&mut buf, 2.5);
546/// assert_eq!(&buf[..encoded_len], &[0x80, 0x11]);
547/// ```
548#[inline]
549#[must_use]
550pub fn encode_f64(buf: &mut [u8; 9], value: f64) -> usize {
551	encode_u64(buf, value.to_bits().swap_bytes())
552}
553
554/// Decodes an `f32` from a buffer, returning the value and encoded length.
555///
556/// # Examples
557///
558/// ```
559/// let mut buf = [0u8; 5];
560/// let encoded_len = vu128::encode_f32(&mut buf, 2.5);
561/// assert_eq!(vu128::decode_f32(&buf), (2.5, encoded_len));
562/// ```
563#[inline]
564#[must_use]
565pub fn decode_f32(buf: &[u8; 5]) -> (f32, usize) {
566	let (swapped, len) = decode_u32(buf);
567	(f32::from_bits(swapped.swap_bytes()), len)
568}
569
570/// Decodes an `f64` from a buffer, returning the value and encoded length.
571///
572/// # Examples
573///
574/// ```
575/// let mut buf = [0u8; 9];
576/// let encoded_len = vu128::encode_f64(&mut buf, 2.5);
577/// assert_eq!(vu128::decode_f64(&buf), (2.5, encoded_len));
578/// ```
579#[inline]
580#[must_use]
581pub fn decode_f64(buf: &[u8; 9]) -> (f64, usize) {
582	let (swapped, len) = decode_u64(buf);
583	(f64::from_bits(swapped.swap_bytes()), len)
584}
585
586#[inline(always)]
587const fn ptr_from_ref<T: ?Sized>(r: &T) -> *const T {
588	r
589}
590
591#[inline(always)]
592fn ptr_from_mut<T: ?Sized>(r: &mut T) -> *mut T {
593	r
594}