fastcdc/v2016/
mod.rs

1//
2// Copyright (c) 2023 Nathan Fiedler
3//
4
5//! This module implements the canonical FastCDC algorithm as described in the
6//! [paper](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf)
7//! by Wen Xia, et al., in 2016.
8//!
9//! The algorithm incorporates a simplified hash judgement using the fast Gear
10//! hash, sub-minimum chunk cut-point skipping, and normalized chunking to
11//! produce chunks of a more consistent length.
12//!
13//! There are two ways in which to use the `FastCDC` struct defined in this
14//! module. One is to simply invoke `cut()` while managing your own `start` and
15//! `remaining` values. The other is to use the struct as an `Iterator` that
16//! yields `Chunk` structs which represent the offset and size of the chunks.
17//! Note that attempting to use both `cut()` and `Iterator` on the same
18//! `FastCDC` instance will yield incorrect results.
19//!
20//! Note that the `cut()` function returns the 64-bit hash of the chunk, which
21//! may be useful in scenarios involving chunk size prediction using historical
22//! data, such as in RapidCDC or SuperCDC. This hash value is also given in the
23//! `hash` field of the `Chunk` struct. While this value has rather low entropy,
24//! it is computationally cost-free and can be put to some use with additional
25//! record keeping.
26//!
27//! The `StreamCDC` implementation is similar to `FastCDC` except that it will
28//! read data from a `Read` into an internal buffer of `max_size` and produce
29//! `ChunkData` values from the `Iterator`.
30use std::fmt;
31use std::io::Read;
32
33/// Smallest acceptable value for the minimum chunk size.
34pub const MINIMUM_MIN: u32 = 64;
35/// Largest acceptable value for the minimum chunk size.
36pub const MINIMUM_MAX: u32 = 1_048_576;
37/// Smallest acceptable value for the average chunk size.
38pub const AVERAGE_MIN: u32 = 256;
39/// Largest acceptable value for the average chunk size.
40pub const AVERAGE_MAX: u32 = 4_194_304;
41/// Smallest acceptable value for the maximum chunk size.
42pub const MAXIMUM_MIN: u32 = 1024;
43/// Largest acceptable value for the maximum chunk size.
44pub const MAXIMUM_MAX: u32 = 16_777_216;
45
46//
47// Masks for each of the desired number of bits, where 0 through 5 are unused.
48// The values for sizes 64 bytes through 128 kilo-bytes comes from the C
49// reference implementation (found in the destor repository) while the extra
50// values come from the restic-FastCDC repository. The FastCDC paper claims that
51// the deduplication ratio is slightly improved when the mask bits are spread
52// relatively evenly, hence these seemingly "magic" values.
53//
54const MASKS: [u64; 26] = [
55    0,                  // padding
56    0,                  // padding
57    0,                  // padding
58    0,                  // padding
59    0,                  // padding
60    0x0000000001804110, // unused except for NC 3
61    0x0000000001803110, // 64B
62    0x0000000018035100, // 128B
63    0x0000001800035300, // 256B
64    0x0000019000353000, // 512B
65    0x0000590003530000, // 1KB
66    0x0000d90003530000, // 2KB
67    0x0000d90103530000, // 4KB
68    0x0000d90303530000, // 8KB
69    0x0000d90313530000, // 16KB
70    0x0000d90f03530000, // 32KB
71    0x0000d90303537000, // 64KB
72    0x0000d90703537000, // 128KB
73    0x0000d90707537000, // 256KB
74    0x0000d91707537000, // 512KB
75    0x0000d91747537000, // 1MB
76    0x0000d91767537000, // 2MB
77    0x0000d93767537000, // 4MB
78    0x0000d93777537000, // 8MB
79    0x0000d93777577000, // 16MB
80    0x0000db3777577000, // unused except for NC 3
81];
82
83//
84// GEAR contains seemingly random numbers which are created by computing the
85// MD5 digest of values from 0 to 255, using only the high 8 bytes of the 16
86// byte digest. This is the "gear hash" referred to the in FastCDC paper.
87//
88// The program to produce this table is named table64.rs in examples.
89//
90#[rustfmt::skip]
91const GEAR: [u64; 256] = [
92    0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7,
93    0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649,
94    0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226,
95    0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14,
96    0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6,
97    0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb,
98    0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98,
99    0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99,
100    0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01,
101    0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f,
102    0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915,
103    0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2,
104    0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565,
105    0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968,
106    0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116,
107    0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2,
108    0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040,
109    0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3,
110    0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651,
111    0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc,
112    0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666,
113    0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3,
114    0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163,
115    0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6,
116    0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0,
117    0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba,
118    0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9,
119    0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55,
120    0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52,
121    0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda,
122    0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e,
123    0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a,
124    0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba,
125    0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748,
126    0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e,
127    0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69,
128    0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0,
129    0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b,
130    0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5,
131    0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362,
132    0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845,
133    0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2,
134    0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71,
135    0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f,
136    0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd,
137    0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78,
138    0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa,
139    0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2,
140    0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482,
141    0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627,
142    0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b,
143    0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe,
144    0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87,
145    0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81,
146    0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e,
147    0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea,
148    0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2,
149    0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06,
150    0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c,
151    0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042,
152    0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6,
153    0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb,
154    0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780,
155    0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1
156];
157
158// Find the next chunk cut point in the source.
159fn cut(
160    source: &[u8],
161    min_size: usize,
162    avg_size: usize,
163    max_size: usize,
164    mask_s: u64,
165    mask_l: u64,
166) -> (u64, usize) {
167    let mut remaining = source.len();
168    if remaining <= min_size {
169        return (0, remaining);
170    }
171    let mut center = avg_size;
172    if remaining > max_size {
173        remaining = max_size;
174    } else if remaining < center {
175        center = remaining;
176    }
177    let mut index = min_size;
178    // Paraphrasing from the paper: Use the mask with more 1 bits for the
179    // hash judgment when the current chunking position is smaller than the
180    // desired size, which makes it harder to generate smaller chunks.
181    let mut hash: u64 = 0;
182    while index < center {
183        hash = (hash << 1).wrapping_add(GEAR[source[index] as usize]);
184        if (hash & mask_s) == 0 {
185            return (hash, index);
186        }
187        index += 1;
188    }
189    // Again, paraphrasing: use the mask with fewer 1 bits for the hash
190    // judgment when the current chunking position is larger than the
191    // desired size, which makes it easier to generate larger chunks.
192    let last_pos = remaining;
193    while index < last_pos {
194        hash = (hash << 1).wrapping_add(GEAR[source[index] as usize]);
195        if (hash & mask_l) == 0 {
196            return (hash, index);
197        }
198        index += 1;
199    }
200    // If all else fails, return the largest chunk. This will happen with
201    // pathological data, such as all zeroes.
202    (hash, index)
203}
204
205///
206/// The level for the normalized chunking used by FastCDC and StreamCDC.
207///
208/// Normalized chunking "generates chunks whose sizes are normalized to a
209/// specified region centered at the expected chunk size," as described in
210/// section 4.4 of the FastCDC 2016 paper.
211///
212/// Note that lower levels of normalization will result in a larger range of
213/// generated chunk sizes. It may be beneficial to widen the minimum/maximum
214/// chunk size values given to the `FastCDC` constructor in that case.
215///
216/// Note that higher levels of normalization may result in the final chunk of
217/// data being smaller than the minimum chunk size, which results in a hash
218/// value of zero (`0`) since no calculations are performed for sub-minimum
219/// chunks.
220///
221pub enum Normalization {
222    /// No chunk size normalization, produces a wide range of chunk sizes.
223    Level0,
224    /// Level 1 normalization, in which fewer chunks are outside of the desired range.
225    Level1,
226    /// Level 2 normalization, where most chunks are of the desired size.
227    Level2,
228    /// Level 3 normalization, nearly all chunks are the desired size.
229    Level3,
230}
231
232impl Normalization {
233    fn bits(&self) -> u32 {
234        match self {
235            Normalization::Level0 => 0,
236            Normalization::Level1 => 1,
237            Normalization::Level2 => 2,
238            Normalization::Level3 => 3,
239        }
240    }
241}
242
243///
244/// Represents a chunk returned from the FastCDC iterator.
245///
246#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
247pub struct Chunk {
248    /// The gear hash value as of the end of the chunk.
249    pub hash: u64,
250    /// Starting byte position within the source.
251    pub offset: usize,
252    /// Length of the chunk in bytes.
253    pub length: usize,
254}
255
256///
257/// The FastCDC chunker implementation from 2016.
258///
259/// Use `new` to construct an instance, and then iterate over the `Chunk`s via
260/// the `Iterator` trait.
261///
262/// This example reads a file into memory and splits it into chunks that are
263/// roughly 16 KB in size. The minimum and maximum sizes are the absolute limit
264/// on the returned chunk sizes. With this algorithm, it is helpful to be more
265/// lenient on the maximum chunk size as the results are highly dependent on the
266/// input data. Changing the minimum chunk size will affect the results as the
267/// algorithm may find different cut points given it uses the minimum as a
268/// starting point (cut-point skipping).
269///
270/// ```no_run
271/// # use std::fs;
272/// # use fastcdc::v2016::FastCDC;
273/// let contents = fs::read("test/fixtures/SekienAkashita.jpg").unwrap();
274/// let chunker = FastCDC::new(&contents, 8192, 16384, 65535);
275/// for entry in chunker {
276///     println!("offset={} length={}", entry.offset, entry.length);
277/// }
278/// ```
279///
280#[derive(Debug, Clone, Eq, PartialEq)]
281pub struct FastCDC<'a> {
282    source: &'a [u8],
283    processed: usize,
284    remaining: usize,
285    min_size: usize,
286    avg_size: usize,
287    max_size: usize,
288    mask_s: u64,
289    mask_l: u64,
290}
291
292impl<'a> FastCDC<'a> {
293    ///
294    /// Construct a `FastCDC` that will process the given slice of bytes.
295    ///
296    /// Uses chunk size normalization level 1 by default.
297    ///
298    pub fn new(source: &'a [u8], min_size: u32, avg_size: u32, max_size: u32) -> Self {
299        FastCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
300    }
301
302    ///
303    /// Create a new `FastCDC` with the given normalization level.
304    ///
305    pub fn with_level(
306        source: &'a [u8],
307        min_size: u32,
308        avg_size: u32,
309        max_size: u32,
310        level: Normalization,
311    ) -> Self {
312        assert!(min_size >= MINIMUM_MIN);
313        assert!(min_size <= MINIMUM_MAX);
314        assert!(avg_size >= AVERAGE_MIN);
315        assert!(avg_size <= AVERAGE_MAX);
316        assert!(max_size >= MAXIMUM_MIN);
317        assert!(max_size <= MAXIMUM_MAX);
318        let bits = logarithm2(avg_size);
319        let normalization = level.bits();
320        let mask_s = MASKS[(bits + normalization) as usize];
321        let mask_l = MASKS[(bits - normalization) as usize];
322        Self {
323            source,
324            processed: 0,
325            remaining: source.len(),
326            min_size: min_size as usize,
327            avg_size: avg_size as usize,
328            max_size: max_size as usize,
329            mask_s,
330            mask_l,
331        }
332    }
333
334    ///
335    /// Find the next cut point in the data, where `start` is the position from
336    /// which to start processing the source data, and `remaining` are the
337    /// number of bytes left to be processed.
338    ///
339    /// The returned 2-tuple consists of the 64-bit hash (fingerprint) and the
340    /// byte offset of the end of the chunk.
341    ///
342    /// There is a special case in which the remaining bytes are less than the
343    /// minimum chunk size, at which point this function returns a hash of 0 and
344    /// the cut point is the end of the source data.
345    ///
346    pub fn cut(&self, start: usize, remaining: usize) -> (u64, usize) {
347        let end = start + remaining;
348        let (hash, count) = cut(
349            &self.source[start..end],
350            self.min_size,
351            self.avg_size,
352            self.max_size,
353            self.mask_s,
354            self.mask_l,
355        );
356        (hash, start + count)
357    }
358}
359
360impl<'a> Iterator for FastCDC<'a> {
361    type Item = Chunk;
362
363    fn next(&mut self) -> Option<Chunk> {
364        if self.remaining == 0 {
365            None
366        } else {
367            let (hash, cutpoint) = self.cut(self.processed, self.remaining);
368            if cutpoint == 0 {
369                None
370            } else {
371                let offset = self.processed;
372                let length = cutpoint - offset;
373                self.processed += length;
374                self.remaining -= length;
375                Some(Chunk {
376                    hash,
377                    offset,
378                    length,
379                })
380            }
381        }
382    }
383
384    fn size_hint(&self) -> (usize, Option<usize>) {
385        // NOTE: This intentionally returns the upper bound for both `size_hint`
386        // values, as the upper bound doesn't actually seem to get used by `std`
387        // and using the actual lower bound is practically guaranteed to require
388        // a second capacity growth.
389        let upper_bound = self.remaining / self.min_size;
390        (upper_bound, Some(upper_bound))
391    }
392}
393
394///
395/// The error type returned from the `StreamCDC` iterator.
396///
397#[derive(Debug)]
398pub enum Error {
399    /// End of source data reached.
400    Empty,
401    /// An I/O error occurred.
402    IoError(std::io::Error),
403    /// Something unexpected happened.
404    Other(String),
405}
406
407impl fmt::Display for Error {
408    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
409        write!(f, "chunker error: {self:?}")
410    }
411}
412
413impl std::error::Error for Error {}
414
415impl From<std::io::Error> for Error {
416    fn from(error: std::io::Error) -> Self {
417        Error::IoError(error)
418    }
419}
420
421impl From<Error> for std::io::Error {
422    fn from(error: Error) -> Self {
423        match error {
424            Error::IoError(ioerr) => ioerr,
425            Error::Empty => Self::from(std::io::ErrorKind::UnexpectedEof),
426            Error::Other(str) => Self::new(std::io::ErrorKind::Other, str),
427        }
428    }
429}
430
431///
432/// Represents a chunk returned from the StreamCDC iterator.
433///
434#[derive(Debug, Clone, Eq, PartialEq, Hash)]
435pub struct ChunkData {
436    /// The gear hash value as of the end of the chunk.
437    pub hash: u64,
438    /// Starting byte position within the source.
439    pub offset: u64,
440    /// Length of the chunk in bytes.
441    pub length: usize,
442    /// Source bytes contained in this chunk.
443    pub data: Vec<u8>,
444}
445
446///
447/// The FastCDC chunker implementation from 2016 with streaming support.
448///
449/// Use `new` to construct an instance, and then iterate over the `ChunkData`s
450/// via the `Iterator` trait.
451///
452/// Note that this struct allocates a `Vec<u8>` of `max_size` bytes to act as a
453/// buffer when reading from the source and finding chunk boundaries.
454///
455/// ```no_run
456/// # use std::fs::File;
457/// # use fastcdc::v2016::StreamCDC;
458/// let source = File::open("test/fixtures/SekienAkashita.jpg").unwrap();
459/// let chunker = StreamCDC::new(source, 4096, 16384, 65535);
460/// for result in chunker {
461///     let chunk = result.unwrap();
462///     println!("offset={} length={}", chunk.offset, chunk.length);
463/// }
464/// ```
465///
466pub struct StreamCDC<R: Read> {
467    /// Buffer of data from source for finding cut points.
468    buffer: Vec<u8>,
469    /// Maximum capacity of the buffer (always `max_size`).
470    capacity: usize,
471    /// Number of relevant bytes in the `buffer`.
472    length: usize,
473    /// Source from which data is read into `buffer`.
474    source: R,
475    /// Number of bytes read from the source so far.
476    processed: u64,
477    /// True when the source produces no more data.
478    eof: bool,
479    min_size: usize,
480    avg_size: usize,
481    max_size: usize,
482    mask_s: u64,
483    mask_l: u64,
484}
485
486impl<R: Read> StreamCDC<R> {
487    ///
488    /// Construct a `StreamCDC` that will process bytes from the given source.
489    ///
490    /// Uses chunk size normalization level 1 by default.
491    ///
492    pub fn new(source: R, min_size: u32, avg_size: u32, max_size: u32) -> Self {
493        StreamCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
494    }
495
496    ///
497    /// Create a new `StreamCDC` with the given normalization level.
498    ///
499    pub fn with_level(
500        source: R,
501        min_size: u32,
502        avg_size: u32,
503        max_size: u32,
504        level: Normalization,
505    ) -> Self {
506        assert!(min_size >= MINIMUM_MIN);
507        assert!(min_size <= MINIMUM_MAX);
508        assert!(avg_size >= AVERAGE_MIN);
509        assert!(avg_size <= AVERAGE_MAX);
510        assert!(max_size >= MAXIMUM_MIN);
511        assert!(max_size <= MAXIMUM_MAX);
512        let bits = logarithm2(avg_size);
513        let normalization = level.bits();
514        let mask_s = MASKS[(bits + normalization) as usize];
515        let mask_l = MASKS[(bits - normalization) as usize];
516        Self {
517            buffer: vec![0_u8; max_size as usize],
518            capacity: max_size as usize,
519            length: 0,
520            source,
521            eof: false,
522            processed: 0,
523            min_size: min_size as usize,
524            avg_size: avg_size as usize,
525            max_size: max_size as usize,
526            mask_s,
527            mask_l,
528        }
529    }
530
531    /// Fill the buffer with data from the source, returning the number of bytes
532    /// read (zero if end of source has been reached).
533    fn fill_buffer(&mut self) -> Result<usize, Error> {
534        // this code originally copied from asuran crate
535        if self.eof {
536            Ok(0)
537        } else {
538            let mut all_bytes_read = 0;
539            while !self.eof && self.length < self.capacity {
540                let bytes_read = self.source.read(&mut self.buffer[self.length..])?;
541                if bytes_read == 0 {
542                    self.eof = true;
543                } else {
544                    self.length += bytes_read;
545                    all_bytes_read += bytes_read;
546                }
547            }
548            Ok(all_bytes_read)
549        }
550    }
551
552    /// Drains a specified number of bytes from the buffer, then resizes the
553    /// buffer back to `capacity` size in preparation for further reads.
554    fn drain_bytes(&mut self, count: usize) -> Result<Vec<u8>, Error> {
555        // this code originally copied from asuran crate
556        if count > self.length {
557            Err(Error::Other(format!(
558                "drain_bytes() called with count larger than length: {} > {}",
559                count, self.length
560            )))
561        } else {
562            let data = self.buffer.drain(..count).collect::<Vec<u8>>();
563            self.length -= count;
564            self.buffer.resize(self.capacity, 0_u8);
565            Ok(data)
566        }
567    }
568
569    /// Find the next chunk in the source. If the end of the source has been
570    /// reached, returns `Error::Empty` as the error.
571    fn read_chunk(&mut self) -> Result<ChunkData, Error> {
572        self.fill_buffer()?;
573        if self.length == 0 {
574            Err(Error::Empty)
575        } else {
576            let (hash, count) = cut(
577                &self.buffer[..self.length],
578                self.min_size,
579                self.avg_size,
580                self.max_size,
581                self.mask_s,
582                self.mask_l,
583            );
584            if count == 0 {
585                Err(Error::Empty)
586            } else {
587                let offset = self.processed;
588                self.processed += count as u64;
589                let data = self.drain_bytes(count)?;
590                Ok(ChunkData {
591                    hash,
592                    offset,
593                    length: count,
594                    data,
595                })
596            }
597        }
598    }
599}
600
601impl<R: Read> Iterator for StreamCDC<R> {
602    type Item = Result<ChunkData, Error>;
603
604    fn next(&mut self) -> Option<Result<ChunkData, Error>> {
605        let slice = self.read_chunk();
606        if let Err(Error::Empty) = slice {
607            None
608        } else {
609            Some(slice)
610        }
611    }
612}
613
614///
615/// Base-2 logarithm function for unsigned 32-bit integers.
616///
617fn logarithm2(value: u32) -> u32 {
618    f64::from(value).log2().round() as u32
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use md5::{Digest, Md5};
625    use std::fs::{self, File};
626
627    #[test]
628    fn test_logarithm2() {
629        assert_eq!(logarithm2(0), 0);
630        assert_eq!(logarithm2(1), 0);
631        assert_eq!(logarithm2(2), 1);
632        assert_eq!(logarithm2(3), 2);
633        assert_eq!(logarithm2(5), 2);
634        assert_eq!(logarithm2(6), 3);
635        assert_eq!(logarithm2(11), 3);
636        assert_eq!(logarithm2(12), 4);
637        assert_eq!(logarithm2(19), 4);
638        assert_eq!(logarithm2(64), 6);
639        assert_eq!(logarithm2(128), 7);
640        assert_eq!(logarithm2(256), 8);
641        assert_eq!(logarithm2(512), 9);
642        assert_eq!(logarithm2(1024), 10);
643        assert_eq!(logarithm2(16383), 14);
644        assert_eq!(logarithm2(16384), 14);
645        assert_eq!(logarithm2(16385), 14);
646        assert_eq!(logarithm2(32767), 15);
647        assert_eq!(logarithm2(32768), 15);
648        assert_eq!(logarithm2(32769), 15);
649        assert_eq!(logarithm2(65535), 16);
650        assert_eq!(logarithm2(65536), 16);
651        assert_eq!(logarithm2(65537), 16);
652        assert_eq!(logarithm2(1_048_575), 20);
653        assert_eq!(logarithm2(1_048_576), 20);
654        assert_eq!(logarithm2(1_048_577), 20);
655        assert_eq!(logarithm2(4_194_303), 22);
656        assert_eq!(logarithm2(4_194_304), 22);
657        assert_eq!(logarithm2(4_194_305), 22);
658        assert_eq!(logarithm2(16_777_215), 24);
659        assert_eq!(logarithm2(16_777_216), 24);
660        assert_eq!(logarithm2(16_777_217), 24);
661    }
662
663    #[test]
664    #[should_panic]
665    fn test_minimum_too_low() {
666        let array = [0u8; 1024];
667        FastCDC::new(&array, 63, 256, 1024);
668    }
669
670    #[test]
671    #[should_panic]
672    fn test_minimum_too_high() {
673        let array = [0u8; 1024];
674        FastCDC::new(&array, 67_108_867, 256, 1024);
675    }
676
677    #[test]
678    #[should_panic]
679    fn test_average_too_low() {
680        let array = [0u8; 1024];
681        FastCDC::new(&array, 64, 255, 1024);
682    }
683
684    #[test]
685    #[should_panic]
686    fn test_average_too_high() {
687        let array = [0u8; 1024];
688        FastCDC::new(&array, 64, 268_435_457, 1024);
689    }
690
691    #[test]
692    #[should_panic]
693    fn test_maximum_too_low() {
694        let array = [0u8; 1024];
695        FastCDC::new(&array, 64, 256, 1023);
696    }
697
698    #[test]
699    #[should_panic]
700    fn test_maximum_too_high() {
701        let array = [0u8; 1024];
702        FastCDC::new(&array, 64, 256, 1_073_741_825);
703    }
704
705    #[test]
706    fn test_masks() {
707        let source = [0u8; 1024];
708        let chunker = FastCDC::new(&source, 64, 256, 1024);
709        assert_eq!(chunker.mask_l, MASKS[7]);
710        assert_eq!(chunker.mask_s, MASKS[9]);
711        let chunker = FastCDC::new(&source, 8192, 16384, 32768);
712        assert_eq!(chunker.mask_l, MASKS[13]);
713        assert_eq!(chunker.mask_s, MASKS[15]);
714        let chunker = FastCDC::new(&source, 1_048_576, 4_194_304, 16_777_216);
715        assert_eq!(chunker.mask_l, MASKS[21]);
716        assert_eq!(chunker.mask_s, MASKS[23]);
717    }
718
719    #[test]
720    fn test_cut_all_zeros() {
721        // for all zeros, always returns chunks of maximum size
722        let array = [0u8; 10240];
723        let chunker = FastCDC::new(&array, 64, 256, 1024);
724        let mut cursor: usize = 0;
725        for _ in 0..10 {
726            let (hash, pos) = chunker.cut(cursor, 10240 - cursor);
727            assert_eq!(hash, 14169102344523991076);
728            assert_eq!(pos, cursor + 1024);
729            cursor = pos;
730        }
731        // assert that nothing more should be returned
732        let (_, pos) = chunker.cut(cursor, 10240 - cursor);
733        assert_eq!(pos, 10240);
734    }
735
736    #[test]
737    fn test_cut_sekien_16k_chunks() {
738        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
739        assert!(read_result.is_ok());
740        let contents = read_result.unwrap();
741        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
742        let mut cursor: usize = 0;
743        let mut remaining: usize = contents.len();
744        let expected: Vec<(u64, usize)> = vec![
745            (17968276318003433923, 21325),
746            (4098594969649699419, 17140),
747            (15733367461443853673, 28084),
748            (4509236223063678303, 18217),
749            (2504464741100432583, 24700),
750        ];
751        for (e_hash, e_length) in expected.iter() {
752            let (hash, pos) = chunker.cut(cursor, remaining);
753            assert_eq!(hash, *e_hash);
754            assert_eq!(pos, cursor + e_length);
755            cursor = pos;
756            remaining -= e_length;
757        }
758        assert_eq!(remaining, 0);
759    }
760
761    #[test]
762    fn test_cut_sekien_32k_chunks() {
763        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
764        assert!(read_result.is_ok());
765        let contents = read_result.unwrap();
766        let chunker = FastCDC::new(&contents, 8192, 32768, 131072);
767        let mut cursor: usize = 0;
768        let mut remaining: usize = contents.len();
769        let expected: Vec<(u64, usize)> =
770            vec![(15733367461443853673, 66549), (2504464741100432583, 42917)];
771        for (e_hash, e_length) in expected.iter() {
772            let (hash, pos) = chunker.cut(cursor, remaining);
773            assert_eq!(hash, *e_hash);
774            assert_eq!(pos, cursor + e_length);
775            cursor = pos;
776            remaining -= e_length;
777        }
778        assert_eq!(remaining, 0);
779    }
780
781    #[test]
782    fn test_cut_sekien_64k_chunks() {
783        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
784        assert!(read_result.is_ok());
785        let contents = read_result.unwrap();
786        let chunker = FastCDC::new(&contents, 16384, 65536, 262144);
787        let mut cursor: usize = 0;
788        let mut remaining: usize = contents.len();
789        let expected: Vec<(u64, usize)> = vec![(2504464741100432583, 109466)];
790        for (e_hash, e_length) in expected.iter() {
791            let (hash, pos) = chunker.cut(cursor, remaining);
792            assert_eq!(hash, *e_hash);
793            assert_eq!(pos, cursor + e_length);
794            cursor = pos;
795            remaining -= e_length;
796        }
797        assert_eq!(remaining, 0);
798    }
799
800    struct ExpectedChunk {
801        hash: u64,
802        offset: u64,
803        length: usize,
804        digest: String,
805    }
806
807    #[test]
808    fn test_iter_sekien_16k_chunks() {
809        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
810        assert!(read_result.is_ok());
811        let contents = read_result.unwrap();
812        // The digest values are not needed here, but they serve to validate
813        // that the streaming version tested below is returning the correct
814        // chunk data on each iteration.
815        let expected_chunks = vec![
816            ExpectedChunk {
817                hash: 17968276318003433923,
818                offset: 0,
819                length: 21325,
820                digest: "2bb52734718194617c957f5e07ee6054".into(),
821            },
822            ExpectedChunk {
823                hash: 4098594969649699419,
824                offset: 21325,
825                length: 17140,
826                digest: "badfb0757fe081c20336902e7131f768".into(),
827            },
828            ExpectedChunk {
829                hash: 15733367461443853673,
830                offset: 38465,
831                length: 28084,
832                digest: "18412d7414de6eb42f638351711f729d".into(),
833            },
834            ExpectedChunk {
835                hash: 4509236223063678303,
836                offset: 66549,
837                length: 18217,
838                digest: "04fe1405fc5f960363bfcd834c056407".into(),
839            },
840            ExpectedChunk {
841                hash: 2504464741100432583,
842                offset: 84766,
843                length: 24700,
844                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
845            },
846        ];
847        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
848        let mut index = 0;
849        for chunk in chunker {
850            assert_eq!(chunk.hash, expected_chunks[index].hash);
851            assert_eq!(chunk.offset, expected_chunks[index].offset as usize);
852            assert_eq!(chunk.length, expected_chunks[index].length);
853            let mut hasher = Md5::new();
854            hasher.update(&contents[chunk.offset..chunk.offset + chunk.length]);
855            let table = hasher.finalize();
856            let digest = format!("{:x}", table);
857            assert_eq!(digest, expected_chunks[index].digest);
858            index += 1;
859        }
860        assert_eq!(index, 5);
861    }
862
863    #[test]
864    fn test_cut_sekien_16k_nc_0() {
865        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
866        assert!(read_result.is_ok());
867        let contents = read_result.unwrap();
868        let chunker = FastCDC::with_level(&contents, 4096, 16384, 65535, Normalization::Level0);
869        let mut cursor: usize = 0;
870        let mut remaining: usize = contents.len();
871        let expected: Vec<(u64, usize)> = vec![
872            (221561130519947581, 6634),
873            (15733367461443853673, 59915),
874            (10460176299449652894, 25597),
875            (6197802202431009942, 5237),
876            (2504464741100432583, 12083),
877        ];
878        for (e_hash, e_length) in expected.iter() {
879            let (hash, pos) = chunker.cut(cursor, remaining);
880            assert_eq!(hash, *e_hash);
881            assert_eq!(pos, cursor + e_length);
882            cursor = pos;
883            remaining -= e_length;
884        }
885        assert_eq!(remaining, 0);
886    }
887
888    #[test]
889    fn test_cut_sekien_16k_nc_3() {
890        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
891        assert!(read_result.is_ok());
892        let contents = read_result.unwrap();
893        let chunker = FastCDC::with_level(&contents, 4096, 16384, 65535, Normalization::Level3);
894        let mut cursor: usize = 0;
895        let mut remaining: usize = contents.len();
896        let expected: Vec<(u64, usize)> = vec![
897            (14582375164208481996, 17350),
898            (13104072099671895560, 19911),
899            (6161241554519610597, 17426),
900            (16009206469796846404, 17519),
901            (10460176299449652894, 19940),
902            (2504464741100432583, 17320),
903        ];
904        for (e_hash, e_length) in expected.iter() {
905            let (hash, pos) = chunker.cut(cursor, remaining);
906            assert_eq!(hash, *e_hash);
907            assert_eq!(pos, cursor + e_length);
908            cursor = pos;
909            remaining -= e_length;
910        }
911        assert_eq!(remaining, 0);
912    }
913
914    #[test]
915    fn test_error_fmt() {
916        let err = Error::Empty;
917        assert_eq!(format!("{err}"), "chunker error: Empty");
918    }
919
920    #[test]
921    fn test_stream_sekien_16k_chunks() {
922        let file_result = File::open("test/fixtures/SekienAkashita.jpg");
923        assert!(file_result.is_ok());
924        let file = file_result.unwrap();
925        // The set of expected results should match the non-streaming version.
926        let expected_chunks = vec![
927            ExpectedChunk {
928                hash: 17968276318003433923,
929                offset: 0,
930                length: 21325,
931                digest: "2bb52734718194617c957f5e07ee6054".into(),
932            },
933            ExpectedChunk {
934                hash: 4098594969649699419,
935                offset: 21325,
936                length: 17140,
937                digest: "badfb0757fe081c20336902e7131f768".into(),
938            },
939            ExpectedChunk {
940                hash: 15733367461443853673,
941                offset: 38465,
942                length: 28084,
943                digest: "18412d7414de6eb42f638351711f729d".into(),
944            },
945            ExpectedChunk {
946                hash: 4509236223063678303,
947                offset: 66549,
948                length: 18217,
949                digest: "04fe1405fc5f960363bfcd834c056407".into(),
950            },
951            ExpectedChunk {
952                hash: 2504464741100432583,
953                offset: 84766,
954                length: 24700,
955                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
956            },
957        ];
958        let chunker = StreamCDC::new(file, 4096, 16384, 65535);
959        let mut index = 0;
960        for result in chunker {
961            assert!(result.is_ok());
962            let chunk = result.unwrap();
963            assert_eq!(chunk.hash, expected_chunks[index].hash);
964            assert_eq!(chunk.offset, expected_chunks[index].offset);
965            assert_eq!(chunk.length, expected_chunks[index].length);
966            let mut hasher = Md5::new();
967            hasher.update(&chunk.data);
968            let table = hasher.finalize();
969            let digest = format!("{:x}", table);
970            assert_eq!(digest, expected_chunks[index].digest);
971            index += 1;
972        }
973        assert_eq!(index, 5);
974    }
975}