fastcdc/v2020/
mod.rs

1//
2// Copyright (c) 2023 Nathan Fiedler
3//
4
5//! This module implements the canonical FastCDC algorithm as described in the
6//! [paper](https://ieeexplore.ieee.org/document/9055082) by Wen Xia, et al., in
7//! 2020.
8//!
9//! The algorithm incorporates a simplified hash judgement using the fast Gear
10//! hash, sub-minimum chunk cut-point skipping, normalized chunking to produce
11//! chunks of a more consistent length, and "rolling two bytes each time".
12//! According to the authors, this should be 30-40% faster than the 2016 version
13//! while producing the same cut points. Benchmarks on several large files on an
14//! Apple M1 show about a 20% improvement, but results may vary depending on CPU
15//! architecture, file size, chunk size, etc.
16//!
17//! There are two ways in which to use the `FastCDC` struct defined in this
18//! module. One is to simply invoke `cut()` while managing your own `start` and
19//! `remaining` values. The other is to use the struct as an `Iterator` that
20//! yields `Chunk` structs which represent the offset and size of the chunks.
21//! Note that attempting to use both `cut()` and `Iterator` on the same
22//! `FastCDC` instance will yield incorrect results.
23//!
24//! Note that the `cut()` function returns the 64-bit hash of the chunk, which
25//! may be useful in scenarios involving chunk size prediction using historical
26//! data, such as in RapidCDC or SuperCDC. This hash value is also given in the
27//! `hash` field of the `Chunk` struct. While this value has rather low entropy,
28//! it is computationally cost-free and can be put to some use with additional
29//! record keeping.
30//!
31//! The `StreamCDC` implementation is similar to `FastCDC` except that it will
32//! read data from a `Read` into an internal buffer of `max_size` and produce
33//! `ChunkData` values from the `Iterator`.
34use std::fmt;
35use std::io::Read;
36
37#[cfg(any(feature = "tokio", feature = "futures"))]
38mod async_stream_cdc;
39#[cfg(any(feature = "tokio", feature = "futures"))]
40pub use async_stream_cdc::*;
41
42/// Smallest acceptable value for the minimum chunk size.
43pub const MINIMUM_MIN: u32 = 64;
44/// Largest acceptable value for the minimum chunk size.
45pub const MINIMUM_MAX: u32 = 1_048_576;
46/// Smallest acceptable value for the average chunk size.
47pub const AVERAGE_MIN: u32 = 256;
48/// Largest acceptable value for the average chunk size.
49pub const AVERAGE_MAX: u32 = 4_194_304;
50/// Smallest acceptable value for the maximum chunk size.
51pub const MAXIMUM_MIN: u32 = 1024;
52/// Largest acceptable value for the maximum chunk size.
53pub const MAXIMUM_MAX: u32 = 16_777_216;
54
55//
56// Masks for each of the desired number of bits, where 0 through 5 are unused.
57// The values for sizes 64 bytes through 128 kilo-bytes comes from the C
58// reference implementation (found in the destor repository) while the extra
59// values come from the restic-FastCDC repository. The FastCDC paper claims that
60// the deduplication ratio is slightly improved when the mask bits are spread
61// relatively evenly, hence these seemingly "magic" values.
62//
63pub(self) const MASKS: [u64; 26] = [
64    0,                  // padding
65    0,                  // padding
66    0,                  // padding
67    0,                  // padding
68    0,                  // padding
69    0x0000000001804110, // unused except for NC 3
70    0x0000000001803110, // 64B
71    0x0000000018035100, // 128B
72    0x0000001800035300, // 256B
73    0x0000019000353000, // 512B
74    0x0000590003530000, // 1KB
75    0x0000d90003530000, // 2KB
76    0x0000d90103530000, // 4KB
77    0x0000d90303530000, // 8KB
78    0x0000d90313530000, // 16KB
79    0x0000d90f03530000, // 32KB
80    0x0000d90303537000, // 64KB
81    0x0000d90703537000, // 128KB
82    0x0000d90707537000, // 256KB
83    0x0000d91707537000, // 512KB
84    0x0000d91747537000, // 1MB
85    0x0000d91767537000, // 2MB
86    0x0000d93767537000, // 4MB
87    0x0000d93777537000, // 8MB
88    0x0000d93777577000, // 16MB
89    0x0000db3777577000, // unused except for NC 3
90];
91
92//
93// GEAR contains seemingly random numbers which are created by computing the
94// MD5 digest of values from 0 to 255, using only the high 8 bytes of the 16
95// byte digest. This is the "gear hash" referred to the in FastCDC paper.
96//
97// The program to produce this table is named table64.rs in examples.
98//
99#[rustfmt::skip]
100const GEAR: [u64; 256] = [
101    0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7,
102    0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649,
103    0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226,
104    0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14,
105    0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6,
106    0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb,
107    0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98,
108    0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99,
109    0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01,
110    0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f,
111    0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915,
112    0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2,
113    0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565,
114    0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968,
115    0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116,
116    0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2,
117    0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040,
118    0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3,
119    0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651,
120    0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc,
121    0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666,
122    0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3,
123    0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163,
124    0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6,
125    0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0,
126    0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba,
127    0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9,
128    0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55,
129    0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52,
130    0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda,
131    0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e,
132    0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a,
133    0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba,
134    0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748,
135    0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e,
136    0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69,
137    0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0,
138    0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b,
139    0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5,
140    0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362,
141    0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845,
142    0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2,
143    0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71,
144    0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f,
145    0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd,
146    0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78,
147    0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa,
148    0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2,
149    0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482,
150    0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627,
151    0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b,
152    0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe,
153    0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87,
154    0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81,
155    0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e,
156    0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea,
157    0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2,
158    0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06,
159    0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c,
160    0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042,
161    0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6,
162    0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb,
163    0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780,
164    0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1
165];
166
167//
168// GEAR table in which all values have been shifted left 1 bit, as per the
169// FastCDC 2020 paper, section 3.7.
170//
171// The program to produce this table is named table64ls.rs in examples.
172//
173#[rustfmt::skip]
174const GEAR_LS: [u64; 256] = [
175    0x76ba78fa40fc6fb8, 0xf09ad1752224610c, 0x9aa5101f105ce530, 0xd59f1c9c33fb994e,
176    0x863e70bbf7a2c656, 0x3abe4e003c4b57cc, 0x062617bc7935b322, 0x89644acedd36ec92,
177    0x54653653c11d6932, 0x6cff97a43caefab0, 0x004f7555b4559ed6, 0xc7de5ab58e78444c,
178    0x1121e49adda6256e, 0x5013c06d0a3af8fc, 0xe14dfcbc0027b036, 0x3a04c6088a59d828,
179    0x070c8c64c91c491e, 0x9b559e7b9b257368, 0xebc025cc7830f0ac, 0x10c5f3a70438016c,
180    0x505ee670ea1edf14, 0x3cb07b8d839616de, 0xf4628b6d2e874fe2, 0x57641fdc80900fd6,
181    0x629679fc0f7074ba, 0x73b84f1315b7341e, 0x6e07ebd23754c57c, 0x9e1770cd02befb30,
182    0x7b30cf883d53b9a6, 0x37c3f4ca8857e458, 0x28601b498aac63b2, 0xcd31d3978ca8b932,
183    0x8ec620fd8c9d254a, 0x8cb043f8cad2d448, 0xec32d80c9045e16e, 0x90b7d083e6a4bc02,
184    0xeca579390b2e9fea, 0x95b06a5f59d3c7d2, 0x54dda3b9d66bd31c, 0x8de90775b822d01e,
185    0x79fb182fd39e25e2, 0x137078bc5d4ac8e2, 0x5cccb9fa49c72552, 0xd86789ca0997122a,
186    0x7f7362bf93fce8a2, 0xaffa3fa328be432a, 0x635bc10a6887dfb0, 0x4abdf930c7c3c5a4,
187    0x21d56c011aac859e, 0x8de090c64af59008, 0x4a5b8854f1344fa6, 0xb555bf39cef5eaca,
188    0xd68f39aa7b0ffd76, 0xc613c5a7f27b6e5e, 0x8ae71ff7543ff3ac, 0xd7aafe7e4b5ef2d0,
189    0xcec0a90db21a1494, 0xc29a172cc77f7b5c, 0x6f77b1b02dd60828, 0xbdf149e2d66b422c,
190    0xcf265b0b555ffdac, 0x102c3975d219fa90, 0x0aaa0f7d6529e116, 0x22469d4dffa73364,
191    0x5ae19e96486be604, 0xa51352eacb785a4e, 0x1cab086fff9533bc, 0x2da4e096e22b8080,
192    0x1113779bf8cc1c82, 0xbc1a9ccfb924251a, 0xe553f122e0c7db2e, 0x8716d3a813c02dc6,
193    0xbe9fdb14bb14872e, 0x01e83b9e83a807ee, 0x9029d6071ca4c07e, 0x3b5f7598b1c5aca2,
194    0xfc5e8b1c97c2e15e, 0x88afd8829bed5280, 0x0dcc5e28a2246628, 0x7a2029a2e7752598,
195    0xbde631c4bdaaeec0, 0x3fd41bd3bf950a4a, 0x8b3bc3ced840c496, 0x5dd8312c2fc5accc,
196    0x24d4580d56b50796, 0x62642a646c1ec264, 0xca842a07b7680246, 0x5acf850fd4113566,
197    0xd9277feb4ad7ad6c, 0x9ff6406d956db31a, 0x9cf6f0b637cf5a9e, 0xdb685dec313fa2c6,
198    0xb920a510e07311ec, 0x6cbf383a58d23108, 0x8c80b06d01b337fc, 0x79a8c4980eb27d8c,
199    0xfe3d51b0baf8b00a, 0x029085a9016ae292, 0x16c93796b5050d10, 0x79aae11daf3631e0,
200    0xd30f90c5f7a5e5e0, 0x304e62ce3e19b058, 0x75e27d162db180c6, 0x1d4621397b2a3774,
201    0xa28208b7f670b95a, 0x559783415e3efa60, 0xcc889b13be077fbe, 0x43198ee370311ff2,
202    0x3068853b60387376, 0x4295ba0ffc10d43e, 0x1e0f83363ed67ff2, 0xad452f637e9ffcaa,
203    0x29aab1c9278a9f8e, 0x817f8498ec8aa596, 0x2634e0df150a4196, 0x64453a64526b7aa4,
204    0x4ac1a1ebb89fdf5e, 0x3b798906ab2d376c, 0x1fb038730b816ad4, 0xc0702fc2ac1e57b4,
205    0x83769f03b12565aa, 0x61890c9e9c51a5ae, 0x7d9893f3b3ad84c6, 0xa260fd336a574cbc,
206    0x15e56d11b5094ea4, 0xebae4a477236416c, 0xdb2bfe3fe8c6900c, 0xac5e42aaa8b06734,
207    0x819c8ff11266c68c, 0x90f047ca113681b0, 0xc8e4f8fd78db2b24, 0xb51ee4efd292e174,
208    0xf945e80d639176a0, 0xb63f152be2f220e0, 0xa6095f3f92050c0a, 0xa88156ff9261ce90,
209    0x625a4bf794556b42, 0x21e949646949aaea, 0x20603aaa08fce8e6, 0x76c6e510d8c2b23c,
210    0x5268056ef8889c0c, 0x8a3e6949a7d2fbae, 0x62b1b029b0378af6, 0x06928484b737b4d2,
211    0xc4065ff3ca65b376, 0xc55c0cd71642f3ca, 0x2a8bc2185f1ae3b0, 0xffee907d65a47f80,
212    0x0128bf9d7b931b0c, 0x0ec9777d3364d944, 0x363d8c4509817f86, 0xb1c1f989e1546c56,
213    0xbe957df50f1a8b1a, 0xfae9585f2c0f8a32, 0x49c7f66fbf197f52, 0x7ed2fc87958ae9ca,
214    0x0de2947ed1e99aa6, 0x49447a0ede39ca44, 0xce4b9b00910d1990, 0x7e78e53d648c86c4,
215    0xb1ed9aaf67983db0, 0xc653ca484aa82aee, 0xc554d115ab5c3580, 0x14484acc4d37f08a,
216    0x2d16348ea7594e96, 0xef135fdffe5cfc78, 0xd866c41276df99b6, 0x99d1ea17a5181364,
217    0x00d45b72b5d15526, 0x2eb61ac4787a3518, 0x30c0ba726a6718b6, 0xb76dec26d95a78e2,
218    0x1ea7440e03f1b14c, 0x5718b5a5cfd278ce, 0x816b58a24f595452, 0x18f7ec7840eb12be,
219    0xf17b3efc029500b8, 0x6593d3e9f3918064, 0xdfac09304fd723e6, 0x57c8b3e90582df7a,
220    0xb259c18ae8b55518, 0x15551f6531b2cb72, 0x566ff258d900762a, 0x18a94bd29c1e1cf0,
221    0x2bf36dd218146064, 0xcf273f5486d8f0e8, 0xa2d7fd1ed5148192, 0x8930570c4c7fa5f4,
222    0xc50bf673f309cb06, 0xef351bee5aec33a6, 0xe5af351bd1abba3c, 0xa206e6a9accd09c4,
223    0x00990549ccd151ca, 0x63a814ecd16089cc, 0xae0af0a717a05822, 0xb68a8620f18be904,
224    0x2ee24376fed4a35a, 0xe7ab997a69dff1ba, 0xc86f40fa6adc2f9a, 0x8f64f0408792ac4e,
225    0x3f64a2827c83a934, 0x99ae16c0ca4a27a6, 0x392b663d14369364, 0x95ce7bfa37969836,
226    0x69b3066363eb6e1e, 0xf09c73e44671b25e, 0x30c27a940c9be840, 0xe3b1b5c4be179d7c,
227    0x67eef82b5d0abdf8, 0x7911677225d62138, 0x2ad45d92d75fdd4a, 0x35400b6bc15a1d0e,
228    0xaa01ae0a4f89771c, 0xc6d8ae32c8439888, 0x2789a50d986ddc72, 0xaca9447b03165502,
229    0xef63b827a2c357b8, 0xe69e89bcbf1abd6a, 0xc0e2fc2e94d91344, 0xa8fb2c924cd4423c,
230    0xb6274864576d3d20, 0xeecd2c13f16bf878, 0x43cd58ab7db9b592, 0x36ad6c56c22cdbd4,
231    0xe91ecd7272f2fd38, 0x6be665f381cd5d34, 0x98e67ed5350f1b60, 0x7b42c3c839821184,
232    0x6fae95ca6b229aa2, 0x9a92761623a6c8d2, 0x9c4c9a3bf752e834, 0x53a3e5b8e86db80c,
233    0xe0e7002cc098544e, 0x463a6dd2dd27e7aa, 0xeccd10232f071a32, 0x945506121555a818,
234    0xe3cec2b22cd166ba, 0xe6c646c92fee614e, 0x602101c6e6f3ba9a, 0xa05bd452e304e084,
235    0x858bd70b1e64c4be, 0xf0d5f73dbf5f7bfe, 0xb5dc1b0d09216548, 0xc2e6cd664d0c13ec,
236    0x5c1c6b41fc8c2e7c, 0xa340fbd27d049e22, 0x0f371622bd499950, 0x275324e8ab1f5d76,
237    0xf63cdc45c1140766, 0xd4c6bfb746d31ba0, 0x9ea6cb2650a074b8, 0x9bc7663cdfabaf00,
238    0x1c7c8443a6c28826, 0xde29a1b0d7e34458, 0xc3b061a7e2d8bbb6, 0x557a56548a2a09c2
239];
240
241// Find the next chunk cut point in the source.
242#[allow(clippy::too_many_arguments)]
243pub(self) fn cut(
244    source: &[u8],
245    min_size: usize,
246    avg_size: usize,
247    max_size: usize,
248    mask_s: u64,
249    mask_l: u64,
250    mask_s_ls: u64,
251    mask_l_ls: u64,
252) -> (u64, usize) {
253    let mut remaining = source.len();
254    if remaining <= min_size {
255        return (0, remaining);
256    }
257    let mut center = avg_size;
258    if remaining > max_size {
259        remaining = max_size;
260    } else if remaining < center {
261        center = remaining;
262    }
263    let mut index = min_size / 2;
264    let mut hash: u64 = 0;
265    while index < center / 2 {
266        let a = index * 2;
267        hash = (hash << 2).wrapping_add(GEAR_LS[source[a] as usize]);
268        if (hash & mask_s_ls) == 0 {
269            return (hash, a);
270        }
271        hash = hash.wrapping_add(GEAR[source[a + 1] as usize]);
272        if (hash & mask_s) == 0 {
273            return (hash, a + 1);
274        }
275        index += 1;
276    }
277    while index < remaining / 2 {
278        let a = index * 2;
279        hash = (hash << 2).wrapping_add(GEAR_LS[source[a] as usize]);
280        if (hash & mask_l_ls) == 0 {
281            return (hash, a);
282        }
283        hash = hash.wrapping_add(GEAR[source[a + 1] as usize]);
284        if (hash & mask_l) == 0 {
285            return (hash, a + 1);
286        }
287        index += 1;
288    }
289    // If all else fails, return the largest chunk. This will happen with
290    // pathological data, such as all zeroes.
291    (hash, remaining)
292}
293
294///
295/// The level for the normalized chunking used by FastCDC.
296///
297/// Normalized chunking "generates chunks whose sizes are normalized to a
298/// specified region centered at the expected chunk size," as described in
299/// section 4.4 of the FastCDC 2016 paper.
300///
301/// Note that lower levels of normalization will result in a larger range of
302/// generated chunk sizes. It may be beneficial to widen the minimum/maximum
303/// chunk size values given to the `FastCDC` constructor in that case.
304///
305/// Note that higher levels of normalization may result in the final chunk of
306/// data being smaller than the minimum chunk size, which results in a hash
307/// value of zero since no calculations are performed for sub-minimum chunks.
308///
309pub enum Normalization {
310    /// No chunk size normalization, produces a wide range of chunk sizes.
311    Level0,
312    /// Level 1 normalization, in which fewer chunks are outside of the desired range.
313    Level1,
314    /// Level 2 normalization, where most chunks are of the desired size.
315    Level2,
316    /// Level 3 normalization, nearly all chunks are the desired size.
317    Level3,
318}
319
320impl Normalization {
321    pub(self) fn bits(&self) -> u32 {
322        match self {
323            Normalization::Level0 => 0,
324            Normalization::Level1 => 1,
325            Normalization::Level2 => 2,
326            Normalization::Level3 => 3,
327        }
328    }
329}
330
331///
332/// Represents a chunk returned from the FastCDC iterator.
333///
334#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
335pub struct Chunk {
336    /// The gear hash value as of the end of the chunk.
337    pub hash: u64,
338    /// Starting byte position within the source.
339    pub offset: usize,
340    /// Length of the chunk in bytes.
341    pub length: usize,
342}
343
344///
345/// The FastCDC chunker implementation from 2020.
346///
347/// Use `new` to construct an instance, and then iterate over the `Chunk`s via
348/// the `Iterator` trait.
349///
350/// This example reads a file into memory and splits it into chunks that are
351/// roughly 16 KB in size. The minimum and maximum sizes are the absolute limit
352/// on the returned chunk sizes. With this algorithm, it is helpful to be more
353/// lenient on the maximum chunk size as the results are highly dependent on the
354/// input data. Changing the minimum chunk size will affect the results as the
355/// algorithm may find different cut points given it uses the minimum as a
356/// starting point (cut-point skipping).
357///
358/// ```no_run
359/// use std::fs;
360/// use fastcdc::v2020;
361/// let contents = fs::read("test/fixtures/SekienAkashita.jpg").unwrap();
362/// let chunker = v2020::FastCDC::new(&contents, 8192, 16384, 65535);
363/// for entry in chunker {
364///     println!("offset={} size={}", entry.offset, entry.length);
365/// }
366/// ```
367///
368#[derive(Debug, Clone, Eq, PartialEq)]
369pub struct FastCDC<'a> {
370    source: &'a [u8],
371    processed: usize,
372    remaining: usize,
373    min_size: usize,
374    avg_size: usize,
375    max_size: usize,
376    mask_s: u64,
377    mask_l: u64,
378    mask_s_ls: u64,
379    mask_l_ls: u64,
380}
381
382impl<'a> FastCDC<'a> {
383    ///
384    /// Construct a `FastCDC` that will process the given slice of bytes.
385    ///
386    /// Uses chunk size normalization level 1 by default.
387    ///
388    pub fn new(source: &'a [u8], min_size: u32, avg_size: u32, max_size: u32) -> Self {
389        FastCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
390    }
391
392    ///
393    /// Create a new `FastCDC` with the given normalization level.
394    ///
395    pub fn with_level(
396        source: &'a [u8],
397        min_size: u32,
398        avg_size: u32,
399        max_size: u32,
400        level: Normalization,
401    ) -> Self {
402        assert!(min_size >= MINIMUM_MIN);
403        assert!(min_size <= MINIMUM_MAX);
404        assert!(avg_size >= AVERAGE_MIN);
405        assert!(avg_size <= AVERAGE_MAX);
406        assert!(max_size >= MAXIMUM_MIN);
407        assert!(max_size <= MAXIMUM_MAX);
408        let bits = logarithm2(avg_size);
409        let normalization = level.bits();
410        let mask_s = MASKS[(bits + normalization) as usize];
411        let mask_l = MASKS[(bits - normalization) as usize];
412        Self {
413            source,
414            processed: 0,
415            remaining: source.len(),
416            min_size: min_size as usize,
417            avg_size: avg_size as usize,
418            max_size: max_size as usize,
419            mask_s,
420            mask_l,
421            mask_s_ls: mask_s << 1,
422            mask_l_ls: mask_l << 1,
423        }
424    }
425
426    ///
427    /// Find the next cut point in the data, where `start` is the position from
428    /// which to start processing the source data, and `remaining` are the
429    /// number of bytes left to be processed.
430    ///
431    /// The returned 2-tuple consists of the 64-bit hash (fingerprint) and the
432    /// byte offset of the end of the chunk. Note that the hash values may
433    /// differ from those produced by the v2016 chunker.
434    ///
435    /// There is a special case in which the remaining bytes are less than the
436    /// minimum chunk size, at which point this function returns a hash of 0 and
437    /// the cut point is the end of the source data.
438    ///
439    pub fn cut(&self, start: usize, remaining: usize) -> (u64, usize) {
440        let end = start + remaining;
441        let (hash, count) = cut(
442            &self.source[start..end],
443            self.min_size,
444            self.avg_size,
445            self.max_size,
446            self.mask_s,
447            self.mask_l,
448            self.mask_s_ls,
449            self.mask_l_ls,
450        );
451        (hash, start + count)
452    }
453}
454
455impl<'a> Iterator for FastCDC<'a> {
456    type Item = Chunk;
457
458    fn next(&mut self) -> Option<Chunk> {
459        if self.remaining == 0 {
460            None
461        } else {
462            let (hash, cutpoint) = self.cut(self.processed, self.remaining);
463            if cutpoint == 0 {
464                None
465            } else {
466                let offset = self.processed;
467                let length = cutpoint - offset;
468                self.processed += length;
469                self.remaining -= length;
470                Some(Chunk {
471                    hash,
472                    offset,
473                    length,
474                })
475            }
476        }
477    }
478
479    fn size_hint(&self) -> (usize, Option<usize>) {
480        // NOTE: This intentionally returns the upper bound for both `size_hint`
481        // values, as the upper bound doesn't actually seem to get used by `std`
482        // and using the actual lower bound is practically guaranteed to require
483        // a second capacity growth.
484        let upper_bound = self.remaining / self.min_size;
485        (upper_bound, Some(upper_bound))
486    }
487}
488
489///
490/// The error type returned from the `StreamCDC` iterator.
491///
492#[derive(Debug)]
493pub enum Error {
494    /// End of source data reached.
495    Empty,
496    /// An I/O error occurred.
497    IoError(std::io::Error),
498    /// Something unexpected happened.
499    Other(String),
500}
501
502impl fmt::Display for Error {
503    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
504        write!(f, "chunker error: {self:?}")
505    }
506}
507
508impl std::error::Error for Error {}
509
510impl From<std::io::Error> for Error {
511    fn from(error: std::io::Error) -> Self {
512        Error::IoError(error)
513    }
514}
515
516impl From<Error> for std::io::Error {
517    fn from(error: Error) -> Self {
518        match error {
519            Error::IoError(ioerr) => ioerr,
520            Error::Empty => Self::from(std::io::ErrorKind::UnexpectedEof),
521            Error::Other(str) => Self::new(std::io::ErrorKind::Other, str),
522        }
523    }
524}
525
526///
527/// Represents a chunk returned from the StreamCDC iterator.
528///
529#[derive(Debug, Clone, Eq, PartialEq, Hash)]
530pub struct ChunkData {
531    /// The gear hash value as of the end of the chunk.
532    pub hash: u64,
533    /// Starting byte position within the source.
534    pub offset: u64,
535    /// Length of the chunk in bytes.
536    pub length: usize,
537    /// Source bytes contained in this chunk.
538    pub data: Vec<u8>,
539}
540
541///
542/// The FastCDC chunker implementation from 2020 with streaming support.
543///
544/// Use `new` to construct an instance, and then iterate over the `ChunkData`s
545/// via the `Iterator` trait.
546///
547/// Note that this struct allocates a `Vec<u8>` of `max_size` bytes to act as a
548/// buffer when reading from the source and finding chunk boundaries.
549///
550/// ```no_run
551/// # use std::fs::File;
552/// # use fastcdc::v2020::StreamCDC;
553/// let source = File::open("test/fixtures/SekienAkashita.jpg").unwrap();
554/// let chunker = StreamCDC::new(source, 4096, 16384, 65535);
555/// for result in chunker {
556///     let chunk = result.unwrap();
557///     println!("offset={} length={}", chunk.offset, chunk.length);
558/// }
559/// ```
560///
561pub struct StreamCDC<R: Read> {
562    /// Buffer of data from source for finding cut points.
563    buffer: Vec<u8>,
564    /// Maximum capacity of the buffer (always `max_size`).
565    capacity: usize,
566    /// Number of relevant bytes in the `buffer`.
567    length: usize,
568    /// Source from which data is read into `buffer`.
569    source: R,
570    /// Number of bytes read from the source so far.
571    processed: u64,
572    /// True when the source produces no more data.
573    eof: bool,
574    min_size: usize,
575    avg_size: usize,
576    max_size: usize,
577    mask_s: u64,
578    mask_l: u64,
579    mask_s_ls: u64,
580    mask_l_ls: u64,
581}
582
583impl<R: Read> StreamCDC<R> {
584    ///
585    /// Construct a `StreamCDC` that will process bytes from the given source.
586    ///
587    /// Uses chunk size normalization level 1 by default.
588    ///
589    pub fn new(source: R, min_size: u32, avg_size: u32, max_size: u32) -> Self {
590        StreamCDC::with_level(source, min_size, avg_size, max_size, Normalization::Level1)
591    }
592
593    ///
594    /// Create a new `StreamCDC` with the given normalization level.
595    ///
596    pub fn with_level(
597        source: R,
598        min_size: u32,
599        avg_size: u32,
600        max_size: u32,
601        level: Normalization,
602    ) -> Self {
603        assert!(min_size >= MINIMUM_MIN);
604        assert!(min_size <= MINIMUM_MAX);
605        assert!(avg_size >= AVERAGE_MIN);
606        assert!(avg_size <= AVERAGE_MAX);
607        assert!(max_size >= MAXIMUM_MIN);
608        assert!(max_size <= MAXIMUM_MAX);
609        let bits = logarithm2(avg_size);
610        let normalization = level.bits();
611        let mask_s = MASKS[(bits + normalization) as usize];
612        let mask_l = MASKS[(bits - normalization) as usize];
613        Self {
614            buffer: vec![0_u8; max_size as usize],
615            capacity: max_size as usize,
616            length: 0,
617            source,
618            eof: false,
619            processed: 0,
620            min_size: min_size as usize,
621            avg_size: avg_size as usize,
622            max_size: max_size as usize,
623            mask_s,
624            mask_l,
625            mask_s_ls: mask_s << 1,
626            mask_l_ls: mask_l << 1,
627        }
628    }
629
630    /// Fill the buffer with data from the source, returning the number of bytes
631    /// read (zero if end of source has been reached).
632    fn fill_buffer(&mut self) -> Result<usize, Error> {
633        // this code originally copied from asuran crate
634        if self.eof {
635            Ok(0)
636        } else {
637            let mut all_bytes_read = 0;
638            while !self.eof && self.length < self.capacity {
639                let bytes_read = self.source.read(&mut self.buffer[self.length..])?;
640                if bytes_read == 0 {
641                    self.eof = true;
642                } else {
643                    self.length += bytes_read;
644                    all_bytes_read += bytes_read;
645                }
646            }
647            Ok(all_bytes_read)
648        }
649    }
650
651    /// Drains a specified number of bytes from the buffer, then resizes the
652    /// buffer back to `capacity` size in preparation for further reads.
653    fn drain_bytes(&mut self, count: usize) -> Result<Vec<u8>, Error> {
654        // this code originally copied from asuran crate
655        if count > self.length {
656            Err(Error::Other(format!(
657                "drain_bytes() called with count larger than length: {} > {}",
658                count, self.length
659            )))
660        } else {
661            let data = self.buffer.drain(..count).collect::<Vec<u8>>();
662            self.length -= count;
663            self.buffer.resize(self.capacity, 0_u8);
664            Ok(data)
665        }
666    }
667
668    /// Find the next chunk in the source. If the end of the source has been
669    /// reached, returns `Error::Empty` as the error.
670    fn read_chunk(&mut self) -> Result<ChunkData, Error> {
671        self.fill_buffer()?;
672        if self.length == 0 {
673            Err(Error::Empty)
674        } else {
675            let (hash, count) = cut(
676                &self.buffer[..self.length],
677                self.min_size,
678                self.avg_size,
679                self.max_size,
680                self.mask_s,
681                self.mask_l,
682                self.mask_s_ls,
683                self.mask_l_ls,
684            );
685            if count == 0 {
686                Err(Error::Empty)
687            } else {
688                let offset = self.processed;
689                self.processed += count as u64;
690                let data = self.drain_bytes(count)?;
691                Ok(ChunkData {
692                    hash,
693                    offset,
694                    length: count,
695                    data,
696                })
697            }
698        }
699    }
700}
701
702impl<R: Read> Iterator for StreamCDC<R> {
703    type Item = Result<ChunkData, Error>;
704
705    fn next(&mut self) -> Option<Result<ChunkData, Error>> {
706        let slice = self.read_chunk();
707        if let Err(Error::Empty) = slice {
708            None
709        } else {
710            Some(slice)
711        }
712    }
713}
714
715///
716/// Base-2 logarithm function for unsigned 32-bit integers.
717///
718pub(self) fn logarithm2(value: u32) -> u32 {
719    f64::from(value).log2().round() as u32
720}
721
722#[cfg(test)]
723mod tests {
724    use super::*;
725    use md5::{Digest, Md5};
726    use std::fs::{self, File};
727
728    #[test]
729    fn test_logarithm2() {
730        assert_eq!(logarithm2(0), 0);
731        assert_eq!(logarithm2(1), 0);
732        assert_eq!(logarithm2(2), 1);
733        assert_eq!(logarithm2(3), 2);
734        assert_eq!(logarithm2(5), 2);
735        assert_eq!(logarithm2(6), 3);
736        assert_eq!(logarithm2(11), 3);
737        assert_eq!(logarithm2(12), 4);
738        assert_eq!(logarithm2(19), 4);
739        assert_eq!(logarithm2(64), 6);
740        assert_eq!(logarithm2(128), 7);
741        assert_eq!(logarithm2(256), 8);
742        assert_eq!(logarithm2(512), 9);
743        assert_eq!(logarithm2(1024), 10);
744        assert_eq!(logarithm2(16383), 14);
745        assert_eq!(logarithm2(16384), 14);
746        assert_eq!(logarithm2(16385), 14);
747        assert_eq!(logarithm2(32767), 15);
748        assert_eq!(logarithm2(32768), 15);
749        assert_eq!(logarithm2(32769), 15);
750        assert_eq!(logarithm2(65535), 16);
751        assert_eq!(logarithm2(65536), 16);
752        assert_eq!(logarithm2(65537), 16);
753        assert_eq!(logarithm2(1_048_575), 20);
754        assert_eq!(logarithm2(1_048_576), 20);
755        assert_eq!(logarithm2(1_048_577), 20);
756        assert_eq!(logarithm2(4_194_303), 22);
757        assert_eq!(logarithm2(4_194_304), 22);
758        assert_eq!(logarithm2(4_194_305), 22);
759        assert_eq!(logarithm2(16_777_215), 24);
760        assert_eq!(logarithm2(16_777_216), 24);
761        assert_eq!(logarithm2(16_777_217), 24);
762    }
763
764    #[test]
765    #[should_panic]
766    fn test_minimum_too_low() {
767        let array = [0u8; 1024];
768        FastCDC::new(&array, 63, 256, 1024);
769    }
770
771    #[test]
772    #[should_panic]
773    fn test_minimum_too_high() {
774        let array = [0u8; 1024];
775        FastCDC::new(&array, 67_108_867, 256, 1024);
776    }
777
778    #[test]
779    #[should_panic]
780    fn test_average_too_low() {
781        let array = [0u8; 1024];
782        FastCDC::new(&array, 64, 255, 1024);
783    }
784
785    #[test]
786    #[should_panic]
787    fn test_average_too_high() {
788        let array = [0u8; 1024];
789        FastCDC::new(&array, 64, 268_435_457, 1024);
790    }
791
792    #[test]
793    #[should_panic]
794    fn test_maximum_too_low() {
795        let array = [0u8; 1024];
796        FastCDC::new(&array, 64, 256, 1023);
797    }
798
799    #[test]
800    #[should_panic]
801    fn test_maximum_too_high() {
802        let array = [0u8; 1024];
803        FastCDC::new(&array, 64, 256, 1_073_741_825);
804    }
805
806    #[test]
807    fn test_masks() {
808        let source = [0u8; 1024];
809        let chunker = FastCDC::new(&source, 64, 256, 1024);
810        assert_eq!(chunker.mask_l, MASKS[7]);
811        assert_eq!(chunker.mask_s, MASKS[9]);
812        let chunker = FastCDC::new(&source, 8192, 16384, 32768);
813        assert_eq!(chunker.mask_l, MASKS[13]);
814        assert_eq!(chunker.mask_s, MASKS[15]);
815        let chunker = FastCDC::new(&source, 1_048_576, 4_194_304, 16_777_216);
816        assert_eq!(chunker.mask_l, MASKS[21]);
817        assert_eq!(chunker.mask_s, MASKS[23]);
818    }
819
820    #[test]
821    fn test_cut_all_zeros() {
822        // for all zeros, always returns chunks of maximum size
823        let array = [0u8; 10240];
824        let chunker = FastCDC::new(&array, 64, 256, 1024);
825        let mut cursor: usize = 0;
826        for _ in 0..10 {
827            let (hash, pos) = chunker.cut(cursor, 10240 - cursor);
828            assert_eq!(hash, 14169102344523991076);
829            assert_eq!(pos, cursor + 1024);
830            cursor = pos;
831        }
832        // assert that nothing more should be returned
833        let (_, pos) = chunker.cut(cursor, 10240 - cursor);
834        assert_eq!(pos, 10240);
835    }
836
837    #[test]
838    fn test_cut_sekien_16k_chunks() {
839        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
840        assert!(read_result.is_ok());
841        let contents = read_result.unwrap();
842        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
843        let mut cursor: usize = 0;
844        let mut remaining: usize = contents.len();
845        let expected: Vec<(u64, usize)> = vec![
846            (17968276318003433923, 21325),
847            (8197189939299398838, 17140),
848            (13019990849178155730, 28084),
849            (4509236223063678303, 18217),
850            (2504464741100432583, 24700),
851        ];
852        for (e_hash, e_length) in expected.iter() {
853            let (hash, pos) = chunker.cut(cursor, remaining);
854            assert_eq!(hash, *e_hash);
855            assert_eq!(pos, cursor + e_length);
856            cursor = pos;
857            remaining -= e_length;
858        }
859        assert_eq!(remaining, 0);
860    }
861
862    #[test]
863    fn test_cut_sekien_32k_chunks() {
864        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
865        assert!(read_result.is_ok());
866        let contents = read_result.unwrap();
867        let chunker = FastCDC::new(&contents, 8192, 32768, 131072);
868        let mut cursor: usize = 0;
869        let mut remaining: usize = contents.len();
870        let expected: Vec<(u64, usize)> =
871            vec![(15733367461443853673, 66549), (6321136627705800457, 42917)];
872        for (e_hash, e_length) in expected.iter() {
873            let (hash, pos) = chunker.cut(cursor, remaining);
874            assert_eq!(hash, *e_hash);
875            assert_eq!(pos, cursor + e_length);
876            cursor = pos;
877            remaining -= e_length;
878        }
879        assert_eq!(remaining, 0);
880    }
881
882    #[test]
883    fn test_cut_sekien_64k_chunks() {
884        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
885        assert!(read_result.is_ok());
886        let contents = read_result.unwrap();
887        let chunker = FastCDC::new(&contents, 16384, 65536, 262144);
888        let mut cursor: usize = 0;
889        let mut remaining: usize = contents.len();
890        let expected: Vec<(u64, usize)> = vec![(2504464741100432583, 109466)];
891        for (e_hash, e_length) in expected.iter() {
892            let (hash, pos) = chunker.cut(cursor, remaining);
893            assert_eq!(hash, *e_hash);
894            assert_eq!(pos, cursor + e_length);
895            cursor = pos;
896            remaining -= e_length;
897        }
898        assert_eq!(remaining, 0);
899    }
900
901    struct ExpectedChunk {
902        hash: u64,
903        offset: u64,
904        length: usize,
905        digest: String,
906    }
907
908    #[test]
909    fn test_iter_sekien_16k_chunks() {
910        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
911        assert!(read_result.is_ok());
912        let contents = read_result.unwrap();
913        // The digest values are not needed here, but they serve to validate
914        // that the streaming version tested below is returning the correct
915        // chunk data on each iteration.
916        let expected_chunks = vec![
917            ExpectedChunk {
918                hash: 17968276318003433923,
919                offset: 0,
920                length: 21325,
921                digest: "2bb52734718194617c957f5e07ee6054".into(),
922            },
923            ExpectedChunk {
924                hash: 8197189939299398838,
925                offset: 21325,
926                length: 17140,
927                digest: "badfb0757fe081c20336902e7131f768".into(),
928            },
929            ExpectedChunk {
930                hash: 13019990849178155730,
931                offset: 38465,
932                length: 28084,
933                digest: "18412d7414de6eb42f638351711f729d".into(),
934            },
935            ExpectedChunk {
936                hash: 4509236223063678303,
937                offset: 66549,
938                length: 18217,
939                digest: "04fe1405fc5f960363bfcd834c056407".into(),
940            },
941            ExpectedChunk {
942                hash: 2504464741100432583,
943                offset: 84766,
944                length: 24700,
945                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
946            },
947        ];
948        let chunker = FastCDC::new(&contents, 4096, 16384, 65535);
949        let mut index = 0;
950        for chunk in chunker {
951            assert_eq!(chunk.hash, expected_chunks[index].hash);
952            assert_eq!(chunk.offset, expected_chunks[index].offset as usize);
953            assert_eq!(chunk.length, expected_chunks[index].length);
954            let mut hasher = Md5::new();
955            hasher.update(&contents[chunk.offset..chunk.offset + chunk.length]);
956            let table = hasher.finalize();
957            let digest = format!("{:x}", table);
958            assert_eq!(digest, expected_chunks[index].digest);
959            index += 1;
960        }
961        assert_eq!(index, 5);
962    }
963
964    #[test]
965    fn test_cut_sekien_16k_nc_0() {
966        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
967        assert!(read_result.is_ok());
968        let contents = read_result.unwrap();
969        let chunker = FastCDC::with_level(&contents, 4096, 16384, 65535, Normalization::Level0);
970        let mut cursor: usize = 0;
971        let mut remaining: usize = contents.len();
972        let expected: Vec<(u64, usize)> = vec![
973            (443122261039895162, 6634),
974            (15733367461443853673, 59915),
975            (10460176299449652894, 25597),
976            (6197802202431009942, 5237),
977            (6321136627705800457, 12083),
978        ];
979        for (e_hash, e_length) in expected.iter() {
980            let (hash, pos) = chunker.cut(cursor, remaining);
981            assert_eq!(hash, *e_hash);
982            assert_eq!(pos, cursor + e_length);
983            cursor = pos;
984            remaining -= e_length;
985        }
986        assert_eq!(remaining, 0);
987    }
988
989    #[test]
990    fn test_cut_sekien_16k_nc_3() {
991        let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
992        assert!(read_result.is_ok());
993        let contents = read_result.unwrap();
994        let chunker = FastCDC::with_level(&contents, 8192, 16384, 32768, Normalization::Level3);
995        let mut cursor: usize = 0;
996        let mut remaining: usize = contents.len();
997        let expected: Vec<(u64, usize)> = vec![
998            (10718006254707412376, 17350),
999            (13104072099671895560, 19911),
1000            (12322483109039221194, 17426),
1001            (16009206469796846404, 17519),
1002            (2473608525189754172, 19940),
1003            (2504464741100432583, 17320),
1004        ];
1005        for (e_hash, e_length) in expected.iter() {
1006            let (hash, pos) = chunker.cut(cursor, remaining);
1007            assert_eq!(hash, *e_hash);
1008            assert_eq!(pos, cursor + e_length);
1009            cursor = pos;
1010            remaining -= e_length;
1011        }
1012        assert_eq!(remaining, 0);
1013    }
1014
1015    #[test]
1016    fn test_error_fmt() {
1017        let err = Error::Empty;
1018        assert_eq!(format!("{err}"), "chunker error: Empty");
1019    }
1020
1021    #[test]
1022    fn test_stream_sekien_16k_chunks() {
1023        let file_result = File::open("test/fixtures/SekienAkashita.jpg");
1024        assert!(file_result.is_ok());
1025        let file = file_result.unwrap();
1026        // The set of expected results should match the non-streaming version.
1027        let expected_chunks = vec![
1028            ExpectedChunk {
1029                hash: 17968276318003433923,
1030                offset: 0,
1031                length: 21325,
1032                digest: "2bb52734718194617c957f5e07ee6054".into(),
1033            },
1034            ExpectedChunk {
1035                hash: 8197189939299398838,
1036                offset: 21325,
1037                length: 17140,
1038                digest: "badfb0757fe081c20336902e7131f768".into(),
1039            },
1040            ExpectedChunk {
1041                hash: 13019990849178155730,
1042                offset: 38465,
1043                length: 28084,
1044                digest: "18412d7414de6eb42f638351711f729d".into(),
1045            },
1046            ExpectedChunk {
1047                hash: 4509236223063678303,
1048                offset: 66549,
1049                length: 18217,
1050                digest: "04fe1405fc5f960363bfcd834c056407".into(),
1051            },
1052            ExpectedChunk {
1053                hash: 2504464741100432583,
1054                offset: 84766,
1055                length: 24700,
1056                digest: "1aa7ad95f274d6ba34a983946ebc5af3".into(),
1057            },
1058        ];
1059        let chunker = StreamCDC::new(file, 4096, 16384, 65535);
1060        let mut index = 0;
1061        for result in chunker {
1062            assert!(result.is_ok());
1063            let chunk = result.unwrap();
1064            assert_eq!(chunk.hash, expected_chunks[index].hash);
1065            assert_eq!(chunk.offset, expected_chunks[index].offset);
1066            assert_eq!(chunk.length, expected_chunks[index].length);
1067            let mut hasher = Md5::new();
1068            hasher.update(&chunk.data);
1069            let table = hasher.finalize();
1070            let digest = format!("{:x}", table);
1071            assert_eq!(digest, expected_chunks[index].digest);
1072            index += 1;
1073        }
1074        assert_eq!(index, 5);
1075    }
1076}