1pub const MINIMUM_MIN: usize = 64;
30pub const MINIMUM_MAX: usize = 67_108_864;
32pub const AVERAGE_MIN: usize = 256;
34pub const AVERAGE_MAX: usize = 268_435_456;
36pub const MAXIMUM_MIN: usize = 1024;
38pub const MAXIMUM_MAX: usize = 1_073_741_824;
40
41#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
43pub struct Chunk {
44 pub hash: u32,
46 pub offset: usize,
48 pub length: usize,
50}
51
52#[derive(Debug, Clone, Eq, PartialEq)]
70pub struct FastCDC<'a> {
71 source: &'a [u8],
72 bytes_processed: usize,
73 bytes_remaining: usize,
74 min_size: usize,
75 avg_size: usize,
76 max_size: usize,
77 mask_s: u32,
78 mask_l: u32,
79 eof: bool,
80}
81
82impl<'a> FastCDC<'a> {
83 pub fn new(source: &'a [u8], min_size: usize, avg_size: usize, max_size: usize) -> Self {
91 FastCDC::with_eof(source, min_size, avg_size, max_size, true)
92 }
93
94 pub fn with_eof(
103 source: &'a [u8],
104 min_size: usize,
105 avg_size: usize,
106 max_size: usize,
107 eof: bool,
108 ) -> Self {
109 assert!(min_size >= MINIMUM_MIN);
110 assert!(min_size <= MINIMUM_MAX);
111 assert!(avg_size >= AVERAGE_MIN);
112 assert!(avg_size <= AVERAGE_MAX);
113 assert!(max_size >= MAXIMUM_MIN);
114 assert!(max_size <= MAXIMUM_MAX);
115 let bits = logarithm2(avg_size as u32);
116 let mask_s = mask(bits + 1);
117 let mask_l = mask(bits - 1);
118 Self {
119 source,
120 bytes_processed: 0,
121 bytes_remaining: source.len(),
122 min_size,
123 avg_size,
124 max_size,
125 mask_s,
126 mask_l,
127 eof,
128 }
129 }
130
131 fn cut(&self, mut source_offset: usize, mut source_size: usize) -> (u32, usize) {
133 if source_size <= self.min_size {
134 if !self.eof {
135 (0, 0)
136 } else {
137 (0, source_size)
138 }
139 } else {
140 if source_size > self.max_size {
141 source_size = self.max_size;
142 }
143 let source_start: usize = source_offset;
144 let source_len1: usize =
145 source_offset + center_size(self.avg_size, self.min_size, source_size);
146 let source_len2: usize = source_offset + source_size;
147 let mut hash: u32 = 0;
148 source_offset += self.min_size;
149 while source_offset < source_len1 {
152 let index = self.source[source_offset] as usize;
153 source_offset += 1;
154 hash = (hash >> 1) + TABLE[index];
155 if (hash & self.mask_s) == 0 {
156 return (hash, source_offset - source_start);
157 }
158 }
159 while source_offset < source_len2 {
162 let index = self.source[source_offset] as usize;
163 source_offset += 1;
164 hash = (hash >> 1) + TABLE[index];
165 if (hash & self.mask_l) == 0 {
166 return (hash, source_offset - source_start);
167 }
168 }
169 if !self.eof && source_size < self.max_size {
172 (hash, 0)
173 } else {
174 (hash, source_size)
177 }
178 }
179 }
180}
181
182impl<'a> Iterator for FastCDC<'a> {
183 type Item = Chunk;
184
185 fn next(&mut self) -> Option<Chunk> {
186 if self.bytes_remaining == 0 {
187 None
188 } else {
189 let (chunk_hash, chunk_size) = self.cut(self.bytes_processed, self.bytes_remaining);
190 if chunk_size == 0 {
191 None
192 } else {
193 let chunk_start = self.bytes_processed;
194 self.bytes_processed += chunk_size;
195 self.bytes_remaining -= chunk_size;
196 Some(Chunk {
197 hash: chunk_hash,
198 offset: chunk_start,
199 length: chunk_size,
200 })
201 }
202 }
203 }
204
205 fn size_hint(&self) -> (usize, Option<usize>) {
206 let upper_bound = self.bytes_remaining / self.min_size;
210 (upper_bound, Some(upper_bound))
211 }
212}
213
214fn logarithm2(value: u32) -> u32 {
218 let fvalue: f64 = f64::from(value);
219 fvalue.log2().round() as u32
220}
221
222fn ceil_div(x: usize, y: usize) -> usize {
226 (x + y - 1) / y
227}
228
229fn center_size(average: usize, minimum: usize, source_size: usize) -> usize {
234 let mut offset: usize = minimum + ceil_div(minimum, 2);
235 if offset > average {
236 offset = average;
237 }
238 let size: usize = average - offset;
239 if size > source_size {
240 source_size
241 } else {
242 size
243 }
244}
245
246fn mask(bits: u32) -> u32 {
251 debug_assert!(bits >= 1);
252 debug_assert!(bits <= 31);
253 2u32.pow(bits) - 1
254}
255
256#[rustfmt::skip]
270const TABLE: [u32; 256] = [
271 0x5c95c078, 0x22408989, 0x2d48a214, 0x12842087, 0x530f8afb, 0x474536b9, 0x2963b4f1, 0x44cb738b,
272 0x4ea7403d, 0x4d606b6e, 0x074ec5d3, 0x3af39d18, 0x726003ca, 0x37a62a74, 0x51a2f58e, 0x7506358e,
273 0x5d4ab128, 0x4d4ae17b, 0x41e85924, 0x470c36f7, 0x4741cbe1, 0x01bb7f30, 0x617c1de3, 0x2b0c3a1f,
274 0x50c48f73, 0x21a82d37, 0x6095ace0, 0x419167a0, 0x3caf49b0, 0x40cea62d, 0x66bc1c66, 0x545e1dad,
275 0x2bfa77cd, 0x6e85da24, 0x5fb0bdc5, 0x652cfc29, 0x3a0ae1ab, 0x2837e0f3, 0x6387b70e, 0x13176012,
276 0x4362c2bb, 0x66d8f4b1, 0x37fce834, 0x2c9cd386, 0x21144296, 0x627268a8, 0x650df537, 0x2805d579,
277 0x3b21ebbd, 0x7357ed34, 0x3f58b583, 0x7150ddca, 0x7362225e, 0x620a6070, 0x2c5ef529, 0x7b522466,
278 0x768b78c0, 0x4b54e51e, 0x75fa07e5, 0x06a35fc6, 0x30b71024, 0x1c8626e1, 0x296ad578, 0x28d7be2e,
279 0x1490a05a, 0x7cee43bd, 0x698b56e3, 0x09dc0126, 0x4ed6df6e, 0x02c1bfc7, 0x2a59ad53, 0x29c0e434,
280 0x7d6c5278, 0x507940a7, 0x5ef6ba93, 0x68b6af1e, 0x46537276, 0x611bc766, 0x155c587d, 0x301ba847,
281 0x2cc9dda7, 0x0a438e2c, 0x0a69d514, 0x744c72d3, 0x4f326b9b, 0x7ef34286, 0x4a0ef8a7, 0x6ae06ebe,
282 0x669c5372, 0x12402dcb, 0x5feae99d, 0x76c7f4a7, 0x6abdb79c, 0x0dfaa038, 0x20e2282c, 0x730ed48b,
283 0x069dac2f, 0x168ecf3e, 0x2610e61f, 0x2c512c8e, 0x15fb8c06, 0x5e62bc76, 0x69555135, 0x0adb864c,
284 0x4268f914, 0x349ab3aa, 0x20edfdb2, 0x51727981, 0x37b4b3d8, 0x5dd17522, 0x6b2cbfe4, 0x5c47cf9f,
285 0x30fa1ccd, 0x23dedb56, 0x13d1f50a, 0x64eddee7, 0x0820b0f7, 0x46e07308, 0x1e2d1dfd, 0x17b06c32,
286 0x250036d8, 0x284dbf34, 0x68292ee0, 0x362ec87c, 0x087cb1eb, 0x76b46720, 0x104130db, 0x71966387,
287 0x482dc43f, 0x2388ef25, 0x524144e1, 0x44bd834e, 0x448e7da3, 0x3fa6eaf9, 0x3cda215c, 0x3a500cf3,
288 0x395cb432, 0x5195129f, 0x43945f87, 0x51862ca4, 0x56ea8ff1, 0x201034dc, 0x4d328ff5, 0x7d73a909,
289 0x6234d379, 0x64cfbf9c, 0x36f6589a, 0x0a2ce98a, 0x5fe4d971, 0x03bc15c5, 0x44021d33, 0x16c1932b,
290 0x37503614, 0x1acaf69d, 0x3f03b779, 0x49e61a03, 0x1f52d7ea, 0x1c6ddd5c, 0x062218ce, 0x07e7a11a,
291 0x1905757a, 0x7ce00a53, 0x49f44f29, 0x4bcc70b5, 0x39feea55, 0x5242cee8, 0x3ce56b85, 0x00b81672,
292 0x46beeccc, 0x3ca0ad56, 0x2396cee8, 0x78547f40, 0x6b08089b, 0x66a56751, 0x781e7e46, 0x1e2cf856,
293 0x3bc13591, 0x494a4202, 0x520494d7, 0x2d87459a, 0x757555b6, 0x42284cc1, 0x1f478507, 0x75c95dff,
294 0x35ff8dd7, 0x4e4757ed, 0x2e11f88c, 0x5e1b5048, 0x420e6699, 0x226b0695, 0x4d1679b4, 0x5a22646f,
295 0x161d1131, 0x125c68d9, 0x1313e32e, 0x4aa85724, 0x21dc7ec1, 0x4ffa29fe, 0x72968382, 0x1ca8eef3,
296 0x3f3b1c28, 0x39c2fb6c, 0x6d76493f, 0x7a22a62e, 0x789b1c2a, 0x16e0cb53, 0x7deceeeb, 0x0dc7e1c6,
297 0x5c75bf3d, 0x52218333, 0x106de4d6, 0x7dc64422, 0x65590ff4, 0x2c02ec30, 0x64a9ac67, 0x59cab2e9,
298 0x4a21d2f3, 0x0f616e57, 0x23b54ee8, 0x02730aaa, 0x2f3c634d, 0x7117fc6c, 0x01ac6f05, 0x5a9ed20c,
299 0x158c4e2a, 0x42b699f0, 0x0c7c14b3, 0x02bd9641, 0x15ad56fc, 0x1c722f60, 0x7da1af91, 0x23e0dbcb,
300 0x0e93e12b, 0x64b2791d, 0x440d2476, 0x588ea8dd, 0x4665a658, 0x7446c418, 0x1877a774, 0x5626407e,
301 0x7f63bd46, 0x32d2dbd8, 0x3c790f4a, 0x772b7239, 0x6f8b2826, 0x677ff609, 0x0dc82c11, 0x23ffe354,
302 0x2eac53a6, 0x16139e09, 0x0afd0dbc, 0x2a4d4237, 0x56a368c7, 0x234325e4, 0x2dce9187, 0x32e8ea7e
303];
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308 use std::fs;
309
310 #[test]
311 fn test_logarithm2() {
312 assert_eq!(logarithm2(0), 0);
313 assert_eq!(logarithm2(1), 0);
314 assert_eq!(logarithm2(2), 1);
315 assert_eq!(logarithm2(3), 2);
316 assert_eq!(logarithm2(5), 2);
317 assert_eq!(logarithm2(6), 3);
318 assert_eq!(logarithm2(11), 3);
319 assert_eq!(logarithm2(12), 4);
320 assert_eq!(logarithm2(19), 4);
321 assert_eq!(logarithm2(16383), 14);
322 assert_eq!(logarithm2(16384), 14);
323 assert_eq!(logarithm2(16385), 14);
324 assert_eq!(logarithm2(32767), 15);
325 assert_eq!(logarithm2(32768), 15);
326 assert_eq!(logarithm2(32769), 15);
327 assert_eq!(logarithm2(65535), 16);
328 assert_eq!(logarithm2(65536), 16);
329 assert_eq!(logarithm2(65537), 16);
330 assert_eq!(logarithm2(1048575), 20);
331 assert_eq!(logarithm2(1048576), 20);
332 assert_eq!(logarithm2(1048577), 20);
333 assert_eq!(logarithm2(4294967286), 32);
334 assert_eq!(logarithm2(4294967295), 32);
335 assert!(logarithm2(AVERAGE_MIN as u32) >= 8);
337 assert!(logarithm2(AVERAGE_MAX as u32) <= 28);
338 }
339
340 #[test]
341 fn test_ceil_div() {
342 assert_eq!(ceil_div(10, 5), 2);
343 assert_eq!(ceil_div(11, 5), 3);
344 assert_eq!(ceil_div(10, 3), 4);
345 assert_eq!(ceil_div(9, 3), 3);
346 assert_eq!(ceil_div(6, 2), 3);
347 assert_eq!(ceil_div(5, 2), 3);
348 }
349
350 #[test]
351 fn test_center_size() {
352 assert_eq!(center_size(50, 100, 50), 0);
353 assert_eq!(center_size(200, 100, 50), 50);
354 assert_eq!(center_size(200, 100, 40), 40);
355 }
356
357 #[test]
358 #[should_panic]
359 fn test_mask_low() {
360 mask(0);
361 }
362
363 #[test]
364 #[should_panic]
365 fn test_mask_high() {
366 mask(32);
367 }
368
369 #[test]
370 fn test_mask() {
371 assert_eq!(mask(24), 16_777_215);
372 assert_eq!(mask(16), 65535);
373 assert_eq!(mask(10), 1023);
374 assert_eq!(mask(8), 255);
375 }
376
377 #[test]
378 #[should_panic]
379 fn test_minimum_too_low() {
380 let array = [0u8; 2048];
381 FastCDC::new(&array, 63, 256, 1024);
382 }
383
384 #[test]
385 #[should_panic]
386 fn test_minimum_too_high() {
387 let array = [0u8; 2048];
388 FastCDC::new(&array, 67_108_867, 256, 1024);
389 }
390
391 #[test]
392 #[should_panic]
393 fn test_average_too_low() {
394 let array = [0u8; 2048];
395 FastCDC::new(&array, 64, 255, 1024);
396 }
397
398 #[test]
399 #[should_panic]
400 fn test_average_too_high() {
401 let array = [0u8; 2048];
402 FastCDC::new(&array, 64, 268_435_457, 1024);
403 }
404
405 #[test]
406 #[should_panic]
407 fn test_maximum_too_low() {
408 let array = [0u8; 2048];
409 FastCDC::new(&array, 64, 256, 1023);
410 }
411
412 #[test]
413 #[should_panic]
414 fn test_maximum_too_high() {
415 let array = [0u8; 2048];
416 FastCDC::new(&array, 64, 256, 1_073_741_825);
417 }
418
419 #[test]
420 fn test_all_zeros() {
421 let array = [0u8; 10240];
423 let chunker = FastCDC::new(&array, 64, 256, 1024);
424 let results: Vec<Chunk> = chunker.collect();
425 assert_eq!(results.len(), 10);
426 for entry in results {
427 assert_eq!(entry.hash, 3106636015);
428 assert_eq!(entry.offset % 1024, 0);
429 assert_eq!(entry.length, 1024);
430 }
431 }
432
433 #[test]
434 fn test_sekien_16k_chunks() {
435 let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
436 assert!(read_result.is_ok());
437 let contents = read_result.unwrap();
438 let chunker = FastCDC::new(&contents, 8192, 16384, 32768);
439 let results: Vec<Chunk> = chunker.collect();
440 assert_eq!(results.len(), 6);
441 assert_eq!(results[0].hash, 1527472128);
442 assert_eq!(results[0].offset, 0);
443 assert_eq!(results[0].length, 22366);
444 assert_eq!(results[1].hash, 1174757376);
445 assert_eq!(results[1].offset, 22366);
446 assert_eq!(results[1].length, 8282);
447 assert_eq!(results[2].hash, 2687197184);
448 assert_eq!(results[2].offset, 30648);
449 assert_eq!(results[2].length, 16303);
450 assert_eq!(results[3].hash, 1210105856);
451 assert_eq!(results[3].offset, 46951);
452 assert_eq!(results[3].length, 18696);
453 assert_eq!(results[4].hash, 2984739645);
454 assert_eq!(results[4].offset, 65647);
455 assert_eq!(results[4].length, 32768);
456 assert_eq!(results[5].hash, 1121740051);
457 assert_eq!(results[5].offset, 98415);
458 assert_eq!(results[5].length, 11051);
459 }
460
461 #[test]
462 fn test_sekien_16k_chunks_streaming() {
463 let filepath = "test/fixtures/SekienAkashita.jpg";
464 let read_result = fs::read(filepath);
465 assert!(read_result.is_ok());
466 let contents = read_result.unwrap();
467
468 let chunk_offsets = [0, 22366, 30648, 46951, 65647, 98415];
471 let chunk_sizes = [22366, 8282, 16303, 18696, 32768, 11051];
472
473 const BUF_SIZE: usize = 32768;
477
478 let attr = fs::metadata(filepath).unwrap();
481 let file_size = attr.len();
482 let mut file_pos = 0;
483 let mut chunk_index = 0;
484
485 for group_size in &[2, 1, 1, 1, 1] {
488 let upper_bound = file_pos + BUF_SIZE;
489 let (eof, slice) = if upper_bound >= file_size as usize {
490 (true, &contents[file_pos..])
491 } else {
492 (false, &contents[file_pos..upper_bound])
493 };
494 let chunker = FastCDC::with_eof(slice, 8192, 16384, 32768, eof);
495 let results: Vec<Chunk> = chunker.collect();
496 assert_eq!(results.len(), *group_size);
497 for idx in 0..*group_size {
498 assert_eq!(results[idx].offset + file_pos, chunk_offsets[chunk_index]);
499 assert_eq!(results[idx].length, chunk_sizes[chunk_index]);
500 chunk_index += 1;
501 }
502 for result in results {
504 file_pos += result.length;
505 }
506 }
507 assert_eq!(file_pos as u64, file_size);
509 }
510
511 #[test]
512 fn test_sekien_32k_chunks() {
513 let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
514 assert!(read_result.is_ok());
515 let contents = read_result.unwrap();
516 let chunker = FastCDC::new(&contents, 16384, 32768, 65536);
517 let results: Vec<Chunk> = chunker.collect();
518 assert_eq!(results.len(), 3);
519 assert_eq!(results[0].hash, 2772598784);
520 assert_eq!(results[0].offset, 0);
521 assert_eq!(results[0].length, 32857);
522 assert_eq!(results[1].hash, 1651589120);
523 assert_eq!(results[1].offset, 32857);
524 assert_eq!(results[1].length, 16408);
525 assert_eq!(results[2].hash, 1121740051);
526 assert_eq!(results[2].offset, 49265);
527 assert_eq!(results[2].length, 60201);
528 }
529
530 #[test]
531 fn test_sekien_64k_chunks() {
532 let read_result = fs::read("test/fixtures/SekienAkashita.jpg");
533 assert!(read_result.is_ok());
534 let contents = read_result.unwrap();
535 let chunker = FastCDC::new(&contents, 32768, 65536, 131_072);
536 let results: Vec<Chunk> = chunker.collect();
537 assert_eq!(results.len(), 2);
538 assert_eq!(results[0].hash, 2772598784);
539 assert_eq!(results[0].offset, 0);
540 assert_eq!(results[0].length, 32857);
541 assert_eq!(results[1].hash, 1121740051);
542 assert_eq!(results[1].offset, 32857);
543 assert_eq!(results[1].length, 76609);
544 }
545}