Skip to main content

snix_castore/blobservice/object_store/
mod.rs

1use std::{
2    collections::{HashMap, hash_map},
3    io::{self, Cursor},
4    pin::pin,
5    sync::Arc,
6    task::Poll,
7};
8
9use data_encoding::HEXLOWER;
10use fastcdc::v2020::AsyncStreamCDC;
11use futures::{Future, TryStreamExt};
12use object_store::{ObjectStore, ObjectStoreExt, ObjectStoreScheme, path::Path};
13use pin_project_lite::pin_project;
14use prost::Message;
15use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
16use tonic::async_trait;
17use tracing::{Level, debug, instrument, trace};
18use url::Url;
19
20use crate::{
21    B3Digest, B3HashingReader,
22    composition::{CompositionContext, ServiceBuilder},
23    proto::{StatBlobResponse, stat_blob_response::ChunkMeta},
24};
25
26use super::{BlobReader, BlobService, BlobWriter, ChunkedReader};
27
28#[cfg(feature = "cloud")]
29mod aws;
30
31/// The number of chunks that will be uploaded in parallel, per blob.
32const CONCURRENT_CHUNK_UPLOADS: usize = 64;
33
34/// Uses any object storage supported by the [object_store] crate to provide a
35/// snix-castore [BlobService].
36///
37/// # Data format
38/// Data is organized in "blobs" and "chunks".
39/// Blobs don't hold the actual data, but instead contain a list of more
40/// granular chunks that assemble to the contents requested.
41/// This allows clients to seek, and not download chunks they already have
42/// locally, as it's referred to from other files.
43/// Check `rpc_blobstore` and more general BlobStore docs on that.
44///
45/// ## Blobs
46/// Stored at `${base_path}/blobs/b3/$digest_key`. They contains the serialized
47/// StatBlobResponse for the blob with the digest.
48///
49/// ## Chunks
50/// Chunks are stored at `${base_path}/chunks/b3/$digest_key`. They contain
51/// the literal contents of the chunk, but are zstd-compressed.
52///
53/// ## Digest key sharding
54/// The blake3 digest encoded in lower hex, and sharded after the second
55/// character.
56/// The blob for "Hello World" is stored at
57/// `${base_path}/blobs/b3/41/41f8394111eb713a22165c46c90ab8f0fd9399c92028fd6d288944b23ff5bf76`.
58///
59/// This reduces the number of files in the same directory, which would be a
60/// problem at least when using [object_store::local::LocalFileSystem].
61///
62/// # Future changes
63/// There's no guarantees about this being a final format yet.
64/// Once object_store gets support for additional metadata / content-types,
65/// we can eliminate some requests (small blobs only consisting of a single
66/// chunk can be stored as-is, without the blob index file).
67/// It also allows signalling any compression of chunks in the content-type.
68/// Migration *should* be possible by simply adding the right content-types to
69/// all keys stored so far, but no promises ;-)
70#[derive(Clone)]
71pub struct ObjectStoreBlobService {
72    instance_name: String,
73    object_store: Arc<dyn ObjectStore>,
74    base_path: Path,
75
76    /// Average chunk size for FastCDC, in bytes.
77    /// min value is half, max value double of that number.
78    avg_chunk_size: u32,
79}
80
81#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))]
82fn derive_blob_path(base_path: &Path, digest: &B3Digest) -> Path {
83    base_path
84        .clone()
85        .join("blobs")
86        .join("b3")
87        .join(HEXLOWER.encode(&digest[..2]))
88        .join(HEXLOWER.encode(&digest[..]))
89}
90
91#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,chunk.digest=%digest),ret(Display))]
92fn derive_chunk_path(base_path: &Path, digest: &B3Digest) -> Path {
93    base_path
94        .clone()
95        .join("chunks")
96        .join("b3")
97        .join(HEXLOWER.encode(&digest[..2]))
98        .join(HEXLOWER.encode(&digest[..]))
99}
100
101#[async_trait]
102impl BlobService for ObjectStoreBlobService {
103    #[instrument(skip_all, ret(level = Level::TRACE), err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
104    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
105        // TODO: clarify if this should work for chunks or not, and explicitly
106        // document in the proto docs.
107        let p = derive_blob_path(&self.base_path, digest);
108
109        match self.object_store.head(&p).await {
110            Ok(_) => Ok(true),
111            Err(object_store::Error::NotFound { .. }) => {
112                let p = derive_chunk_path(&self.base_path, digest);
113                match self.object_store.head(&p).await {
114                    Ok(_) => Ok(true),
115                    Err(object_store::Error::NotFound { .. }) => Ok(false),
116                    Err(e) => Err(e)?,
117                }
118            }
119            Err(e) => Err(e)?,
120        }
121    }
122
123    #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
124    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
125        // handle reading the empty blob.
126        if digest.as_slice() == blake3::hash(b"").as_bytes() {
127            return Ok(Some(Box::new(Cursor::new(b"")) as Box<dyn BlobReader>));
128        }
129        match self
130            .object_store
131            .get(&derive_chunk_path(&self.base_path, digest))
132            .await
133        {
134            Ok(res) => {
135                // handle reading blobs that are small enough to fit inside a single chunk:
136                // fetch the entire chunk into memory, decompress, ensure the b3 digest matches,
137                // and return a io::Cursor over that data.
138                // FUTUREWORK: use zstd::bulk to prevent decompression bombs
139
140                let chunk_raw_bytes = res.bytes().await?;
141                let chunk_contents = zstd::stream::decode_all(Cursor::new(chunk_raw_bytes))?;
142
143                if *digest != blake3::hash(&chunk_contents).as_bytes().into() {
144                    Err(io::Error::other("chunk contents invalid"))?;
145                }
146
147                Ok(Some(Box::new(Cursor::new(chunk_contents))))
148            }
149            Err(object_store::Error::NotFound { .. }) => {
150                // NOTE: For public-facing things, we would want to stop here.
151                // Clients should fetch granularly, so they can make use of
152                // chunks they have locally.
153                // However, if this is used directly, without any caches, do the
154                // assembly here.
155                // This is subject to change, once we have store composition.
156                // TODO: make this configurable, and/or clarify behaviour for
157                // the gRPC server surface (explicitly document behaviour in the
158                // proto docs)
159                if let Some(chunks) = self.chunks(digest).await? {
160                    let chunked_reader = ChunkedReader::from_chunks(
161                        chunks.into_iter().map(|chunk| {
162                            (
163                                chunk.digest.try_into().expect("invalid b3 digest"),
164                                chunk.size,
165                            )
166                        }),
167                        Arc::new(self.clone()) as Arc<dyn BlobService>,
168                    );
169
170                    Ok(Some(Box::new(chunked_reader)))
171                } else {
172                    // This is neither a chunk nor a blob, return None.
173                    Ok(None)
174                }
175            }
176            Err(e) => Err(e.into()),
177        }
178    }
179
180    #[instrument(skip_all, fields(instance_name=%self.instance_name))]
181    async fn open_write(&self) -> Box<dyn BlobWriter> {
182        // ObjectStoreBlobWriter implements AsyncWrite, but all the chunking
183        // needs an AsyncRead, so we create a pipe here.
184        // In its `AsyncWrite` implementation, `ObjectStoreBlobWriter` delegates
185        // writes to w. It periodically polls the future that's reading from the
186        // other side.
187        let (w, r) = tokio::io::duplex(self.avg_chunk_size as usize * 10);
188
189        Box::new(ObjectStoreBlobWriter {
190            writer: Some(w),
191            fut: Some(Box::pin(chunk_and_upload(
192                r,
193                self.object_store.clone(),
194                self.base_path.clone(),
195                self.avg_chunk_size / 2,
196                self.avg_chunk_size,
197                self.avg_chunk_size * 2,
198            ))),
199            fut_output: None,
200        })
201    }
202
203    #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
204    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
205        match self
206            .object_store
207            .get(&derive_blob_path(&self.base_path, digest))
208            .await
209        {
210            Ok(get_result) => {
211                // fetch the data at the blob path
212                let blob_data = get_result.bytes().await?;
213                // parse into StatBlobResponse
214                let stat_blob_response: StatBlobResponse = StatBlobResponse::decode(blob_data)?;
215
216                debug!(
217                    chunk.count = stat_blob_response.chunks.len(),
218                    blob.size = stat_blob_response
219                        .chunks
220                        .iter()
221                        .map(|x| x.size)
222                        .sum::<u64>(),
223                    "found more granular chunks"
224                );
225
226                Ok(Some(stat_blob_response.chunks))
227            }
228            Err(object_store::Error::NotFound { .. }) => {
229                // If there's only a chunk, we must return the empty vec here, rather than None.
230                match self
231                    .object_store
232                    .head(&derive_chunk_path(&self.base_path, digest))
233                    .await
234                {
235                    Ok(_) => {
236                        // present, but no more chunks available
237                        debug!("found a single chunk");
238                        Ok(Some(vec![]))
239                    }
240                    Err(object_store::Error::NotFound { .. }) => {
241                        // Neither blob nor single chunk found
242                        debug!("not found");
243                        Ok(None)
244                    }
245                    // error checking for chunk
246                    Err(e) => Err(e.into()),
247                }
248            }
249            // error checking for blob
250            Err(err) => Err(err.into()),
251        }
252    }
253}
254
255fn default_avg_chunk_size() -> u32 {
256    256 * 1024
257}
258
259#[derive(serde::Deserialize)]
260#[serde(deny_unknown_fields)]
261pub struct ObjectStoreBlobServiceConfig {
262    object_store_url: String,
263    #[serde(default = "default_avg_chunk_size")]
264    avg_chunk_size: u32,
265    object_store_options: HashMap<String, String>,
266}
267
268impl TryFrom<url::Url> for ObjectStoreBlobServiceConfig {
269    type Error = Box<dyn std::error::Error + Send + Sync>;
270    /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by
271    /// [object_store].
272    /// Any path suffix becomes the base path of the object store.
273    /// additional options, the same as in [object_store::parse_url_opts] can
274    /// be passed.
275    fn try_from(url: url::Url) -> Result<Self, Self::Error> {
276        // We need to convert the URL to string, strip the prefix there, and then
277        // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do.
278        let trimmed_url = {
279            let s = url.to_string();
280            let mut url = Url::parse(
281                s.strip_prefix("objectstore+")
282                    .ok_or("Missing objectstore uri")?,
283            )?;
284            // trim the query pairs, they might contain credentials or local settings we don't want to send as-is.
285            url.set_query(None);
286            url
287        };
288        Ok(ObjectStoreBlobServiceConfig {
289            object_store_url: trimmed_url.into(),
290            object_store_options: url
291                .query_pairs()
292                .into_iter()
293                .map(|(k, v)| (k.to_string(), v.to_string()))
294                .collect(),
295            avg_chunk_size: 256 * 1024,
296        })
297    }
298}
299
300#[async_trait]
301impl ServiceBuilder for ObjectStoreBlobServiceConfig {
302    type Output = dyn BlobService;
303    async fn build<'a>(
304        &'a self,
305        instance_name: &str,
306        _context: &CompositionContext,
307    ) -> Result<Arc<Self::Output>, Box<dyn std::error::Error + Send + Sync>> {
308        let opts = {
309            let mut opts: HashMap<&str, _> = self
310                .object_store_options
311                .iter()
312                .map(|(k, v)| (k.as_str(), v.as_str()))
313                .collect();
314
315            if let hash_map::Entry::Vacant(e) =
316                opts.entry(object_store::ClientConfigKey::UserAgent.as_ref())
317            {
318                e.insert(crate::USER_AGENT);
319            }
320
321            opts
322        };
323
324        // object_store doesn't sufficiently support the AWS credential chain.
325        let object_store_url: url::Url = self.object_store_url.parse()?;
326        let (object_store_scheme, path) =
327            object_store::ObjectStoreScheme::parse(&object_store_url)?;
328
329        let (object_store, path) = match object_store_scheme {
330            #[cfg(feature = "cloud")]
331            ObjectStoreScheme::AmazonS3 => {
332                // In the AWS case, we only support s3:// URLs.
333                if object_store_url.scheme() != "s3" {
334                    return Err(Box::new(std::io::Error::new(
335                        std::io::ErrorKind::InvalidInput,
336                        "only s3://-style URLs supported",
337                    )));
338                }
339
340                let store = aws::setup_aws_object_store(&object_store_url, opts).await?;
341                (Box::new(store) as Box<dyn ObjectStore>, path)
342            }
343            _ => object_store::parse_url_opts(&object_store_url, opts)?,
344        };
345
346        Ok(Arc::new(ObjectStoreBlobService {
347            instance_name: instance_name.to_string(),
348            object_store: Arc::new(object_store),
349            base_path: path,
350            avg_chunk_size: self.avg_chunk_size,
351        }))
352    }
353}
354
355/// Reads blob contents from a AsyncRead, chunks and uploads them.
356/// On success, returns a [StatBlobResponse] pointing to the individual chunks.
357#[instrument(skip_all, fields(base_path=%base_path, min_chunk_size, avg_chunk_size, max_chunk_size), err)]
358async fn chunk_and_upload<R: AsyncRead + Unpin>(
359    r: R,
360    object_store: Arc<dyn ObjectStore>,
361    base_path: Path,
362    min_chunk_size: u32,
363    avg_chunk_size: u32,
364    max_chunk_size: u32,
365) -> io::Result<B3Digest> {
366    // wrap reader with something calculating the blake3 hash of all data read.
367    let mut b3_r = B3HashingReader::from(r);
368    // set up a fastcdc chunker
369    let mut chunker =
370        AsyncStreamCDC::new(&mut b3_r, min_chunk_size, avg_chunk_size, max_chunk_size);
371
372    // Use the fastcdc chunker to produce a stream of chunks, and upload these
373    // that don't exist to the backend.
374    let chunks = chunker
375        .as_stream()
376        .err_into()
377        .map_ok(|chunk_data| {
378            let object_store = object_store.clone();
379            let chunk_digest: B3Digest = blake3::hash(&chunk_data.data).as_bytes().into();
380            let chunk_path = derive_chunk_path(&base_path, &chunk_digest);
381            upload_chunk(object_store, chunk_digest, chunk_path, chunk_data.data)
382        })
383        .try_buffered(CONCURRENT_CHUNK_UPLOADS)
384        .try_collect::<Vec<ChunkMeta>>()
385        .await?;
386
387    let chunks = if chunks.len() < 2 {
388        // The chunker returned only one chunk, which is the entire blob.
389        // According to the protocol, we must return an empty list of chunks
390        // when the blob is not split up further.
391        vec![]
392    } else {
393        chunks
394    };
395
396    let stat_blob_response = StatBlobResponse {
397        chunks,
398        bao: "".into(), // still todo
399    };
400
401    // check for Blob, if it doesn't exist, persist.
402    let blob_digest: B3Digest = b3_r.digest().into();
403    let blob_path = derive_blob_path(&base_path, &blob_digest);
404
405    match object_store.head(&blob_path).await {
406        // blob already exists, nothing to do
407        Ok(_) => {
408            trace!(
409                blob.digest = %blob_digest,
410                blob.path = %blob_path,
411                "blob already exists on backend"
412            );
413        }
414        // chunk does not yet exist, upload first
415        Err(object_store::Error::NotFound { .. }) => {
416            debug!(
417                blob.digest = %blob_digest,
418                blob.path = %blob_path,
419                "uploading blob"
420            );
421            object_store
422                .put(&blob_path, stat_blob_response.encode_to_vec().into())
423                .await?;
424        }
425        Err(err) => {
426            // other error
427            Err(err)?
428        }
429    }
430
431    Ok(blob_digest)
432}
433
434/// upload chunk if it doesn't exist yet.
435#[instrument(skip_all, fields(chunk.digest = %chunk_digest, chunk.size = chunk_data.len(), chunk.path = %chunk_path), err)]
436async fn upload_chunk(
437    object_store: Arc<dyn ObjectStore>,
438    chunk_digest: B3Digest,
439    chunk_path: Path,
440    chunk_data: Vec<u8>,
441) -> std::io::Result<ChunkMeta> {
442    let chunk_size = chunk_data.len();
443    match object_store.head(&chunk_path).await {
444        // chunk already exists, nothing to do
445        Ok(_) => {
446            debug!("chunk already exists");
447        }
448
449        // chunk does not yet exist, compress and upload.
450        Err(object_store::Error::NotFound { .. }) => {
451            let chunk_data_compressed =
452                zstd::encode_all(Cursor::new(chunk_data), zstd::DEFAULT_COMPRESSION_LEVEL)?;
453
454            debug!(chunk.compressed_size=%chunk_data_compressed.len(), "uploading chunk");
455
456            object_store
457                .as_ref()
458                .put(&chunk_path, chunk_data_compressed.into())
459                .await?;
460        }
461        // other error
462        Err(err) => Err(err)?,
463    }
464
465    Ok(ChunkMeta {
466        digest: chunk_digest.into(),
467        size: chunk_size as u64,
468    })
469}
470
471pin_project! {
472    /// Takes care of blob uploads.
473    /// All writes are relayed to self.writer, and we continuously poll the
474    /// future (which will internally read from the other side of the pipe and
475    /// upload chunks).
476    /// Our BlobWriter::close() needs to drop self.writer, so the other side
477    /// will read EOF and can finalize the blob.
478    /// The future should then resolve and return the blob digest.
479    pub struct ObjectStoreBlobWriter<W, Fut>
480    where
481        W: AsyncWrite,
482        Fut: Future,
483    {
484        #[pin]
485        writer: Option<W>,
486
487        #[pin]
488        fut: Option<Fut>,
489
490        fut_output: Option<io::Result<B3Digest>>
491    }
492}
493
494impl<W, Fut> tokio::io::AsyncWrite for ObjectStoreBlobWriter<W, Fut>
495where
496    W: AsyncWrite + Send + Unpin,
497    Fut: Future,
498{
499    fn poll_write(
500        self: std::pin::Pin<&mut Self>,
501        cx: &mut std::task::Context<'_>,
502        buf: &[u8],
503    ) -> std::task::Poll<Result<usize, io::Error>> {
504        let this = self.project();
505        // poll the future.
506        let fut = this.fut.as_pin_mut().expect("not future");
507        let fut_p = fut.poll(cx);
508        // if it's ready, the only way this could have happened is that the
509        // upload failed, because we're only closing `self.writer` after all
510        // writes happened.
511        if fut_p.is_ready() {
512            return Poll::Ready(Err(io::Error::other("upload failed")));
513        }
514
515        // write to the underlying writer
516        this.writer
517            .as_pin_mut()
518            .expect("writer must be some")
519            .poll_write(cx, buf)
520    }
521
522    fn poll_flush(
523        self: std::pin::Pin<&mut Self>,
524        cx: &mut std::task::Context<'_>,
525    ) -> std::task::Poll<Result<(), io::Error>> {
526        let this = self.project();
527        // poll the future.
528        let fut = this.fut.as_pin_mut().expect("not future");
529        let fut_p = fut.poll(cx);
530        // if it's ready, the only way this could have happened is that the
531        // upload failed, because we're only closing `self.writer` after all
532        // writes happened.
533        if fut_p.is_ready() {
534            return Poll::Ready(Err(io::Error::other("upload failed")));
535        }
536
537        // Call poll_flush on the writer
538        this.writer
539            .as_pin_mut()
540            .expect("writer must be some")
541            .poll_flush(cx)
542    }
543
544    fn poll_shutdown(
545        self: std::pin::Pin<&mut Self>,
546        _cx: &mut std::task::Context<'_>,
547    ) -> std::task::Poll<Result<(), io::Error>> {
548        // There's nothing to do on shutdown. We might have written some chunks
549        // that are nowhere else referenced, but cleaning them up here would be racy.
550        std::task::Poll::Ready(Ok(()))
551    }
552}
553
554#[async_trait]
555impl<W, Fut> BlobWriter for ObjectStoreBlobWriter<W, Fut>
556where
557    W: AsyncWrite + Send + Unpin,
558    Fut: Future<Output = io::Result<B3Digest>> + Send + Unpin,
559{
560    async fn close(&mut self) -> io::Result<B3Digest> {
561        match self.writer.take() {
562            Some(mut writer) => {
563                // shut down the writer, so the other side will read EOF.
564                writer.shutdown().await?;
565
566                // take out the future.
567                let fut = self.fut.take().expect("fut must be some");
568                // await it.
569                let resp = pin!(fut).await;
570
571                match resp.as_ref() {
572                    // In the case of an Ok value, we store it in self.fut_output,
573                    // so future calls to close can return that.
574                    Ok(b3_digest) => {
575                        self.fut_output = Some(Ok(*b3_digest));
576                    }
577                    Err(e) => {
578                        // for the error type, we need to cheat a bit, as
579                        // they're not clone-able.
580                        // Simply store a sloppy clone, with the same ErrorKind and message there.
581                        self.fut_output = Some(Err(std::io::Error::new(e.kind(), e.to_string())))
582                    }
583                }
584                resp
585            }
586            None => {
587                // called a second time, return self.fut_output.
588                match self.fut_output.as_ref().unwrap() {
589                    Ok(b3_digest) => Ok(*b3_digest),
590                    Err(e) => Err(std::io::Error::new(e.kind(), e.to_string())),
591                }
592            }
593        }
594    }
595}
596
597#[cfg(test)]
598mod test {
599    use super::{chunk_and_upload, default_avg_chunk_size};
600    use crate::{
601        blobservice::{BlobService, ObjectStoreBlobService},
602        fixtures::{BLOB_A, BLOB_A_DIGEST, BLOB_B, BLOB_B_DIGEST},
603    };
604    use std::{io::Cursor, sync::Arc};
605    use url::Url;
606
607    /// Tests chunk_and_upload directly, bypassing the BlobWriter at open_write().
608    #[rstest::rstest]
609    #[case::a(&BLOB_A, &BLOB_A_DIGEST)]
610    #[case::b(&BLOB_B, &BLOB_B_DIGEST)]
611    #[tokio::test]
612    async fn test_chunk_and_upload(
613        #[case] blob: &bytes::Bytes,
614        #[case] blob_digest: &crate::B3Digest,
615    ) {
616        let (object_store, base_path) =
617            object_store::parse_url(&Url::parse("memory:///").unwrap()).unwrap();
618        let object_store: Arc<dyn object_store::ObjectStore> = Arc::from(object_store);
619        let blobsvc = Arc::new(ObjectStoreBlobService {
620            instance_name: "test".into(),
621            object_store: object_store.clone(),
622            avg_chunk_size: default_avg_chunk_size(),
623            base_path,
624        });
625
626        let inserted_blob_digest = chunk_and_upload(
627            &mut Cursor::new(blob.to_vec()),
628            object_store,
629            object_store::path::Path::from("/"),
630            1024 / 2,
631            1024,
632            1024 * 2,
633        )
634        .await
635        .expect("chunk_and_upload succeeds");
636
637        assert_eq!(blob_digest.clone(), inserted_blob_digest);
638
639        // Now we should have the blob
640        assert!(blobsvc.has(blob_digest).await.unwrap());
641
642        // Check if it was chunked correctly
643        let chunks = blobsvc.chunks(blob_digest).await.unwrap().unwrap();
644        if blob.len() < 1024 / 2 {
645            // The blob is smaller than the min chunk size, it should have been inserted as a whole
646            assert!(chunks.is_empty());
647        } else if blob.len() > 1024 * 2 {
648            // The blob is larger than the max chunk size, make sure it was split up into at least
649            // two chunks
650            assert!(chunks.len() >= 2);
651        }
652    }
653}