1use std::{
2 collections::{HashMap, hash_map},
3 io::{self, Cursor},
4 pin::pin,
5 sync::Arc,
6 task::Poll,
7};
8
9use data_encoding::HEXLOWER;
10use fastcdc::v2020::AsyncStreamCDC;
11use futures::{Future, TryStreamExt};
12use object_store::{ObjectStore, ObjectStoreExt, ObjectStoreScheme, path::Path};
13use pin_project_lite::pin_project;
14use prost::Message;
15use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
16use tonic::async_trait;
17use tracing::{Level, debug, instrument, trace};
18use url::Url;
19
20use crate::{
21 B3Digest, B3HashingReader,
22 composition::{CompositionContext, ServiceBuilder},
23 proto::{StatBlobResponse, stat_blob_response::ChunkMeta},
24};
25
26use super::{BlobReader, BlobService, BlobWriter, ChunkedReader};
27
28#[cfg(feature = "cloud")]
29mod aws;
30
31const CONCURRENT_CHUNK_UPLOADS: usize = 64;
33
34#[derive(Clone)]
71pub struct ObjectStoreBlobService {
72 instance_name: String,
73 object_store: Arc<dyn ObjectStore>,
74 base_path: Path,
75
76 avg_chunk_size: u32,
79}
80
81#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))]
82fn derive_blob_path(base_path: &Path, digest: &B3Digest) -> Path {
83 base_path
84 .clone()
85 .join("blobs")
86 .join("b3")
87 .join(HEXLOWER.encode(&digest[..2]))
88 .join(HEXLOWER.encode(&digest[..]))
89}
90
91#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,chunk.digest=%digest),ret(Display))]
92fn derive_chunk_path(base_path: &Path, digest: &B3Digest) -> Path {
93 base_path
94 .clone()
95 .join("chunks")
96 .join("b3")
97 .join(HEXLOWER.encode(&digest[..2]))
98 .join(HEXLOWER.encode(&digest[..]))
99}
100
101#[async_trait]
102impl BlobService for ObjectStoreBlobService {
103 #[instrument(skip_all, ret(level = Level::TRACE), err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
104 async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
105 let p = derive_blob_path(&self.base_path, digest);
108
109 match self.object_store.head(&p).await {
110 Ok(_) => Ok(true),
111 Err(object_store::Error::NotFound { .. }) => {
112 let p = derive_chunk_path(&self.base_path, digest);
113 match self.object_store.head(&p).await {
114 Ok(_) => Ok(true),
115 Err(object_store::Error::NotFound { .. }) => Ok(false),
116 Err(e) => Err(e)?,
117 }
118 }
119 Err(e) => Err(e)?,
120 }
121 }
122
123 #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
124 async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
125 if digest.as_slice() == blake3::hash(b"").as_bytes() {
127 return Ok(Some(Box::new(Cursor::new(b"")) as Box<dyn BlobReader>));
128 }
129 match self
130 .object_store
131 .get(&derive_chunk_path(&self.base_path, digest))
132 .await
133 {
134 Ok(res) => {
135 let chunk_raw_bytes = res.bytes().await?;
141 let chunk_contents = zstd::stream::decode_all(Cursor::new(chunk_raw_bytes))?;
142
143 if *digest != blake3::hash(&chunk_contents).as_bytes().into() {
144 Err(io::Error::other("chunk contents invalid"))?;
145 }
146
147 Ok(Some(Box::new(Cursor::new(chunk_contents))))
148 }
149 Err(object_store::Error::NotFound { .. }) => {
150 if let Some(chunks) = self.chunks(digest).await? {
160 let chunked_reader = ChunkedReader::from_chunks(
161 chunks.into_iter().map(|chunk| {
162 (
163 chunk.digest.try_into().expect("invalid b3 digest"),
164 chunk.size,
165 )
166 }),
167 Arc::new(self.clone()) as Arc<dyn BlobService>,
168 );
169
170 Ok(Some(Box::new(chunked_reader)))
171 } else {
172 Ok(None)
174 }
175 }
176 Err(e) => Err(e.into()),
177 }
178 }
179
180 #[instrument(skip_all, fields(instance_name=%self.instance_name))]
181 async fn open_write(&self) -> Box<dyn BlobWriter> {
182 let (w, r) = tokio::io::duplex(self.avg_chunk_size as usize * 10);
188
189 Box::new(ObjectStoreBlobWriter {
190 writer: Some(w),
191 fut: Some(Box::pin(chunk_and_upload(
192 r,
193 self.object_store.clone(),
194 self.base_path.clone(),
195 self.avg_chunk_size / 2,
196 self.avg_chunk_size,
197 self.avg_chunk_size * 2,
198 ))),
199 fut_output: None,
200 })
201 }
202
203 #[instrument(skip_all, err, fields(blob.digest=%digest, instance_name=%self.instance_name))]
204 async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
205 match self
206 .object_store
207 .get(&derive_blob_path(&self.base_path, digest))
208 .await
209 {
210 Ok(get_result) => {
211 let blob_data = get_result.bytes().await?;
213 let stat_blob_response: StatBlobResponse = StatBlobResponse::decode(blob_data)?;
215
216 debug!(
217 chunk.count = stat_blob_response.chunks.len(),
218 blob.size = stat_blob_response
219 .chunks
220 .iter()
221 .map(|x| x.size)
222 .sum::<u64>(),
223 "found more granular chunks"
224 );
225
226 Ok(Some(stat_blob_response.chunks))
227 }
228 Err(object_store::Error::NotFound { .. }) => {
229 match self
231 .object_store
232 .head(&derive_chunk_path(&self.base_path, digest))
233 .await
234 {
235 Ok(_) => {
236 debug!("found a single chunk");
238 Ok(Some(vec![]))
239 }
240 Err(object_store::Error::NotFound { .. }) => {
241 debug!("not found");
243 Ok(None)
244 }
245 Err(e) => Err(e.into()),
247 }
248 }
249 Err(err) => Err(err.into()),
251 }
252 }
253}
254
255fn default_avg_chunk_size() -> u32 {
256 256 * 1024
257}
258
259#[derive(serde::Deserialize)]
260#[serde(deny_unknown_fields)]
261pub struct ObjectStoreBlobServiceConfig {
262 object_store_url: String,
263 #[serde(default = "default_avg_chunk_size")]
264 avg_chunk_size: u32,
265 object_store_options: HashMap<String, String>,
266}
267
268impl TryFrom<url::Url> for ObjectStoreBlobServiceConfig {
269 type Error = Box<dyn std::error::Error + Send + Sync>;
270 fn try_from(url: url::Url) -> Result<Self, Self::Error> {
276 let trimmed_url = {
279 let s = url.to_string();
280 let mut url = Url::parse(
281 s.strip_prefix("objectstore+")
282 .ok_or("Missing objectstore uri")?,
283 )?;
284 url.set_query(None);
286 url
287 };
288 Ok(ObjectStoreBlobServiceConfig {
289 object_store_url: trimmed_url.into(),
290 object_store_options: url
291 .query_pairs()
292 .into_iter()
293 .map(|(k, v)| (k.to_string(), v.to_string()))
294 .collect(),
295 avg_chunk_size: 256 * 1024,
296 })
297 }
298}
299
300#[async_trait]
301impl ServiceBuilder for ObjectStoreBlobServiceConfig {
302 type Output = dyn BlobService;
303 async fn build<'a>(
304 &'a self,
305 instance_name: &str,
306 _context: &CompositionContext,
307 ) -> Result<Arc<Self::Output>, Box<dyn std::error::Error + Send + Sync>> {
308 let opts = {
309 let mut opts: HashMap<&str, _> = self
310 .object_store_options
311 .iter()
312 .map(|(k, v)| (k.as_str(), v.as_str()))
313 .collect();
314
315 if let hash_map::Entry::Vacant(e) =
316 opts.entry(object_store::ClientConfigKey::UserAgent.as_ref())
317 {
318 e.insert(crate::USER_AGENT);
319 }
320
321 opts
322 };
323
324 let object_store_url: url::Url = self.object_store_url.parse()?;
326 let (object_store_scheme, path) =
327 object_store::ObjectStoreScheme::parse(&object_store_url)?;
328
329 let (object_store, path) = match object_store_scheme {
330 #[cfg(feature = "cloud")]
331 ObjectStoreScheme::AmazonS3 => {
332 if object_store_url.scheme() != "s3" {
334 return Err(Box::new(std::io::Error::new(
335 std::io::ErrorKind::InvalidInput,
336 "only s3://-style URLs supported",
337 )));
338 }
339
340 let store = aws::setup_aws_object_store(&object_store_url, opts).await?;
341 (Box::new(store) as Box<dyn ObjectStore>, path)
342 }
343 _ => object_store::parse_url_opts(&object_store_url, opts)?,
344 };
345
346 Ok(Arc::new(ObjectStoreBlobService {
347 instance_name: instance_name.to_string(),
348 object_store: Arc::new(object_store),
349 base_path: path,
350 avg_chunk_size: self.avg_chunk_size,
351 }))
352 }
353}
354
355#[instrument(skip_all, fields(base_path=%base_path, min_chunk_size, avg_chunk_size, max_chunk_size), err)]
358async fn chunk_and_upload<R: AsyncRead + Unpin>(
359 r: R,
360 object_store: Arc<dyn ObjectStore>,
361 base_path: Path,
362 min_chunk_size: u32,
363 avg_chunk_size: u32,
364 max_chunk_size: u32,
365) -> io::Result<B3Digest> {
366 let mut b3_r = B3HashingReader::from(r);
368 let mut chunker =
370 AsyncStreamCDC::new(&mut b3_r, min_chunk_size, avg_chunk_size, max_chunk_size);
371
372 let chunks = chunker
375 .as_stream()
376 .err_into()
377 .map_ok(|chunk_data| {
378 let object_store = object_store.clone();
379 let chunk_digest: B3Digest = blake3::hash(&chunk_data.data).as_bytes().into();
380 let chunk_path = derive_chunk_path(&base_path, &chunk_digest);
381 upload_chunk(object_store, chunk_digest, chunk_path, chunk_data.data)
382 })
383 .try_buffered(CONCURRENT_CHUNK_UPLOADS)
384 .try_collect::<Vec<ChunkMeta>>()
385 .await?;
386
387 let chunks = if chunks.len() < 2 {
388 vec![]
392 } else {
393 chunks
394 };
395
396 let stat_blob_response = StatBlobResponse {
397 chunks,
398 bao: "".into(), };
400
401 let blob_digest: B3Digest = b3_r.digest().into();
403 let blob_path = derive_blob_path(&base_path, &blob_digest);
404
405 match object_store.head(&blob_path).await {
406 Ok(_) => {
408 trace!(
409 blob.digest = %blob_digest,
410 blob.path = %blob_path,
411 "blob already exists on backend"
412 );
413 }
414 Err(object_store::Error::NotFound { .. }) => {
416 debug!(
417 blob.digest = %blob_digest,
418 blob.path = %blob_path,
419 "uploading blob"
420 );
421 object_store
422 .put(&blob_path, stat_blob_response.encode_to_vec().into())
423 .await?;
424 }
425 Err(err) => {
426 Err(err)?
428 }
429 }
430
431 Ok(blob_digest)
432}
433
434#[instrument(skip_all, fields(chunk.digest = %chunk_digest, chunk.size = chunk_data.len(), chunk.path = %chunk_path), err)]
436async fn upload_chunk(
437 object_store: Arc<dyn ObjectStore>,
438 chunk_digest: B3Digest,
439 chunk_path: Path,
440 chunk_data: Vec<u8>,
441) -> std::io::Result<ChunkMeta> {
442 let chunk_size = chunk_data.len();
443 match object_store.head(&chunk_path).await {
444 Ok(_) => {
446 debug!("chunk already exists");
447 }
448
449 Err(object_store::Error::NotFound { .. }) => {
451 let chunk_data_compressed =
452 zstd::encode_all(Cursor::new(chunk_data), zstd::DEFAULT_COMPRESSION_LEVEL)?;
453
454 debug!(chunk.compressed_size=%chunk_data_compressed.len(), "uploading chunk");
455
456 object_store
457 .as_ref()
458 .put(&chunk_path, chunk_data_compressed.into())
459 .await?;
460 }
461 Err(err) => Err(err)?,
463 }
464
465 Ok(ChunkMeta {
466 digest: chunk_digest.into(),
467 size: chunk_size as u64,
468 })
469}
470
471pin_project! {
472 pub struct ObjectStoreBlobWriter<W, Fut>
480 where
481 W: AsyncWrite,
482 Fut: Future,
483 {
484 #[pin]
485 writer: Option<W>,
486
487 #[pin]
488 fut: Option<Fut>,
489
490 fut_output: Option<io::Result<B3Digest>>
491 }
492}
493
494impl<W, Fut> tokio::io::AsyncWrite for ObjectStoreBlobWriter<W, Fut>
495where
496 W: AsyncWrite + Send + Unpin,
497 Fut: Future,
498{
499 fn poll_write(
500 self: std::pin::Pin<&mut Self>,
501 cx: &mut std::task::Context<'_>,
502 buf: &[u8],
503 ) -> std::task::Poll<Result<usize, io::Error>> {
504 let this = self.project();
505 let fut = this.fut.as_pin_mut().expect("not future");
507 let fut_p = fut.poll(cx);
508 if fut_p.is_ready() {
512 return Poll::Ready(Err(io::Error::other("upload failed")));
513 }
514
515 this.writer
517 .as_pin_mut()
518 .expect("writer must be some")
519 .poll_write(cx, buf)
520 }
521
522 fn poll_flush(
523 self: std::pin::Pin<&mut Self>,
524 cx: &mut std::task::Context<'_>,
525 ) -> std::task::Poll<Result<(), io::Error>> {
526 let this = self.project();
527 let fut = this.fut.as_pin_mut().expect("not future");
529 let fut_p = fut.poll(cx);
530 if fut_p.is_ready() {
534 return Poll::Ready(Err(io::Error::other("upload failed")));
535 }
536
537 this.writer
539 .as_pin_mut()
540 .expect("writer must be some")
541 .poll_flush(cx)
542 }
543
544 fn poll_shutdown(
545 self: std::pin::Pin<&mut Self>,
546 _cx: &mut std::task::Context<'_>,
547 ) -> std::task::Poll<Result<(), io::Error>> {
548 std::task::Poll::Ready(Ok(()))
551 }
552}
553
554#[async_trait]
555impl<W, Fut> BlobWriter for ObjectStoreBlobWriter<W, Fut>
556where
557 W: AsyncWrite + Send + Unpin,
558 Fut: Future<Output = io::Result<B3Digest>> + Send + Unpin,
559{
560 async fn close(&mut self) -> io::Result<B3Digest> {
561 match self.writer.take() {
562 Some(mut writer) => {
563 writer.shutdown().await?;
565
566 let fut = self.fut.take().expect("fut must be some");
568 let resp = pin!(fut).await;
570
571 match resp.as_ref() {
572 Ok(b3_digest) => {
575 self.fut_output = Some(Ok(*b3_digest));
576 }
577 Err(e) => {
578 self.fut_output = Some(Err(std::io::Error::new(e.kind(), e.to_string())))
582 }
583 }
584 resp
585 }
586 None => {
587 match self.fut_output.as_ref().unwrap() {
589 Ok(b3_digest) => Ok(*b3_digest),
590 Err(e) => Err(std::io::Error::new(e.kind(), e.to_string())),
591 }
592 }
593 }
594 }
595}
596
597#[cfg(test)]
598mod test {
599 use super::{chunk_and_upload, default_avg_chunk_size};
600 use crate::{
601 blobservice::{BlobService, ObjectStoreBlobService},
602 fixtures::{BLOB_A, BLOB_A_DIGEST, BLOB_B, BLOB_B_DIGEST},
603 };
604 use std::{io::Cursor, sync::Arc};
605 use url::Url;
606
607 #[rstest::rstest]
609 #[case::a(&BLOB_A, &BLOB_A_DIGEST)]
610 #[case::b(&BLOB_B, &BLOB_B_DIGEST)]
611 #[tokio::test]
612 async fn test_chunk_and_upload(
613 #[case] blob: &bytes::Bytes,
614 #[case] blob_digest: &crate::B3Digest,
615 ) {
616 let (object_store, base_path) =
617 object_store::parse_url(&Url::parse("memory:///").unwrap()).unwrap();
618 let object_store: Arc<dyn object_store::ObjectStore> = Arc::from(object_store);
619 let blobsvc = Arc::new(ObjectStoreBlobService {
620 instance_name: "test".into(),
621 object_store: object_store.clone(),
622 avg_chunk_size: default_avg_chunk_size(),
623 base_path,
624 });
625
626 let inserted_blob_digest = chunk_and_upload(
627 &mut Cursor::new(blob.to_vec()),
628 object_store,
629 object_store::path::Path::from("/"),
630 1024 / 2,
631 1024,
632 1024 * 2,
633 )
634 .await
635 .expect("chunk_and_upload succeeds");
636
637 assert_eq!(blob_digest.clone(), inserted_blob_digest);
638
639 assert!(blobsvc.has(blob_digest).await.unwrap());
641
642 let chunks = blobsvc.chunks(blob_digest).await.unwrap().unwrap();
644 if blob.len() < 1024 / 2 {
645 assert!(chunks.is_empty());
647 } else if blob.len() > 1024 * 2 {
648 assert!(chunks.len() >= 2);
651 }
652 }
653}