object_store/path/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Path abstraction for Object Storage
19
20use itertools::Itertools;
21use percent_encoding::percent_decode;
22use snafu::{ensure, ResultExt, Snafu};
23use std::fmt::Formatter;
24#[cfg(not(target_arch = "wasm32"))]
25use url::Url;
26
27/// The delimiter to separate object namespaces, creating a directory structure.
28pub const DELIMITER: &str = "/";
29
30/// The path delimiter as a single byte
31pub const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
32
33mod parts;
34
35pub use parts::{InvalidPart, PathPart};
36
37/// Error returned by [`Path::parse`]
38#[derive(Debug, Snafu)]
39#[allow(missing_docs)]
40pub enum Error {
41    #[snafu(display("Path \"{}\" contained empty path segment", path))]
42    EmptySegment { path: String },
43
44    #[snafu(display("Error parsing Path \"{}\": {}", path, source))]
45    BadSegment { path: String, source: InvalidPart },
46
47    #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))]
48    Canonicalize {
49        path: std::path::PathBuf,
50        source: std::io::Error,
51    },
52
53    #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))]
54    InvalidPath { path: std::path::PathBuf },
55
56    #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))]
57    NonUnicode {
58        path: String,
59        source: std::str::Utf8Error,
60    },
61
62    #[snafu(display("Path {} does not start with prefix {}", path, prefix))]
63    PrefixMismatch { path: String, prefix: String },
64}
65
66/// A parsed path representation that can be safely written to object storage
67///
68/// A [`Path`] maintains the following invariants:
69///
70/// * Paths are delimited by `/`
71/// * Paths do not contain leading or trailing `/`
72/// * Paths do not contain relative path segments, i.e. `.` or `..`
73/// * Paths do not contain empty path segments
74/// * Paths do not contain any ASCII control characters
75///
76/// There are no enforced restrictions on path length, however, it should be noted that most
77/// object stores do not permit paths longer than 1024 bytes, and many filesystems do not
78/// support path segments longer than 255 bytes.
79///
80/// # Encode
81///
82/// In theory object stores support any UTF-8 character sequence, however, certain character
83/// sequences cause compatibility problems with some applications and protocols. Additionally
84/// some filesystems may impose character restrictions, see [`LocalFileSystem`]. As such the
85/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a
86/// limited character subset.
87///
88/// [S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
89/// [GCS]: https://cloud.google.com/storage/docs/naming-objects
90/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
91///
92/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`]
93/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic
94/// segments according to [RFC 1738].
95///
96/// ```
97/// # use object_store::path::Path;
98/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
99/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
100/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
101/// assert_eq!(Path::from("/").as_ref(), "");
102/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
103/// ```
104///
105/// Note: if provided with an already percent encoded string, this will encode it again
106///
107/// ```
108/// # use object_store::path::Path;
109/// assert_eq!(Path::from("foo/foo%2Fbar").as_ref(), "foo/foo%252Fbar");
110/// ```
111///
112/// # Parse
113///
114/// Alternatively a [`Path`] can be parsed from an existing string, returning an
115/// error if it is invalid. Unlike the encoding methods above, this will permit
116/// arbitrary unicode, including percent encoded sequences.
117///
118/// ```
119/// # use object_store::path::Path;
120/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar");
121/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed
122/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed
123/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed
124/// ```
125///
126/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
127/// [`LocalFileSystem`]: crate::local::LocalFileSystem
128#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
129pub struct Path {
130    /// The raw path with no leading or trailing delimiters
131    raw: String,
132}
133
134impl Path {
135    /// Parse a string as a [`Path`], returning a [`Error`] if invalid,
136    /// as defined on the docstring for [`Path`]
137    ///
138    /// Note: this will strip any leading `/` or trailing `/`
139    pub fn parse(path: impl AsRef<str>) -> Result<Self, Error> {
140        let path = path.as_ref();
141
142        let stripped = path.strip_prefix(DELIMITER).unwrap_or(path);
143        if stripped.is_empty() {
144            return Ok(Default::default());
145        }
146
147        let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped);
148
149        for segment in stripped.split(DELIMITER) {
150            ensure!(!segment.is_empty(), EmptySegmentSnafu { path });
151            PathPart::parse(segment).context(BadSegmentSnafu { path })?;
152        }
153
154        Ok(Self {
155            raw: stripped.to_string(),
156        })
157    }
158
159    #[cfg(not(target_arch = "wasm32"))]
160    /// Convert a filesystem path to a [`Path`] relative to the filesystem root
161    ///
162    /// This will return an error if the path contains illegal character sequences
163    /// as defined on the docstring for [`Path`] or does not exist
164    ///
165    /// Note: this will canonicalize the provided path, resolving any symlinks
166    pub fn from_filesystem_path(path: impl AsRef<std::path::Path>) -> Result<Self, Error> {
167        let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu {
168            path: path.as_ref(),
169        })?;
170
171        Self::from_absolute_path(absolute)
172    }
173
174    #[cfg(not(target_arch = "wasm32"))]
175    /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root
176    ///
177    /// This will return an error if the path contains illegal character sequences,
178    /// as defined on the docstring for [`Path`], or `base` is not an absolute path
179    pub fn from_absolute_path(path: impl AsRef<std::path::Path>) -> Result<Self, Error> {
180        Self::from_absolute_path_with_base(path, None)
181    }
182
183    #[cfg(not(target_arch = "wasm32"))]
184    /// Convert a filesystem path to a [`Path`] relative to the provided base
185    ///
186    /// This will return an error if the path contains illegal character sequences,
187    /// as defined on the docstring for [`Path`], or `base` does not refer to a parent
188    /// path of `path`, or `base` is not an absolute path
189    pub(crate) fn from_absolute_path_with_base(
190        path: impl AsRef<std::path::Path>,
191        base: Option<&Url>,
192    ) -> Result<Self, Error> {
193        let url = absolute_path_to_url(path)?;
194        let path = match base {
195            Some(prefix) => {
196                url.path()
197                    .strip_prefix(prefix.path())
198                    .ok_or_else(|| Error::PrefixMismatch {
199                        path: url.path().to_string(),
200                        prefix: prefix.to_string(),
201                    })?
202            }
203            None => url.path(),
204        };
205
206        // Reverse any percent encoding performed by conversion to URL
207        Self::from_url_path(path)
208    }
209
210    /// Parse a url encoded string as a [`Path`], returning a [`Error`] if invalid
211    ///
212    /// This will return an error if the path contains illegal character sequences
213    /// as defined on the docstring for [`Path`]
214    pub fn from_url_path(path: impl AsRef<str>) -> Result<Self, Error> {
215        let path = path.as_ref();
216        let decoded = percent_decode(path.as_bytes())
217            .decode_utf8()
218            .context(NonUnicodeSnafu { path })?;
219
220        Self::parse(decoded)
221    }
222
223    /// Returns the [`PathPart`] of this [`Path`]
224    pub fn parts(&self) -> impl Iterator<Item = PathPart<'_>> {
225        self.raw
226            .split_terminator(DELIMITER)
227            .map(|s| PathPart { raw: s.into() })
228    }
229
230    /// Returns the last path segment containing the filename stored in this [`Path`]
231    pub fn filename(&self) -> Option<&str> {
232        match self.raw.is_empty() {
233            true => None,
234            false => self.raw.rsplit(DELIMITER).next(),
235        }
236    }
237
238    /// Returns the extension of the file stored in this [`Path`], if any
239    pub fn extension(&self) -> Option<&str> {
240        self.filename()
241            .and_then(|f| f.rsplit_once('.'))
242            .and_then(|(_, extension)| {
243                if extension.is_empty() {
244                    None
245                } else {
246                    Some(extension)
247                }
248            })
249    }
250
251    /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix`
252    ///
253    /// Returns `None` if the prefix does not match
254    pub fn prefix_match(&self, prefix: &Self) -> Option<impl Iterator<Item = PathPart<'_>> + '_> {
255        let mut stripped = self.raw.strip_prefix(&prefix.raw)?;
256        if !stripped.is_empty() && !prefix.raw.is_empty() {
257            stripped = stripped.strip_prefix(DELIMITER)?;
258        }
259        let iter = stripped
260            .split_terminator(DELIMITER)
261            .map(|x| PathPart { raw: x.into() });
262        Some(iter)
263    }
264
265    /// Returns true if this [`Path`] starts with `prefix`
266    pub fn prefix_matches(&self, prefix: &Self) -> bool {
267        self.prefix_match(prefix).is_some()
268    }
269
270    /// Creates a new child of this [`Path`]
271    pub fn child<'a>(&self, child: impl Into<PathPart<'a>>) -> Self {
272        let raw = match self.raw.is_empty() {
273            true => format!("{}", child.into().raw),
274            false => format!("{}{}{}", self.raw, DELIMITER, child.into().raw),
275        };
276
277        Self { raw }
278    }
279}
280
281impl AsRef<str> for Path {
282    fn as_ref(&self) -> &str {
283        &self.raw
284    }
285}
286
287impl From<&str> for Path {
288    fn from(path: &str) -> Self {
289        Self::from_iter(path.split(DELIMITER))
290    }
291}
292
293impl From<String> for Path {
294    fn from(path: String) -> Self {
295        Self::from_iter(path.split(DELIMITER))
296    }
297}
298
299impl From<Path> for String {
300    fn from(path: Path) -> Self {
301        path.raw
302    }
303}
304
305impl std::fmt::Display for Path {
306    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
307        self.raw.fmt(f)
308    }
309}
310
311impl<'a, I> FromIterator<I> for Path
312where
313    I: Into<PathPart<'a>>,
314{
315    fn from_iter<T: IntoIterator<Item = I>>(iter: T) -> Self {
316        let raw = T::into_iter(iter)
317            .map(|s| s.into())
318            .filter(|s| !s.raw.is_empty())
319            .map(|s| s.raw)
320            .join(DELIMITER);
321
322        Self { raw }
323    }
324}
325
326#[cfg(not(target_arch = "wasm32"))]
327/// Given an absolute filesystem path convert it to a URL representation without canonicalization
328pub(crate) fn absolute_path_to_url(path: impl AsRef<std::path::Path>) -> Result<Url, Error> {
329    Url::from_file_path(&path).map_err(|_| Error::InvalidPath {
330        path: path.as_ref().into(),
331    })
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn cloud_prefix_with_trailing_delimiter() {
340        // Use case: files exist in object storage named `foo/bar.json` and
341        // `foo_test.json`. A search for the prefix `foo/` should return
342        // `foo/bar.json` but not `foo_test.json'.
343        let prefix = Path::from_iter(["test"]);
344        assert_eq!(prefix.as_ref(), "test");
345    }
346
347    #[test]
348    fn push_encodes() {
349        let location = Path::from_iter(["foo/bar", "baz%2Ftest"]);
350        assert_eq!(location.as_ref(), "foo%2Fbar/baz%252Ftest");
351    }
352
353    #[test]
354    fn test_parse() {
355        assert_eq!(Path::parse("/").unwrap().as_ref(), "");
356        assert_eq!(Path::parse("").unwrap().as_ref(), "");
357
358        let err = Path::parse("//").unwrap_err();
359        assert!(matches!(err, Error::EmptySegment { .. }));
360
361        assert_eq!(Path::parse("/foo/bar/").unwrap().as_ref(), "foo/bar");
362        assert_eq!(Path::parse("foo/bar/").unwrap().as_ref(), "foo/bar");
363        assert_eq!(Path::parse("foo/bar").unwrap().as_ref(), "foo/bar");
364
365        let err = Path::parse("foo///bar").unwrap_err();
366        assert!(matches!(err, Error::EmptySegment { .. }));
367    }
368
369    #[test]
370    fn convert_raw_before_partial_eq() {
371        // dir and file_name
372        let cloud = Path::from("test_dir/test_file.json");
373        let built = Path::from_iter(["test_dir", "test_file.json"]);
374
375        assert_eq!(built, cloud);
376
377        // dir and file_name w/o dot
378        let cloud = Path::from("test_dir/test_file");
379        let built = Path::from_iter(["test_dir", "test_file"]);
380
381        assert_eq!(built, cloud);
382
383        // dir, no file
384        let cloud = Path::from("test_dir/");
385        let built = Path::from_iter(["test_dir"]);
386        assert_eq!(built, cloud);
387
388        // file_name, no dir
389        let cloud = Path::from("test_file.json");
390        let built = Path::from_iter(["test_file.json"]);
391        assert_eq!(built, cloud);
392
393        // empty
394        let cloud = Path::from("");
395        let built = Path::from_iter(["", ""]);
396
397        assert_eq!(built, cloud);
398    }
399
400    #[test]
401    fn parts_after_prefix_behavior() {
402        let existing_path = Path::from("apple/bear/cow/dog/egg.json");
403
404        // Prefix with one directory
405        let prefix = Path::from("apple");
406        let expected_parts: Vec<PathPart<'_>> = vec!["bear", "cow", "dog", "egg.json"]
407            .into_iter()
408            .map(Into::into)
409            .collect();
410        let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
411        assert_eq!(parts, expected_parts);
412
413        // Prefix with two directories
414        let prefix = Path::from("apple/bear");
415        let expected_parts: Vec<PathPart<'_>> = vec!["cow", "dog", "egg.json"]
416            .into_iter()
417            .map(Into::into)
418            .collect();
419        let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
420        assert_eq!(parts, expected_parts);
421
422        // Not a prefix
423        let prefix = Path::from("cow");
424        assert!(existing_path.prefix_match(&prefix).is_none());
425
426        // Prefix with a partial directory
427        let prefix = Path::from("ap");
428        assert!(existing_path.prefix_match(&prefix).is_none());
429
430        // Prefix matches but there aren't any parts after it
431        let existing = Path::from("apple/bear/cow/dog");
432
433        assert_eq!(existing.prefix_match(&existing).unwrap().count(), 0);
434        assert_eq!(Path::default().parts().count(), 0);
435    }
436
437    #[test]
438    fn prefix_matches() {
439        let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]);
440        // self starts with self
441        assert!(
442            haystack.prefix_matches(&haystack),
443            "{haystack:?} should have started with {haystack:?}"
444        );
445
446        // a longer prefix doesn't match
447        let needle = haystack.child("longer now");
448        assert!(
449            !haystack.prefix_matches(&needle),
450            "{haystack:?} shouldn't have started with {needle:?}"
451        );
452
453        // one dir prefix matches
454        let needle = Path::from_iter(["foo/bar"]);
455        assert!(
456            haystack.prefix_matches(&needle),
457            "{haystack:?} should have started with {needle:?}"
458        );
459
460        // two dir prefix matches
461        let needle = needle.child("baz%2Ftest");
462        assert!(
463            haystack.prefix_matches(&needle),
464            "{haystack:?} should have started with {needle:?}"
465        );
466
467        // partial dir prefix doesn't match
468        let needle = Path::from_iter(["f"]);
469        assert!(
470            !haystack.prefix_matches(&needle),
471            "{haystack:?} should not have started with {needle:?}"
472        );
473
474        // one dir and one partial dir doesn't match
475        let needle = Path::from_iter(["foo/bar", "baz"]);
476        assert!(
477            !haystack.prefix_matches(&needle),
478            "{haystack:?} should not have started with {needle:?}"
479        );
480
481        // empty prefix matches
482        let needle = Path::from("");
483        assert!(
484            haystack.prefix_matches(&needle),
485            "{haystack:?} should have started with {needle:?}"
486        );
487    }
488
489    #[test]
490    fn prefix_matches_with_file_name() {
491        let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]);
492
493        // All directories match and file name is a prefix
494        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]);
495
496        assert!(
497            !haystack.prefix_matches(&needle),
498            "{haystack:?} should not have started with {needle:?}"
499        );
500
501        // All directories match but file name is not a prefix
502        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "e"]);
503
504        assert!(
505            !haystack.prefix_matches(&needle),
506            "{haystack:?} should not have started with {needle:?}"
507        );
508
509        // Not all directories match; file name is a prefix of the next directory; this
510        // does not match
511        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "s"]);
512
513        assert!(
514            !haystack.prefix_matches(&needle),
515            "{haystack:?} should not have started with {needle:?}"
516        );
517
518        // Not all directories match; file name is NOT a prefix of the next directory;
519        // no match
520        let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "p"]);
521
522        assert!(
523            !haystack.prefix_matches(&needle),
524            "{haystack:?} should not have started with {needle:?}"
525        );
526    }
527
528    #[test]
529    fn path_containing_spaces() {
530        let a = Path::from_iter(["foo bar", "baz"]);
531        let b = Path::from("foo bar/baz");
532        let c = Path::parse("foo bar/baz").unwrap();
533
534        assert_eq!(a.raw, "foo bar/baz");
535        assert_eq!(a.raw, b.raw);
536        assert_eq!(b.raw, c.raw);
537    }
538
539    #[test]
540    fn from_url_path() {
541        let a = Path::from_url_path("foo%20bar").unwrap();
542        let b = Path::from_url_path("foo/%2E%2E/bar").unwrap_err();
543        let c = Path::from_url_path("foo%2F%252E%252E%2Fbar").unwrap();
544        let d = Path::from_url_path("foo/%252E%252E/bar").unwrap();
545        let e = Path::from_url_path("%48%45%4C%4C%4F").unwrap();
546        let f = Path::from_url_path("foo/%FF/as").unwrap_err();
547
548        assert_eq!(a.raw, "foo bar");
549        assert!(matches!(b, Error::BadSegment { .. }));
550        assert_eq!(c.raw, "foo/%2E%2E/bar");
551        assert_eq!(d.raw, "foo/%2E%2E/bar");
552        assert_eq!(e.raw, "HELLO");
553        assert!(matches!(f, Error::NonUnicode { .. }));
554    }
555
556    #[test]
557    fn filename_from_path() {
558        let a = Path::from("foo/bar");
559        let b = Path::from("foo/bar.baz");
560        let c = Path::from("foo.bar/baz");
561
562        assert_eq!(a.filename(), Some("bar"));
563        assert_eq!(b.filename(), Some("bar.baz"));
564        assert_eq!(c.filename(), Some("baz"));
565    }
566
567    #[test]
568    fn file_extension() {
569        let a = Path::from("foo/bar");
570        let b = Path::from("foo/bar.baz");
571        let c = Path::from("foo.bar/baz");
572        let d = Path::from("foo.bar/baz.qux");
573
574        assert_eq!(a.extension(), None);
575        assert_eq!(b.extension(), Some("baz"));
576        assert_eq!(c.extension(), None);
577        assert_eq!(d.extension(), Some("qux"));
578    }
579}