object_store/path/
parts.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use percent_encoding::{percent_encode, AsciiSet, CONTROLS};
19use std::borrow::Cow;
20
21use crate::path::DELIMITER_BYTE;
22use snafu::Snafu;
23
24/// Error returned by [`PathPart::parse`]
25#[derive(Debug, Snafu)]
26#[snafu(display(
27    "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"",
28    illegal,
29    segment
30))]
31#[allow(missing_copy_implementations)]
32pub struct InvalidPart {
33    segment: String,
34    illegal: String,
35}
36
37/// The PathPart type exists to validate the directory/file names that form part
38/// of a path.
39///
40/// A [`PathPart`] is guaranteed to:
41///
42/// * Contain no ASCII control characters or `/`
43/// * Not be a relative path segment, i.e. `.` or `..`
44#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
45pub struct PathPart<'a> {
46    pub(super) raw: Cow<'a, str>,
47}
48
49impl<'a> PathPart<'a> {
50    /// Parse the provided path segment as a [`PathPart`] returning an error if invalid
51    pub fn parse(segment: &'a str) -> Result<Self, InvalidPart> {
52        if segment == "." || segment == ".." {
53            return Err(InvalidPart {
54                segment: segment.to_string(),
55                illegal: segment.to_string(),
56            });
57        }
58
59        for c in segment.chars() {
60            if c.is_ascii_control() || c == '/' {
61                return Err(InvalidPart {
62                    segment: segment.to_string(),
63                    // This is correct as only single byte characters up to this point
64                    illegal: c.to_string(),
65                });
66            }
67        }
68
69        Ok(Self {
70            raw: segment.into(),
71        })
72    }
73}
74
75/// Characters we want to encode.
76const INVALID: &AsciiSet = &CONTROLS
77    // The delimiter we are reserving for internal hierarchy
78    .add(DELIMITER_BYTE)
79    // Characters AWS recommends avoiding for object keys
80    // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
81    .add(b'\\')
82    .add(b'{')
83    .add(b'^')
84    .add(b'}')
85    .add(b'%')
86    .add(b'`')
87    .add(b']')
88    .add(b'"') // " <-- my editor is confused about double quotes within single quotes
89    .add(b'>')
90    .add(b'[')
91    .add(b'~')
92    .add(b'<')
93    .add(b'#')
94    .add(b'|')
95    // Characters Google Cloud Storage recommends avoiding for object names
96    // https://cloud.google.com/storage/docs/naming-objects
97    .add(b'\r')
98    .add(b'\n')
99    .add(b'*')
100    .add(b'?');
101
102impl<'a> From<&'a [u8]> for PathPart<'a> {
103    fn from(v: &'a [u8]) -> Self {
104        let inner = match v {
105            // We don't want to encode `.` generally, but we do want to disallow parts of paths
106            // to be equal to `.` or `..` to prevent file system traversal shenanigans.
107            b"." => "%2E".into(),
108            b".." => "%2E%2E".into(),
109            other => percent_encode(other, INVALID).into(),
110        };
111        Self { raw: inner }
112    }
113}
114
115impl<'a> From<&'a str> for PathPart<'a> {
116    fn from(v: &'a str) -> Self {
117        Self::from(v.as_bytes())
118    }
119}
120
121impl From<String> for PathPart<'static> {
122    fn from(s: String) -> Self {
123        Self {
124            raw: Cow::Owned(PathPart::from(s.as_str()).raw.into_owned()),
125        }
126    }
127}
128
129impl<'a> AsRef<str> for PathPart<'a> {
130    fn as_ref(&self) -> &str {
131        self.raw.as_ref()
132    }
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn path_part_delimiter_gets_encoded() {
141        let part: PathPart<'_> = "foo/bar".into();
142        assert_eq!(part.raw, "foo%2Fbar");
143    }
144
145    #[test]
146    fn path_part_given_already_encoded_string() {
147        let part: PathPart<'_> = "foo%2Fbar".into();
148        assert_eq!(part.raw, "foo%252Fbar");
149    }
150
151    #[test]
152    fn path_part_cant_be_one_dot() {
153        let part: PathPart<'_> = ".".into();
154        assert_eq!(part.raw, "%2E");
155    }
156
157    #[test]
158    fn path_part_cant_be_two_dots() {
159        let part: PathPart<'_> = "..".into();
160        assert_eq!(part.raw, "%2E%2E");
161    }
162
163    #[test]
164    fn path_part_parse() {
165        PathPart::parse("foo").unwrap();
166        PathPart::parse("foo/bar").unwrap_err();
167
168        // Test percent-encoded path
169        PathPart::parse("foo%2Fbar").unwrap();
170        PathPart::parse("L%3ABC.parquet").unwrap();
171
172        // Test path containing bad escape sequence
173        PathPart::parse("%Z").unwrap();
174        PathPart::parse("%%").unwrap();
175    }
176}