object_store/
parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#[cfg(not(target_arch = "wasm32"))]
19use crate::local::LocalFileSystem;
20use crate::memory::InMemory;
21use crate::path::Path;
22use crate::ObjectStore;
23use snafu::Snafu;
24use url::Url;
25
26#[derive(Debug, Snafu)]
27pub enum Error {
28    #[snafu(display("Unable to recognise URL \"{}\"", url))]
29    Unrecognised { url: Url },
30
31    #[snafu(context(false))]
32    Path { source: crate::path::Error },
33}
34
35impl From<Error> for super::Error {
36    fn from(e: Error) -> Self {
37        Self::Generic {
38            store: "URL",
39            source: Box::new(e),
40        }
41    }
42}
43
44/// Recognizes various URL formats, identifying the relevant [`ObjectStore`]
45///
46/// See [`ObjectStoreScheme::parse`] for more details
47///
48/// # Supported formats:
49/// - `file:///path/to/my/file` -> [`LocalFileSystem`]
50/// - `memory:///` -> [`InMemory`]
51/// - `s3://bucket/path` -> [`AmazonS3`](crate::aws::AmazonS3) (also supports `s3a`)
52/// - `gs://bucket/path` -> [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage)
53/// - `az://account/container/path` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure) (also supports `adl`, `azure`, `abfs`, `abfss`)
54/// - `http://mydomain/path` -> [`HttpStore`](crate::http::HttpStore)
55/// - `https://mydomain/path` -> [`HttpStore`](crate::http::HttpStore)
56///
57/// There are also special cases for AWS and Azure for `https://{host?}/path` paths:
58/// - `dfs.core.windows.net`, `blob.core.windows.net`, `dfs.fabric.microsoft.com`, `blob.fabric.microsoft.com` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure)
59/// - `amazonaws.com` -> [`AmazonS3`](crate::aws::AmazonS3)
60/// - `r2.cloudflarestorage.com` -> [`AmazonS3`](crate::aws::AmazonS3)
61///
62#[non_exhaustive] // permit new variants
63#[derive(Debug, Eq, PartialEq, Clone)]
64pub enum ObjectStoreScheme {
65    /// Url corresponding to [`LocalFileSystem`]
66    Local,
67    /// Url corresponding to [`InMemory`]
68    Memory,
69    /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3)
70    AmazonS3,
71    /// Url corresponding to [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage)
72    GoogleCloudStorage,
73    /// Url corresponding to [`MicrosoftAzure`](crate::azure::MicrosoftAzure)
74    MicrosoftAzure,
75    /// Url corresponding to [`HttpStore`](crate::http::HttpStore)
76    Http,
77}
78
79impl ObjectStoreScheme {
80    /// Create an [`ObjectStoreScheme`] from the provided [`Url`]
81    ///
82    /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`]
83    ///
84    /// # Example
85    /// ```
86    /// # use url::Url;
87    /// # use object_store::ObjectStoreScheme;
88    /// let url: Url = "file:///path/to/my/file".parse().unwrap();
89    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
90    /// assert_eq!(scheme, ObjectStoreScheme::Local);
91    /// assert_eq!(path.as_ref(), "path/to/my/file");
92    ///
93    /// let url: Url = "https://blob.core.windows.net/path/to/my/file".parse().unwrap();
94    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
95    /// assert_eq!(scheme, ObjectStoreScheme::MicrosoftAzure);
96    /// assert_eq!(path.as_ref(), "path/to/my/file");
97    ///
98    /// let url: Url = "https://example.com/path/to/my/file".parse().unwrap();
99    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
100    /// assert_eq!(scheme, ObjectStoreScheme::Http);
101    /// assert_eq!(path.as_ref(), "path/to/my/file");
102    /// ```
103    pub fn parse(url: &Url) -> Result<(Self, Path), Error> {
104        let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1);
105
106        let (scheme, path) = match (url.scheme(), url.host_str()) {
107            ("file", None) => (Self::Local, url.path()),
108            ("memory", None) => (Self::Memory, url.path()),
109            ("s3" | "s3a", Some(_)) => (Self::AmazonS3, url.path()),
110            ("gs", Some(_)) => (Self::GoogleCloudStorage, url.path()),
111            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => {
112                (Self::MicrosoftAzure, url.path())
113            }
114            ("http", Some(_)) => (Self::Http, url.path()),
115            ("https", Some(host)) => {
116                if host.ends_with("dfs.core.windows.net")
117                    || host.ends_with("blob.core.windows.net")
118                    || host.ends_with("dfs.fabric.microsoft.com")
119                    || host.ends_with("blob.fabric.microsoft.com")
120                {
121                    (Self::MicrosoftAzure, url.path())
122                } else if host.ends_with("amazonaws.com") {
123                    match host.starts_with("s3") {
124                        true => (Self::AmazonS3, strip_bucket().unwrap_or_default()),
125                        false => (Self::AmazonS3, url.path()),
126                    }
127                } else if host.ends_with("r2.cloudflarestorage.com") {
128                    (Self::AmazonS3, strip_bucket().unwrap_or_default())
129                } else {
130                    (Self::Http, url.path())
131                }
132            }
133            _ => return Err(Error::Unrecognised { url: url.clone() }),
134        };
135
136        Ok((scheme, Path::from_url_path(path)?))
137    }
138}
139
140#[cfg(feature = "cloud")]
141macro_rules! builder_opts {
142    ($builder:ty, $url:expr, $options:expr) => {{
143        let builder = $options.into_iter().fold(
144            <$builder>::new().with_url($url.to_string()),
145            |builder, (key, value)| match key.as_ref().parse() {
146                Ok(k) => builder.with_config(k, value),
147                Err(_) => builder,
148            },
149        );
150        Box::new(builder.build()?) as _
151    }};
152}
153
154/// Create an [`ObjectStore`] based on the provided `url`
155///
156/// Returns
157/// - An [`ObjectStore`] of the corresponding type
158/// - The [`Path`] into the [`ObjectStore`] of the addressed resource
159pub fn parse_url(url: &Url) -> Result<(Box<dyn ObjectStore>, Path), super::Error> {
160    parse_url_opts(url, std::iter::empty::<(&str, &str)>())
161}
162
163/// Create an [`ObjectStore`] based on the provided `url` and options
164///
165/// Returns
166/// - An [`ObjectStore`] of the corresponding type
167/// - The [`Path`] into the [`ObjectStore`] of the addressed resource
168pub fn parse_url_opts<I, K, V>(
169    url: &Url,
170    options: I,
171) -> Result<(Box<dyn ObjectStore>, Path), super::Error>
172where
173    I: IntoIterator<Item = (K, V)>,
174    K: AsRef<str>,
175    V: Into<String>,
176{
177    let _options = options;
178    let (scheme, path) = ObjectStoreScheme::parse(url)?;
179    let path = Path::parse(path)?;
180
181    let store = match scheme {
182        #[cfg(not(target_arch = "wasm32"))]
183        ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _,
184        ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _,
185        #[cfg(feature = "aws")]
186        ObjectStoreScheme::AmazonS3 => {
187            builder_opts!(crate::aws::AmazonS3Builder, url, _options)
188        }
189        #[cfg(feature = "gcp")]
190        ObjectStoreScheme::GoogleCloudStorage => {
191            builder_opts!(crate::gcp::GoogleCloudStorageBuilder, url, _options)
192        }
193        #[cfg(feature = "azure")]
194        ObjectStoreScheme::MicrosoftAzure => {
195            builder_opts!(crate::azure::MicrosoftAzureBuilder, url, _options)
196        }
197        #[cfg(feature = "http")]
198        ObjectStoreScheme::Http => {
199            let url = &url[..url::Position::BeforePath];
200            builder_opts!(crate::http::HttpBuilder, url, _options)
201        }
202        #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))]
203        s => {
204            return Err(super::Error::Generic {
205                store: "parse_url",
206                source: format!("feature for {s:?} not enabled").into(),
207            })
208        }
209    };
210
211    Ok((store, path))
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn test_parse() {
220        let cases = [
221            ("file:/path", (ObjectStoreScheme::Local, "path")),
222            ("file:///path", (ObjectStoreScheme::Local, "path")),
223            ("memory:/path", (ObjectStoreScheme::Memory, "path")),
224            ("memory:///", (ObjectStoreScheme::Memory, "")),
225            ("s3://bucket/path", (ObjectStoreScheme::AmazonS3, "path")),
226            ("s3a://bucket/path", (ObjectStoreScheme::AmazonS3, "path")),
227            (
228                "https://s3.region.amazonaws.com/bucket",
229                (ObjectStoreScheme::AmazonS3, ""),
230            ),
231            (
232                "https://s3.region.amazonaws.com/bucket/path",
233                (ObjectStoreScheme::AmazonS3, "path"),
234            ),
235            (
236                "https://bucket.s3.region.amazonaws.com",
237                (ObjectStoreScheme::AmazonS3, ""),
238            ),
239            (
240                "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket",
241                (ObjectStoreScheme::AmazonS3, ""),
242            ),
243            (
244                "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path",
245                (ObjectStoreScheme::AmazonS3, "path"),
246            ),
247            (
248                "abfs://container/path",
249                (ObjectStoreScheme::MicrosoftAzure, "path"),
250            ),
251            (
252                "abfs://file_system@account_name.dfs.core.windows.net/path",
253                (ObjectStoreScheme::MicrosoftAzure, "path"),
254            ),
255            (
256                "abfss://file_system@account_name.dfs.core.windows.net/path",
257                (ObjectStoreScheme::MicrosoftAzure, "path"),
258            ),
259            (
260                "https://account.dfs.core.windows.net",
261                (ObjectStoreScheme::MicrosoftAzure, ""),
262            ),
263            (
264                "https://account.blob.core.windows.net",
265                (ObjectStoreScheme::MicrosoftAzure, ""),
266            ),
267            (
268                "gs://bucket/path",
269                (ObjectStoreScheme::GoogleCloudStorage, "path"),
270            ),
271            (
272                "gs://test.example.com/path",
273                (ObjectStoreScheme::GoogleCloudStorage, "path"),
274            ),
275            ("http://mydomain/path", (ObjectStoreScheme::Http, "path")),
276            ("https://mydomain/path", (ObjectStoreScheme::Http, "path")),
277            (
278                "s3://bucket/foo%20bar",
279                (ObjectStoreScheme::AmazonS3, "foo bar"),
280            ),
281            (
282                "https://foo/bar%20baz",
283                (ObjectStoreScheme::Http, "bar baz"),
284            ),
285            (
286                "file:///bar%252Efoo",
287                (ObjectStoreScheme::Local, "bar%2Efoo"),
288            ),
289            (
290                "abfss://file_system@account.dfs.fabric.microsoft.com/",
291                (ObjectStoreScheme::MicrosoftAzure, ""),
292            ),
293            (
294                "abfss://file_system@account.dfs.fabric.microsoft.com/",
295                (ObjectStoreScheme::MicrosoftAzure, ""),
296            ),
297            (
298                "https://account.dfs.fabric.microsoft.com/",
299                (ObjectStoreScheme::MicrosoftAzure, ""),
300            ),
301            (
302                "https://account.dfs.fabric.microsoft.com/container",
303                (ObjectStoreScheme::MicrosoftAzure, "container"),
304            ),
305            (
306                "https://account.blob.fabric.microsoft.com/",
307                (ObjectStoreScheme::MicrosoftAzure, ""),
308            ),
309            (
310                "https://account.blob.fabric.microsoft.com/container",
311                (ObjectStoreScheme::MicrosoftAzure, "container"),
312            ),
313        ];
314
315        for (s, (expected_scheme, expected_path)) in cases {
316            let url = Url::parse(s).unwrap();
317            let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
318
319            assert_eq!(scheme, expected_scheme, "{s}");
320            assert_eq!(path, Path::parse(expected_path).unwrap(), "{s}");
321        }
322
323        let neg_cases = [
324            "unix:/run/foo.socket",
325            "file://remote/path",
326            "memory://remote/",
327        ];
328        for s in neg_cases {
329            let url = Url::parse(s).unwrap();
330            assert!(ObjectStoreScheme::parse(&url).is_err());
331        }
332    }
333
334    #[test]
335    fn test_url_spaces() {
336        let url = Url::parse("file:///my file with spaces").unwrap();
337        assert_eq!(url.path(), "/my%20file%20with%20spaces");
338        let (_, path) = parse_url(&url).unwrap();
339        assert_eq!(path.as_ref(), "my file with spaces");
340    }
341
342    #[tokio::test]
343    #[cfg(feature = "http")]
344    async fn test_url_http() {
345        use crate::client::mock_server::MockServer;
346        use hyper::{header::USER_AGENT, Response};
347
348        let server = MockServer::new().await;
349
350        server.push_fn(|r| {
351            assert_eq!(r.uri().path(), "/foo/bar");
352            assert_eq!(r.headers().get(USER_AGENT).unwrap(), "test_url");
353            Response::new(String::new())
354        });
355
356        let test = format!("{}/foo/bar", server.url());
357        let opts = [("user_agent", "test_url"), ("allow_http", "true")];
358        let url = test.parse().unwrap();
359        let (store, path) = parse_url_opts(&url, opts).unwrap();
360        assert_eq!(path.as_ref(), "foo/bar");
361        store.get(&path).await.unwrap();
362
363        server.shutdown().await;
364    }
365}