From 8bb7d4c84c80ca21a724a629f33b743844a05db6 Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Mon, 6 Nov 2023 15:14:42 +0100 Subject: [PATCH 1/3] Support az:// prefix --- CMakeLists.txt | 8 ++++++-- src/azure_extension.cpp | 19 ++++++++++--------- src/include/azure_extension.hpp | 1 + test/sql/azure.test | 18 +++++++++++++----- test/sql/azure_glob.test | 16 ++++++++++++++++ 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0d8eab..a3dc47f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,12 @@ add_library(${EXTENSION_NAME} STATIC ${EXTENSION_SOURCES}) set(PARAMETERS "-warnings") build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}) -find_package(azure-identity-cpp CONFIG REQUIRED) -find_package(azure-storage-blobs-cpp CONFIG REQUIRED) +find_package(azure-identity-cpp CONFIG) +find_package(azure-storage-blobs-cpp CONFIG) + +if(NOT ${azure-identity-cpp_FOUND} OR NOT ${azure-storage-blobs-cpp_FOUND}) + message(FATAL_ERROR "Azure SDK not found, did you set up vcpkg correctly?") +endif() # Static lib target_link_libraries(${EXTENSION_NAME} Azure::azure-identity diff --git a/src/azure_extension.cpp b/src/azure_extension.cpp index d9fc34f..4f5fc34 100644 --- a/src/azure_extension.cpp +++ b/src/azure_extension.cpp @@ -159,9 +159,8 @@ time_t AzureStorageFileSystem::GetLastModifiedTime(FileHandle &handle) { return afh.last_modified; } -// TODO: this is currently a bit weird: it should be az:// but that shit dont work bool AzureStorageFileSystem::CanHandleFile(const string &fpath) { - return fpath.rfind("azure://", 0) == 0; + return fpath.rfind("azure://", 0) * fpath.rfind("az://", 0) == 0; } void AzureStorageFileSystem::Seek(FileHandle &handle, idx_t location) { @@ -273,7 +272,7 @@ vector AzureStorageFileSystem::Glob(const string &path, FileOpener *open bool is_match = Match(key_splits.begin(), key_splits.end(), pattern_splits.begin(), pattern_splits.end()); if (is_match) { - auto result_full_url = "azure://" + azure_url.container + "/" + key.Name; + auto result_full_url = azure_url.prefix + azure_url.container + "/" + key.Name; result.push_back(result_full_url); } } @@ -376,22 +375,24 @@ void AzureStorageFileSystem::ReadRange(FileHandle &handle, idx_t file_offset, ch } AzureParsedUrl AzureStorageFileSystem::ParseUrl(const string &url) { - string container, path; + string container, prefix, path; - if (url.rfind("azure://", 0) != 0) { - throw IOException("URL needs to start with s3://"); + if (url.rfind("azure://", 0) * url.rfind("az://", 0) != 0) { + throw IOException("URL needs to start with azure:// or az://"); } - auto slash_pos = url.find('/', 8); + auto prefix_end_pos = url.find("//") + 2; + auto slash_pos = url.find('/', prefix_end_pos); if (slash_pos == string::npos) { throw IOException("URL needs to contain a '/' after the host"); } - container = url.substr(8, slash_pos - 8); + container = url.substr(prefix_end_pos, slash_pos - prefix_end_pos); if (container.empty()) { throw IOException("URL needs to contain a bucket name"); } + prefix = url.substr(0, prefix_end_pos); path = url.substr(slash_pos + 1); - return {container, path}; + return {container, prefix, path}; } void AzureExtension::Load(DuckDB &db) { diff --git a/src/include/azure_extension.hpp b/src/include/azure_extension.hpp index 4d548e9..20e44b4 100644 --- a/src/include/azure_extension.hpp +++ b/src/include/azure_extension.hpp @@ -30,6 +30,7 @@ struct AzureAuthentication { struct AzureParsedUrl { string container; + string prefix; string path; }; diff --git a/test/sql/azure.test b/test/sql/azure.test index 6cc9564..42aaf4e 100644 --- a/test/sql/azure.test +++ b/test/sql/azure.test @@ -10,8 +10,10 @@ require parquet require-env AZURE_STORAGE_CONNECTION_STRING # We need a connection string to do requests +foreach prefix azure:// az:// + statement error -SELECT sum(l_orderkey) FROM 'azure://testing-private/l.parquet'; +SELECT sum(l_orderkey) FROM '${prefix}testing-private/l.parquet'; ---- Invalid Input Error: No valid Azure credentials found @@ -21,17 +23,23 @@ SET azure_storage_connection_string = '${AZURE_STORAGE_CONNECTION_STRING}'; # Read a column from a parquet file query I -SELECT sum(l_orderkey) FROM 'azure://testing-private/l.parquet'; +SELECT sum(l_orderkey) FROM '${prefix}testing-private/l.parquet'; ---- 1802759573 # Read from a csv file with no header query I -SELECT count(*) FROM 'azure://testing-private/lineitem.csv'; +SELECT count(*) FROM '${prefix}testing-private/lineitem.csv'; ---- 60175 query I -SELECT count(*) FROM 'azure://testing-private/l.csv'; +SELECT count(*) FROM '${prefix}testing-private/l.csv'; ---- -60175 \ No newline at end of file +60175 + +# Unset the connection string var +statement ok +SET azure_storage_connection_string = ''; + +endloop diff --git a/test/sql/azure_glob.test b/test/sql/azure_glob.test index ed0ad13..0f98217 100644 --- a/test/sql/azure_glob.test +++ b/test/sql/azure_glob.test @@ -59,3 +59,19 @@ query I SELECT * from GLOB("azure://testing-public/lineitem.*") order by file; ---- azure://testing-public/lineitem.csv + +# Testing private blobs with az:// prefix +query I +SELECT * from GLOB("az://testing-private/*.*") order by file; +---- +az://testing-private/l.csv +az://testing-private/l.parquet +az://testing-private/lineitem.csv + +# Testing public blobs with az:// prefix +query I +SELECT * from GLOB("az://testing-public/*.*") order by file; +---- +az://testing-public/l.csv +az://testing-public/l.parquet +az://testing-public/lineitem.csv \ No newline at end of file From 244bd30c53739eab6a5443e5af9561c65a7ad62d Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Tue, 7 Nov 2023 10:18:44 +0100 Subject: [PATCH 2/3] Change the tag of duckdb distribution to a commit hash --- .github/workflows/MainDistributionPipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a416ee3..06d2833 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -14,7 +14,7 @@ concurrency: jobs: duckdb-stable-build: name: Build extension binaries - uses: duckdb/duckdb/.github/workflows/_extension_distribution.yml@v0.9.1 + uses: duckdb/duckdb/.github/workflows/_extension_distribution.yml@60ddc316ca0c1585f14d55aa73f9db59d8fc05d1 with: duckdb_version: v0.9.1 extension_name: azure From cfc5bf9e9cd37fb67405dee2c2d2ed8a4f4219b8 Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Tue, 7 Nov 2023 12:35:21 +0100 Subject: [PATCH 3/3] Pin run-vcpkg action to older version --- .github/workflows/Linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Linux.yml b/.github/workflows/Linux.yml index 028de6c..3e35aad 100644 --- a/.github/workflows/Linux.yml +++ b/.github/workflows/Linux.yml @@ -103,7 +103,7 @@ jobs: aarch64_cross_compile: 1 - name: Setup vcpkg - uses: lukka/run-vcpkg@v11 + uses: lukka/run-vcpkg@v11.1 with: vcpkgGitCommitId: 9edb1b8e590cc086563301d735cae4b6e732d2d2