Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support az:// scheme #21

Merged
merged 3 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/Linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
aarch64_cross_compile: 1

- name: Setup vcpkg
uses: lukka/run-vcpkg@v11
uses: lukka/run-vcpkg@v11.1
with:
vcpkgGitCommitId: 9edb1b8e590cc086563301d735cae4b6e732d2d2

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ concurrency:
jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/duckdb/.github/workflows/_extension_distribution.yml@v0.9.1
uses: duckdb/duckdb/.github/workflows/_extension_distribution.yml@60ddc316ca0c1585f14d55aa73f9db59d8fc05d1
with:
duckdb_version: v0.9.1
extension_name: azure
Expand Down
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
set(PARAMETERS "-warnings")
build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES})

find_package(azure-identity-cpp CONFIG REQUIRED)
find_package(azure-storage-blobs-cpp CONFIG REQUIRED)
find_package(azure-identity-cpp CONFIG)
find_package(azure-storage-blobs-cpp CONFIG)

if(NOT ${azure-identity-cpp_FOUND} OR NOT ${azure-storage-blobs-cpp_FOUND})
message(FATAL_ERROR "Azure SDK not found, did you set up vcpkg correctly?")
endif()

# Static lib
target_link_libraries(${EXTENSION_NAME} Azure::azure-identity
Expand Down
19 changes: 10 additions & 9 deletions src/azure_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,8 @@ time_t AzureStorageFileSystem::GetLastModifiedTime(FileHandle &handle) {
return afh.last_modified;
}

// TODO: this is currently a bit weird: it should be az:// but that shit dont work
bool AzureStorageFileSystem::CanHandleFile(const string &fpath) {
return fpath.rfind("azure://", 0) == 0;
return fpath.rfind("azure://", 0) * fpath.rfind("az://", 0) == 0;
}

void AzureStorageFileSystem::Seek(FileHandle &handle, idx_t location) {
Expand Down Expand Up @@ -273,7 +272,7 @@ vector<string> AzureStorageFileSystem::Glob(const string &path, FileOpener *open
bool is_match = Match(key_splits.begin(), key_splits.end(), pattern_splits.begin(), pattern_splits.end());

if (is_match) {
auto result_full_url = "azure://" + azure_url.container + "/" + key.Name;
auto result_full_url = azure_url.prefix + azure_url.container + "/" + key.Name;
result.push_back(result_full_url);
}
}
Expand Down Expand Up @@ -376,22 +375,24 @@ void AzureStorageFileSystem::ReadRange(FileHandle &handle, idx_t file_offset, ch
}

AzureParsedUrl AzureStorageFileSystem::ParseUrl(const string &url) {
string container, path;
string container, prefix, path;

if (url.rfind("azure://", 0) != 0) {
throw IOException("URL needs to start with s3://");
if (url.rfind("azure://", 0) * url.rfind("az://", 0) != 0) {
throw IOException("URL needs to start with azure:// or az://");
}
auto slash_pos = url.find('/', 8);
auto prefix_end_pos = url.find("//") + 2;
auto slash_pos = url.find('/', prefix_end_pos);
if (slash_pos == string::npos) {
throw IOException("URL needs to contain a '/' after the host");
}
container = url.substr(8, slash_pos - 8);
container = url.substr(prefix_end_pos, slash_pos - prefix_end_pos);
if (container.empty()) {
throw IOException("URL needs to contain a bucket name");
}

prefix = url.substr(0, prefix_end_pos);
path = url.substr(slash_pos + 1);
return {container, path};
return {container, prefix, path};
}

void AzureExtension::Load(DuckDB &db) {
Expand Down
1 change: 1 addition & 0 deletions src/include/azure_extension.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct AzureAuthentication {

struct AzureParsedUrl {
string container;
string prefix;
string path;
};

Expand Down
18 changes: 13 additions & 5 deletions test/sql/azure.test
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ require parquet
require-env AZURE_STORAGE_CONNECTION_STRING

# We need a connection string to do requests
foreach prefix azure:// az://

statement error
SELECT sum(l_orderkey) FROM 'azure://testing-private/l.parquet';
SELECT sum(l_orderkey) FROM '${prefix}testing-private/l.parquet';
----
Invalid Input Error: No valid Azure credentials found

Expand All @@ -21,17 +23,23 @@ SET azure_storage_connection_string = '${AZURE_STORAGE_CONNECTION_STRING}';

# Read a column from a parquet file
query I
SELECT sum(l_orderkey) FROM 'azure://testing-private/l.parquet';
SELECT sum(l_orderkey) FROM '${prefix}testing-private/l.parquet';
----
1802759573

# Read from a csv file with no header
query I
SELECT count(*) FROM 'azure://testing-private/lineitem.csv';
SELECT count(*) FROM '${prefix}testing-private/lineitem.csv';
----
60175

query I
SELECT count(*) FROM 'azure://testing-private/l.csv';
SELECT count(*) FROM '${prefix}testing-private/l.csv';
----
60175
60175

# Unset the connection string var
statement ok
SET azure_storage_connection_string = '';

endloop
16 changes: 16 additions & 0 deletions test/sql/azure_glob.test
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,19 @@ query I
SELECT * from GLOB("azure://testing-public/lineitem.*") order by file;
----
azure://testing-public/lineitem.csv

# Testing private blobs with az:// prefix
query I
SELECT * from GLOB("az://testing-private/*.*") order by file;
----
az://testing-private/l.csv
az://testing-private/l.parquet
az://testing-private/lineitem.csv

# Testing public blobs with az:// prefix
query I
SELECT * from GLOB("az://testing-public/*.*") order by file;
----
az://testing-public/l.csv
az://testing-public/l.parquet
az://testing-public/lineitem.csv
Loading