Skip to content

Commit

Permalink
Merge branch 'main' into universalmind303/cyclic-deps
Browse files Browse the repository at this point in the history
  • Loading branch information
universalmind303 authored Feb 20, 2024
2 parents 633a0fa + c5d6959 commit b5fa59b
Show file tree
Hide file tree
Showing 35 changed files with 1,790 additions and 44 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,13 @@ jobs:
./scripts/prepare-testdata.sh
source ./scripts/ci-install-clickhouse.sh
export CLICKHOUSE_CONN_STRING=$(./scripts/create-test-clickhouse-db.sh)
- name: SQLite
path: "sqllogictests_sqlite/*"
prepare: |
./scripts/prepare-testdata.sh
# If there's an old data-set in the cache, remove it so we can create a new one.
test -f testdata/sqllogictests_sqlite/data/db.sqlite3 && rm testdata/sqllogictests_sqlite/data/db.sqlite3
- name: Cassandra
path: "sqllogictests_cassandra/*"
prepare: |
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ glaredb_image
pgsrv_image
# Downloaded artifacts from GCS
gcs-artifacts/
# Prepared sqlite database
testdata/sqllogictests_sqlite/data/db.sqlite3
# Outputs from cargo flamegraph
flamegraph.svg
# Benchmark artifacts
Expand Down Expand Up @@ -38,4 +40,4 @@ bench_data/
# dbt compiled models
tests/fixtures/dbt_project/target/
tests/fixtures/dbt_project/.user.yml
tests/fixtures/dbt_project/logs/
tests/fixtures/dbt_project/logs/
82 changes: 69 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ tracing = "0.1"
url = "2.5.0"

[workspace.dependencies.deltalake]

git = "https://github.com/delta-io/delta-rs.git"
rev = "993e2c202936719855f8831513bcbab1b9930b94"
features = ["s3", "gcs", "azure", "datafusion"]
1 change: 1 addition & 0 deletions crates/datasources/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ bson = "2.9.0"
scylla = { version = "0.12.0" }
glob = "0.3.1"
indexmap = "2.2.3"
async-sqlite = "0.2.2"

# SSH tunnels
[target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies]
Expand Down
1 change: 1 addition & 0 deletions crates/datasources/src/common/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub enum Datasource {
Snowflake,
Clickhouse,
SqlServer,
Sqlite,
}

/// Returns true if the literal expression encoding should be wrapped inside
Expand Down
34 changes: 15 additions & 19 deletions crates/datasources/src/excel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,25 @@ fn infer_value_type(v: &calamine::Data) -> Result<DataType, Error> {

fn infer_schema(
r: &Range<calamine::Data>,
has_header: Option<bool>,
has_header: bool,
infer_schema_length: usize,
) -> Result<(Schema, bool), Error> {
) -> Result<Schema, Error> {
let mut col_types: HashMap<&str, HashSet<DataType>> = HashMap::new();
let mut rows = r.rows();
let mut skip_first = false;
let col_names: Vec<String> = rows
.next()
.unwrap()
.iter()
.enumerate()
.map(|(i, c)| {
let s = c.get_string().map(|s| s.to_string());
match (has_header, s) {
(Some(true), Some(s)) => {
skip_first = true;
Ok(s)
}
(Some(true), None) => Err(Error::Load {
.map(
|(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) {
(true, Some(s)) => Ok(s),
(true, None) => Err(Error::Load {
msg: "failed to parse header".to_string(),
}),
_ => Ok(format!("col{}", i)),
}
})
(false, _) => Ok(format!("col{}", i)),
},
)
.collect::<Result<_, _>>()?;

for row in rows.take(infer_schema_length) {
Expand Down Expand Up @@ -87,23 +82,24 @@ fn infer_schema(
Field::new(col_name.replace(' ', "_"), dt, true)
})
.collect();
Ok((Schema::new(fields), skip_first))

Ok(Schema::new(fields))
}

// TODO: vectorize this to improve performance
// Ideally we can iterate over the columns instead of iterating over the rows
fn xlsx_sheet_value_to_record_batch(
r: Range<calamine::Data>,
has_header: Option<bool>,
has_header: bool,
infer_schema_length: usize,
) -> Result<RecordBatch, Error> {
let (schema, should_skip) = infer_schema(&r, has_header, infer_schema_length)?;
let schema = infer_schema(&r, has_header, infer_schema_length)?;
let arrays = schema
.fields()
.iter()
.enumerate()
.map(|(i, field)| {
let rows = if should_skip {
let rows = if has_header {
r.rows().skip(1)
} else {
// Rows doesn't behave like a normal iterator here, so we need to skip `0` rows
Expand Down Expand Up @@ -150,7 +146,7 @@ fn xlsx_sheet_value_to_record_batch(
pub async fn read_excel_impl(
path: &PathBuf,
sheet_name: Option<&str>,
has_header: Option<bool>,
has_header: bool,
infer_schema_length: usize,
) -> Result<datafusion::datasource::MemTable, Error> {
let mut workbook: Xlsx<_> = open_workbook(path)?;
Expand Down
1 change: 1 addition & 0 deletions crates/datasources/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ pub mod native;
pub mod object_store;
pub mod postgres;
pub mod snowflake;
pub mod sqlite;
pub mod sqlserver;
3 changes: 2 additions & 1 deletion crates/datasources/src/object_store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ pub fn init_session_registry<'a>(
| TableOptions::SqlServer(_)
| TableOptions::Clickhouse(_)
| TableOptions::Cassandra(_)
| TableOptions::Excel(_) => continue,
| TableOptions::Excel(_)
| TableOptions::Sqlite(_) => continue,
};

let base_url = access.base_url()?;
Expand Down
Loading

0 comments on commit b5fa59b

Please sign in to comment.