-
Notifications
You must be signed in to change notification settings - Fork 216
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat!: change the default data storage version to "stable" (e.g. v2.0) (
#2829) Closes #2394 This PR changes a few remaining tests. Also, by changing the default to v2 we exposed a few minor inconsistencies with v1 that we fixed. * When creating a fragment we reported progress before adding the filename to the fragment. We now add the filename to the fragment before reporting progress. * Nested projection was broken (existing nested projection tests passed by luck). This required some slight change to how we calculate projection. BREAKING CHANGE: new datasets will no longer be readable by versions older than 0.16
- Loading branch information
1 parent
f4e3300
commit c0e1f15
Showing
35 changed files
with
801 additions
and
342 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
# This script generates Lance files that are read by test_forward_compat.py | ||
|
||
from pathlib import Path | ||
|
||
import pyarrow as pa | ||
from lance.file import LanceFileWriter | ||
|
||
|
||
def get_path(name: str): | ||
dataset_dir = ( | ||
Path(__file__).parent.parent.parent.parent.parent | ||
/ "test_data" | ||
/ "forward_compat" | ||
/ name | ||
) | ||
return dataset_dir | ||
|
||
|
||
def build_basic_types(): | ||
schema = pa.schema( | ||
[ | ||
pa.field("int", pa.int64()), | ||
pa.field("float", pa.float32()), | ||
pa.field("str", pa.string()), | ||
pa.field("list_int", pa.list_(pa.int64())), | ||
pa.field("list_str", pa.list_(pa.string())), | ||
pa.field("struct", pa.struct([pa.field("a", pa.int64())])), | ||
pa.field("dict", pa.dictionary(pa.int16(), pa.string())), | ||
pa.field("str_as_dict", pa.string()), | ||
] | ||
) | ||
|
||
return pa.table( | ||
[ | ||
pa.array(range(1000)), | ||
pa.array(range(1000), pa.float32()), | ||
pa.array([str(i) for i in range(1000)]), | ||
pa.array([list(range(i)) for i in range(1000)]), | ||
pa.array([[str(i)] for i in range(1000)]), | ||
pa.array([{"a": i} for i in range(1000)]), | ||
pa.array( | ||
[str(i % 10) for i in range(1000)], | ||
pa.dictionary(pa.int16(), pa.string()), | ||
), | ||
pa.array(["a"] * 500 + ["b"] * 500), | ||
], | ||
schema=schema, | ||
) | ||
|
||
|
||
def write_basic_types(): | ||
path = get_path("basic_types.lance") | ||
with LanceFileWriter(str(path)) as writer: | ||
writer.write_batch(build_basic_types()) | ||
|
||
|
||
def build_large(): | ||
# ~40MB of vector embedding data (10K 1024-float32) | ||
fsl_data = pa.array(range(1024 * 1000 * 10), pa.float32()) | ||
fsls = pa.FixedSizeListArray.from_arrays(fsl_data, 1024) | ||
# ~40 MiB of binary data (10k 4KiB chunks) | ||
bindata = pa.allocate_buffer(1024 * 1000 * 40) | ||
offsets = pa.array( | ||
range(0, (1024 * 1000 * 40) + 4 * 1024, 4 * 1024), pa.int32() | ||
).buffers()[1] | ||
bins = pa.BinaryArray.from_buffers(pa.binary(), 10000, [None, offsets, bindata]) | ||
|
||
schema = pa.schema( | ||
[ | ||
pa.field("int", pa.int32()), | ||
pa.field("fsl", pa.list_(pa.float32())), | ||
pa.field("bin", pa.binary()), | ||
] | ||
) | ||
|
||
return pa.table( | ||
[ | ||
pa.array(range(10000), pa.int32()), | ||
fsls, | ||
bins, | ||
], | ||
schema=schema, | ||
) | ||
|
||
|
||
def write_large(): | ||
path = get_path("large.lance") | ||
with LanceFileWriter(str(path)) as writer: | ||
writer.write_batch(build_large()) | ||
|
||
|
||
if __name__ == "__main__": | ||
write_basic_types() | ||
write_large() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import pytest | ||
from lance.file import LanceFileReader | ||
|
||
from .datagen import build_basic_types, build_large, get_path | ||
|
||
|
||
@pytest.mark.forward | ||
def test_scans(): | ||
expected_basic_types = build_basic_types() | ||
actual_basic_types = ( | ||
LanceFileReader(str(get_path("basic_types.lance"))).read_all().to_table() | ||
) | ||
assert actual_basic_types.equals(expected_basic_types) | ||
|
||
expected_large = build_large() | ||
actual_large = LanceFileReader(str(get_path("large.lance"))).read_all().to_table() | ||
assert actual_large.equals(expected_large) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.