Skip to content

Commit

Permalink
refactor: rename use_experimental_writer to use_legacy_format (#2433)
Browse files Browse the repository at this point in the history
As the v2 format becomes the default we don't want to call it
"experimental" any longer
  • Loading branch information
westonpace authored Jun 4, 2024
1 parent f8efc85 commit 0b18a44
Show file tree
Hide file tree
Showing 15 changed files with 216 additions and 252 deletions.
10 changes: 5 additions & 5 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,7 +2385,7 @@ def write_dataset(
commit_lock: Optional[CommitLock] = None,
progress: Optional[FragmentWriteProgress] = None,
storage_options: Optional[Dict[str, str]] = None,
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
) -> LanceDataset:
"""Write a given data_obj to the given uri
Expand Down Expand Up @@ -2425,9 +2425,9 @@ def write_dataset(
storage_options : optional, dict
Extra options that make sense for a particular storage connection. This is
used to store connection parameters like credentials, endpoint, etc.
use_experimental_writer : optional, bool
Use the Lance v2 writer to write Lance v2 files. This is not recommended
at this time as there are several known limitations in the v2 writer.
use_legacy_format : optional, bool, default True
Use the Lance v1 writer to write Lance v1 files. The default is currently
True but will change as we roll out the v2 format.
"""
if _check_for_hugging_face(data_obj):
# Huggingface datasets
Expand All @@ -2449,7 +2449,7 @@ def write_dataset(
"max_bytes_per_file": max_bytes_per_file,
"progress": progress,
"storage_options": storage_options,
"use_experimental_writer": use_experimental_writer,
"use_legacy_format": use_legacy_format,
}

if commit_lock:
Expand Down
17 changes: 10 additions & 7 deletions python/python/lance/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def create(
progress: Optional[FragmentWriteProgress] = None,
mode: str = "append",
*,
use_experimental_writer=False,
use_legacy_format=True,
) -> FragmentMetadata:
"""Create a :class:`FragmentMetadata` from the given data.
Expand Down Expand Up @@ -177,6 +177,9 @@ def create(
The write mode. If "append" is specified, the data will be checked
against the existing dataset's schema. Otherwise, pass "create" or
"overwrite" to assign new field ids to the schema.
use_legacy_format: bool, default True
Use the legacy format to write Lance files. The default is True
while the v2 format is still in beta.
See Also
--------
Expand Down Expand Up @@ -215,7 +218,7 @@ def create(
max_rows_per_group=max_rows_per_group,
progress=progress,
mode=mode,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
)
return FragmentMetadata(inner_meta.json())

Expand Down Expand Up @@ -504,7 +507,7 @@ def write_fragments(
max_rows_per_group: int = 1024,
max_bytes_per_file: int = DEFAULT_MAX_BYTES_PER_FILE,
progress: Optional[FragmentWriteProgress] = None,
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
storage_options: Optional[Dict[str, str]] = None,
) -> List[FragmentMetadata]:
"""
Expand Down Expand Up @@ -542,9 +545,9 @@ def write_fragments(
*Experimental API*. Progress tracking for writing the fragment. Pass
a custom class that defines hooks to be called when each fragment is
starting to write and finishing writing.
use_experimental_writer : optional, bool
Use the Lance v2 writer to write Lance v2 files. This is not recommended
at this time as there are several known limitations in the v2 writer.
use_legacy_format : optional, bool, default True
Use the Lance v1 writer to write Lance v1 files. The default is currently
True while the v2 format is in beta.
storage_options : Optional[Dict[str, str]]
Extra options that make sense for a particular storage connection. This is
used to store connection parameters like credentials, endpoint, etc.
Expand Down Expand Up @@ -578,7 +581,7 @@ def write_fragments(
max_rows_per_group=max_rows_per_group,
max_bytes_per_file=max_bytes_per_file,
progress=progress,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
storage_options=storage_options,
)
return [FragmentMetadata.from_metadata(frag) for frag in fragments]
25 changes: 12 additions & 13 deletions python/python/lance/ray/sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _write_fragment(
max_rows_per_file: int = 1024 * 1024,
max_bytes_per_file: Optional[int] = None,
max_rows_per_group: int = 1024, # Only useful for v1 writer.
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
storage_options: Optional[Dict[str, Any]] = None,
) -> Tuple[FragmentMetadata, pa.Schema]:
from ..dependencies import _PANDAS_AVAILABLE
Expand Down Expand Up @@ -88,7 +88,7 @@ def record_batch_converter():
max_rows_per_file=max_rows_per_file,
max_rows_per_group=max_rows_per_group,
max_bytes_per_file=max_bytes_per_file,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
storage_options=storage_options,
)
return [(fragment, schema) for fragment in fragments]
Expand Down Expand Up @@ -161,9 +161,8 @@ class LanceDatasink(_BaseLanceDatasink):
Choices are 'append', 'create', 'overwrite'.
max_rows_per_file : int, optional
The maximum number of rows per file. Default is 1024 * 1024.
use_experimental_writer : bool, optional
Set true to use v2 writer. Default is False now. Will be removed once
v2 writer become the default.
use_legacy_format : bool, optional
Set True to use the legacy v1 format. Default is False
"""

NAME = "Lance"
Expand All @@ -174,14 +173,14 @@ def __init__(
schema: Optional[pa.Schema] = None,
mode: Literal["create", "append", "overwrite"] = "create",
max_rows_per_file: int = 1024 * 1024,
use_experimental_writer: bool = True,
use_legacy_format: bool = False,
*args,
**kwargs,
):
super().__init__(uri, schema=schema, mode=mode, *args, **kwargs)

self.max_rows_per_file = max_rows_per_file
self.use_experimental_writer = use_experimental_writer
self.use_legacy_format = use_legacy_format
# if mode is append, read_version is read from existing dataset.
self.read_version: int | None = None

Expand All @@ -206,7 +205,7 @@ def write(
self.uri,
schema=self.schema,
max_rows_per_file=self.max_rows_per_file,
use_experimental_writer=self.use_experimental_writer,
use_legacy_format=self.use_legacy_format,
)
return [
(pickle.dumps(fragment), pickle.dumps(schema))
Expand Down Expand Up @@ -235,8 +234,8 @@ class LanceFragmentWriter:
max_rows_per_group : int, optional
The maximum number of rows per group. Default is 1024.
Only useful for v1 writer.
use_experimental_writer : bool, optional
Set true to use v2 writer. Default is True.
use_legacy_format : bool, optional
Set True to use the legacy v1 writer. Default is False
storage_options : Dict[str, Any], optional
The storage options for the writer. Default is None.
Expand All @@ -251,7 +250,7 @@ def __init__(
max_rows_per_file: int = 1024 * 1024,
max_bytes_per_file: Optional[int] = None,
max_rows_per_group: Optional[int] = None, # Only useful for v1 writer.
use_experimental_writer: bool = True,
use_legacy_format: bool = False,
storage_options: Optional[Dict[str, Any]] = None,
):
self.uri = uri
Expand All @@ -261,7 +260,7 @@ def __init__(
self.max_rows_per_group = max_rows_per_group
self.max_rows_per_file = max_rows_per_file
self.max_bytes_per_file = max_bytes_per_file
self.use_experimental_writer = use_experimental_writer
self.use_legacy_format = use_legacy_format
self.storage_options = storage_options

def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
Expand All @@ -277,7 +276,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
schema=self.schema,
max_rows_per_file=self.max_rows_per_file,
max_rows_per_group=self.max_rows_per_group,
use_experimental_writer=self.use_experimental_writer,
use_legacy_format=self.use_legacy_format,
storage_options=self.storage_options,
)
return pa.Table.from_pydict(
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1728,7 +1728,7 @@ def test_migrate_manifest(tmp_path: Path):

def test_v2_dataset(tmp_path: Path):
table = pa.table({"a": range(100), "b": range(100)})
dataset = lance.write_dataset(table, tmp_path, use_experimental_writer=True)
dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False)
batches = list(dataset.to_batches())
assert len(batches) == 1
assert pa.Table.from_batches(batches) == table
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def test_fragment_v2(tmp_path):
fragments = write_fragments(
tab,
tmp_path,
use_experimental_writer=True,
use_legacy_format=False,
)
assert len(fragments) == 1
ds = lance.dataset(dataset_uri)
Expand Down
6 changes: 2 additions & 4 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1171,10 +1171,8 @@ pub fn get_write_params(options: &PyDict) -> PyResult<Option<WriteParams>> {
if let Some(maybe_nbytes) = get_dict_opt::<usize>(options, "max_bytes_per_file")? {
p.max_bytes_per_file = maybe_nbytes;
}
if let Some(use_experimental_writer) =
get_dict_opt::<bool>(options, "use_experimental_writer")?
{
p.use_experimental_writer = use_experimental_writer;
if let Some(use_legacy_format) = get_dict_opt::<bool>(options, "use_legacy_format")? {
p.use_legacy_format = use_legacy_format;
}
if let Some(progress) = get_dict_opt::<PyObject>(options, "progress")? {
p.progress = Arc::new(PyWriteProgress::new(progress.to_object(options.py())));
Expand Down
Loading

0 comments on commit 0b18a44

Please sign in to comment.