refactor: rename use_experimental_writer to use_legacy_format (#2433)

As the v2 format becomes the default we don't want to call it "experimental" any longer
lancedb · Jun 4, 2024 · 0b18a44 · 0b18a44
1 parent f8efc85
commit 0b18a44
Show file tree

Hide file tree

Showing 15 changed files with 216 additions and 252 deletions.
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -2385,7 +2385,7 @@ def write_dataset(
     commit_lock: Optional[CommitLock] = None,
     progress: Optional[FragmentWriteProgress] = None,
     storage_options: Optional[Dict[str, str]] = None,
-    use_experimental_writer: bool = False,
+    use_legacy_format: bool = True,
 ) -> LanceDataset:
     """Write a given data_obj to the given uri
 
@@ -2425,9 +2425,9 @@ def write_dataset(
     storage_options : optional, dict
         Extra options that make sense for a particular storage connection. This is
         used to store connection parameters like credentials, endpoint, etc.
-    use_experimental_writer : optional, bool
-        Use the Lance v2 writer to write Lance v2 files.  This is not recommended
-        at this time as there are several known limitations in the v2 writer.
+    use_legacy_format : optional, bool, default True
+        Use the Lance v1 writer to write Lance v1 files.  The default is currently
+        True but will change as we roll out the v2 format.
     """
     if _check_for_hugging_face(data_obj):
         # Huggingface datasets
@@ -2449,7 +2449,7 @@ def write_dataset(
         "max_bytes_per_file": max_bytes_per_file,
         "progress": progress,
         "storage_options": storage_options,
-        "use_experimental_writer": use_experimental_writer,
+        "use_legacy_format": use_legacy_format,
     }
 
     if commit_lock:

diff --git a/python/python/lance/fragment.py b/python/python/lance/fragment.py
@@ -146,7 +146,7 @@ def create(
         progress: Optional[FragmentWriteProgress] = None,
         mode: str = "append",
         *,
-        use_experimental_writer=False,
+        use_legacy_format=True,
     ) -> FragmentMetadata:
         """Create a :class:`FragmentMetadata` from the given data.
 
@@ -177,6 +177,9 @@ def create(
             The write mode. If "append" is specified, the data will be checked
             against the existing dataset's schema. Otherwise, pass "create" or
             "overwrite" to assign new field ids to the schema.
+        use_legacy_format: bool, default True
+            Use the legacy format to write Lance files. The default is True
+            while the v2 format is still in beta.
 
         See Also
         --------
@@ -215,7 +218,7 @@ def create(
             max_rows_per_group=max_rows_per_group,
             progress=progress,
             mode=mode,
-            use_experimental_writer=use_experimental_writer,
+            use_legacy_format=use_legacy_format,
         )
         return FragmentMetadata(inner_meta.json())
 
@@ -504,7 +507,7 @@ def write_fragments(
     max_rows_per_group: int = 1024,
     max_bytes_per_file: int = DEFAULT_MAX_BYTES_PER_FILE,
     progress: Optional[FragmentWriteProgress] = None,
-    use_experimental_writer: bool = False,
+    use_legacy_format: bool = True,
     storage_options: Optional[Dict[str, str]] = None,
 ) -> List[FragmentMetadata]:
     """
@@ -542,9 +545,9 @@ def write_fragments(
         *Experimental API*. Progress tracking for writing the fragment. Pass
         a custom class that defines hooks to be called when each fragment is
         starting to write and finishing writing.
-    use_experimental_writer : optional, bool
-        Use the Lance v2 writer to write Lance v2 files.  This is not recommended
-        at this time as there are several known limitations in the v2 writer.
+    use_legacy_format : optional, bool, default True
+        Use the Lance v1 writer to write Lance v1 files.  The default is currently
+        True while the v2 format is in beta.
     storage_options : Optional[Dict[str, str]]
         Extra options that make sense for a particular storage connection. This is
         used to store connection parameters like credentials, endpoint, etc.
@@ -578,7 +581,7 @@ def write_fragments(
         max_rows_per_group=max_rows_per_group,
         max_bytes_per_file=max_bytes_per_file,
         progress=progress,
-        use_experimental_writer=use_experimental_writer,
+        use_legacy_format=use_legacy_format,
         storage_options=storage_options,
     )
     return [FragmentMetadata.from_metadata(frag) for frag in fragments]
diff --git a/python/python/lance/ray/sink.py b/python/python/lance/ray/sink.py
@@ -54,7 +54,7 @@ def _write_fragment(
     max_rows_per_file: int = 1024 * 1024,
     max_bytes_per_file: Optional[int] = None,
     max_rows_per_group: int = 1024,  # Only useful for v1 writer.
-    use_experimental_writer: bool = False,
+    use_legacy_format: bool = True,
     storage_options: Optional[Dict[str, Any]] = None,
 ) -> Tuple[FragmentMetadata, pa.Schema]:
     from ..dependencies import _PANDAS_AVAILABLE
@@ -88,7 +88,7 @@ def record_batch_converter():
         max_rows_per_file=max_rows_per_file,
         max_rows_per_group=max_rows_per_group,
         max_bytes_per_file=max_bytes_per_file,
-        use_experimental_writer=use_experimental_writer,
+        use_legacy_format=use_legacy_format,
         storage_options=storage_options,
     )
     return [(fragment, schema) for fragment in fragments]
@@ -161,9 +161,8 @@ class LanceDatasink(_BaseLanceDatasink):
         Choices are 'append', 'create', 'overwrite'.
     max_rows_per_file : int, optional
         The maximum number of rows per file. Default is 1024 * 1024.
-    use_experimental_writer : bool, optional
-        Set true to use v2 writer. Default is False now. Will be removed once
-        v2 writer become the default.
+    use_legacy_format : bool, optional
+        Set True to use the legacy v1 format. Default is False
     """
 
     NAME = "Lance"
@@ -174,14 +173,14 @@ def __init__(
         schema: Optional[pa.Schema] = None,
         mode: Literal["create", "append", "overwrite"] = "create",
         max_rows_per_file: int = 1024 * 1024,
-        use_experimental_writer: bool = True,
+        use_legacy_format: bool = False,
         *args,
         **kwargs,
     ):
         super().__init__(uri, schema=schema, mode=mode, *args, **kwargs)
 
         self.max_rows_per_file = max_rows_per_file
-        self.use_experimental_writer = use_experimental_writer
+        self.use_legacy_format = use_legacy_format
         # if mode is append, read_version is read from existing dataset.
         self.read_version: int | None = None
 
@@ -206,7 +205,7 @@ def write(
             self.uri,
             schema=self.schema,
             max_rows_per_file=self.max_rows_per_file,
-            use_experimental_writer=self.use_experimental_writer,
+            use_legacy_format=self.use_legacy_format,
         )
         return [
             (pickle.dumps(fragment), pickle.dumps(schema))
@@ -235,8 +234,8 @@ class LanceFragmentWriter:
     max_rows_per_group : int, optional
         The maximum number of rows per group. Default is 1024.
         Only useful for v1 writer.
-    use_experimental_writer : bool, optional
-        Set true to use v2 writer. Default is True.
+    use_legacy_format : bool, optional
+        Set True to use the legacy v1 writer. Default is False
     storage_options : Dict[str, Any], optional
         The storage options for the writer. Default is None.
 
@@ -251,7 +250,7 @@ def __init__(
         max_rows_per_file: int = 1024 * 1024,
         max_bytes_per_file: Optional[int] = None,
         max_rows_per_group: Optional[int] = None,  # Only useful for v1 writer.
-        use_experimental_writer: bool = True,
+        use_legacy_format: bool = False,
         storage_options: Optional[Dict[str, Any]] = None,
     ):
         self.uri = uri
@@ -261,7 +260,7 @@ def __init__(
         self.max_rows_per_group = max_rows_per_group
         self.max_rows_per_file = max_rows_per_file
         self.max_bytes_per_file = max_bytes_per_file
-        self.use_experimental_writer = use_experimental_writer
+        self.use_legacy_format = use_legacy_format
         self.storage_options = storage_options
 
     def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
@@ -277,7 +276,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
             schema=self.schema,
             max_rows_per_file=self.max_rows_per_file,
             max_rows_per_group=self.max_rows_per_group,
-            use_experimental_writer=self.use_experimental_writer,
+            use_legacy_format=self.use_legacy_format,
             storage_options=self.storage_options,
         )
         return pa.Table.from_pydict(

diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
@@ -1728,7 +1728,7 @@ def test_migrate_manifest(tmp_path: Path):
 
 def test_v2_dataset(tmp_path: Path):
     table = pa.table({"a": range(100), "b": range(100)})
-    dataset = lance.write_dataset(table, tmp_path, use_experimental_writer=True)
+    dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False)
     batches = list(dataset.to_batches())
     assert len(batches) == 1
     assert pa.Table.from_batches(batches) == table

diff --git a/python/python/tests/test_fragment.py b/python/python/tests/test_fragment.py
@@ -261,7 +261,7 @@ def test_fragment_v2(tmp_path):
     fragments = write_fragments(
         tab,
         tmp_path,
-        use_experimental_writer=True,
+        use_legacy_format=False,
     )
     assert len(fragments) == 1
     ds = lance.dataset(dataset_uri)

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -1171,10 +1171,8 @@ pub fn get_write_params(options: &PyDict) -> PyResult<Option<WriteParams>> {
         if let Some(maybe_nbytes) = get_dict_opt::<usize>(options, "max_bytes_per_file")? {
             p.max_bytes_per_file = maybe_nbytes;
         }
-        if let Some(use_experimental_writer) =
-            get_dict_opt::<bool>(options, "use_experimental_writer")?
-        {
-            p.use_experimental_writer = use_experimental_writer;
+        if let Some(use_legacy_format) = get_dict_opt::<bool>(options, "use_legacy_format")? {
+            p.use_legacy_format = use_legacy_format;
         }
         if let Some(progress) = get_dict_opt::<PyObject>(options, "progress")? {
             p.progress = Arc::new(PyWriteProgress::new(progress.to_object(options.py())));