feat: Added DataFrameWriteOptions option when writing as csv, json, p…

…arquet.
apache · Sep 6, 2024 · 7b47717 · 7b47717
1 parent fe0738a
commit 7b47717
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 11 deletions.
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -409,37 +409,62 @@ def except_all(self, other: DataFrame) -> DataFrame:
         """
         return DataFrame(self.df.except_all(other.df))
 
-    def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None:
+    def write_csv(
+        self, 
+        path: str | pathlib.Path, 
+        with_header: bool = False,
+        write_options_overwrite: bool = False,
+        write_options_single_file_output: bool = False,
+        write_options_partition_by: List = [],
+    ) -> None:
         """Execute the :py:class:`DataFrame`  and write the results to a CSV file.
 
         Args:
             path: Path of the CSV file to write.
             with_header: If true, output the CSV header row.
+            write_options_overwrite: Controls if existing data should be overwritten
+            write_options_single_file_output: Controls if all partitions should be coalesced into a single output file. Generally will have slower performance when set to true.
+            write_options_partition_by: Sets which columns should be used for hive-style partitioned writes by name. Can be set to empty vec![] for non-partitioned writes.
         """
-        self.df.write_csv(str(path), with_header)
+        self.df.write_csv(str(path), with_header, write_options_overwrite, write_options_single_file_output, write_options_partition_by)
 
     def write_parquet(
         self,
         path: str | pathlib.Path,
         compression: str = "uncompressed",
         compression_level: int | None = None,
+        write_options_overwrite: bool = False,
+        write_options_single_file_output: bool = False,
+        write_options_partition_by: List = [],
     ) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a Parquet file.
 
         Args:
             path: Path of the Parquet file to write.
             compression: Compression type to use.
             compression_level: Compression level to use.
+            write_options_overwrite: Controls if existing data should be overwritten
+            write_options_single_file_output: Controls if all partitions should be coalesced into a single output file. Generally will have slower performance when set to true.
+            write_options_partition_by: Sets which columns should be used for hive-style partitioned writes by name. Can be set to empty vec![] for non-partitioned writes.
         """
-        self.df.write_parquet(str(path), compression, compression_level)
+        self.df.write_parquet(str(path), compression, compression_level, write_options_overwrite, write_options_single_file_output, write_options_partition_by)
 
-    def write_json(self, path: str | pathlib.Path) -> None:
+    def write_json(
+        self, 
+        path: str | pathlib.Path,
+        write_options_overwrite: bool = False,
+        write_options_single_file_output: bool = False,
+        write_options_partition_by: List = [],
+    ) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a JSON file.
 
         Args:
             path: Path of the JSON file to write.
+            write_options_overwrite: Controls if existing data should be overwritten
+            write_options_single_file_output: Controls if all partitions should be coalesced into a single output file. Generally will have slower performance when set to true.
+            write_options_partition_by: Sets which columns should be used for hive-style partitioned writes by name. Can be set to empty vec![] for non-partitioned writes.
         """
-        self.df.write_json(str(path))
+        self.df.write_json(str(path), write_options_overwrite, write_options_single_file_output, write_options_partition_by)
 
     def to_arrow_table(self) -> pa.Table:
         """Execute the :py:class:`DataFrame` and convert it into an Arrow Table.

diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -402,7 +402,24 @@ impl PyDataFrame {
     }
 
     /// Write a `DataFrame` to a CSV file.
-    fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyResult<()> {
+    #[pyo3(signature = (
+        path,
+        with_header=false,
+
+        write_options_overwrite=false,
+        write_options_single_file_output=false,
+        write_options_partition_by=vec![],
+    ))]
+    fn write_csv(
+        &self, 
+        path: &str, 
+        with_header: bool,
+
+        write_options_overwrite: bool,
+        write_options_single_file_output: bool,
+        write_options_partition_by: Vec<String>,
+        py: Python
+    ) -> PyResult<()> {
         let csv_options = CsvOptions {
             has_header: Some(with_header),
             ..Default::default()
@@ -411,7 +428,10 @@ impl PyDataFrame {
             py,
             self.df.as_ref().clone().write_csv(
                 path,
-                DataFrameWriteOptions::new(),
+                DataFrameWriteOptions::default()
+                    .with_overwrite(write_options_overwrite)
+                    .with_single_file_output(write_options_single_file_output)
+                    .with_partition_by(write_options_partition_by),
                 Some(csv_options),
             ),
         )?;
@@ -422,13 +442,21 @@ impl PyDataFrame {
     #[pyo3(signature = (
         path,
         compression="uncompressed",
-        compression_level=None
+        compression_level=None,
+
+        write_options_overwrite=false,
+        write_options_single_file_output=false,
+        write_options_partition_by=vec![],
         ))]
     fn write_parquet(
         &self,
         path: &str,
         compression: &str,
         compression_level: Option<u32>,
+
+        write_options_overwrite: bool,
+        write_options_single_file_output: bool,
+        write_options_partition_by: Vec<String>,
         py: Python,
     ) -> PyResult<()> {
         fn verify_compression_level(cl: Option<u32>) -> Result<u32, PyErr> {
@@ -472,21 +500,45 @@ impl PyDataFrame {
             py,
             self.df.as_ref().clone().write_parquet(
                 path,
-                DataFrameWriteOptions::new(),
+                DataFrameWriteOptions::default()
+                    .with_overwrite(write_options_overwrite)
+                    .with_single_file_output(write_options_single_file_output)
+                    .with_partition_by(write_options_partition_by),
                 Option::from(options),
             ),
         )?;
         Ok(())
     }
 
     /// Executes a query and writes the results to a partitioned JSON file.
-    fn write_json(&self, path: &str, py: Python) -> PyResult<()> {
+    #[pyo3(signature = (
+        path,
+
+        write_options_overwrite=false,
+        write_options_single_file_output=false,
+        write_options_partition_by=vec![],
+    ))]
+    fn write_json(
+        &self, 
+        path: &str, 
+
+        write_options_overwrite: bool,
+        write_options_single_file_output: bool,
+        write_options_partition_by: Vec<String>,
+        py: Python
+    ) -> PyResult<()> {
         wait_for_future(
             py,
             self.df
                 .as_ref()
                 .clone()
-                .write_json(path, DataFrameWriteOptions::new(), None),
+                .write_json(
+                    path,
+                    DataFrameWriteOptions::default()
+                        .with_overwrite(write_options_overwrite)
+                        .with_single_file_output(write_options_single_file_output)
+                        .with_partition_by(write_options_partition_by), 
+                    None),
         )?;
         Ok(())
     }