Skip to content

Commit

Permalink
Merge pull request #2849 from finos/tkp/polars
Browse files Browse the repository at this point in the history
Support input/output to/from polars via python (not native rust)
  • Loading branch information
texodus authored Nov 18, 2024
2 parents efec1b0 + d40b1c8 commit 456bd39
Show file tree
Hide file tree
Showing 9 changed files with 407 additions and 1 deletion.
1 change: 1 addition & 0 deletions rust/perspective-python/docs/client/to_pandas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Serialize the data to a `pandas.DataFrame`.
1 change: 1 addition & 0 deletions rust/perspective-python/docs/client/to_polars.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Serialize the data to a `polars.DataFrame`.
1 change: 1 addition & 0 deletions rust/perspective-python/docs/lib.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ guide. In Python, however, Perspective supports additional data types that are
commonly used when processing data:

- `pandas.DataFrame`
- `polars.DataFrame`
- `bytes` (encoding an Apache Arrow)
- `objects` (either extracting a repr or via reference)
- `str` (encoding as a CSV)
Expand Down
251 changes: 251 additions & 0 deletions rust/perspective-python/perspective/tests/table/test_table_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
# ┃ ██████ ██████ ██████ █ █ █ █ █ █▄ ▀███ █ ┃
# ┃ ▄▄▄▄▄█ █▄▄▄▄▄ ▄▄▄▄▄█ ▀▀▀▀▀█▀▀▀▀▀ █ ▀▀▀▀▀█ ████████▌▐███ ███▄ ▀█ █ ▀▀▀▀▀ ┃
# ┃ █▀▀▀▀▀ █▀▀▀▀▀ █▀██▀▀ ▄▄▄▄▄ █ ▄▄▄▄▄█ ▄▄▄▄▄█ ████████▌▐███ █████▄ █ ▄▄▄▄▄ ┃
# ┃ █ ██████ █ ▀█▄ █ ██████ █ ███▌▐███ ███████▄ █ ┃
# ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫
# ┃ Copyright (c) 2017, the Perspective Authors. ┃
# ┃ ╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌ ┃
# ┃ This file is part of the Perspective library, distributed under the terms ┃
# ┃ of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). ┃
# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛

from datetime import date, datetime
import numpy as np
import polars as pl
from pytest import mark
import perspective as psp

client = psp.Server().new_local_client()
Table = client.table


def arrow_bytes_to_polars(view):
import pyarrow

with pyarrow.ipc.open_stream(pyarrow.BufferReader(view.to_arrow())) as reader:
return pl.from_dataframe(reader.read_pandas())


class TestTablePolars(object):
def test_empty_table(self):
tbl = Table([])
assert tbl.size() == 0
assert tbl.schema() == {}

def test_table_dataframe(self):
d = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
data = pl.DataFrame(d)
tbl = Table(data)
assert tbl.size() == 2
assert tbl.schema() == {"a": "integer", "b": "integer"}
assert tbl.view().to_records() == [
{"a": 1, "b": 2},
{"a": 3, "b": 4},
]

def test_table_lazyframe(self):
d = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
data = pl.DataFrame(d).lazy()
tbl = Table(data)
assert tbl.size() == 2
assert tbl.schema() == {"a": "integer", "b": "integer"}
assert tbl.view().to_records() == [
{"a": 1, "b": 2},
{"a": 3, "b": 4},
]

def test_table_dataframe_column_order(self):
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
data = pl.DataFrame(d).select(["b", "c", "a", "d"])
tbl = Table(data)
assert tbl.size() == 2
assert tbl.columns() == ["b", "c", "a", "d"]

def test_table_dataframe_selective_column_order(self):
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
data = pl.DataFrame(d).select(["b", "c", "a"])
tbl = Table(data)
assert tbl.size() == 2
assert tbl.columns() == ["b", "c", "a"]

def test_table_dataframe_does_not_mutate(self):
# make sure we don't mutate the dataframe that a user passes in
data = pl.DataFrame(
{
"a": [None, 1, None, 2],
"b": [1.5, None, 2.5, None],
}
)
assert data["a"].to_list() == [None, 1, None, 2]
assert data["b"].to_list() == [1.5, None, 2.5, None]

tbl = Table(data)
assert tbl.size() == 4
assert tbl.schema() == {"a": "integer", "b": "float"}

assert data["a"].to_list() == [None, 1, None, 2]
assert data["b"].to_list() == [1.5, None, 2.5, None]

def test_table_polars_from_schema_int(self):
data = [None, 1, None, 2, None, 3, 4]
df = pl.DataFrame({"a": data})
table = Table({"a": "integer"})
table.update(df)
assert table.view().to_columns()["a"] == data

def test_table_polars_from_schema_bool(self):
data = [True, False, True, False]
df = pl.DataFrame({"a": data})
table = Table({"a": "boolean"})
table.update(df)
assert table.view().to_columns()["a"] == data

def test_table_polars_from_schema_float(self):
data = [None, 1.5, None, 2.5, None, 3.5, 4.5]
df = pl.DataFrame({"a": data})
table = Table({"a": "float"})
table.update(df)
assert table.view().to_columns()["a"] == data

def test_table_polars_from_schema_float_all_nan(self):
data = [np.nan, np.nan, np.nan, np.nan]
df = pl.DataFrame({"a": data})
table = Table({"a": "float"})
table.update(df)
assert table.view().to_columns()["a"] == [None, None, None, None]

def test_table_polars_from_schema_float_to_int(self):
data = [None, 1.5, None, 2.5, None, 3.5, 4.5]
df = pl.DataFrame({"a": data})
table = Table({"a": "integer"})
table.update(df)
# truncates decimal
assert table.view().to_columns()["a"] == [None, 1, None, 2, None, 3, 4]

def test_table_polars_from_schema_int_to_float(self):
data = [None, 1, None, 2, None, 3, 4]
df = pl.DataFrame({"a": data})
table = Table({"a": "float"})
table.update(df)
assert table.view().to_columns()["a"] == [None, 1.0, None, 2.0, None, 3.0, 4.0]

def test_table_polars_from_schema_date(self, util):
data = [date(2019, 8, 15), None, date(2019, 8, 16)]
df = pl.DataFrame({"a": data})
table = Table({"a": "date"})
table.update(df)
assert table.view().to_columns()["a"] == [
util.to_timestamp(datetime(2019, 8, 15)),
None,
util.to_timestamp(datetime(2019, 8, 16)),
]

def test_table_polars_from_schema_str(self):
data = ["a", None, "b", None, "c"]
df = pl.DataFrame({"a": data})
table = Table({"a": "string"})
table.update(df)
assert table.view().to_columns()["a"] == data

def test_table_polars_none(self):
data = [None, None, None]
df = pl.DataFrame({"a": data})
table = Table(df)
assert table.view().to_columns()["a"] == data

def test_table_polars_symmetric_table(self):
# make sure that updates are symmetric to table creation
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [1.5, 2.5, 3.5, 4.5]})
t1 = Table(df)
t2 = Table({"a": "integer", "b": "float"})
t2.update(df)
assert t1.view().to_columns() == {
"a": [1, 2, 3, 4],
"b": [1.5, 2.5, 3.5, 4.5],
}

def test_table_polars_symmetric_stacked_updates(self):
# make sure that updates are symmetric to table creation
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [1.5, 2.5, 3.5, 4.5]})

t1 = Table(df)
t1.update(df)

t2 = Table({"a": "integer", "b": "float"})
t2.update(df)
t2.update(df)

assert t1.view().to_columns() == {
"a": [1, 2, 3, 4, 1, 2, 3, 4],
"b": [1.5, 2.5, 3.5, 4.5, 1.5, 2.5, 3.5, 4.5],
}

@mark.skip(reason="Not supported, polars doesnt like input")
def test_table_polars_transitive(self):
# serialized output -> table -> serialized output
records = {
"a": [1, 2, 3, 4],
"b": [1.5, 2.5, 3.5, 4.5],
"c": [np.nan, np.nan, "abc", np.nan],
"d": [None, True, None, False],
"e": [
float("nan"),
datetime(2019, 7, 11, 12, 30),
float("nan"),
datetime(2019, 7, 11, 12, 30),
],
}

df = pl.DataFrame(records, strict=False)
t1 = Table(df)
out1 = arrow_bytes_to_polars(t1.view(columns=["a", "b", "c", "d", "e"]))
t2 = Table(out1)
assert t1.schema() == t2.schema()
out2 = t2.view().to_columns()
assert t1.view().to_columns() == out2

# dtype=object should have correct inferred types

def test_table_polars_object_to_int(self):
df = pl.DataFrame({"a": [1, 2, None, 2, None, 3, 4]})
table = Table(df)
assert table.schema() == {"a": "integer"}
assert table.view().to_columns()["a"] == [1, 2, None, 2, None, 3, 4]

def test_table_polars_object_to_float(self):
df = pl.DataFrame({"a": [None, 1, None, 2, None, 3, 4]})
table = Table(df)
assert table.schema() == {"a": "integer"}
assert table.view().to_columns()["a"] == [None, 1.0, None, 2.0, None, 3.0, 4.0]

def test_table_polars_object_to_bool(self):
df = pl.DataFrame({"a": [True, False, True, False, True, False]})
table = Table(df)
assert table.schema() == {"a": "boolean"}
assert table.view().to_columns()["a"] == [True, False, True, False, True, False]

def test_table_polars_object_to_datetime(self):
df = pl.DataFrame(
{
"a": [
datetime(2019, 7, 11, 1, 2, 3),
datetime(2019, 7, 12, 1, 2, 3),
None,
]
}
)

table = Table(df)
assert table.schema() == {"a": "datetime"}
assert table.view().to_columns()["a"] == [
datetime(2019, 7, 11, 1, 2, 3).timestamp() * 1000,
datetime(2019, 7, 12, 1, 2, 3).timestamp() * 1000,
None,
]

def test_table_polars_object_to_str(self):
df = pl.DataFrame({"a": np.array(["abc", "def", None, "ghi"], dtype=object)})
table = Table(df)
assert table.schema() == {"a": "string"}
assert table.view().to_columns()["a"] == ["abc", "def", None, "ghi"]
1 change: 1 addition & 0 deletions rust/perspective-python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ maturin==1.6.0
numpy==2.0.0
packaging==24.1
pandas==2.2.2
polars==1.13.1
pyarrow==16.1.0
psutil==6.0.0
pytest==8.2.2
Expand Down
14 changes: 14 additions & 0 deletions rust/perspective-python/src/client/client_sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,25 @@ impl View {
self.0.to_csv(window).py_block_on(py)
}

#[doc = include_str!("../../docs/client/to_pandas.md")]
#[pyo3(signature = (**window))]
// #[deprecated(since="3.2.0", note="Please use `View::to_pandas`")]
pub fn to_dataframe(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyAny>> {
self.0.to_dataframe(window).py_block_on(py)
}

#[doc = include_str!("../../docs/client/to_pandas.md")]
#[pyo3(signature = (**window))]
pub fn to_pandas(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyAny>> {
self.0.to_dataframe(window).py_block_on(py)
}

#[doc = include_str!("../../docs/client/to_polars.md")]
#[pyo3(signature = (**window))]
pub fn to_polars(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyAny>> {
self.0.to_polars(window).py_block_on(py)
}

#[doc = crate::inherit_docs!("view/to_arrow.md")]
#[pyo3(signature = (**window))]
pub fn to_arrow(&self, py: Python<'_>, window: Option<Py<PyDict>>) -> PyResult<Py<PyBytes>> {
Expand Down
1 change: 1 addition & 0 deletions rust/perspective-python/src/client/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

pub mod client_sync;
mod pandas;
mod polars;
mod pyarrow;
pub mod python;
pub mod table_data;
Expand Down
Loading

0 comments on commit 456bd39

Please sign in to comment.