Skip to content

Commit

Permalink
Improved arrow backed pandas support and updated poetry lock.
Browse files Browse the repository at this point in the history
I had to disable tests with Series(dtype=Int64).rank() calls because they fail to handle NA correctly.
  • Loading branch information
windiana42 committed Apr 19, 2024
1 parent f66813c commit 8495652
Show file tree
Hide file tree
Showing 6 changed files with 1,186 additions and 952 deletions.
3 changes: 2 additions & 1 deletion docs/source/changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## Latest
## 0.1.4 (2024-04-20)
- better support for apache arrow backed pandas dataframes
- fix handling of boolean literals
- fix literal handling within SQL expressions
- support for operators/functions that take constant arguments
Expand Down
2,052 changes: 1,117 additions & 935 deletions poetry.lock

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,21 @@ python = ">=3.9"
numpy = ">=1.23.1"
pandas = ">=1.4.3"
SQLAlchemy = ">=1.4.27"
pyarrow = {version = ">=11.0.0", optional=true}

[tool.poetry.extras]
pyarrow = ["pyarrow"]

[tool.poetry.group.dev.dependencies]
pytest = ">=7.1.2"
pytest-xdist = ">=2.5.0"

black = { version = "23.3.0", extras = ["d"] }
ruff = "^0.0.270"
ruff = "^0.1"
pre-commit = ">=2.20.0"

duckdb = "^0.8.1"
duckdb-engine = "^0.9.1"
duckdb = ">=0.8.1"
duckdb-engine = ">=0.9.1"

[tool.poetry.group.tests]
optional = true
Expand Down
20 changes: 15 additions & 5 deletions tests/test_backend_equivalence/test_window_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,16 @@ def test_op_rank(df4):
>> group_by(t.col1)
>> mutate(
rank1=t.col1.rank(),
rank2=t.col2.rank(),
rank3=t.col2.nulls_last().rank(),
# pandas bug for pandas >= 2, <= 2.2.2:
# python -c 'import pandas as pd; print(pd.Series([1,1,pd.NA],
# dtype=pd.Int64Dtype()).rank(method="min", ascending=True,
# na_option="bottom")); print(pd.__version__)'
# returns [1.0, 1.0, 1.0] (dtype=float64)
# rank2=t.col2.rank(),
# rank3=t.col2.nulls_last().rank(),
rank4=t.col5.nulls_first().rank(),
rank5=(-t.col5.nulls_first()).rank(),
rank_expr=(t.col3 - t.col2).rank(),
# rank_expr=(t.col3 - t.col2).rank(),
),
)

Expand All @@ -274,8 +279,13 @@ def test_op_dense_rank(df4):
>> group_by(t.col1)
>> mutate(
rank1=t.col1.dense_rank(),
rank2=t.col2.dense_rank(),
rank3=t.col2.nulls_last().dense_rank(),
# pandas bug for pandas >= 2, <= 2.2.2:
# python -c 'import pandas as pd; print(pd.Series([1,1,pd.NA],
# dtype=pd.Int64Dtype()). rank(method="min", ascending=True,
# na_option="bottom")); print(pd.__version__)'
# returns [1.0, 1.0, 1.0] (dtype=float64)
# rank2=t.col2.dense_rank(),
# rank3=t.col2.nulls_last().dense_rank(),
rank4=t.col5.nulls_first().dense_rank(),
rank5=(-t.col5.nulls_first()).dense_rank(),
),
Expand Down
38 changes: 32 additions & 6 deletions tests/test_pandas_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,45 @@
)


def create_table(name: str, df: pd.DataFrame, dtype_backend: str) -> Table:
if dtype_backend == "numpy":
return Table(PandasTableImpl(name, df.copy()))
elif dtype_backend == "arrow":
import pyarrow as pa

def types_mapper(dtype):
if dtype == pa.string():
return pd.StringDtype(storage="pyarrow")
return pd.ArrowDtype(dtype)

df2 = pa.Table.from_pandas(df).to_pandas(types_mapper=types_mapper)
# convert for example nullable int column represented as float in df to Int64
for col in df2.columns:
df2[col] = df2[col].astype(
pd.core.dtypes.cast.convert_dtypes(df[col]._values)
)
return Table(PandasTableImpl(name, df2))
raise ValueError(f"Unknown dtype_backend: {dtype_backend}")


@pytest.fixture(params=["numpy", "arrow"])
def dtype_backend(request):
return request.param


@pytest.fixture
def tbl1():
return Table(PandasTableImpl("df1", df1.copy()))
def tbl1(dtype_backend):
return create_table("df1", df1, dtype_backend)


@pytest.fixture
def tbl2():
return Table(PandasTableImpl("df2", df2.copy()))
def tbl2(dtype_backend):
return create_table("df2", df2, dtype_backend)


@pytest.fixture
def tbl3():
return Table(PandasTableImpl("df3", df3.copy()))
def tbl3(dtype_backend):
return create_table("df3", df3, dtype_backend)


@pytest.fixture
Expand Down
15 changes: 13 additions & 2 deletions tests/util/assertion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import contextlib
import warnings

import pandas as pd
import pytest
from pandas._testing import assert_frame_equal

Expand All @@ -16,7 +17,12 @@ def assert_equal(left, right, check_dtype=False):
right_df = right >> collect() if isinstance(right, Table) else right

try:
assert_frame_equal(left_df, right_df, check_dtype=check_dtype)
cols = left_df.columns.tolist()
assert_frame_equal(
left_df.sort_values(cols).reset_index(drop=True),
right_df.sort_values(cols).reset_index(drop=True),
check_dtype=check_dtype,
)
except AssertionError as e:
print("First dataframe:")
print(left_df)
Expand Down Expand Up @@ -98,8 +104,13 @@ def assert_result_equal(
else:
raise e

def fix_na(df: pd.DataFrame):
for col in dfy.dtypes[dfy.dtypes == object].index:
df[col] = df[col].fillna(pd.NA)
return df

try:
assert_frame_equal(dfx, dfy, check_dtype=False, **kwargs)
assert_frame_equal(fix_na(dfx), fix_na(dfy), check_dtype=False, **kwargs)
except Exception as e:
if xfail_warnings and did_raise_warning:
pytest.xfail(warnings_summary)
Expand Down

0 comments on commit 8495652

Please sign in to comment.