Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#6016: make sure eval_general doesn't expect exceptions by default #6954

Merged
merged 61 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
1bcb09e
TEST-#6016: make sure 'eval_general' doesn't expect exceptions by def…
anmyachev Feb 21, 2024
22207df
update 'test_general'
anmyachev Feb 21, 2024
b281235
update 'test_binary.py'
anmyachev Feb 22, 2024
c014163
fixes
anmyachev Feb 26, 2024
a5db6a3
fixes
anmyachev Feb 27, 2024
2971259
update 'test_indexing.py'
anmyachev Feb 27, 2024
cb1f705
fixes
anmyachev Feb 27, 2024
d52e6b9
update 'test_reduce.py'
anmyachev Feb 27, 2024
4d2c028
update 'test_udf.py'
anmyachev Feb 27, 2024
c9122d4
fixes
anmyachev Feb 27, 2024
46cfb38
fixes
anmyachev Feb 27, 2024
8eb878b
update 'test_series.py'
anmyachev Feb 28, 2024
27e1444
update 'test_map_metadata.py'
anmyachev Feb 29, 2024
0e7e206
fixes
anmyachev Feb 29, 2024
5ec5984
update 'test_io.py'
anmyachev Mar 1, 2024
74f3ebf
fixes
anmyachev Mar 1, 2024
a40b71f
fixes
anmyachev Mar 3, 2024
93af135
fixes
anmyachev Mar 3, 2024
e7e30f0
fixes
anmyachev Mar 3, 2024
54910b4
fixes
anmyachev Mar 3, 2024
d0a0867
fixes
anmyachev Mar 3, 2024
d5ae03c
fies
anmyachev Mar 3, 2024
73a0d69
Merge branch 'master' into issue6016
anmyachev Mar 3, 2024
04ecf14
fixes
anmyachev Mar 3, 2024
89327bc
fixes
anmyachev Mar 3, 2024
8b59bcb
fixes
anmyachev Mar 3, 2024
4af874e
fixes
anmyachev Mar 3, 2024
3693b60
fixes
anmyachev Mar 3, 2024
f9ec678
fixes
anmyachev Mar 3, 2024
08f0a28
REVERT ME; use all tests
anmyachev Mar 3, 2024
023cf96
fixes
anmyachev Mar 3, 2024
12c8337
fixes
anmyachev Mar 3, 2024
0f9172b
fixes
anmyachev Mar 3, 2024
728987d
fixes
anmyachev Mar 4, 2024
d65bd21
fixes
anmyachev Mar 4, 2024
7fb4ba1
Merge branch 'master' of https://github.com/modin-project/modin into …
anmyachev Mar 5, 2024
779c7f8
fix after merge
anmyachev Mar 5, 2024
b6fb0cc
Merge branch 'master' of https://github.com/modin-project/modin into …
anmyachev Mar 6, 2024
7d2f6cb
fixes after merge
anmyachev Mar 6, 2024
999ce44
REFACTOR-#7013: Move `to_pandas` and `to_ray_dataset` into modin name…
anmyachev Mar 6, 2024
47a886c
FEAT-#7001: Do not force materialization in MetaList.__getitem__() (#…
AndreyPavlenko Mar 6, 2024
a64e5d1
REFACTOR-#7017: Align 'to_hdf' and 'hist' signatures to pandas (#7018)
anmyachev Mar 6, 2024
5f00068
fixes
anmyachev Mar 6, 2024
d03ebc6
Merge branch 'master' of https://github.com/modin-project/modin into …
anmyachev Mar 6, 2024
2d5e139
fixes
anmyachev Mar 6, 2024
abf89fb
fixes
anmyachev Mar 6, 2024
65db771
fixes
anmyachev Mar 6, 2024
9baeb87
fixes
anmyachev Mar 6, 2024
760f040
fixes
anmyachev Mar 6, 2024
3937854
fixes
anmyachev Mar 6, 2024
b16edab
fixes
anmyachev Mar 7, 2024
def475e
fixes
anmyachev Mar 7, 2024
2fe8086
fix 'test_loc_iloc_2064'
anmyachev Mar 7, 2024
925b4ce
fixes
anmyachev Mar 7, 2024
af29f5b
fixes
anmyachev Mar 7, 2024
cd04805
fixes
anmyachev Mar 7, 2024
d3ce0c2
fixes
anmyachev Mar 7, 2024
6abeba7
Merge branch 'master' of https://github.com/modin-project/modin into …
anmyachev Mar 13, 2024
6bcd51d
fixes
anmyachev Mar 13, 2024
9a340ec
fixes
anmyachev Mar 13, 2024
46c4abe
address review comments and remove debug stuff
anmyachev Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,10 @@ def _read_csv_check_support(
+ "'infer' header values",
)
if isinstance(parse_dates, list) and not set(parse_dates).issubset(names):
raise ValueError("Missing column provided to 'parse_dates'")
missing_columns = set(parse_dates) - set(names)
raise ValueError(
f"Missing column provided to 'parse_dates': '{', '.join(missing_columns)}'"
)

empty_pandas_df = pandas.read_csv(
**dict(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ def test_read_csv_datetime(
engine,
parse_dates,
names,
request,
):
parse_dates_unsupported = isinstance(parse_dates, dict) or (
isinstance(parse_dates, list)
Expand All @@ -311,9 +312,27 @@ def test_read_csv_datetime(
skip_exc_type_check = parse_dates_unsupported and engine == "arrow"
if skip_exc_type_check:
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7012")

raising_exceptions = None
if "names1-parse_dates2" in request.node.callspec.id:
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'col2'"
)
elif (
"names1-parse_dates5-None" in request.node.callspec.id
or "names1-parse_dates4-None" in request.node.callspec.id
):
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'col2, col3'"
)
elif "None-parse_dates3" in request.node.callspec.id:
raising_exceptions = ValueError(
"Missing column provided to 'parse_dates': 'c2'"
)
eval_io(
fn_name="read_csv",
md_extra_kwargs={"engine": engine},
raising_exceptions=raising_exceptions,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
parse_dates=parse_dates,
Expand Down Expand Up @@ -509,7 +528,7 @@ def test_dup_names(self, names):
["i1", "i2", "a"],
],
)
def test_reset_index(self, names):
def test_reset_index(self, names, request):
index = pandas.MultiIndex.from_tuples(
[(i, j, k) for i in range(2) for j in range(3) for k in range(4)],
names=names,
Expand All @@ -519,7 +538,12 @@ def applier(lib):
df = lib.DataFrame(self.data, index=index) + 1
return df.reset_index()

eval_general(pd, pandas, applier)
raising_exceptions = None
if "names3" in request.node.callspec.id:
raising_exceptions = ValueError("cannot insert i1, already exists")
elif "names4" in request.node.callspec.id:
raising_exceptions = ValueError("cannot insert a, already exists")
eval_general(pd, pandas, applier, raising_exceptions=raising_exceptions)

@pytest.mark.parametrize("is_multiindex", [True, False])
def test_reset_index_multicolumns(self, is_multiindex):
Expand Down Expand Up @@ -1327,7 +1351,7 @@ def groupby(df, **kwargs):
@pytest.mark.parametrize("n", [10, -10])
@pytest.mark.parametrize("invert", [True, False])
@pytest.mark.parametrize("select", [True, False])
@pytest.mark.parametrize("ascending", [None, True, False])
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pandas no longer uses this value as default.

@pytest.mark.parametrize("ascending", [True, False])
def test_head_tail(self, op, n, invert, select, ascending):
def head(df, **kwargs):
if invert:
Expand Down
4 changes: 4 additions & 0 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ def _pandas_read_csv_glob(path, storage_options):
).reset_index(drop=True)
return pandas_df

raising_exceptions = None
if "anon" in storage_options_extra:
raising_exceptions = PermissionError("Forbidden")
eval_general(
pd,
pandas,
Expand All @@ -237,6 +240,7 @@ def _pandas_read_csv_glob(path, storage_options):
else _pandas_read_csv_glob(s3_path, **kwargs)
),
storage_options=s3_storage_options | storage_options_extra,
raising_exceptions=raising_exceptions,
)


Expand Down
54 changes: 50 additions & 4 deletions modin/pandas/test/dataframe/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
@pytest.mark.parametrize(
"other",
[
lambda df: 4,
lambda df, axis: 4,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lambda was used incorrectly.

lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]),
lambda df, axis: {
label: idx + 1
Expand Down Expand Up @@ -114,14 +114,19 @@ def test___rdivmod__():
*("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"),
],
)
def test_math_functions_fill_value(other, fill_value, op):
def test_math_functions_fill_value(other, fill_value, op, request):
data = test_data["int_data"]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

raising_exceptions = None
if "check_different_index" in request.node.callspec.id and fill_value == 3.0:
raising_exceptions = NotImplementedError("fill_value 3.0 not supported.")

eval_general(
modin_df,
pandas_df,
lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value),
raising_exceptions=raising_exceptions,
# This test causes an empty slice to be generated thus triggering:
# https://github.com/modin-project/modin/issues/5974
comparator_kwargs={"check_dtypes": get_current_execution() != "BaseOnPython"},
Expand Down Expand Up @@ -178,7 +183,7 @@ def test_math_alias(math_op, alias):
@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"])
@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_comparison(data, op, other):
def test_comparison(data, op, other, request):
def operation(df):
df = getattr(df, op)(df if other == "as_left" else other)
if other == "as_left" and StorageFormat.get() == "Hdk":
Expand All @@ -187,9 +192,20 @@ def operation(df):
df = df.sort_index(axis=1)
return df

raising_exceptions = None
if "int_data" in request.node.callspec.id and other == "a":
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
elif "float_nan_data" in request.node.callspec.id and other == "a":
raising_exceptions = TypeError(
"Invalid comparison between dtype=float64 and str"
)
if StorageFormat.get() == "Hdk":
pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")

eval_general(
*create_test_dfs(data),
operation=operation,
raising_exceptions=raising_exceptions,
)


Expand Down Expand Up @@ -344,6 +360,9 @@ def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partit
lambda df: (
df / modin_df1.a if isinstance(df, pd.DataFrame) else df / pandas_df1.a
),
raising_exceptions=ValueError(
"cannot reindex on an axis with duplicate labels"
),
)
return

Expand Down Expand Up @@ -492,13 +511,40 @@ def test_non_commutative_multiply():
pytest.param(3.5, id="float scalar"),
],
)
def test_arithmetic_with_tricky_dtypes(val1, val2, op):
def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
modin_df1, pandas_df1 = create_test_dfs(val1)
modin_df2, pandas_df2 = (
create_test_dfs(val2) if isinstance(val2, list) else (val2, val2)
)

raising_exceptions = None
if (
"bool-bool" in request.node.callspec.id
or "bool scalar-bool" in request.node.callspec.id
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we are going to proceed using request.* to identify the exact errors and how difficult it is for developers to find the exact cases for those errors using request.*?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we are going to proceed using request.* to identify the exact errors and how difficult it is for developers to find the exact cases for those errors using request.*?

Well, this is quite inconvenient, but this is the price we need to pay in order to be more confident that what is actually being tested is what is intended.

I would also like to note that to avoid this inconvenience, when writing a test, we need to separate test cases that should work correctly from those that should not.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also like to note that to avoid this inconvenience, when writing a test, we need to separate test cases that should work correctly from those that should not.

Yes, separating tests would probably make sense here. Matching error-prone parameters with exceptions would help I think.

) and op in [
"pow",
"rpow",
"truediv",
"rtruediv",
"floordiv",
"rfloordiv",
]:
op_name = op[1:] if op.startswith("r") else op
raising_exceptions = NotImplementedError(
f"operator '{op_name}' not implemented for bool dtypes"
)
elif (
"bool-bool" in request.node.callspec.id
or "bool scalar-bool" in request.node.callspec.id
) and op in ["sub", "rsub"]:
raising_exceptions = TypeError(
"numpy boolean subtract, the `-` operator, is not supported, "
+ "use the bitwise_xor, the `^` operator, or the logical_xor function instead."
)

eval_general(
(modin_df1, modin_df2),
(pandas_df1, pandas_df2),
lambda dfs: getattr(dfs[0], op)(dfs[1]),
raising_exceptions=raising_exceptions,
)
11 changes: 6 additions & 5 deletions modin/pandas/test/dataframe/test_join_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ def test_merge(test_data, test_data2):
pandas_df,
lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
comparator=comparator,
raising_exceptions=ValueError("Cannot merge a Series without a name"),
)

# merge a Series with a name
Expand Down Expand Up @@ -538,9 +539,7 @@ def test_merge_on_single_index(left_index, right_index):


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys)
)
@pytest.mark.parametrize("ascending", [False, True])
@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"])
def test_sort_index(axis, ascending, na_position):
data = test_data["float_nan_data"]
Expand Down Expand Up @@ -620,8 +619,10 @@ def test_sort_multiindex(sort_remaining):
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize(
"ascending",
bool_arg_values + ["list_first_True", "list_first_False"],
ids=arg_keys("ascending", bool_arg_keys + ["list_first_True", "list_first_False"]),
[False, True] + ["list_first_True", "list_first_False"],
ids=arg_keys(
"ascending", ["False", "True"] + ["list_first_True", "list_first_False"]
),
)
@pytest.mark.parametrize(
"inplace", bool_arg_values, ids=arg_keys("inplace", bool_arg_keys)
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ def test_insert(data):
eval_insert(
pd.DataFrame(columns=list("ab")),
pandas.DataFrame(columns=list("ab")),
col=lambda df: df.columns[0],
col="Series insert",
YarShev marked this conversation as resolved.
Show resolved Hide resolved
value=lambda df: df[df.columns[0]],
)
eval_insert(
Expand Down
50 changes: 30 additions & 20 deletions modin/pandas/test/dataframe/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import numpy as np
import pandas
import pytest
from pandas._libs.lib import no_default
from pandas.core.dtypes.common import is_list_like

import modin.pandas as pd
Expand Down Expand Up @@ -60,13 +59,13 @@
def test_agg_dict():
md_df, pd_df = create_test_dfs(test_data_values[0])
agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")}
eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True)
YarShev marked this conversation as resolved.
Show resolved Hide resolved
eval_general(md_df, pd_df, lambda df: df.agg(agg_dict))

agg_dict = {
"new_col1": (pd_df.columns[0], "sum"),
"new_col2": (pd_df.columns[-1], "count"),
}
eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict))


@pytest.mark.parametrize("axis", [0, 1])
Expand All @@ -76,10 +75,19 @@ def test_agg_dict():
ids=agg_func_keys + agg_func_except_keys,
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply(axis, func, op):
def test_agg_apply(axis, func, op, request):
raising_exceptions = None
if "sum sum" in request.node.callspec.id:
raising_exceptions = pandas.errors.SpecificationError(
"Function names must be unique if there is no new column names assigned"
)
elif "should raise AssertionError" in request.node.callspec.id:
# FIXME: https://github.com/modin-project/modin/issues/7031
raising_exceptions = False
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
lambda df: getattr(df, op)(func, axis),
raising_exceptions=raising_exceptions,
)


Expand All @@ -90,10 +98,19 @@ def test_agg_apply(axis, func, op):
ids=agg_func_keys + agg_func_except_keys,
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply_axis_names(axis, func, op):
def test_agg_apply_axis_names(axis, func, op, request):
raising_exceptions = None
if "sum sum" in request.node.callspec.id:
raising_exceptions = pandas.errors.SpecificationError(
"Function names must be unique if there is no new column names assigned"
)
elif "should raise AssertionError" in request.node.callspec.id:
# FIXME: https://github.com/modin-project/modin/issues/7031
raising_exceptions = False
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: getattr(df, op)(func, axis),
raising_exceptions=raising_exceptions,
)


Expand Down Expand Up @@ -130,23 +147,22 @@ def test_apply_key_error(func):
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: df.apply({"row": func}, axis=1),
raising_exceptions=KeyError("Column(s) ['row'] do not exist"),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("level", [no_default, None, -1, 0, 1])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("func", ["kurt", "count", "sum", "mean", "all", "any"])
def test_apply_text_func_with_level(level, data, func, axis):
func_kwargs = {"level": level, "axis": axis}
def test_apply_text_func(data, func, axis):
func_kwargs = {"axis": axis}
rows_number = len(next(iter(data.values()))) # length of the first data column
level_0 = np.random.choice([0, 1, 2], rows_number)
level_1 = np.random.choice([3, 4, 5], rows_number)
index = pd.MultiIndex.from_arrays([level_0, level_1])

eval_general(
pd.DataFrame(data, index=index),
pandas.DataFrame(data, index=index),
*create_test_dfs(data, index=index),
lambda df, *args, **kwargs: df.apply(func, *args, **kwargs),
**func_kwargs,
)
Expand Down Expand Up @@ -448,8 +464,6 @@ def test_query_named_index():
eval_general(
*(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])),
lambda df: df.query("col1 % 2 == 0 | col3 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand All @@ -460,8 +474,6 @@ def test_query_named_multiindex():
for df in create_test_dfs(test_data["int_data"])
),
lambda df: df.query("col1 % 2 == 1 | col3 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand All @@ -474,8 +486,6 @@ def make_df(without_index):
eval_general(
*(make_df(df) for df in create_test_dfs(test_data["int_data"])),
lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col4 % 2 == 1"),
# work around https://github.com/modin-project/modin/issues/6016
raising_exceptions=(Exception,),
)


Expand Down Expand Up @@ -514,9 +524,9 @@ def test_query_with_element_access_issue_4580(engine):

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize(
"func",
agg_func_values + agg_func_except_values,
ids=agg_func_keys + agg_func_except_keys,
"func", [lambda x: x + 1, [np.sqrt, np.exp]], ids=["lambda", "list_udfs"]
)
def test_transform(data, func):
def test_transform(data, func, request):
if "list_udfs" in request.node.callspec.id:
pytest.xfail(reason="https://github.com/modin-project/modin/issues/6998")
eval_general(*create_test_dfs(data), lambda df: df.transform(func))
Loading
Loading