modin-project · YarShev · Mar 14, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 22, 2024
@@ -457,7 +457,10 @@ def _read_csv_check_support(
                     + "'infer' header values",
                 )
             if isinstance(parse_dates, list) and not set(parse_dates).issubset(names):
-                raise ValueError("Missing column provided to 'parse_dates'")
+                missing_columns = set(parse_dates) - set(names)
+                raise ValueError(
+                    f"Missing column provided to 'parse_dates': '{', '.join(missing_columns)}'"
+                )
 
             empty_pandas_df = pandas.read_csv(
                 **dict(

@@ -296,6 +296,7 @@ def test_read_csv_datetime(
         engine,
         parse_dates,
         names,
+        request,
     ):
         parse_dates_unsupported = isinstance(parse_dates, dict) or (
             isinstance(parse_dates, list)
@@ -311,9 +312,27 @@ def test_read_csv_datetime(
         skip_exc_type_check = parse_dates_unsupported and engine == "arrow"
         if skip_exc_type_check:
             pytest.xfail(reason="https://github.com/modin-project/modin/issues/7012")
+
+        raising_exceptions = None
+        if "names1-parse_dates2" in request.node.callspec.id:
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'col2'"
+            )
+        elif (
+            "names1-parse_dates5-None" in request.node.callspec.id
+            or "names1-parse_dates4-None" in request.node.callspec.id
+        ):
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'col2, col3'"
+            )
+        elif "None-parse_dates3" in request.node.callspec.id:
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'c2'"
+            )
         eval_io(
             fn_name="read_csv",
             md_extra_kwargs={"engine": engine},
+            raising_exceptions=raising_exceptions,
             # read_csv kwargs
             filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
             parse_dates=parse_dates,
@@ -509,7 +528,7 @@ def test_dup_names(self, names):
             ["i1", "i2", "a"],
         ],
     )
-    def test_reset_index(self, names):
+    def test_reset_index(self, names, request):
         index = pandas.MultiIndex.from_tuples(
             [(i, j, k) for i in range(2) for j in range(3) for k in range(4)],
             names=names,
@@ -519,7 +538,12 @@ def applier(lib):
             df = lib.DataFrame(self.data, index=index) + 1
             return df.reset_index()
 
-        eval_general(pd, pandas, applier)
+        raising_exceptions = None
+        if "names3" in request.node.callspec.id:
+            raising_exceptions = ValueError("cannot insert i1, already exists")
+        elif "names4" in request.node.callspec.id:
+            raising_exceptions = ValueError("cannot insert a, already exists")
+        eval_general(pd, pandas, applier, raising_exceptions=raising_exceptions)
 
     @pytest.mark.parametrize("is_multiindex", [True, False])
     def test_reset_index_multicolumns(self, is_multiindex):
@@ -1327,7 +1351,7 @@ def groupby(df, **kwargs):
     @pytest.mark.parametrize("n", [10, -10])
     @pytest.mark.parametrize("invert", [True, False])
     @pytest.mark.parametrize("select", [True, False])
-    @pytest.mark.parametrize("ascending", [None, True, False])
+    @pytest.mark.parametrize("ascending", [True, False])
     def test_head_tail(self, op, n, invert, select, ascending):
         def head(df, **kwargs):
             if invert:

@@ -228,6 +228,9 @@ def _pandas_read_csv_glob(path, storage_options):
         ).reset_index(drop=True)
         return pandas_df
 
+    raising_exceptions = None
+    if "anon" in storage_options_extra:
+        raising_exceptions = PermissionError("Forbidden")
     eval_general(
         pd,
         pandas,
@@ -237,6 +240,7 @@ def _pandas_read_csv_glob(path, storage_options):
             else _pandas_read_csv_glob(s3_path, **kwargs)
         ),
         storage_options=s3_storage_options | storage_options_extra,
+        raising_exceptions=raising_exceptions,
     )
 
 

@@ -49,7 +49,7 @@
 @pytest.mark.parametrize(
     "other",
     [
-        lambda df: 4,
+        lambda df, axis: 4,
         lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]),
         lambda df, axis: {
             label: idx + 1
@@ -114,14 +114,19 @@ def test___rdivmod__():
         *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"),
     ],
 )
-def test_math_functions_fill_value(other, fill_value, op):
+def test_math_functions_fill_value(other, fill_value, op, request):
     data = test_data["int_data"]
     modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
 
+    raising_exceptions = None
+    if "check_different_index" in request.node.callspec.id and fill_value == 3.0:
+        raising_exceptions = NotImplementedError("fill_value 3.0 not supported.")
+
     eval_general(
         modin_df,
         pandas_df,
         lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value),
+        raising_exceptions=raising_exceptions,
         # This test causes an empty slice to be generated thus triggering:
         # https://github.com/modin-project/modin/issues/5974
         comparator_kwargs={"check_dtypes": get_current_execution() != "BaseOnPython"},
@@ -178,7 +183,7 @@ def test_math_alias(math_op, alias):
 @pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"])
 @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test_comparison(data, op, other):
+def test_comparison(data, op, other, request):
     def operation(df):
         df = getattr(df, op)(df if other == "as_left" else other)
         if other == "as_left" and StorageFormat.get() == "Hdk":
@@ -187,9 +192,20 @@ def operation(df):
             df = df.sort_index(axis=1)
         return df
 
+    raising_exceptions = None
+    if "int_data" in request.node.callspec.id and other == "a":
+        pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
+    elif "float_nan_data" in request.node.callspec.id and other == "a":
+        raising_exceptions = TypeError(
+            "Invalid comparison between dtype=float64 and str"
+        )
+        if StorageFormat.get() == "Hdk":
+            pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
+
     eval_general(
         *create_test_dfs(data),
         operation=operation,
+        raising_exceptions=raising_exceptions,
     )
 
 
@@ -344,6 +360,9 @@ def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partit
             lambda df: (
                 df / modin_df1.a if isinstance(df, pd.DataFrame) else df / pandas_df1.a
             ),
+            raising_exceptions=ValueError(
+                "cannot reindex on an axis with duplicate labels"
+            ),
         )
         return
 
@@ -492,13 +511,40 @@ def test_non_commutative_multiply():
         pytest.param(3.5, id="float scalar"),
     ],
 )
-def test_arithmetic_with_tricky_dtypes(val1, val2, op):
+def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
     modin_df1, pandas_df1 = create_test_dfs(val1)
     modin_df2, pandas_df2 = (
         create_test_dfs(val2) if isinstance(val2, list) else (val2, val2)
     )
+
+    raising_exceptions = None
+    if (
+        "bool-bool" in request.node.callspec.id
+        or "bool scalar-bool" in request.node.callspec.id
+    ) and op in [
+        "pow",
+        "rpow",
+        "truediv",
+        "rtruediv",
+        "floordiv",
+        "rfloordiv",
+    ]:
+        op_name = op[1:] if op.startswith("r") else op
+        raising_exceptions = NotImplementedError(
+            f"operator '{op_name}' not implemented for bool dtypes"
+        )
+    elif (
+        "bool-bool" in request.node.callspec.id
+        or "bool scalar-bool" in request.node.callspec.id
+    ) and op in ["sub", "rsub"]:
+        raising_exceptions = TypeError(
+            "numpy boolean subtract, the `-` operator, is not supported, "
+            + "use the bitwise_xor, the `^` operator, or the logical_xor function instead."
+        )
+
     eval_general(
         (modin_df1, modin_df2),
         (pandas_df1, pandas_df2),
         lambda dfs: getattr(dfs[0], op)(dfs[1]),
+        raising_exceptions=raising_exceptions,
     )
@@ -384,6 +384,7 @@ def test_merge(test_data, test_data2):
         pandas_df,
         lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
         comparator=comparator,
+        raising_exceptions=ValueError("Cannot merge a Series without a name"),
     )
 
     # merge a Series with a name
@@ -538,9 +539,7 @@ def test_merge_on_single_index(left_index, right_index):
 
 
 @pytest.mark.parametrize("axis", [0, 1])
-@pytest.mark.parametrize(
-    "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys)
-)
+@pytest.mark.parametrize("ascending", [False, True])
 @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"])
 def test_sort_index(axis, ascending, na_position):
     data = test_data["float_nan_data"]
@@ -620,8 +619,10 @@ def test_sort_multiindex(sort_remaining):
 @pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
 @pytest.mark.parametrize(
     "ascending",
-    bool_arg_values + ["list_first_True", "list_first_False"],
-    ids=arg_keys("ascending", bool_arg_keys + ["list_first_True", "list_first_False"]),
+    [False, True] + ["list_first_True", "list_first_False"],
+    ids=arg_keys(
+        "ascending", ["False", "True"] + ["list_first_True", "list_first_False"]
+    ),
 )
 @pytest.mark.parametrize(
     "inplace", bool_arg_values, ids=arg_keys("inplace", bool_arg_keys)

@@ -1333,7 +1333,7 @@ def test_insert(data):
         eval_insert(
             pd.DataFrame(columns=list("ab")),
             pandas.DataFrame(columns=list("ab")),
-            col=lambda df: df.columns[0],
+            col="Series insert",
             value=lambda df: df[df.columns[0]],
         )
     eval_insert(

@@ -15,7 +15,6 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._libs.lib import no_default
 from pandas.core.dtypes.common import is_list_like
 
 import modin.pandas as pd
@@ -60,13 +59,13 @@
 def test_agg_dict():
     md_df, pd_df = create_test_dfs(test_data_values[0])
     agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")}
-    eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True)
+    eval_general(md_df, pd_df, lambda df: df.agg(agg_dict))
 
     agg_dict = {
         "new_col1": (pd_df.columns[0], "sum"),
         "new_col2": (pd_df.columns[-1], "count"),
     }
-    eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
+    eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict))
 
 
 @pytest.mark.parametrize("axis", [0, 1])
@@ -76,10 +75,19 @@ def test_agg_dict():
     ids=agg_func_keys + agg_func_except_keys,
 )
 @pytest.mark.parametrize("op", ["agg", "apply"])
-def test_agg_apply(axis, func, op):
+def test_agg_apply(axis, func, op, request):
+    raising_exceptions = None
+    if "sum sum" in request.node.callspec.id:
+        raising_exceptions = pandas.errors.SpecificationError(
+            "Function names must be unique if there is no new column names assigned"
+        )
+    elif "should raise AssertionError" in request.node.callspec.id:
+        # FIXME: https://github.com/modin-project/modin/issues/7031
+        raising_exceptions = False
     eval_general(
         *create_test_dfs(test_data["float_nan_data"]),
         lambda df: getattr(df, op)(func, axis),
+        raising_exceptions=raising_exceptions,
     )
 
 
@@ -90,10 +98,19 @@ def test_agg_apply(axis, func, op):
     ids=agg_func_keys + agg_func_except_keys,
 )
 @pytest.mark.parametrize("op", ["agg", "apply"])
-def test_agg_apply_axis_names(axis, func, op):
+def test_agg_apply_axis_names(axis, func, op, request):
+    raising_exceptions = None
+    if "sum sum" in request.node.callspec.id:
+        raising_exceptions = pandas.errors.SpecificationError(
+            "Function names must be unique if there is no new column names assigned"
+        )
+    elif "should raise AssertionError" in request.node.callspec.id:
+        # FIXME: https://github.com/modin-project/modin/issues/7031
+        raising_exceptions = False
     eval_general(
         *create_test_dfs(test_data["int_data"]),
         lambda df: getattr(df, op)(func, axis),
+        raising_exceptions=raising_exceptions,
     )
 
 
@@ -130,23 +147,22 @@ def test_apply_key_error(func):
     eval_general(
         *create_test_dfs(test_data["int_data"]),
         lambda df: df.apply({"row": func}, axis=1),
+        raising_exceptions=KeyError("Column(s) ['row'] do not exist"),
     )
 
 
 @pytest.mark.parametrize("axis", [0, 1])
-@pytest.mark.parametrize("level", [no_default, None, -1, 0, 1])
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 @pytest.mark.parametrize("func", ["kurt", "count", "sum", "mean", "all", "any"])
-def test_apply_text_func_with_level(level, data, func, axis):
-    func_kwargs = {"level": level, "axis": axis}
+def test_apply_text_func(data, func, axis):
+    func_kwargs = {"axis": axis}
     rows_number = len(next(iter(data.values())))  # length of the first data column
     level_0 = np.random.choice([0, 1, 2], rows_number)
     level_1 = np.random.choice([3, 4, 5], rows_number)
     index = pd.MultiIndex.from_arrays([level_0, level_1])
 
     eval_general(
-        pd.DataFrame(data, index=index),
-        pandas.DataFrame(data, index=index),
+        *create_test_dfs(data, index=index),
         lambda df, *args, **kwargs: df.apply(func, *args, **kwargs),
         **func_kwargs,
     )
@@ -448,8 +464,6 @@ def test_query_named_index():
     eval_general(
         *(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])),
         lambda df: df.query("col1 % 2 == 0 | col3 % 2 == 1"),
-        # work around https://github.com/modin-project/modin/issues/6016
-        raising_exceptions=(Exception,),
     )
 
 
@@ -460,8 +474,6 @@ def test_query_named_multiindex():
             for df in create_test_dfs(test_data["int_data"])
         ),
         lambda df: df.query("col1 % 2 == 1 | col3 % 2 == 1"),
-        # work around https://github.com/modin-project/modin/issues/6016
-        raising_exceptions=(Exception,),
     )
 
 
@@ -474,8 +486,6 @@ def make_df(without_index):
     eval_general(
         *(make_df(df) for df in create_test_dfs(test_data["int_data"])),
         lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col4 % 2 == 1"),
-        # work around https://github.com/modin-project/modin/issues/6016
-        raising_exceptions=(Exception,),
     )
 
 
@@ -514,9 +524,9 @@ def test_query_with_element_access_issue_4580(engine):
 
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 @pytest.mark.parametrize(
-    "func",
-    agg_func_values + agg_func_except_values,
-    ids=agg_func_keys + agg_func_except_keys,
+    "func", [lambda x: x + 1, [np.sqrt, np.exp]], ids=["lambda", "list_udfs"]
 )
-def test_transform(data, func):
+def test_transform(data, func, request):
+    if "list_udfs" in request.node.callspec.id:
+        pytest.xfail(reason="https://github.com/modin-project/modin/issues/6998")
     eval_general(*create_test_dfs(data), lambda df: df.transform(func))