modin-project · YarShev · Mar 14, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 22, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -395,7 +395,7 @@ jobs:
           - ubuntu
           - windows
         python-version: ["3.9"]
-        engine: ${{ fromJSON( github.event_name == 'push' && '["python", "ray", "dask"]' || needs.execution-filter.outputs.engines ) }}
+        engine: ["python", "ray", "dask"]
         test_task:
           - group_1
           - group_2

@@ -457,7 +457,10 @@ def _read_csv_check_support(
                     + "'infer' header values",
                 )
             if isinstance(parse_dates, list) and not set(parse_dates).issubset(names):
-                raise ValueError("Missing column provided to 'parse_dates'")
+                missed_columns = set(parse_dates) - set(names)
-                missed_columns = set(parse_dates) - set(names)
+                missing_columns = set(parse_dates) - set(names)
-                missed_columns = set(parse_dates) - set(names)
+                missing_columns = set(parse_dates) - set(names)
+                raise ValueError(
+                    f"Missing column provided to 'parse_dates': '{', '.join(missed_columns)}'"
+                )
 
             empty_pandas_df = pandas.read_csv(
                 **dict(

@@ -296,6 +296,7 @@ def test_read_csv_datetime(
         engine,
         parse_dates,
         names,
+        request,
     ):
         parse_dates_unsupported = isinstance(parse_dates, dict) or (
             isinstance(parse_dates, list)
@@ -311,9 +312,27 @@ def test_read_csv_datetime(
         skip_exc_type_check = parse_dates_unsupported and engine == "arrow"
         if skip_exc_type_check:
             pytest.xfail(reason="https://github.com/modin-project/modin/issues/7012")
+
+        raising_exceptions = None
+        if "names1-parse_dates2" in request.node.callspec.id:
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'col2'"
+            )
+        elif (
+            "names1-parse_dates5-None" in request.node.callspec.id
+            or "names1-parse_dates4-None" in request.node.callspec.id
+        ):
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'col2, col3'"
+            )
+        elif "None-parse_dates3" in request.node.callspec.id:
+            raising_exceptions = ValueError(
+                "Missing column provided to 'parse_dates': 'c2'"
+            )
         eval_io(
             fn_name="read_csv",
             md_extra_kwargs={"engine": engine},
+            raising_exceptions=raising_exceptions,
             # read_csv kwargs
             filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
             parse_dates=parse_dates,
@@ -509,7 +528,7 @@ def test_dup_names(self, names):
             ["i1", "i2", "a"],
         ],
     )
-    def test_reset_index(self, names):
+    def test_reset_index(self, names, request):
         index = pandas.MultiIndex.from_tuples(
             [(i, j, k) for i in range(2) for j in range(3) for k in range(4)],
             names=names,
@@ -519,7 +538,12 @@ def applier(lib):
             df = lib.DataFrame(self.data, index=index) + 1
             return df.reset_index()
 
-        eval_general(pd, pandas, applier)
+        raising_exceptions = None
+        if "names3" in request.node.callspec.id:
+            raising_exceptions = ValueError("cannot insert i1, already exists")
+        elif "names4" in request.node.callspec.id:
+            raising_exceptions = ValueError("cannot insert a, already exists")
+        eval_general(pd, pandas, applier, raising_exceptions=raising_exceptions)
 
     @pytest.mark.parametrize("is_multiindex", [True, False])
     def test_reset_index_multicolumns(self, is_multiindex):
@@ -1327,7 +1351,7 @@ def groupby(df, **kwargs):
     @pytest.mark.parametrize("n", [10, -10])
     @pytest.mark.parametrize("invert", [True, False])
     @pytest.mark.parametrize("select", [True, False])
-    @pytest.mark.parametrize("ascending", [None, True, False])
+    @pytest.mark.parametrize("ascending", [True, False])
     def test_head_tail(self, op, n, invert, select, ascending):
         def head(df, **kwargs):
             if invert:

@@ -228,6 +228,9 @@ def _pandas_read_csv_glob(path, storage_options):
         ).reset_index(drop=True)
         return pandas_df
 
+    raising_exceptions = None
+    if "anon" in storage_options_extra:
+        raising_exceptions = PermissionError("Forbidden")
     eval_general(
         pd,
         pandas,
@@ -237,6 +240,7 @@ def _pandas_read_csv_glob(path, storage_options):
             else _pandas_read_csv_glob(s3_path, **kwargs)
         ),
         storage_options=s3_storage_options | storage_options_extra,
+        raising_exceptions=raising_exceptions,
     )
 
 

@@ -49,7 +49,7 @@
 @pytest.mark.parametrize(
     "other",
     [
-        lambda df: 4,
+        lambda df, axis: 4,
         lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]),
         lambda df, axis: {
             label: idx + 1
@@ -114,14 +114,19 @@ def test___rdivmod__():
         *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"),
     ],
 )
-def test_math_functions_fill_value(other, fill_value, op):
+def test_math_functions_fill_value(other, fill_value, op, request):
     data = test_data["int_data"]
     modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
 
+    raising_exceptions = None
+    if "check_different_index" in request.node.callspec.id and fill_value == 3.0:
+        raising_exceptions = NotImplementedError("fill_value 3.0 not supported.")
+
     eval_general(
         modin_df,
         pandas_df,
         lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value),
+        raising_exceptions=raising_exceptions,
         # This test causes an empty slice to be generated thus triggering:
         # https://github.com/modin-project/modin/issues/5974
         comparator_kwargs={"check_dtypes": get_current_execution() != "BaseOnPython"},
@@ -178,7 +183,7 @@ def test_math_alias(math_op, alias):
 @pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"])
 @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test_comparison(data, op, other):
+def test_comparison(data, op, other, request):
     def operation(df):
         df = getattr(df, op)(df if other == "as_left" else other)
         if other == "as_left" and StorageFormat.get() == "Hdk":
@@ -187,9 +192,20 @@ def operation(df):
             df = df.sort_index(axis=1)
         return df
 
+    raising_exceptions = None
+    if "int_data" in request.node.callspec.id and other == "a":
+        pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
+    elif "float_nan_data" in request.node.callspec.id and other == "a":
+        raising_exceptions = TypeError(
+            "Invalid comparison between dtype=float64 and str"
+        )
+        if StorageFormat.get() == "Hdk":
+            pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019")
+
     eval_general(
         *create_test_dfs(data),
         operation=operation,
+        raising_exceptions=raising_exceptions,
     )
 
 
@@ -319,7 +335,9 @@ def test_equals_with_nans():
 @pytest.mark.parametrize(
     "is_idx_aligned", [True, False], ids=["idx_aligned", "idx_not_aligned"]
 )
-def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partitions):
+def test_mismatched_row_partitions(
+    is_idx_aligned, op_type, is_more_other_partitions, request
+):
     data = [0, 1, 2, 3, 4, 5]
     modin_df1, pandas_df1 = create_test_dfs({"a": data, "b": data})
     modin_df, pandas_df = modin_df1.loc[:2], pandas_df1.loc[:2]
@@ -344,6 +362,9 @@ def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partit
             lambda df: (
                 df / modin_df1.a if isinstance(df, pd.DataFrame) else df / pandas_df1.a
             ),
+            raising_exceptions=ValueError(
+                "cannot reindex on an axis with duplicate labels"
+            ),
         )
         return
 
@@ -492,13 +513,40 @@ def test_non_commutative_multiply():
         pytest.param(3.5, id="float scalar"),
     ],
 )
-def test_arithmetic_with_tricky_dtypes(val1, val2, op):
+def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
     modin_df1, pandas_df1 = create_test_dfs(val1)
     modin_df2, pandas_df2 = (
         create_test_dfs(val2) if isinstance(val2, list) else (val2, val2)
     )
+
+    raising_exceptions = None
+    if (
+        "bool-bool" in request.node.callspec.id
+        or "bool scalar-bool" in request.node.callspec.id
+    ) and op in [
+        "pow",
+        "rpow",
+        "truediv",
+        "rtruediv",
+        "floordiv",
+        "rfloordiv",
+    ]:
+        op_name = op[1:] if op.startswith("r") else op
+        raising_exceptions = NotImplementedError(
+            f"operator '{op_name}' not implemented for bool dtypes"
+        )
+    elif (
+        "bool-bool" in request.node.callspec.id
+        or "bool scalar-bool" in request.node.callspec.id
+    ) and op in ["sub", "rsub"]:
+        raising_exceptions = TypeError(
+            "numpy boolean subtract, the `-` operator, is not supported, "
+            + "use the bitwise_xor, the `^` operator, or the logical_xor function instead."
+        )
+
     eval_general(
         (modin_df1, modin_df2),
         (pandas_df1, pandas_df2),
         lambda dfs: getattr(dfs[0], op)(dfs[1]),
+        raising_exceptions=raising_exceptions,
     )
@@ -384,6 +384,7 @@ def test_merge(test_data, test_data2):
         pandas_df,
         lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps),
         comparator=comparator,
+        raising_exceptions=ValueError("Cannot merge a Series without a name"),
     )
 
     # merge a Series with a name
@@ -538,9 +539,7 @@ def test_merge_on_single_index(left_index, right_index):
 
 
 @pytest.mark.parametrize("axis", [0, 1])
-@pytest.mark.parametrize(
-    "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys)
-)
+@pytest.mark.parametrize("ascending", [False, True])
 @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"])
 def test_sort_index(axis, ascending, na_position):
     data = test_data["float_nan_data"]
@@ -620,8 +619,10 @@ def test_sort_multiindex(sort_remaining):
 @pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
 @pytest.mark.parametrize(
     "ascending",
-    bool_arg_values + ["list_first_True", "list_first_False"],
-    ids=arg_keys("ascending", bool_arg_keys + ["list_first_True", "list_first_False"]),
+    [False, True] + ["list_first_True", "list_first_False"],
+    ids=arg_keys(
+        "ascending", ["False", "True"] + ["list_first_True", "list_first_False"]
+    ),
 )
 @pytest.mark.parametrize(
     "inplace", bool_arg_values, ids=arg_keys("inplace", bool_arg_keys)

@@ -1333,7 +1333,7 @@ def test_insert(data):
         eval_insert(
             pd.DataFrame(columns=list("ab")),
             pandas.DataFrame(columns=list("ab")),
-            col=lambda df: df.columns[0],
+            col="Series insert",
             value=lambda df: df[df.columns[0]],
         )
     eval_insert(