From cf91b8d63b6574b2a5b48741a90fd5a7e8fbef16 Mon Sep 17 00:00:00 2001 From: Jianzhun Du Date: Mon, 21 Oct 2024 15:50:04 -0700 Subject: [PATCH 1/3] d --- tests/integ/conftest.py | 3 ++- tests/integ/modin/crosstab/test_crosstab.py | 2 +- tests/integ/modin/frame/test_loc.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/integ/conftest.py b/tests/integ/conftest.py index 60e078160b..2c7c1cfd1b 100644 --- a/tests/integ/conftest.py +++ b/tests/integ/conftest.py @@ -217,7 +217,8 @@ def session( .create() ) session.sql_simplifier_enabled = sql_simplifier_enabled - session._cte_optimization_enabled = cte_optimization_enabled + session._cte_optimization_enabled = True + session._query_compilation_stage_enabled = True if os.getenv("GITHUB_ACTIONS") == "true" and not local_testing_mode: set_up_external_access_integration_resources( session, rule1, rule2, key1, key2, integration1, integration2 diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index bff11bf311..5b26dc282c 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -438,7 +438,7 @@ def eval_func(lib): with SqlCounter( query_count=1, - join_count=7 if dropna else 10, + join_count=3 if dropna else 4, union_count=1, ): eval_snowpark_pandas_result( diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 4ff162abce..eb1884fb5c 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -973,7 +973,7 @@ def loc_set_helper(df): ) -@sql_count_checker(query_count=2, join_count=8) +@sql_count_checker(query_count=2, join_count=6) def test_df_loc_set_series_and_list_like_row_key_negative(key_type): # This test verifies pandas raise ValueError when row key is out-of-bounds but Snowpandas pandas will ignore the # out-of-bound index @@ -1301,7 +1301,7 @@ def loc_set_helper(df): else: df.loc[row_key, :] = pd.DataFrame(item) - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=3): if item.index.has_duplicates: # pandas fails to update duplicated rows with duplicated item with pytest.raises( @@ -1322,7 +1322,7 @@ def loc_set_helper(df): ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_df_loc_set_duplicate_cols_in_df_and_col_key(): df = native_pd.DataFrame( [[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]], columns=["D", "B", "B", "A"] @@ -1922,7 +1922,7 @@ def test_df_loc_get_key_scalar( ): key = random.choice(range(0, len(default_index_native_df))) # squeeze and to_pandas triggers additional queries - with SqlCounter(query_count=2, join_count=3): + with SqlCounter(query_count=2, join_count=2): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -3202,7 +3202,7 @@ def test_df_loc_set_boolean_series_with_non_default_index_key_and_scalar_item(): ["duplicate", [1, 1, 2, 3]], ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_df_loc_set_duplicate_index( self_index_type, self_index_val, index, columns, item ): @@ -3440,7 +3440,7 @@ def loc_helper(df): else: df.loc[["z", "y"], :] = val - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df, native_df, @@ -3460,7 +3460,7 @@ def loc_helper(df): def loc_helper(df): df.loc[["x", "y"], :] = val[:-2] - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df, native_df, @@ -3487,7 +3487,7 @@ def loc_helper(df): else: snow_df.loc[["x", "y", "z", "w"], :] = val[:2] - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df, native_df, @@ -4073,7 +4073,7 @@ def test_df_loc_get_with_timedelta_and_none_key(): assert_frame_equal(snow_df.loc[None], expected_df, check_column_type=False) -@sql_count_checker(query_count=2, join_count=6) +@sql_count_checker(query_count=2, join_count=4) @pytest.mark.parametrize("index", [list("ABC"), [0, 1, 2]]) def test_df_loc_set_row_from_series(index): native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC")) From 7751757278680af98674053af1e18dc54b60ce96 Mon Sep 17 00:00:00 2001 From: Jianzhun Du Date: Tue, 22 Oct 2024 13:29:18 -0700 Subject: [PATCH 2/3] fix --- tests/integ/modin/crosstab/test_crosstab.py | 22 ++++++------- tests/integ/modin/frame/test_assign.py | 8 ++--- tests/integ/modin/frame/test_cache_result.py | 4 +-- tests/integ/modin/frame/test_describe.py | 26 +++++++-------- tests/integ/modin/frame/test_loc.py | 16 +++++----- tests/integ/modin/frame/test_name.py | 2 +- tests/integ/modin/frame/test_setitem.py | 8 ++--- .../modin/groupby/test_groupby_transform.py | 2 +- .../modin/pivot/test_pivot_table_dropna.py | 22 ++++++------- .../pivot/test_pivot_table_fill_value.py | 18 +++++------ .../modin/pivot/test_pivot_table_margins.py | 32 ++++++++++--------- tests/integ/modin/series/test_cache_result.py | 2 +- tests/integ/modin/series/test_describe.py | 24 +++++++++----- tests/integ/modin/series/test_fillna.py | 2 +- tests/integ/modin/series/test_loc.py | 14 ++++---- tests/integ/modin/series/test_mask.py | 2 +- tests/integ/modin/series/test_setitem.py | 4 +-- tests/integ/modin/series/test_where.py | 2 +- .../modin/types/test_timedelta_indexing.py | 2 +- 19 files changed, 111 insertions(+), 101 deletions(-) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 5b26dc282c..faa46a829a 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -319,7 +319,7 @@ def test_margins(self, dropna, a, b, c): @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize(self, dropna, normalize, a, b, c): query_count = 1 if normalize in (0, "index") else 2 - join_count = 3 if normalize in (0, "index") else 2 + join_count = 3 if normalize in (0, "index") and dropna else 2 if dropna: join_count -= 2 @@ -340,9 +340,9 @@ def test_normalize(self, dropna, normalize, a, b, c): @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize_and_margins(self, dropna, normalize, a, b, c): counts = { - "columns": [3, 5 if dropna else 9, 4], - "index": [1, 5 if dropna else 8, 3], - "all": [3, 12 if dropna else 19, 7], + "columns": [3, 4 if dropna else 7, 3], + "index": [1, 3 if dropna else 4, 1], + "all": [3, 7 if dropna else 10, 3], } counts[0] = counts["index"] counts[1] = counts["columns"] @@ -374,8 +374,8 @@ def test_normalize_and_margins(self, dropna, normalize, a, b, c): @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c): counts = { - "columns": [3, 29 if dropna else 41, 4], - "index": [1, 23 if dropna else 32, 3], + "columns": [3, 10 if dropna else 13, 3], + "index": [1, 5 if dropna else 6, 1], "all": [3, 54 if dropna else 75, 7], } counts[0] = counts["index"] @@ -451,9 +451,9 @@ def eval_func(lib): @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c): counts = { - "columns": [2, 4 if dropna else 10], - "index": [1, 5 if dropna else 11], - "all": [2, 4 if dropna else 10], + "columns": [2, 4 if dropna else 6], + "index": [1, 3 if dropna else 4], + "all": [2, 4 if dropna else 6], } counts[0] = counts["index"] counts[1] = counts["columns"] @@ -520,7 +520,7 @@ def test_normalize_margins_and_values_not_supported( @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_values(self, dropna, aggfunc, basic_crosstab_dfs): query_count = 1 - join_count = 2 if dropna else 5 + join_count = 2 if dropna else 3 native_df = basic_crosstab_dfs[0] with SqlCounter(query_count=query_count, join_count=join_count): @@ -539,7 +539,7 @@ def test_values(self, dropna, aggfunc, basic_crosstab_dfs): @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs): query_count = 5 - join_count = 2 if dropna else 5 + join_count = 2 if dropna else 3 native_df, snow_df = basic_crosstab_dfs def eval_func(df): diff --git a/tests/integ/modin/frame/test_assign.py b/tests/integ/modin/frame/test_assign.py index 9cda3b5f59..3bfea77d70 100644 --- a/tests/integ/modin/frame/test_assign.py +++ b/tests/integ/modin/frame/test_assign.py @@ -60,7 +60,7 @@ def assign_func(df): @pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"]) def test_assign_basic_non_pandas_object(new_col_value): - join_count = 4 if isinstance(new_col_value, list) else 1 + join_count = 3 if isinstance(new_col_value, list) else 1 with SqlCounter(query_count=1, join_count=join_count): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -74,7 +74,7 @@ def test_assign_basic_non_pandas_object(new_col_value): ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_assign_invalid_long_column_length_negative(): # pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3. # Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted @@ -98,7 +98,7 @@ def test_assign_invalid_long_column_length_negative(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_assign_invalid_short_column_length_negative(): # pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3. # Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted @@ -226,7 +226,7 @@ def test_assign_self_columns(): ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_overwrite_columns_via_assign(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 5fb094f4ed..2129d776fe 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -130,7 +130,7 @@ def test_cache_result_simple(self, inplace): native_df = perform_chained_operations( native_pd.DataFrame(np.arange(15).reshape((3, 5))), native_pd ) - with SqlCounter(query_count=1, union_count=29): + with SqlCounter(query_count=1, union_count=11): snow_df = perform_chained_operations(snow_df, pd) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_df, native_df @@ -213,7 +213,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data): with SqlCounter( query_count=11, union_count=9, - udf_count=2, + udf_count=1, high_count_expected=True, high_count_reason="applymap requires additional queries to setup the UDF.", ): diff --git a/tests/integ/modin/frame/test_describe.py b/tests/integ/modin/frame/test_describe.py index e89f53ac59..117f3c78d4 100644 --- a/tests/integ/modin/frame/test_describe.py +++ b/tests/integ/modin/frame/test_describe.py @@ -49,11 +49,11 @@ def test_describe_numeric_only(data): # In total, we thus have 2 + 2 * (N - 1 + N) + 1 = 4N + 1 UNIONs for an N-column frame. [ # If there are multiple modes, return the value that appears first - ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 9), + ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 5), # Empty columns are numeric by default (df constructor must explicitly specify object dtype) - ({"a": [], "b": []}, 9), + ({"a": [], "b": []}, 5), # Heterogeneous data is considered non-numeric - ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 13), + ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 6), ( [ [None, "quick", None], @@ -61,7 +61,7 @@ def test_describe_numeric_only(data): ["dog", "dog", "lazy"], [None, None, None], ], - 13, + 6, ), ], ) @@ -107,7 +107,7 @@ def test_describe_empty_cols(): # 4K-1 UNIONs to compute top/freq for K object-dtype columns (see comment on # test_describe_obj_only for reasoning). # Since we have K=2 object columns, the result is 9 + (4 * 2 - 1) = 16 UNIONs. - ([int, object], None, None, 16), + ([int, object], None, None, 12), (np.number, [], None, 7), # Including only datetimes has 7 statistics since std is not computed. # Since there is only 1 column, all quantiles are computed in a single QC. @@ -127,8 +127,8 @@ def test_describe_empty_cols(): # include and exclude cannot directly overlap ([int, "O"], [float, "O"], ValueError, 0), # Like select_dtypes, a dtype in include/exclude can be a subtype of a dtype in the other - ([int, "O"], [float, np.number, np.datetime64], None, 9), - ("O", None, None, 9), + ([int, "O"], [float, np.number, np.datetime64], None, 5), + ("O", None, None, 5), ], ) def test_describe_include_exclude( @@ -285,9 +285,9 @@ def timestamp_describe_comparator(snow_res, native_res): # Don't need to test all permutations of include/exclude with MultiIndex -- this is covered by # tests for select_dtypes, as well as other tests in this file [ - ("all", 16), + ("all", 12), (np.number, 7), - (object, 9), + (object, 5), ], ) def test_describe_multiindex(index, columns, include, expected_union_count): @@ -312,10 +312,10 @@ def test_describe_multiindex(index, columns, include, expected_union_count): "include, exclude, expected_union_count", [ (None, None, 7), - ("all", None, 12), + ("all", None, 11), (np.number, None, 7), - (None, float, 10), - (object, None, 5), + (None, float, 9), + (object, None, 4), (None, object, 7), (int, float, 5), (float, int, 5), @@ -350,7 +350,7 @@ def helper(df): @sql_count_checker( query_count=3, - union_count=21, + union_count=8, ) # SNOW-1320296 - pd.concat SQL Compilation ambigious __row_position__ issue def test_describe_object_file(resources_path): diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index eb1884fb5c..05beb6e6de 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -811,7 +811,7 @@ def loc_set_helper(df): else: df.loc[pd.Series(row_key), :] = pd.DataFrame(item) - expected_join_count = 4 if not row_key.dtype == bool else 2 + expected_join_count = 3 if not row_key.dtype == bool else 2 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(pd.DataFrame(df), df, loc_set_helper, inplace=True) @@ -851,7 +851,7 @@ def loc_set_helper(df): df.loc[row_key, col_key] = item expected_join_count = ( - 6 if isinstance(col_key, str) and isinstance(item, list) else 1 + 4 if isinstance(col_key, str) and isinstance(item, list) else 1 ) with SqlCounter(query_count=1, join_count=expected_join_count): @@ -914,7 +914,7 @@ def test_df_loc_set_list_like_row_key(row_key, key_type): ) expected_join_count = ( - 2 if all(isinstance(i, bool) for i in row_key) and len(row_key) > 0 else 4 + 2 if all(isinstance(i, bool) for i in row_key) and len(row_key) > 0 else 3 ) # test case for df.loc[row_key] = item @@ -1220,7 +1220,7 @@ def loc_set_helper(df): # otherwise, pandas raise ValueError: cannot reindex on an axis with duplicate labels or (df.columns.equals(df.columns.union(col_key))) ): - query_count, join_count, expect_exception = 1, 4, False + query_count, join_count, expect_exception = 1, 3, False if isinstance(col_key, native_pd.Series): query_count += 1 @@ -2696,7 +2696,7 @@ def test_empty_df_loc_set_series_and_list(native_item): else native_item ) - expected_join_count = 2 if isinstance(native_item, native_pd.Series) else 4 + expected_join_count = 2 if isinstance(native_item, native_pd.Series) else 3 def setitem_op(df): item = native_item if isinstance(df, native_pd.DataFrame) else snow_item @@ -2761,7 +2761,7 @@ def set_loc_helper(df): else: df.loc[key] = native_item_df - expected_join_count = 1 if key == slice(None, None, None) else 4 + expected_join_count = 1 if key == slice(None, None, None) else 3 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, set_loc_helper, inplace=True) @@ -3010,7 +3010,7 @@ def loc_set_helper(df): len(row_key) - len(native_item) ) - expected_join_count = 4 if len(item) > 1 else 2 + expected_join_count = 3 if len(item) > 1 else 2 # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist with SqlCounter( query_count=5 if item_type_name == "index" else 1, @@ -3340,7 +3340,7 @@ def loc_set_helper(df): else: df.loc[snow_indexers] = item - expected_join_count = 4 + expected_join_count = 3 if isinstance(indexer[0], slice): expected_join_count = 1 diff --git a/tests/integ/modin/frame/test_name.py b/tests/integ/modin/frame/test_name.py index 39e04d862b..54c053e0cf 100644 --- a/tests/integ/modin/frame/test_name.py +++ b/tests/integ/modin/frame/test_name.py @@ -39,7 +39,7 @@ def test_create_dataframe_from_object_with_name(sample): ) -@sql_count_checker(query_count=1, join_count=2, union_count=1) +@sql_count_checker(query_count=1, join_count=1, union_count=1) def test_create_dataframe_from_snowpark_pandas_series(): df = pd.DataFrame([[2, 3, 4], [5, 6, 7]], columns=["X", "Y", "Z"]) df = pd.DataFrame([df.X, df.iloc[:, 2]]) diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 78f537b9c9..e74dbca348 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -145,7 +145,7 @@ def setitem(df): else: df[key] = val - expected_join_count = 3 if isinstance(key.start, int) else 4 + expected_join_count = 3 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, setitem, inplace=True) @@ -246,7 +246,7 @@ def func_insert_new_column(df): # matching_item_row_by_label is False here. -@sql_count_checker(query_count=2, join_count=8) +@sql_count_checker(query_count=2, join_count=6) def test_df_setitem_array_value(): # Case: setting an array as a new column (df[col] = arr) copies that data data = {"a": [1, 2, 3], "b": [4, 5, 6]} @@ -376,7 +376,7 @@ def func_insert_new_column(df, column): elif isinstance(column, native_pd.Index) and not isinstance( column, native_pd.DatetimeIndex ): - expected_join_count = 4 + expected_join_count = 3 if ( key == "a" @@ -672,7 +672,7 @@ def helper(df): def helper(df): df["x"] = df.loc[df.b < 0, "b"] - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result(snow_df, native_df, helper, inplace=True) diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index af77687bbf..a994dc36c4 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -134,7 +134,7 @@ def test_dataframe_groupby_transform_conflicting_labels_negative(): @sql_count_checker( query_count=11, - join_count=10, + join_count=8, udtf_count=2, high_count_expected=True, high_count_reason="performing two groupby transform operations that use UDTFs and compare with pandas", diff --git a/tests/integ/modin/pivot/test_pivot_table_dropna.py b/tests/integ/modin/pivot/test_pivot_table_dropna.py index cbb5d55447..69e10572c8 100644 --- a/tests/integ/modin/pivot/test_pivot_table_dropna.py +++ b/tests/integ/modin/pivot/test_pivot_table_dropna.py @@ -32,11 +32,11 @@ def test_pivot_table_single_value_with_dropna(df_data_with_nulls, dropna, column @pytest.mark.parametrize( "aggfunc, expected_join_count", [ - ("mean", 5), - ({"D": "max", "E": "sum"}, 3), - ({"D": ["count", "max"], "E": ["mean", "sum"]}, 7), - ({"D": "min", "E": ["mean"]}, 3), - (["min", "max"], 11), + ("mean", 3), + ({"D": "max", "E": "sum"}, 2), + ({"D": ["count", "max"], "E": ["mean", "sum"]}, 4), + ({"D": "min", "E": ["mean"]}, 2), + (["min", "max"], 6), ], ) def test_pivot_table_multiple_values_dropna_nonnull_data( @@ -60,11 +60,11 @@ def test_pivot_table_multiple_values_dropna_nonnull_data( @pytest.mark.parametrize( "aggfunc, expected_join_count", [ - ({"E": "count", "F": ["mean", "sum"]}, 5), - ({"E": ["min", "max"], "F": ["mean", "sum"]}, 7), - (["min", "max"], 7), - ({"E": "min", "F": "mean"}, 3), - ({"E": "max", "F": "max"}, 3), + ({"E": "count", "F": ["mean", "sum"]}, 3), + ({"E": ["min", "max"], "F": ["mean", "sum"]}, 4), + (["min", "max"], 4), + ({"E": "min", "F": "mean"}, 2), + ({"E": "max", "F": "max"}, 2), ], ) def test_pivot_table_multiple_pivot_values_dropna_null_data( @@ -106,7 +106,7 @@ def test_pivot_table_single_all_aggfuncs_dropna_and_null_data( df_data_with_nulls_2, values, ): - expected_join_count = 19 if len(values) > 1 else 9 + expected_join_count = 10 if len(values) > 1 else 5 with SqlCounter(query_count=1, join_count=expected_join_count): pivot_table_test_helper( df_data_with_nulls_2, diff --git a/tests/integ/modin/pivot/test_pivot_table_fill_value.py b/tests/integ/modin/pivot/test_pivot_table_fill_value.py index 2aeeafcad9..019216ba68 100644 --- a/tests/integ/modin/pivot/test_pivot_table_fill_value.py +++ b/tests/integ/modin/pivot/test_pivot_table_fill_value.py @@ -46,7 +46,7 @@ def test_pivot_table_single_with_dropna_type_incompatible_fill_value( ).to_pandas() -@sql_count_checker(query_count=1, join_count=9) +@sql_count_checker(query_count=1, join_count=5) def test_pivot_table_multiple_values_fill_value_nonnull_data( df_data, ): @@ -66,11 +66,11 @@ def test_pivot_table_multiple_values_fill_value_nonnull_data( @pytest.mark.parametrize( "aggfunc, expected_join_count", [ - ({"E": "count", "F": ["mean", "sum"]}, 5), - ({"E": ["min", "max"], "F": ["mean", "sum"]}, 7), - (["min", "max"], 7), - ({"E": "min", "F": "mean"}, 3), - ({"E": "max", "F": "max"}, 3), + ({"E": "count", "F": ["mean", "sum"]}, 3), + ({"E": ["min", "max"], "F": ["mean", "sum"]}, 4), + (["min", "max"], 4), + ({"E": "min", "F": "mean"}, 2), + ({"E": "max", "F": "max"}, 2), ], ) def test_pivot_table_multiple_pivot_values_fill_value_null_data( @@ -90,7 +90,7 @@ def test_pivot_table_multiple_pivot_values_fill_value_null_data( ) -@sql_count_checker(query_count=1, join_count=11) +@sql_count_checker(query_count=1, join_count=5) def test_pivot_table_multiple_index_single_pivot_values_dfill_value_null_data( df_data_with_nulls_2, ): @@ -109,7 +109,7 @@ def test_pivot_table_multiple_index_single_pivot_values_dfill_value_null_data( @pytest.mark.parametrize( "values, expected_join_count", - [(["D"], 9), (["E"], 9), (["F"], 9), (["E", "F"], 19)], + [(["D"], 5), (["E"], 5), (["F"], 5), (["E", "F"], 10)], ) def test_pivot_table_single_all_aggfuncs_fill_value_and_null_data( df_data_with_nulls_2, @@ -130,7 +130,7 @@ def test_pivot_table_single_all_aggfuncs_fill_value_and_null_data( ) -@sql_count_checker(query_count=1, join_count=7) +@sql_count_checker(query_count=1, join_count=4) def test_pivot_table_single_nuance_aggfuncs_fill_value_and_null_data( df_data_with_nulls_2, ): diff --git a/tests/integ/modin/pivot/test_pivot_table_margins.py b/tests/integ/modin/pivot/test_pivot_table_margins.py index f591e21444..0ac0ee1b32 100644 --- a/tests/integ/modin/pivot/test_pivot_table_margins.py +++ b/tests/integ/modin/pivot/test_pivot_table_margins.py @@ -91,7 +91,7 @@ def test_pivot_table_multiple_columns_values_with_margins( if isinstance(aggfunc, list): expected_join_count += 2 if not dropna: - expected_join_count += expected_join_count + expected_join_count += 1 with SqlCounter(query_count=1, join_count=expected_join_count): pivot_table_test_helper( df_data, @@ -134,7 +134,7 @@ def test_pivot_table_multiple_columns_values_with_margins( ), ], ) -@sql_count_checker(query_count=1, join_count=9, union_count=1) +@sql_count_checker(query_count=1, join_count=5, union_count=1) def test_pivot_table_multiple_pivot_values_null_data_with_margins( df_data_with_nulls, index, fill_value ): @@ -173,7 +173,7 @@ def test_pivot_table_multiple_pivot_values_null_data_with_margins( def test_pivot_table_multiple_pivot_values_null_data_with_margins_nan_blocked( df_data_with_nulls, index, fill_value ): - join_count = 7 if index is None and fill_value is None else 6 + join_count = 5 if index is None and fill_value is None else 4 union_count = 0 if index is None and fill_value is None else 1 with SqlCounter(query_count=1, join_count=join_count, union_count=union_count): pivot_table_test_helper( @@ -208,7 +208,7 @@ def test_pivot_table_mixed_index_types_with_margins( ) -@sql_count_checker(query_count=1, join_count=8, union_count=1) +@sql_count_checker(query_count=1, join_count=5, union_count=1) def test_pivot_table_single_aggfuncs_dropna_and_null_data_pandas_drops_columns( df_data_with_nulls_2, ): @@ -262,18 +262,20 @@ def test_single_value_single_aggfunc(self, columns, df_data): }, ) - @sql_count_checker(query_count=1, join_count=1, union_count=2) def test_multiple_value_single_aggfunc(self, columns, df_data): - pivot_table_test_helper( - df_data, - { - "columns": columns, - "values": ["D", "E"], - "aggfunc": ["sum"], - "dropna": True, - "margins": True, - }, - ) + with SqlCounter( + query_count=1, join_count=1, union_count=2 if len(columns) > 1 else 1 + ): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": ["sum"], + "dropna": True, + "margins": True, + }, + ) @sql_count_checker(query_count=1, join_count=3) def test_single_value_multiple_aggfunc(self, columns, df_data): diff --git a/tests/integ/modin/series/test_cache_result.py b/tests/integ/modin/series/test_cache_result.py index 5216bf26d7..3306d5b35a 100644 --- a/tests/integ/modin/series/test_cache_result.py +++ b/tests/integ/modin/series/test_cache_result.py @@ -104,7 +104,7 @@ def test_cache_result_simple(self, inplace): native_series = perform_chained_operations( native_pd.Series(np.arange(50)), native_pd ) - with SqlCounter(query_count=1, union_count=99): + with SqlCounter(query_count=1, union_count=18): snow_series = perform_chained_operations(snow_series, pd) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_series, native_series diff --git a/tests/integ/modin/series/test_describe.py b/tests/integ/modin/series/test_describe.py index c28ad38ab7..180815f327 100644 --- a/tests/integ/modin/series/test_describe.py +++ b/tests/integ/modin/series/test_describe.py @@ -14,7 +14,7 @@ create_test_series, eval_snowpark_pandas_result, ) -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker @pytest.mark.parametrize( @@ -37,9 +37,14 @@ [1.1, 2.2, "hello", None], ], ) -@sql_count_checker(query_count=1, union_count=5) def test_describe(data): - eval_snowpark_pandas_result(*create_test_series(data), lambda ser: ser.describe()) + with SqlCounter( + query_count=1, + union_count=4 if not data or any(isinstance(x, str) for x in data) else 5, + ): + eval_snowpark_pandas_result( + *create_test_series(data), lambda ser: ser.describe() + ) @pytest.mark.parametrize( @@ -78,7 +83,7 @@ def test_describe_percentiles(percentiles): ), # Specifying non-None exclude with include="all" is invalid for dataframes ], ) -@sql_count_checker(query_count=1, union_count=5) +@sql_count_checker(query_count=1, union_count=4) def test_describe_ignore_include_exclude(include, exclude): data = [f"data{i}" for i in range(10)] eval_snowpark_pandas_result( @@ -152,11 +157,14 @@ def timestamp_describe_comparator(snow_res, native_res): ], ids=["ints", "floats", "objects"], ) -@sql_count_checker(query_count=1, union_count=5) def test_describe_multiindex(data, index): - eval_snowpark_pandas_result( - *create_test_series(data, index=index), lambda ser: ser.describe() - ) + with SqlCounter( + query_count=1, + union_count=4 if not data or any(isinstance(x, str) for x in data) else 5, + ): + eval_snowpark_pandas_result( + *create_test_series(data, index=index), lambda ser: ser.describe() + ) @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/series/test_fillna.py b/tests/integ/modin/series/test_fillna.py index e4b894f48e..53c5377148 100644 --- a/tests/integ/modin/series/test_fillna.py +++ b/tests/integ/modin/series/test_fillna.py @@ -206,7 +206,7 @@ def inplace_fillna(df): @pytest.mark.parametrize("index", [list(range(8)), list(string.ascii_lowercase[:8])]) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_inplace_fillna_from_series(index): def inplace_fillna(series): series.iloc[:4].fillna(14, inplace=True) diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 3826282e0d..96c003646a 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -706,7 +706,7 @@ def loc_set_helper(s): s.loc[pd.Series(row_key)] = pd.Series(item) expected_join_count = ( - 2 if len(row_key) > 0 and all(isinstance(i, bool) for i in row_key) else 4 + 2 if len(row_key) > 0 and all(isinstance(i, bool) for i in row_key) else 3 ) with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( @@ -736,12 +736,12 @@ def test_series_loc_set_series_and_list_like_row_key_and_item( series = native_pd.Series([1, 2, 3], name="abc") item = [10, 20, 30] - expected_join_count = 4 + expected_join_count = 3 if all(isinstance(i, bool) for i in row_key): if item_type.startswith("series"): expected_join_count = 2 else: - expected_join_count = 6 + expected_join_count = 4 # With a boolean key, the number of items provided must match the number of True values in the key in pandas. if is_bool(row_key[0]): @@ -964,7 +964,7 @@ def set_loc_helper(ser): @pytest.mark.parametrize( "start, stop, step, pandas_fail", [[1, -1, None, True], [10, None, None, False]] ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=3) def test_series_loc_set_key_slice_with_series_item_pandas_bug( start, stop, step, pandas_fail ): @@ -1056,7 +1056,7 @@ def test_series_loc_set_with_empty_key_and_empty_item_negative( assert_series_equal(snowpark_ser, native_ser) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize("key", EMPTY_LIST_LIKE_VALUES) def test_series_loc_set_with_empty_key_and_empty_series_item( key, @@ -1177,7 +1177,7 @@ def test_series_loc_set_with_empty_key_and_list_like_item_negative( assert_series_equal(snowpark_ser, native_ser) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize("key", EMPTY_LIST_LIKE_VALUES) @pytest.mark.parametrize( "item", [native_pd.Series([random.randint(0, 6) for _ in range(7)])] @@ -1486,7 +1486,7 @@ def test_series_loc_set_lambda_key(key, item): # Join is performed when the item is list-like - join index and list-like item for assignment. # If item is scalar, no join is performed. - with SqlCounter(query_count=1, join_count=4 if isinstance(item, list) else 0): + with SqlCounter(query_count=1, join_count=3 if isinstance(item, list) else 0): assert_series_equal(snowpark_ser, native_ser, check_dtype=False) diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index b9dfad3541..487f82e2a1 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -196,7 +196,7 @@ def test_series_mask_with_lambda_returns_singleton_should_fail(): @pytest.mark.parametrize( "other, sql_count, join_count", - [(lambda x: -x.iloc[0], 4, 7), (lambda x: x**2, 3, 8)], + [(lambda x: -x.iloc[0], 4, 6), (lambda x: x**2, 3, 6)], ) def test_series_mask_with_lambda_other(other, sql_count, join_count): # Multiple joins since multiple Series are created with non-Snowpark pandas data diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index f4eff79d01..2e4b9c6610 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -1091,7 +1091,7 @@ def test_series_setitem_series_key_and_scalar_item( "item", SERIES_AND_LIST_LIKE_KEY_AND_ITEM_VALUES_WITH_DUPLICATES, ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_series_setitem_series_list_like_item_key_and_item_with_duplicates( key, item, default_index_native_series ): @@ -1124,7 +1124,7 @@ def test_series_setitem_series_list_like_item_key_and_item_with_duplicates( "item", SERIES_AND_LIST_LIKE_KEY_AND_ITEM_VALUES_NO_DUPLICATES, ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_series_setitem_series_list_like_item_key_and_item_no_duplicates( key, item, default_index_native_series ): diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index f5d2b2eb27..3e0cffd263 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -196,7 +196,7 @@ def test_series_where_with_lambda_cond_returns_singleton_should_fail(): @pytest.mark.parametrize( "other, sql_count, join_count", - [(lambda x: -x.iloc[0], 4, 7), (lambda x: x**2, 3, 8)], + [(lambda x: -x.iloc[0], 4, 6), (lambda x: x**2, 3, 6)], ) def test_series_where_with_lambda_other(other, sql_count, join_count): # High join count due to creatinga Series with non-Snowpark pandas data diff --git a/tests/integ/modin/types/test_timedelta_indexing.py b/tests/integ/modin/types/test_timedelta_indexing.py index e4b5803047..c600678616 100644 --- a/tests/integ/modin/types/test_timedelta_indexing.py +++ b/tests/integ/modin/types/test_timedelta_indexing.py @@ -78,7 +78,7 @@ def run_test(api, query_count, join_count): [ (2, ["b", "a"]), 2, - 3, + 2, True, ], # require transpose and keep result column type as timedelta [(2, ...), 1, 0, False], # require transpose but lose the type From 803e1bba849850a0fb049afe0fd20a34d929c0ae Mon Sep 17 00:00:00 2001 From: Jianzhun Du Date: Tue, 22 Oct 2024 14:43:09 -0700 Subject: [PATCH 3/3] fix --- tests/integ/modin/crosstab/test_crosstab.py | 8 +++--- tests/integ/modin/frame/test_cache_result.py | 2 +- tests/integ/modin/frame/test_describe.py | 4 +-- tests/integ/modin/frame/test_loc.py | 8 +++--- .../modin/pivot/test_pivot_table_dropna.py | 4 +-- .../modin/pivot/test_pivot_table_margins.py | 28 ++++++++++--------- .../modin/types/test_timedelta_indexing.py | 4 +-- 7 files changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index faa46a829a..cc3ede508d 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -53,7 +53,7 @@ def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b, def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c): # In this case, all indexes are identical - hence "full" overlap. query_count = 2 - join_count = 5 if dropna else 10 + join_count = 4 if dropna else 5 def eval_func(lib): if lib is pd: @@ -80,7 +80,7 @@ def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c): # of the Series objects. This test case passes because we pass in arrays that # are the length of the intersection rather than the length of each of the Series. query_count = 2 - join_count = 5 if dropna else 10 + join_count = 4 if dropna else 5 b = native_pd.Series( b, index=list(range(len(a))), @@ -206,7 +206,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns( self, dropna, a, b, c ): query_count = 4 - join_count = 1 if dropna else 3 + join_count = 1 if dropna else 2 a = native_pd.Series( a, dtype=object, @@ -252,7 +252,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index( self, dropna, a, b, c ): query_count = 6 - join_count = 5 if dropna else 17 + join_count = 5 if dropna else 11 a = native_pd.Series( a, dtype=object, diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 2129d776fe..1828344185 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -160,7 +160,7 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data): native_df = perform_chained_operations( native_df.pivot_table(**pivot_kwargs), native_pd ) - with SqlCounter(query_count=1, join_count=10, union_count=9): + with SqlCounter(query_count=1, join_count=1, union_count=9): snow_df = perform_chained_operations(snow_df, pd) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_df, native_df diff --git a/tests/integ/modin/frame/test_describe.py b/tests/integ/modin/frame/test_describe.py index 117f3c78d4..ca6b47b3a0 100644 --- a/tests/integ/modin/frame/test_describe.py +++ b/tests/integ/modin/frame/test_describe.py @@ -73,7 +73,7 @@ def test_describe_obj_only(data, expected_union_count): @pytest.mark.parametrize( - "dtype, expected_union_count", [(int, 7), (float, 7), (object, 9)] + "dtype, expected_union_count", [(int, 7), (float, 7), (object, 5)] ) def test_describe_empty_rows(dtype, expected_union_count): with SqlCounter(query_count=1, union_count=expected_union_count): @@ -181,7 +181,7 @@ def test_describe_include_exclude_obj_only(include, exclude, expected_exception) } with SqlCounter( query_count=1 if expected_exception is None else 0, - union_count=9 if expected_exception is None else 0, + union_count=5 if expected_exception is None else 0, ): eval_snowpark_pandas_result( *create_test_dfs(data), diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 05beb6e6de..47a3755cee 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -1144,7 +1144,7 @@ def loc_set_helper(df): query_count, join_count = 1, 2 if not all(isinstance(rk_val, bool) for rk_val in row_key): - join_count += 2 + join_count += 1 if isinstance(col_key, native_pd.Series): query_count += 1 with SqlCounter(query_count=query_count, join_count=join_count): @@ -4219,9 +4219,9 @@ def test_df_loc_set_series_value(key, convert_key_to_series, row_loc): key_sorted = key == list("ABC") if row_loc is not None: if convert_key_to_series: - join_count = 9 - else: join_count = 6 + else: + join_count = 4 else: if convert_key_to_series: join_count = 3 @@ -4278,7 +4278,7 @@ def test_df_loc_set_series_value_slice_key(key, row_loc): snow_df = pd.DataFrame(native_df) query_count = 2 if row_loc is not None: - join_count = 6 + join_count = 4 else: join_count = 1 diff --git a/tests/integ/modin/pivot/test_pivot_table_dropna.py b/tests/integ/modin/pivot/test_pivot_table_dropna.py index 69e10572c8..65e68ff638 100644 --- a/tests/integ/modin/pivot/test_pivot_table_dropna.py +++ b/tests/integ/modin/pivot/test_pivot_table_dropna.py @@ -85,7 +85,7 @@ def test_pivot_table_multiple_pivot_values_dropna_null_data( ) -@sql_count_checker(query_count=1, join_count=11) +@sql_count_checker(query_count=1, join_count=5) def test_pivot_table_multiple_index_single_pivot_values_dropna_null_data( df_data_with_nulls_2, ): @@ -120,7 +120,7 @@ def test_pivot_table_single_all_aggfuncs_dropna_and_null_data( ) -@sql_count_checker(query_count=1, join_count=7) +@sql_count_checker(query_count=1, join_count=4) def test_pivot_table_single_nuance_aggfuncs_dropna_and_null_data( df_data_with_nulls_2, ): diff --git a/tests/integ/modin/pivot/test_pivot_table_margins.py b/tests/integ/modin/pivot/test_pivot_table_margins.py index 0ac0ee1b32..fc48a0f19b 100644 --- a/tests/integ/modin/pivot/test_pivot_table_margins.py +++ b/tests/integ/modin/pivot/test_pivot_table_margins.py @@ -191,7 +191,7 @@ def test_pivot_table_multiple_pivot_values_null_data_with_margins_nan_blocked( ) -@sql_count_checker(query_count=1, join_count=12, union_count=1) +@sql_count_checker(query_count=1, join_count=6, union_count=1) def test_pivot_table_mixed_index_types_with_margins( df_data, ): @@ -352,21 +352,23 @@ def test_single_value_single_aggfunc( named_columns=named_columns, ) - @sql_count_checker(query_count=1, join_count=1, union_count=2) def test_multiple_value_single_aggfunc( self, columns, named_columns, df_data_more_pivot_values ): - pivot_table_test_helper( - df_data_more_pivot_values, - { - "columns": columns, - "values": ["D", "E"], - "aggfunc": "sum", - "dropna": True, - "margins": True, - }, - named_columns=named_columns, - ) + with SqlCounter( + query_count=1, join_count=1, union_count=2 if len(columns) > 1 else 1 + ): + pivot_table_test_helper( + df_data_more_pivot_values, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": "sum", + "dropna": True, + "margins": True, + }, + named_columns=named_columns, + ) @sql_count_checker(query_count=1, join_count=3) def test_single_value_multiple_aggfunc( diff --git a/tests/integ/modin/types/test_timedelta_indexing.py b/tests/integ/modin/types/test_timedelta_indexing.py index c600678616..932af46341 100644 --- a/tests/integ/modin/types/test_timedelta_indexing.py +++ b/tests/integ/modin/types/test_timedelta_indexing.py @@ -466,7 +466,7 @@ def test_index_get_timedelta(key, join_count): [ [2, "iat", 1, 1], [native_pd.Timedelta("1 days 1 hour"), "at", 2, 4], - [[2, 1], "iloc", 1, 4], + [[2, 1], "iloc", 1, 3], [ [ native_pd.Timedelta("1 days 1 hour"), @@ -510,7 +510,7 @@ def test_series_with_timedelta_index(key, api, query_count, join_count): [ [2, "iat", 1, 1], [native_pd.Timedelta("1 days 1 hour"), "at", 2, 4], - [[2, 1], "iloc", 1, 4], + [[2, 1], "iloc", 1, 3], [ [ native_pd.Timedelta("1 days 1 hour"),