Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jdu snow 1733956 fix sql count #2491

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/integ/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ def session(
.create()
)
session.sql_simplifier_enabled = sql_simplifier_enabled
session._cte_optimization_enabled = cte_optimization_enabled
session._cte_optimization_enabled = True
session._query_compilation_stage_enabled = True
if os.getenv("GITHUB_ACTIONS") == "true" and not local_testing_mode:
set_up_external_access_integration_resources(
session, rule1, rule2, key1, key2, integration1, integration2
Expand Down
32 changes: 16 additions & 16 deletions tests/integ/modin/crosstab/test_crosstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b,
def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c):
# In this case, all indexes are identical - hence "full" overlap.
query_count = 2
join_count = 5 if dropna else 10
join_count = 4 if dropna else 5

def eval_func(lib):
if lib is pd:
Expand All @@ -80,7 +80,7 @@ def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c):
# of the Series objects. This test case passes because we pass in arrays that
# are the length of the intersection rather than the length of each of the Series.
query_count = 2
join_count = 5 if dropna else 10
join_count = 4 if dropna else 5
b = native_pd.Series(
b,
index=list(range(len(a))),
Expand Down Expand Up @@ -206,7 +206,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns(
self, dropna, a, b, c
):
query_count = 4
join_count = 1 if dropna else 3
join_count = 1 if dropna else 2
a = native_pd.Series(
a,
dtype=object,
Expand Down Expand Up @@ -252,7 +252,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index(
self, dropna, a, b, c
):
query_count = 6
join_count = 5 if dropna else 17
join_count = 5 if dropna else 11
a = native_pd.Series(
a,
dtype=object,
Expand Down Expand Up @@ -319,7 +319,7 @@ def test_margins(self, dropna, a, b, c):
@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
def test_normalize(self, dropna, normalize, a, b, c):
query_count = 1 if normalize in (0, "index") else 2
join_count = 3 if normalize in (0, "index") else 2
join_count = 3 if normalize in (0, "index") and dropna else 2
if dropna:
join_count -= 2

Expand All @@ -340,9 +340,9 @@ def test_normalize(self, dropna, normalize, a, b, c):
@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
def test_normalize_and_margins(self, dropna, normalize, a, b, c):
counts = {
"columns": [3, 5 if dropna else 9, 4],
"index": [1, 5 if dropna else 8, 3],
"all": [3, 12 if dropna else 19, 7],
"columns": [3, 4 if dropna else 7, 3],
"index": [1, 3 if dropna else 4, 1],
"all": [3, 7 if dropna else 10, 3],
}
counts[0] = counts["index"]
counts[1] = counts["columns"]
Expand Down Expand Up @@ -374,8 +374,8 @@ def test_normalize_and_margins(self, dropna, normalize, a, b, c):
@pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c):
counts = {
"columns": [3, 29 if dropna else 41, 4],
"index": [1, 23 if dropna else 32, 3],
"columns": [3, 10 if dropna else 13, 3],
"index": [1, 5 if dropna else 6, 1],
"all": [3, 54 if dropna else 75, 7],
}
counts[0] = counts["index"]
Expand Down Expand Up @@ -438,7 +438,7 @@ def eval_func(lib):

with SqlCounter(
query_count=1,
join_count=7 if dropna else 10,
join_count=3 if dropna else 4,
union_count=1,
):
eval_snowpark_pandas_result(
Expand All @@ -451,9 +451,9 @@ def eval_func(lib):
@pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c):
counts = {
"columns": [2, 4 if dropna else 10],
"index": [1, 5 if dropna else 11],
"all": [2, 4 if dropna else 10],
"columns": [2, 4 if dropna else 6],
"index": [1, 3 if dropna else 4],
"all": [2, 4 if dropna else 6],
}
counts[0] = counts["index"]
counts[1] = counts["columns"]
Expand Down Expand Up @@ -520,7 +520,7 @@ def test_normalize_margins_and_values_not_supported(
@pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
query_count = 1
join_count = 2 if dropna else 5
join_count = 2 if dropna else 3
native_df = basic_crosstab_dfs[0]

with SqlCounter(query_count=query_count, join_count=join_count):
Expand All @@ -539,7 +539,7 @@ def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
@pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs):
query_count = 5
join_count = 2 if dropna else 5
join_count = 2 if dropna else 3
native_df, snow_df = basic_crosstab_dfs

def eval_func(df):
Expand Down
8 changes: 4 additions & 4 deletions tests/integ/modin/frame/test_assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def assign_func(df):

@pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"])
def test_assign_basic_non_pandas_object(new_col_value):
join_count = 4 if isinstance(new_col_value, list) else 1
join_count = 3 if isinstance(new_col_value, list) else 1
with SqlCounter(query_count=1, join_count=join_count):
snow_df, native_df = create_test_dfs(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
Expand All @@ -74,7 +74,7 @@ def test_assign_basic_non_pandas_object(new_col_value):
)


@sql_count_checker(query_count=1, join_count=4)
@sql_count_checker(query_count=1, join_count=3)
def test_assign_invalid_long_column_length_negative():
# pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3.
# Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted
Expand All @@ -98,7 +98,7 @@ def test_assign_invalid_long_column_length_negative():
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)


@sql_count_checker(query_count=1, join_count=4)
@sql_count_checker(query_count=1, join_count=3)
def test_assign_invalid_short_column_length_negative():
# pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3.
# Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted
Expand Down Expand Up @@ -226,7 +226,7 @@ def test_assign_self_columns():
)


@sql_count_checker(query_count=1, join_count=4)
@sql_count_checker(query_count=1, join_count=3)
def test_overwrite_columns_via_assign():
snow_df, native_df = create_test_dfs(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
Expand Down
6 changes: 3 additions & 3 deletions tests/integ/modin/frame/test_cache_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_cache_result_simple(self, inplace):
native_df = perform_chained_operations(
native_pd.DataFrame(np.arange(15).reshape((3, 5))), native_pd
)
with SqlCounter(query_count=1, union_count=29):
with SqlCounter(query_count=1, union_count=11):
snow_df = perform_chained_operations(snow_df, pd)
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_df, native_df
Expand Down Expand Up @@ -160,7 +160,7 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data):
native_df = perform_chained_operations(
native_df.pivot_table(**pivot_kwargs), native_pd
)
with SqlCounter(query_count=1, join_count=10, union_count=9):
with SqlCounter(query_count=1, join_count=1, union_count=9):
snow_df = perform_chained_operations(snow_df, pd)
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_df, native_df
Expand Down Expand Up @@ -213,7 +213,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data):
with SqlCounter(
query_count=11,
union_count=9,
udf_count=2,
udf_count=1,
high_count_expected=True,
high_count_reason="applymap requires additional queries to setup the UDF.",
):
Expand Down
30 changes: 15 additions & 15 deletions tests/integ/modin/frame/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,19 @@ def test_describe_numeric_only(data):
# In total, we thus have 2 + 2 * (N - 1 + N) + 1 = 4N + 1 UNIONs for an N-column frame.
[
# If there are multiple modes, return the value that appears first
({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 9),
({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 5),
# Empty columns are numeric by default (df constructor must explicitly specify object dtype)
({"a": [], "b": []}, 9),
({"a": [], "b": []}, 5),
# Heterogeneous data is considered non-numeric
({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 13),
({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 6),
(
[
[None, "quick", None],
["fox", "quick", "lazy"],
["dog", "dog", "lazy"],
[None, None, None],
],
13,
6,
),
],
)
Expand All @@ -73,7 +73,7 @@ def test_describe_obj_only(data, expected_union_count):


@pytest.mark.parametrize(
"dtype, expected_union_count", [(int, 7), (float, 7), (object, 9)]
"dtype, expected_union_count", [(int, 7), (float, 7), (object, 5)]
)
def test_describe_empty_rows(dtype, expected_union_count):
with SqlCounter(query_count=1, union_count=expected_union_count):
Expand Down Expand Up @@ -107,7 +107,7 @@ def test_describe_empty_cols():
# 4K-1 UNIONs to compute top/freq for K object-dtype columns (see comment on
# test_describe_obj_only for reasoning).
# Since we have K=2 object columns, the result is 9 + (4 * 2 - 1) = 16 UNIONs.
([int, object], None, None, 16),
([int, object], None, None, 12),
(np.number, [], None, 7),
# Including only datetimes has 7 statistics since std is not computed.
# Since there is only 1 column, all quantiles are computed in a single QC.
Expand All @@ -127,8 +127,8 @@ def test_describe_empty_cols():
# include and exclude cannot directly overlap
([int, "O"], [float, "O"], ValueError, 0),
# Like select_dtypes, a dtype in include/exclude can be a subtype of a dtype in the other
([int, "O"], [float, np.number, np.datetime64], None, 9),
("O", None, None, 9),
([int, "O"], [float, np.number, np.datetime64], None, 5),
("O", None, None, 5),
],
)
def test_describe_include_exclude(
Expand Down Expand Up @@ -181,7 +181,7 @@ def test_describe_include_exclude_obj_only(include, exclude, expected_exception)
}
with SqlCounter(
query_count=1 if expected_exception is None else 0,
union_count=9 if expected_exception is None else 0,
union_count=5 if expected_exception is None else 0,
):
eval_snowpark_pandas_result(
*create_test_dfs(data),
Expand Down Expand Up @@ -285,9 +285,9 @@ def timestamp_describe_comparator(snow_res, native_res):
# Don't need to test all permutations of include/exclude with MultiIndex -- this is covered by
# tests for select_dtypes, as well as other tests in this file
[
("all", 16),
("all", 12),
(np.number, 7),
(object, 9),
(object, 5),
],
)
def test_describe_multiindex(index, columns, include, expected_union_count):
Expand All @@ -312,10 +312,10 @@ def test_describe_multiindex(index, columns, include, expected_union_count):
"include, exclude, expected_union_count",
[
(None, None, 7),
("all", None, 12),
("all", None, 11),
(np.number, None, 7),
(None, float, 10),
(object, None, 5),
(None, float, 9),
(object, None, 4),
(None, object, 7),
(int, float, 5),
(float, int, 5),
Expand Down Expand Up @@ -350,7 +350,7 @@ def helper(df):

@sql_count_checker(
query_count=3,
union_count=21,
union_count=8,
)
# SNOW-1320296 - pd.concat SQL Compilation ambigious __row_position__ issue
def test_describe_object_file(resources_path):
Expand Down
Loading
Loading