snowflakedb · sfc-gh-jdu · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024
@@ -217,7 +217,8 @@ def session(
         .create()
     )
     session.sql_simplifier_enabled = sql_simplifier_enabled
-    session._cte_optimization_enabled = cte_optimization_enabled
+    session._cte_optimization_enabled = True
+    session._query_compilation_stage_enabled = True
     if os.getenv("GITHUB_ACTIONS") == "true" and not local_testing_mode:
         set_up_external_access_integration_resources(
             session, rule1, rule2, key1, key2, integration1, integration2

@@ -53,7 +53,7 @@ def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b,
     def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c):
         # In this case, all indexes are identical - hence "full" overlap.
         query_count = 2
-        join_count = 5 if dropna else 10
+        join_count = 4 if dropna else 5
 
         def eval_func(lib):
             if lib is pd:
@@ -80,7 +80,7 @@ def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c):
         # of the Series objects. This test case passes because we pass in arrays that
         # are the length of the intersection rather than the length of each of the Series.
         query_count = 2
-        join_count = 5 if dropna else 10
+        join_count = 4 if dropna else 5
         b = native_pd.Series(
             b,
             index=list(range(len(a))),
@@ -206,7 +206,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns(
         self, dropna, a, b, c
     ):
         query_count = 4
-        join_count = 1 if dropna else 3
+        join_count = 1 if dropna else 2
         a = native_pd.Series(
             a,
             dtype=object,
@@ -252,7 +252,7 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index(
         self, dropna, a, b, c
     ):
         query_count = 6
-        join_count = 5 if dropna else 17
+        join_count = 5 if dropna else 11
         a = native_pd.Series(
             a,
             dtype=object,
@@ -319,7 +319,7 @@ def test_margins(self, dropna, a, b, c):
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize(self, dropna, normalize, a, b, c):
         query_count = 1 if normalize in (0, "index") else 2
-        join_count = 3 if normalize in (0, "index") else 2
+        join_count = 3 if normalize in (0, "index") and dropna else 2
         if dropna:
             join_count -= 2
 
@@ -340,9 +340,9 @@ def test_normalize(self, dropna, normalize, a, b, c):
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize_and_margins(self, dropna, normalize, a, b, c):
         counts = {
-            "columns": [3, 5 if dropna else 9, 4],
-            "index": [1, 5 if dropna else 8, 3],
-            "all": [3, 12 if dropna else 19, 7],
+            "columns": [3, 4 if dropna else 7, 3],
+            "index": [1, 3 if dropna else 4, 1],
+            "all": [3, 7 if dropna else 10, 3],
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
@@ -374,8 +374,8 @@ def test_normalize_and_margins(self, dropna, normalize, a, b, c):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
-            "columns": [3, 29 if dropna else 41, 4],
-            "index": [1, 23 if dropna else 32, 3],
+            "columns": [3, 10 if dropna else 13, 3],
+            "index": [1, 5 if dropna else 6, 1],
             "all": [3, 54 if dropna else 75, 7],
         }
         counts[0] = counts["index"]
@@ -438,7 +438,7 @@ def eval_func(lib):
 
         with SqlCounter(
             query_count=1,
-            join_count=7 if dropna else 10,
+            join_count=3 if dropna else 4,
             union_count=1,
         ):
             eval_snowpark_pandas_result(
@@ -451,9 +451,9 @@ def eval_func(lib):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
-            "columns": [2, 4 if dropna else 10],
-            "index": [1, 5 if dropna else 11],
-            "all": [2, 4 if dropna else 10],
+            "columns": [2, 4 if dropna else 6],
+            "index": [1, 3 if dropna else 4],
+            "all": [2, 4 if dropna else 6],
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
@@ -520,7 +520,7 @@ def test_normalize_margins_and_values_not_supported(
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 1
-        join_count = 2 if dropna else 5
+        join_count = 2 if dropna else 3
         native_df = basic_crosstab_dfs[0]
 
         with SqlCounter(query_count=query_count, join_count=join_count):
@@ -539,7 +539,7 @@ def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 5
-        join_count = 2 if dropna else 5
+        join_count = 2 if dropna else 3
         native_df, snow_df = basic_crosstab_dfs
 
         def eval_func(df):

@@ -60,7 +60,7 @@ def assign_func(df):
 
 @pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"])
 def test_assign_basic_non_pandas_object(new_col_value):
-    join_count = 4 if isinstance(new_col_value, list) else 1
+    join_count = 3 if isinstance(new_col_value, list) else 1
     with SqlCounter(query_count=1, join_count=join_count):
         snow_df, native_df = create_test_dfs(
             [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
@@ -74,7 +74,7 @@ def test_assign_basic_non_pandas_object(new_col_value):
         )
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_assign_invalid_long_column_length_negative():
     # pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3.
     # Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted
@@ -98,7 +98,7 @@ def test_assign_invalid_long_column_length_negative():
     assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_assign_invalid_short_column_length_negative():
     # pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3.
     # Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted
@@ -226,7 +226,7 @@ def test_assign_self_columns():
     )
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_overwrite_columns_via_assign():
     snow_df, native_df = create_test_dfs(
         [[1, 2, 3], [4, 5, 6], [7, 8, 9]],

@@ -130,7 +130,7 @@ def test_cache_result_simple(self, inplace):
         native_df = perform_chained_operations(
             native_pd.DataFrame(np.arange(15).reshape((3, 5))), native_pd
         )
-        with SqlCounter(query_count=1, union_count=29):
+        with SqlCounter(query_count=1, union_count=11):
             snow_df = perform_chained_operations(snow_df, pd)
             assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
                 snow_df, native_df
@@ -160,7 +160,7 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data):
         native_df = perform_chained_operations(
             native_df.pivot_table(**pivot_kwargs), native_pd
         )
-        with SqlCounter(query_count=1, join_count=10, union_count=9):
+        with SqlCounter(query_count=1, join_count=1, union_count=9):
             snow_df = perform_chained_operations(snow_df, pd)
             assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
                 snow_df, native_df
@@ -213,7 +213,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data):
         with SqlCounter(
             query_count=11,
             union_count=9,
-            udf_count=2,
+            udf_count=1,
             high_count_expected=True,
             high_count_reason="applymap requires additional queries to setup the UDF.",
         ):

@@ -49,19 +49,19 @@ def test_describe_numeric_only(data):
     # In total, we thus have 2 + 2 * (N - 1 + N) + 1 = 4N + 1 UNIONs for an N-column frame.
     [
         # If there are multiple modes, return the value that appears first
-        ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 9),
+        ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 5),
         # Empty columns are numeric by default (df constructor must explicitly specify object dtype)
-        ({"a": [], "b": []}, 9),
+        ({"a": [], "b": []}, 5),
         # Heterogeneous data is considered non-numeric
-        ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 13),
+        ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 6),
         (
             [
                 [None, "quick", None],
                 ["fox", "quick", "lazy"],
                 ["dog", "dog", "lazy"],
                 [None, None, None],
             ],
-            13,
+            6,
         ),
     ],
 )
@@ -73,7 +73,7 @@ def test_describe_obj_only(data, expected_union_count):
 
 
 @pytest.mark.parametrize(
-    "dtype, expected_union_count", [(int, 7), (float, 7), (object, 9)]
+    "dtype, expected_union_count", [(int, 7), (float, 7), (object, 5)]
 )
 def test_describe_empty_rows(dtype, expected_union_count):
     with SqlCounter(query_count=1, union_count=expected_union_count):
@@ -107,7 +107,7 @@ def test_describe_empty_cols():
         # 4K-1 UNIONs to compute top/freq for K object-dtype columns (see comment on
         # test_describe_obj_only for reasoning).
         # Since we have K=2 object columns, the result is 9 + (4 * 2 - 1) = 16 UNIONs.
-        ([int, object], None, None, 16),
+        ([int, object], None, None, 12),
         (np.number, [], None, 7),
         # Including only datetimes has 7 statistics since std is not computed.
         # Since there is only 1 column, all quantiles are computed in a single QC.
@@ -127,8 +127,8 @@ def test_describe_empty_cols():
         # include and exclude cannot directly overlap
         ([int, "O"], [float, "O"], ValueError, 0),
         # Like select_dtypes, a dtype in include/exclude can be a subtype of a dtype in the other
-        ([int, "O"], [float, np.number, np.datetime64], None, 9),
-        ("O", None, None, 9),
+        ([int, "O"], [float, np.number, np.datetime64], None, 5),
+        ("O", None, None, 5),
     ],
 )
 def test_describe_include_exclude(
@@ -181,7 +181,7 @@ def test_describe_include_exclude_obj_only(include, exclude, expected_exception)
     }
     with SqlCounter(
         query_count=1 if expected_exception is None else 0,
-        union_count=9 if expected_exception is None else 0,
+        union_count=5 if expected_exception is None else 0,
     ):
         eval_snowpark_pandas_result(
             *create_test_dfs(data),
@@ -285,9 +285,9 @@ def timestamp_describe_comparator(snow_res, native_res):
     # Don't need to test all permutations of include/exclude with MultiIndex -- this is covered by
     # tests for select_dtypes, as well as other tests in this file
     [
-        ("all", 16),
+        ("all", 12),
         (np.number, 7),
-        (object, 9),
+        (object, 5),
     ],
 )
 def test_describe_multiindex(index, columns, include, expected_union_count):
@@ -312,10 +312,10 @@ def test_describe_multiindex(index, columns, include, expected_union_count):
     "include, exclude, expected_union_count",
     [
         (None, None, 7),
-        ("all", None, 12),
+        ("all", None, 11),
         (np.number, None, 7),
-        (None, float, 10),
-        (object, None, 5),
+        (None, float, 9),
+        (object, None, 4),
         (None, object, 7),
         (int, float, 5),
         (float, int, 5),
@@ -350,7 +350,7 @@ def helper(df):
 
 @sql_count_checker(
     query_count=3,
-    union_count=21,
+    union_count=8,
 )
 # SNOW-1320296 - pd.concat SQL Compilation ambigious __row_position__ issue
 def test_describe_object_file(resources_path):