From e8d29e73e68630620908eaf28589f0882c294e53 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 17 Oct 2018 05:36:19 -0700
Subject: [PATCH] REF: Fuse all the types (#23022)

---
 pandas/_libs/algos_common_helper.pxi.in |  31 ++-
 pandas/_libs/algos_rank_helper.pxi.in   |  57 ++---
 pandas/_libs/groupby_helper.pxi.in      | 287 +++++++++++++-----------
 pandas/_libs/join_func_helper.pxi.in    | 112 +++++----
 pandas/_libs/sparse_op_helper.pxi.in    |  86 +++----
 5 files changed, 295 insertions(+), 278 deletions(-)

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
index 6bcc735656c6b..b39b5eaced8fd 100644
--- a/pandas/_libs/algos_common_helper.pxi.in
+++ b/pandas/_libs/algos_common_helper.pxi.in
@@ -16,33 +16,30 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 {{py:
 
-# name, c_type, dest_type, dest_dtype
-dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
-          ('float32', 'float32_t', 'float32_t', 'np.float32'),
-          ('int8',  'int8_t',  'float32_t', 'np.float32'),
-          ('int16', 'int16_t', 'float32_t', 'np.float32'),
-          ('int32', 'int32_t', 'float64_t', 'np.float64'),
-          ('int64', 'int64_t', 'float64_t', 'np.float64')]
+# name, c_type, dest_type
+dtypes = [('float64', 'float64_t', 'float64_t'),
+          ('float32', 'float32_t', 'float32_t'),
+          ('int8',  'int8_t',  'float32_t'),
+          ('int16', 'int16_t', 'float32_t'),
+          ('int32', 'int32_t', 'float64_t'),
+          ('int64', 'int64_t', 'float64_t')]
 
 def get_dispatch(dtypes):
 
-    for name, c_type, dest_type, dest_dtype, in dtypes:
-
-        dest_type2 = dest_type
-        dest_type = dest_type.replace('_t', '')
-
-        yield name, c_type, dest_type, dest_type2, dest_dtype
+    for name, c_type, dest_type, in dtypes:
+        dest_name = dest_type[:-2]  # i.e. strip "_t"
+        yield name, c_type, dest_type, dest_name
 
 }}
 
-{{for name, c_type, dest_type, dest_type2, dest_dtype
+{{for name, c_type, dest_type, dest_name
       in get_dispatch(dtypes)}}
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
-                     ndarray[{{dest_type2}}, ndim=2] out,
+                     ndarray[{{dest_type}}, ndim=2] out,
                      Py_ssize_t periods, int axis):
     cdef:
         Py_ssize_t i, j, sx, sy
@@ -84,9 +81,9 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
                     out[i, j] = arr[i, j] - arr[i, j - periods]
 
 
-def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
+def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
                                  ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[{{dest_type2}}] out):
+                                 ndarray[{{dest_type}}] out):
     cdef:
         Py_ssize_t i, j, k
 
diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
index 130276ae0e73c..bb4aec75ed567 100644
--- a/pandas/_libs/algos_rank_helper.pxi.in
+++ b/pandas/_libs/algos_rank_helper.pxi.in
@@ -131,45 +131,20 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
     argsorted = _as.astype('i8')
 
     {{if dtype == 'object'}}
-    for i in range(n):
-        sum_ranks += i + 1
-        dups += 1
-        isnan = sorted_mask[i]
-        val = util.get_value_at(sorted_data, i)
-
-        if isnan and keep_na:
-            ranks[argsorted[i]] = nan
-            continue
-        count += 1.0
-
-        if (i == n - 1 or
-                are_diff(util.get_value_at(sorted_data, i + 1), val) or
-                i == non_na_idx):
-            if tiebreak == TIEBREAK_AVERAGE:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = sum_ranks / dups
-            elif tiebreak == TIEBREAK_MIN:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = i - dups + 2
-            elif tiebreak == TIEBREAK_MAX:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = i + 1
-            elif tiebreak == TIEBREAK_FIRST:
-                raise ValueError('first not supported for non-numeric data')
-            elif tiebreak == TIEBREAK_FIRST_DESCENDING:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = 2 * i - j - dups + 2
-            elif tiebreak == TIEBREAK_DENSE:
-                total_tie_count += 1
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = total_tie_count
-            sum_ranks = dups = 0
+    if True:
     {{else}}
     with nogil:
+    {{endif}}
+        # TODO: why does the 2d version not have a nogil block?
         for i in range(n):
             sum_ranks += i + 1
             dups += 1
+
+            {{if dtype == 'object'}}
+            val = util.get_value_at(sorted_data, i)
+            {{else}}
             val = sorted_data[i]
+            {{endif}}
 
             {{if dtype != 'uint64'}}
             isnan = sorted_mask[i]
@@ -180,8 +155,14 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
 
             count += 1.0
 
-            if (i == n - 1 or sorted_data[i + 1] != val or
-                i == non_na_idx):
+            {{if dtype == 'object'}}
+            if (i == n - 1 or
+                    are_diff(util.get_value_at(sorted_data, i + 1), val) or
+                    i == non_na_idx):
+            {{else}}
+            if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+            {{endif}}
+
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups
@@ -192,8 +173,13 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = i + 1
                 elif tiebreak == TIEBREAK_FIRST:
+                    {{if dtype == 'object'}}
+                    raise ValueError('first not supported for '
+                                     'non-numeric data')
+                    {{else}}
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = j + 1
+                    {{endif}}
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = 2 * i - j - dups + 2
@@ -202,7 +188,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = total_tie_count
                 sum_ranks = dups = 0
-    {{endif}}
     if pct:
         if tiebreak == TIEBREAK_DENSE:
             return ranks / total_tie_count
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 5b01117381a27..addbb2b3e8165 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -14,26 +14,22 @@ _int64_max = np.iinfo(np.int64).max
 
 {{py:
 
-# name, c_type, dest_type, dest_dtype
-dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
-          ('float32', 'float32_t', 'float32_t', 'np.float32')]
+# name, c_type
+dtypes = [('float64', 'float64_t'),
+          ('float32', 'float32_t')]
 
 def get_dispatch(dtypes):
 
-    for name, c_type, dest_type, dest_dtype in dtypes:
-
-        dest_type2 = dest_type
-        dest_type = dest_type.replace('_t', '')
-
-        yield name, c_type, dest_type, dest_type2, dest_dtype
+    for name, c_type in dtypes:
+        yield name, c_type
 }}
 
-{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}}
+{{for name, c_type in get_dispatch(dtypes)}}
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{c_type}}, ndim=2] values,
                        ndarray[int64_t] labels,
@@ -43,8 +39,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] sumx, nobs
+        {{c_type}} val, count
+        ndarray[{{c_type}}, ndim=2] sumx, nobs
 
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
@@ -80,7 +76,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{c_type}}, ndim=2] values,
                         ndarray[int64_t] labels,
@@ -90,8 +86,8 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] prodx, nobs
+        {{c_type}} val, count
+        ndarray[{{c_type}}, ndim=2] prodx, nobs
 
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
@@ -127,15 +123,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 @cython.wraparound(False)
 @cython.boundscheck(False)
 @cython.cdivision(True)
-def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                        ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
+                       ndarray[{{c_type}}, ndim=2] values,
                        ndarray[int64_t] labels,
                        Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, ct, oldmean
-        ndarray[{{dest_type2}}, ndim=2] nobs, mean
+        {{c_type}} val, ct, oldmean
+        ndarray[{{c_type}}, ndim=2] nobs, mean
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -179,15 +175,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                         ndarray[int64_t] counts,
-                        ndarray[{{dest_type2}}, ndim=2] values,
+                        ndarray[{{c_type}}, ndim=2] values,
                         ndarray[int64_t] labels,
                         Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] sumx, nobs
+        {{c_type}} val, count
+        ndarray[{{c_type}}, ndim=2] sumx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -224,9 +220,9 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                   ndarray[int64_t] counts,
-                  ndarray[{{dest_type2}}, ndim=2] values,
+                  ndarray[{{c_type}}, ndim=2] values,
                   ndarray[int64_t] labels,
                   Py_ssize_t min_count=-1):
     """
@@ -234,7 +230,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab
-        {{dest_type2}} val, count
+        {{c_type}} val, count
         Py_ssize_t ngroups = len(counts)
 
     assert min_count == -1, "'min_count' only used in add and prod"
@@ -278,26 +274,26 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
 {{py:
 
-# name, c_type, dest_type2, nan_val
-dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'),
-          ('float32', 'float32_t', 'float32_t', 'NAN'),
-          ('int64', 'int64_t', 'int64_t', 'iNaT'),
-          ('object', 'object', 'object', 'NAN')]
+# name, c_type, nan_val
+dtypes = [('float64', 'float64_t', 'NAN'),
+          ('float32', 'float32_t', 'NAN'),
+          ('int64', 'int64_t', 'iNaT'),
+          ('object', 'object', 'NAN')]
 
 def get_dispatch(dtypes):
 
-    for name, c_type, dest_type2, nan_val in dtypes:
+    for name, c_type, nan_val in dtypes:
 
-        yield name, c_type, dest_type2, nan_val
+        yield name, c_type, nan_val
 }}
 
 
-{{for name, c_type, dest_type2, nan_val in get_dispatch(dtypes)}}
+{{for name, c_type, nan_val in get_dispatch(dtypes)}}
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{c_type}}, ndim=2] values,
                         ndarray[int64_t] labels,
@@ -307,8 +303,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val
-        ndarray[{{dest_type2}}, ndim=2] resx
+        {{c_type}} val
+        ndarray[{{c_type}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
@@ -354,7 +350,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{c_type}}, ndim=2] values,
                        ndarray[int64_t] labels, int64_t rank,
@@ -364,8 +360,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val
-        ndarray[{{dest_type2}}, ndim=2] resx
+        {{c_type}} val
+        ndarray[{{c_type}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
@@ -473,7 +469,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
     # with mask, without obfuscating location of missing data
     # in values array
     masked_vals = np.array(values[:, 0], copy=True)
-    {{if name=='int64'}}
+    {{if name == 'int64'}}
     mask = (masked_vals == {{nan_val}}).astype(np.uint8)
     {{else}}
     mask = np.isnan(masked_vals).astype(np.uint8)
@@ -597,41 +593,31 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
 {{endfor}}
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # group_min, group_max
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type, dest_type2, nan_val
-dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'),
-          ('float32', 'float32_t', 'NAN', 'np.inf'),
-          ('int64', 'int64_t', 'iNaT', '_int64_max')]
-
-def get_dispatch(dtypes):
-
-    for name, dest_type2, nan_val, inf_val in dtypes:
-        yield name, dest_type2, nan_val, inf_val
-}}
-
-
-{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}}
+# TODO: consider implementing for more dtypes
+ctypedef fused groupby_t:
+    float64_t
+    float32_t
+    int64_t
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_max(ndarray[groupby_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[groupby_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] maxx, nobs
+        groupby_t val, count, nan_val
+        ndarray[groupby_t, ndim=2] maxx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -641,7 +627,13 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     maxx = np.empty_like(out)
-    maxx.fill(-{{inf_val}})
+    if groupby_t is int64_t:
+        # Note: evaluated at compile-time
+        maxx.fill(-_int64_max)
+        nan_val = iNaT
+    else:
+        maxx.fill(-np.inf)
+        nan_val = NAN
 
     N, K = (<object> values).shape
 
@@ -656,37 +648,44 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
+                if groupby_t is int64_t:
+                    if val != nan_val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+                else:
+                    if val == val and val != nan_val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    out[i, j] = nan_val
                 else:
                     out[i, j] = maxx[i, j]
 
 
+group_max_float64 = group_max["float64_t"]
+group_max_float32 = group_max["float32_t"]
+group_max_int64 = group_max["int64_t"]
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_min(ndarray[groupby_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[groupby_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] minx, nobs
+        groupby_t val, count, nan_val
+        ndarray[groupby_t, ndim=2] minx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -696,7 +695,12 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
-    minx.fill({{inf_val}})
+    if groupby_t is int64_t:
+        minx.fill(_int64_max)
+        nan_val = iNaT
+    else:
+        minx.fill(np.inf)
+        nan_val = NAN
 
     N, K = (<object> values).shape
 
@@ -711,41 +715,51 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
+                if groupby_t is int64_t:
+                    if val != nan_val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+                else:
+                    if val == val and val != nan_val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    out[i, j] = nan_val
                 else:
                     out[i, j] = minx[i, j]
 
 
+group_min_float64 = group_min["float64_t"]
+group_min_float32 = group_min["float32_t"]
+group_min_int64 = group_min["int64_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummin(ndarray[groupby_t, ndim=2] out,
+                 ndarray[groupby_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        groupby_t val, mval
+        ndarray[groupby_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill({{inf_val}})
+    if groupby_t is int64_t:
+        accum.fill(_int64_max)
+    else:
+        accum.fill(np.inf)
 
     with nogil:
         for i in range(N):
@@ -757,37 +771,48 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # val = nan
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if groupby_t is int64_t:
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                    else:
+                        mval = accum[lab, j]
+                        if val < mval:
+                            accum[lab, j] = mval = val
+                        out[i, j] = mval
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val < mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val == val:
+                        mval = accum[lab, j]
+                        if val < mval:
+                            accum[lab, j] = mval = val
+                        out[i, j] = mval
+
+
+group_cummin_float64 = group_cummin["float64_t"]
+group_cummin_float32 = group_cummin["float32_t"]
+group_cummin_int64 = group_cummin["int64_t"]
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummax(ndarray[groupby_t, ndim=2] out,
+                 ndarray[groupby_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        groupby_t val, mval
+        ndarray[groupby_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill(-{{inf_val}})
+    if groupby_t is int64_t:
+        accum.fill(-_int64_max)
+    else:
+        accum.fill(-np.inf)
 
     with nogil:
         for i in range(N):
@@ -798,16 +823,22 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
             for j in range(K):
                 val = values[i, j]
 
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if groupby_t is int64_t:
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                    else:
+                        mval = accum[lab, j]
+                        if val > mval:
+                            accum[lab, j] = mval = val
+                        out[i, j] = mval
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val > mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val == val:
+                        mval = accum[lab, j]
+                        if val > mval:
+                            accum[lab, j] = mval = val
+                        out[i, j] = mval
 
-{{endfor}}
+
+group_cummax_float64 = group_cummax["float64_t"]
+group_cummax_float32 = group_cummax["float32_t"]
+group_cummax_int64 = group_cummax["int64_t"]
diff --git a/pandas/_libs/join_func_helper.pxi.in b/pandas/_libs/join_func_helper.pxi.in
index 72f24762838b4..b7f604d2fc951 100644
--- a/pandas/_libs/join_func_helper.pxi.in
+++ b/pandas/_libs/join_func_helper.pxi.in
@@ -210,34 +210,34 @@ def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}(
 {{endfor}}
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # asof_join
-#----------------------------------------------------------------------
-
-{{py:
-
-# on_dtype
-dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
-          'int8_t', 'int16_t', 'int32_t', 'int64_t',
-          'float', 'double']
-
-}}
-
-{{for on_dtype in dtypes}}
-
-
-def asof_join_backward_{{on_dtype}}(
-        ndarray[{{on_dtype}}] left_values,
-        ndarray[{{on_dtype}}] right_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
+# ----------------------------------------------------------------------
+
+ctypedef fused asof_t:
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    float
+    double
+
+
+def asof_join_backward(ndarray[asof_t] left_values,
+                       ndarray[asof_t] right_values,
+                       bint allow_exact_matches=1,
+                       tolerance=None):
 
     cdef:
         Py_ssize_t left_pos, right_pos, left_size, right_size
         ndarray[int64_t] left_indexer, right_indexer
         bint has_tolerance = 0
-        {{on_dtype}} tolerance_ = 0
-        {{on_dtype}} diff = 0
+        asof_t tolerance_ = 0
+        asof_t diff = 0
 
     # if we are using tolerance, set our objects
     if tolerance is not None:
@@ -280,18 +280,29 @@ def asof_join_backward_{{on_dtype}}(
     return left_indexer, right_indexer
 
 
-def asof_join_forward_{{on_dtype}}(
-        ndarray[{{on_dtype}}] left_values,
-        ndarray[{{on_dtype}}] right_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
+asof_join_backward_uint8_t = asof_join_backward["uint8_t"]
+asof_join_backward_uint16_t = asof_join_backward["uint16_t"]
+asof_join_backward_uint32_t = asof_join_backward["uint32_t"]
+asof_join_backward_uint64_t = asof_join_backward["uint64_t"]
+asof_join_backward_int8_t = asof_join_backward["int8_t"]
+asof_join_backward_int16_t = asof_join_backward["int16_t"]
+asof_join_backward_int32_t = asof_join_backward["int32_t"]
+asof_join_backward_int64_t = asof_join_backward["int64_t"]
+asof_join_backward_float = asof_join_backward["float"]
+asof_join_backward_double = asof_join_backward["double"]
+
+
+def asof_join_forward(ndarray[asof_t] left_values,
+                      ndarray[asof_t] right_values,
+                      bint allow_exact_matches=1,
+                      tolerance=None):
 
     cdef:
         Py_ssize_t left_pos, right_pos, left_size, right_size
         ndarray[int64_t] left_indexer, right_indexer
         bint has_tolerance = 0
-        {{on_dtype}} tolerance_ = 0
-        {{on_dtype}} diff = 0
+        asof_t tolerance_ = 0
+        asof_t diff = 0
 
     # if we are using tolerance, set our objects
     if tolerance is not None:
@@ -335,16 +346,27 @@ def asof_join_forward_{{on_dtype}}(
     return left_indexer, right_indexer
 
 
-def asof_join_nearest_{{on_dtype}}(
-        ndarray[{{on_dtype}}] left_values,
-        ndarray[{{on_dtype}}] right_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
+asof_join_forward_uint8_t = asof_join_forward["uint8_t"]
+asof_join_forward_uint16_t = asof_join_forward["uint16_t"]
+asof_join_forward_uint32_t = asof_join_forward["uint32_t"]
+asof_join_forward_uint64_t = asof_join_forward["uint64_t"]
+asof_join_forward_int8_t = asof_join_forward["int8_t"]
+asof_join_forward_int16_t = asof_join_forward["int16_t"]
+asof_join_forward_int32_t = asof_join_forward["int32_t"]
+asof_join_forward_int64_t = asof_join_forward["int64_t"]
+asof_join_forward_float = asof_join_forward["float"]
+asof_join_forward_double = asof_join_forward["double"]
+
+
+def asof_join_nearest(ndarray[asof_t] left_values,
+                      ndarray[asof_t] right_values,
+                      bint allow_exact_matches=1,
+                      tolerance=None):
 
     cdef:
         Py_ssize_t left_size, right_size, i
         ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
-        {{on_dtype}} bdiff, fdiff
+        asof_t bdiff, fdiff
 
     left_size = len(left_values)
     right_size = len(right_values)
@@ -353,10 +375,10 @@ def asof_join_nearest_{{on_dtype}}(
     right_indexer = np.empty(left_size, dtype=np.int64)
 
     # search both forward and backward
-    bli, bri = asof_join_backward_{{on_dtype}}(left_values, right_values,
-                                               allow_exact_matches, tolerance)
-    fli, fri = asof_join_forward_{{on_dtype}}(left_values, right_values,
-                                              allow_exact_matches, tolerance)
+    bli, bri = asof_join_backward(left_values, right_values,
+                                  allow_exact_matches, tolerance)
+    fli, fri = asof_join_forward(left_values, right_values,
+                                 allow_exact_matches, tolerance)
 
     for i in range(len(bri)):
         # choose timestamp from right with smaller difference
@@ -370,4 +392,14 @@ def asof_join_nearest_{{on_dtype}}(
 
     return left_indexer, right_indexer
 
-{{endfor}}
+
+asof_join_nearest_uint8_t = asof_join_nearest["uint8_t"]
+asof_join_nearest_uint16_t = asof_join_nearest["uint16_t"]
+asof_join_nearest_uint32_t = asof_join_nearest["uint32_t"]
+asof_join_nearest_uint64_t = asof_join_nearest["uint64_t"]
+asof_join_nearest_int8_t = asof_join_nearest["int8_t"]
+asof_join_nearest_int16_t = asof_join_nearest["int16_t"]
+asof_join_nearest_int32_t = asof_join_nearest["int32_t"]
+asof_join_nearest_int64_t = asof_join_nearest["int64_t"]
+asof_join_nearest_float = asof_join_nearest["float"]
+asof_join_nearest_double = asof_join_nearest["double"]
diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in
index 2843a3cf7dd28..d02a985de1d61 100644
--- a/pandas/_libs/sparse_op_helper.pxi.in
+++ b/pandas/_libs/sparse_op_helper.pxi.in
@@ -8,18 +8,12 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # Sparse op
 #----------------------------------------------------------------------
 
-{{py:
-
-# dtype, float_group
-dtypes = [('float64', True), ('int64', False)]
-
-}}
+ctypedef fused sparse_t:
+    float64_t
+    int64_t
 
-{{for dtype, float_group in dtypes}}
 
-{{if float_group}}
-
-cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
+cdef inline float64_t __div__(sparse_t a, sparse_t b):
     if b == 0:
         if a > 0:
             return INF
@@ -30,63 +24,41 @@ cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
     else:
         return float(a) / b
 
-cdef inline {{dtype}}_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
-    return __div_{{dtype}}(a, b)
 
-cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
-    if b == 0:
-        # numpy >= 1.11 returns NaN
-        # for a // 0, rather than +-inf
-        if _np_version_under1p11:
-            if a > 0:
-                return INF
-            elif a < 0:
-                return -INF
-        return NaN
-    else:
-        return a // b
+cdef inline float64_t __truediv__(sparse_t a, sparse_t b):
+    return __div__(a, b)
 
-cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
-    if b == 0:
-        return NaN
-    else:
-        return a % b
-
-{{else}}
 
-cdef inline float64_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
+cdef inline sparse_t __mod__(sparse_t a, sparse_t b):
     if b == 0:
-        if a > 0:
-            return INF
-        elif a < 0:
-            return -INF
-        else:
+        if sparse_t is float64_t:
             return NaN
+        else:
+            return 0
     else:
-        return float(a) / b
+        return a % b
 
-cdef inline float64_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
-    return __div_{{dtype}}(a, b)
 
-cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
+cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b):
     if b == 0:
-        return 0
+        if sparse_t is float64_t:
+            # numpy >= 1.11 returns NaN
+            # for a // 0, rather than +-inf
+            if _np_version_under1p11:
+                if a > 0:
+                    return INF
+                elif a < 0:
+                    return -INF
+            return NaN
+        else:
+            return 0
     else:
         return a // b
 
-cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b):
-    if b == 0:
-        return 0
-    else:
-        return a % b
 
-{{endif}}
-
-{{endfor}}
-
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # sparse array op
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 
 {{py:
 
@@ -106,10 +78,10 @@ def get_op(tup):
     ops_dict = {'add': '{0} + {1}',
                 'sub': '{0} - {1}',
                 'mul': '{0} * {1}',
-                'div': '__div_{2}({0}, {1})',
-                'mod': '__mod_{2}({0}, {1})',
-                'truediv': '__truediv_{2}({0}, {1})',
-                'floordiv': '__floordiv_{2}({0}, {1})',
+                'div': '__div__({0}, {1})',
+                'mod': '__mod__({0}, {1})',
+                'truediv': '__truediv__({0}, {1})',
+                'floordiv': '__floordiv__({0}, {1})',
                 'pow': '{0} ** {1}',
                 'eq': '{0} == {1}',
                 'ne': '{0} != {1}',