From 55b22c0c5da59b5df46dd1d41b85781d19cceed3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 4 Jun 2024 11:15:41 -0400
Subject: [PATCH] feat: Update ggml

---
 ggml/ggml.py | 298 ++++++++++++++++++++++++++++++++-------------------
 vendor/ggml  |   2 +-
 2 files changed, 189 insertions(+), 111 deletions(-)

diff --git a/ggml/ggml.py b/ggml/ggml.py
index 1deda36..0fc2efe 100644
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -519,9 +519,7 @@ def ggml_fp32_to_bf16_row(fp32: CtypesPointer[ctypes.c_float], bf16: CtypesPoint
 #     GGML_OP_ARGSORT,
 #     GGML_OP_LEAKY_RELU,
 
-#     GGML_OP_FLASH_ATTN,
 #     GGML_OP_FLASH_ATTN_EXT,
-#     GGML_OP_FLASH_FF,
 #     GGML_OP_FLASH_ATTN_BACK,
 #     GGML_OP_SSM_CONV,
 #     GGML_OP_SSM_SCAN,
@@ -603,28 +601,22 @@ def ggml_fp32_to_bf16_row(fp32: CtypesPointer[ctypes.c_float], bf16: CtypesPoint
 GGML_OP_TIMESTEP_EMBEDDING = 52
 GGML_OP_ARGSORT = 53
 GGML_OP_LEAKY_RELU = 54
-GGML_OP_FLASH_ATTN = 55
-GGML_OP_FLASH_ATTN_EXT = 56
-GGML_OP_FLASH_FF = 57
-GGML_OP_FLASH_ATTN_BACK = 58
-GGML_OP_SSM_CONV = 59
-GGML_OP_SSM_SCAN = 60
-GGML_OP_WIN_PART = 61
-GGML_OP_WIN_UNPART = 62
-GGML_OP_GET_REL_POS = 63
-GGML_OP_ADD_REL_POS = 64
-GGML_OP_UNARY = 65
-GGML_OP_MAP_UNARY = 66
-GGML_OP_MAP_BINARY = 67
-GGML_OP_MAP_CUSTOM1_F32 = 68
-GGML_OP_MAP_CUSTOM2_F32 = 69
-GGML_OP_MAP_CUSTOM3_F32 = 70
-GGML_OP_MAP_CUSTOM1 = 71
-GGML_OP_MAP_CUSTOM2 = 72
-GGML_OP_MAP_CUSTOM3 = 73
-GGML_OP_CROSS_ENTROPY_LOSS = 74
-GGML_OP_CROSS_ENTROPY_LOSS_BACK = 75
-GGML_OP_COUNT = 76
+GGML_OP_FLASH_ATTN_EXT = 55
+GGML_OP_FLASH_ATTN_BACK = 56
+GGML_OP_SSM_CONV = 57
+GGML_OP_SSM_SCAN = 58
+GGML_OP_WIN_PART = 59
+GGML_OP_WIN_UNPART = 60
+GGML_OP_GET_REL_POS = 61
+GGML_OP_ADD_REL_POS = 62
+GGML_OP_UNARY = 63
+GGML_OP_MAP_UNARY = 64
+GGML_OP_MAP_BINARY = 65
+GGML_OP_MAP_CUSTOM1_F32 = 66
+GGML_OP_MAP_CUSTOM2_F32 = 67
+GGML_OP_MAP_CUSTOM3_F32 = 68
+GGML_OP_MAP_CUSTOM1 = 69
+GGML_OP_MAP_CUSTOM2 = 70
 
 
 # enum ggml_unary_op {
@@ -736,7 +728,7 @@ class ggml_object(ctypes.Structure):
 # // n-dimensional tensor
 # struct ggml_tensor {
 #     enum ggml_type         type;
-#     enum ggml_backend_type backend;
+#     GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
 
 #     struct ggml_backend_buffer * buffer;
 
@@ -2939,18 +2931,20 @@ def ggml_repeat_back(
 # GGML_API struct ggml_tensor * ggml_concat(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
-#         struct ggml_tensor  * b);
+#         struct ggml_tensor  * b,
+#         int                   dim);
 @ggml_function(
     "ggml_concat",
     [
         ggml_context_p_ctypes,
         ctypes.POINTER(ggml_tensor),
         ctypes.POINTER(ggml_tensor),
+        ctypes.c_int,
     ],
     ctypes.POINTER(ggml_tensor),
 )
 def ggml_concat(
-    ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, /
+    ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, dim: int, /
 ) -> ggml_tensor_p:
     """Concatenate two tensors along the second axis and return the result.
 
@@ -2958,6 +2952,7 @@ def ggml_concat(
         ctx: ggml context
         a: first tensor
         b: second tensor
+        dim: dimension to concatenate along
 
     Returns:
         Pointer to ggml_tensor"""
@@ -4740,11 +4735,12 @@ def ggml_soft_max_back_inplace(
 
 
 # // rotary position embedding
-# // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+# // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
 # // if mode & 2 == 1, GPT-NeoX style
 # // if mode & 4 == 1, ChatGLM style
 # //
 # // b is an int32 vector with size a->ne[2], it contains the positions
+# // c is freq factors (e.g. phi3-128k), (optional)
 # GGML_API struct ggml_tensor * ggml_rope(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
@@ -4837,10 +4833,11 @@ def ggml_rope_inplace(
 
 
 # // custom RoPE
-# GGML_API struct ggml_tensor * ggml_rope_custom(
+# GGML_API struct ggml_tensor * ggml_rope_ext(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
 #         struct ggml_tensor  * b,
+#         struct ggml_tensor  * c,
 #         int                   n_dims,
 #         int                   mode,
 #         int                   n_ctx,
@@ -4852,11 +4849,12 @@ def ggml_rope_inplace(
 #         float                 beta_fast,
 #         float                 beta_slow);
 @ggml_function(
-    "ggml_rope_custom",
+    "ggml_rope_ext",
     [
         ggml_context_p_ctypes,
         ctypes.POINTER(ggml_tensor),
         ctypes.POINTER(ggml_tensor),
+        ctypes.POINTER(ggml_tensor),
         ctypes.c_int,
         ctypes.c_int,
         ctypes.c_int,
@@ -4870,10 +4868,11 @@ def ggml_rope_inplace(
     ],
     ctypes.POINTER(ggml_tensor),
 )
-def ggml_rope_custom(
+def ggml_rope_ext(
     ctx: ggml_context_p,
     a: ggml_tensor_p,
     b: ggml_tensor_p,
+    c: ggml_tensor_p,
     n_dims: Union[ctypes.c_int, int],
     mode: Union[ctypes.c_int, int],
     n_ctx: Union[ctypes.c_int, int],
@@ -4886,15 +4885,15 @@ def ggml_rope_custom(
     beta_slow: Union[ctypes.c_float, float],
     /,
 ) -> ggml_tensor_p:
-    """Custom rotary position embedding"""
     ...
 
 
 # // in-place, returns view(a)
-# GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+# GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
 #         struct ggml_tensor  * b,
+#         struct ggml_tensor  * c,
 #         int                   n_dims,
 #         int                   mode,
 #         int                   n_ctx,
@@ -4906,11 +4905,12 @@ def ggml_rope_custom(
 #         float                 beta_fast,
 #         float                 beta_slow);
 @ggml_function(
-    "ggml_rope_custom_inplace",
+    "ggml_rope_ext_inplace",
     [
         ggml_context_p_ctypes,
         ctypes.POINTER(ggml_tensor),
         ctypes.POINTER(ggml_tensor),
+        ctypes.POINTER(ggml_tensor),
         ctypes.c_int,
         ctypes.c_int,
         ctypes.c_int,
@@ -4924,10 +4924,11 @@ def ggml_rope_custom(
     ],
     ctypes.POINTER(ggml_tensor),
 )
-def ggml_rope_custom_inplace(
+def ggml_rope_ext_inplace(
     ctx: ggml_context_p,
     a: ggml_tensor_p,
     b: ggml_tensor_p,
+    c: ggml_tensor_p,
     n_dims: Union[ctypes.c_int, int],
     mode: Union[ctypes.c_int, int],
     n_ctx: Union[ctypes.c_int, int],
@@ -4940,46 +4941,123 @@ def ggml_rope_custom_inplace(
     beta_slow: Union[ctypes.c_float, float],
     /,
 ) -> ggml_tensor_p:
-    """Custom rotary position embedding inplace"""
     ...
 
 
-# // compute correction dims for YaRN RoPE scaling
-# GGML_CALL void ggml_rope_yarn_corr_dims(
-#     int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+# GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
+#         struct ggml_context * ctx,
+#         struct ggml_tensor  * a,
+#         struct ggml_tensor  * b,
+#         int                   n_dims,
+#         int                   mode,
+#         int                   n_ctx,
+#         int                   n_orig_ctx,
+#         float                 freq_base,
+#         float                 freq_scale,
+#         float                 ext_factor,
+#         float                 attn_factor,
+#         float                 beta_fast,
+#         float                 beta_slow),
+#     "use ggml_rope_ext instead");
 @ggml_function(
-    "ggml_rope_yarn_corr_dims",
+    "ggml_rope_custom",
     [
+        ggml_context_p_ctypes,
+        ctypes.POINTER(ggml_tensor),
+        ctypes.POINTER(ggml_tensor),
+        ctypes.c_int,
+        ctypes.c_int,
         ctypes.c_int,
         ctypes.c_int,
         ctypes.c_float,
         ctypes.c_float,
         ctypes.c_float,
-        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
     ],
-    None,
+    ctypes.POINTER(ggml_tensor),
 )
-def ggml_rope_yarn_corr_dims(
+def ggml_rope_custom(
+    ctx: ggml_context_p,
+    a: ggml_tensor_p,
+    b: ggml_tensor_p,
     n_dims: Union[ctypes.c_int, int],
+    mode: Union[ctypes.c_int, int],
+    n_ctx: Union[ctypes.c_int, int],
     n_orig_ctx: Union[ctypes.c_int, int],
     freq_base: Union[ctypes.c_float, float],
+    freq_scale: Union[ctypes.c_float, float],
+    ext_factor: Union[ctypes.c_float, float],
+    attn_factor: Union[ctypes.c_float, float],
     beta_fast: Union[ctypes.c_float, float],
     beta_slow: Union[ctypes.c_float, float],
-    dims: CtypesArray[ctypes.c_float],
     /,
-) -> None:
-    """Compute correction dims for YaRN RoPE scaling"""
+) -> ggml_tensor_p:
+    """Custom rotary position embedding"""
     ...
 
 
-# // xPos RoPE, in-place, returns view(a)
-# GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+# GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
 #         struct ggml_tensor  * b,
 #         int                   n_dims,
-#         float                 base,
-#         bool                  down);
+#         int                   mode,
+#         int                   n_ctx,
+#         int                   n_orig_ctx,
+#         float                 freq_base,
+#         float                 freq_scale,
+#         float                 ext_factor,
+#         float                 attn_factor,
+#         float                 beta_fast,
+#         float                 beta_slow),
+#     "use ggml_rope_ext_inplace instead");
+@ggml_function(
+    "ggml_rope_custom_inplace",
+    [
+        ggml_context_p_ctypes,
+        ctypes.POINTER(ggml_tensor),
+        ctypes.POINTER(ggml_tensor),
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
+    ],
+    ctypes.POINTER(ggml_tensor),
+)
+def ggml_rope_custom_inplace(
+    ctx: ggml_context_p,
+    a: ggml_tensor_p,
+    b: ggml_tensor_p,
+    n_dims: Union[ctypes.c_int, int],
+    mode: Union[ctypes.c_int, int],
+    n_ctx: Union[ctypes.c_int, int],
+    n_orig_ctx: Union[ctypes.c_int, int],
+    freq_base: Union[ctypes.c_float, float],
+    freq_scale: Union[ctypes.c_float, float],
+    ext_factor: Union[ctypes.c_float, float],
+    attn_factor: Union[ctypes.c_float, float],
+    beta_fast: Union[ctypes.c_float, float],
+    beta_slow: Union[ctypes.c_float, float],
+    /,
+) -> ggml_tensor_p:
+    """Custom rotary position embedding inplace"""
+    ...
+
+# struct ggml_tensor * ggml_rope_xpos_inplace(
+#     struct ggml_context * ctx,
+#     struct ggml_tensor  * a,
+#     struct ggml_tensor  * b,
+#     int                   n_dims,
+#     float                 base,
+#     bool                  down);
 @ggml_function(
     "ggml_rope_xpos_inplace",
     [
@@ -5001,7 +5079,33 @@ def ggml_rope_xpos_inplace(
     down: Union[ctypes.c_bool, bool],
     /,
 ) -> ggml_tensor_p:
-    """xPos RoPE, in-place, returns view(a)"""
+    ...
+
+# // compute correction dims for YaRN RoPE scaling
+# GGML_CALL void ggml_rope_yarn_corr_dims(
+#     int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+@ggml_function(
+    "ggml_rope_yarn_corr_dims",
+    [
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.POINTER(ctypes.c_float),
+    ],
+    None,
+)
+def ggml_rope_yarn_corr_dims(
+    n_dims: Union[ctypes.c_int, int],
+    n_orig_ctx: Union[ctypes.c_int, int],
+    freq_base: Union[ctypes.c_float, float],
+    beta_fast: Union[ctypes.c_float, float],
+    beta_slow: Union[ctypes.c_float, float],
+    dims: CtypesArray[ctypes.c_float],
+    /,
+) -> None:
+    """Compute correction dims for YaRN RoPE scaling"""
     ...
 
 
@@ -5011,6 +5115,7 @@ def ggml_rope_xpos_inplace(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * a,
 #         struct ggml_tensor  * b,
+#         struct ggml_tensor  * c,
 #         int                   n_dims,
 #         int                   mode,
 #         int                   n_ctx,
@@ -5029,6 +5134,7 @@ def ggml_rope_xpos_inplace(
         ggml_context_p_ctypes,
         ctypes.POINTER(ggml_tensor),
         ctypes.POINTER(ggml_tensor),
+        ctypes.POINTER(ggml_tensor),
         ctypes.c_int,
         ctypes.c_int,
         ctypes.c_int,
@@ -5048,6 +5154,7 @@ def ggml_rope_back(
     ctx: ggml_context_p,
     a: ggml_tensor_p,
     b: ggml_tensor_p,
+    c: ggml_tensor_p,
     n_dims: Union[ctypes.c_int, int],
     mode: Union[ctypes.c_int, int],
     n_ctx: Union[ctypes.c_int, int],
@@ -5822,33 +5929,6 @@ def ggml_top_k(
     ...
 
 
-# GGML_API struct ggml_tensor * ggml_flash_attn(
-#         struct ggml_context * ctx,
-#         struct ggml_tensor  * q,
-#         struct ggml_tensor  * k,
-#         struct ggml_tensor  * v,
-#         bool                  masked);
-@ggml_function(
-    "ggml_flash_attn",
-    [
-        ggml_context_p_ctypes,
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-        ctypes.c_bool,
-    ],
-    ctypes.POINTER(ggml_tensor),
-)
-def ggml_flash_attn(
-    ctx: ggml_context_p,
-    q: ggml_tensor_p,
-    k: ggml_tensor_p,
-    v: ggml_tensor_p,
-    masked: Union[ctypes.c_bool, bool],
-    /,
-) -> ggml_tensor_p:
-    ...
-
 #define GGML_KQ_MASK_PAD 32
 GGML_KQ_MASK_PAD = 32
 
@@ -5905,6 +5985,7 @@ def ggml_flash_attn_ext_set_prec(
     ...
 
 
+# // TODO: needs to be adapted to ggml_flash_attn_ext
 # GGML_API struct ggml_tensor * ggml_flash_attn_back(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * q,
@@ -5936,37 +6017,6 @@ def ggml_flash_attn_back(
     ...
 
 
-# GGML_API struct ggml_tensor * ggml_flash_ff(
-#         struct ggml_context * ctx,
-#         struct ggml_tensor  * a,
-#         struct ggml_tensor  * b0,
-#         struct ggml_tensor  * b1,
-#         struct ggml_tensor  * c0,
-#         struct ggml_tensor  * c1);
-@ggml_function(
-    "ggml_flash_ff",
-    [
-        ggml_context_p_ctypes,
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-        ctypes.POINTER(ggml_tensor),
-    ],
-    ctypes.POINTER(ggml_tensor),
-)
-def ggml_flash_ff(
-    ctx: ggml_context_p,
-    a: ggml_tensor_p,
-    b0: ggml_tensor_p,
-    b1: ggml_tensor_p,
-    c0: ggml_tensor_p,
-    c1: ggml_tensor_p,
-    /,
-) -> ggml_tensor_p:
-    ...
-
-
 # GGML_API struct ggml_tensor * ggml_ssm_conv(
 #         struct ggml_context * ctx,
 #         struct ggml_tensor  * s,
@@ -8684,6 +8734,12 @@ def ggml_cpu_has_avx512_vnni() -> int:
     ...
 
 
+# GGML_API int ggml_cpu_has_avx512_bf16(void);
+@ggml_function("ggml_cpu_has_avx512_bf16", [], ctypes.c_int)
+def ggml_cpu_has_avx512_bf16() -> int:
+    ...
+
+
 # GGML_API int ggml_cpu_has_fma        (void);
 @ggml_function("ggml_cpu_has_fma", [], ctypes.c_int)
 def ggml_cpu_has_fma() -> int:
@@ -8696,6 +8752,12 @@ def ggml_cpu_has_neon() -> int:
     ...
 
 
+# GGML_API int ggml_cpu_has_sve        (void);
+@ggml_function("ggml_cpu_has_sve", [], ctypes.c_int)
+def ggml_cpu_has_sve() -> int:
+    ...
+
+
 # GGML_API int ggml_cpu_has_arm_fma    (void);
 @ggml_function("ggml_cpu_has_arm_fma", [], ctypes.c_int)
 def ggml_cpu_has_arm_fma() -> int:
@@ -10789,6 +10851,22 @@ def ggml_backend_cuda_unregister_host_buffer(
     ...
 
 
+# GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
+@ggml_function(
+    "ggml_backend_cuda_log_set_callback",
+    [
+        ggml_log_callback,
+        ctypes.c_void_p,
+    ],
+    None,
+    enabled=GGML_USE_CUDA,
+)
+def ggml_backend_cuda_log_set_callback(
+    log_callback, user_data: Union[ctypes.c_void_p, int, None], /  # type: ignore
+):
+    ...
+
+
 #####################################################
 # GGML METAL API
 # source: src/ggml-metal.h
@@ -11270,7 +11348,7 @@ def ggml_backend_vk_host_buffer_type() -> Optional[ggml_backend_buffer_type_t]:
 
 #####################################################
 # GGML Vulkan API
-# source: src/ggml-vulkan.h
+# source: src/ggml-rpc.h
 #####################################################
 
 
diff --git a/vendor/ggml b/vendor/ggml
index 7cf94a2..2aae01f 160000
--- a/vendor/ggml
+++ b/vendor/ggml
@@ -1 +1 @@
-Subproject commit 7cf94a2bb99eecfe7f55fa80e19b89e00bf7fe4d
+Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c