From 55b22c0c5da59b5df46dd1d41b85781d19cceed3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 4 Jun 2024 11:15:41 -0400 Subject: [PATCH] feat: Update ggml --- ggml/ggml.py | 298 ++++++++++++++++++++++++++++++++------------------- vendor/ggml | 2 +- 2 files changed, 189 insertions(+), 111 deletions(-) diff --git a/ggml/ggml.py b/ggml/ggml.py index 1deda36..0fc2efe 100644 --- a/ggml/ggml.py +++ b/ggml/ggml.py @@ -519,9 +519,7 @@ def ggml_fp32_to_bf16_row(fp32: CtypesPointer[ctypes.c_float], bf16: CtypesPoint # GGML_OP_ARGSORT, # GGML_OP_LEAKY_RELU, -# GGML_OP_FLASH_ATTN, # GGML_OP_FLASH_ATTN_EXT, -# GGML_OP_FLASH_FF, # GGML_OP_FLASH_ATTN_BACK, # GGML_OP_SSM_CONV, # GGML_OP_SSM_SCAN, @@ -603,28 +601,22 @@ def ggml_fp32_to_bf16_row(fp32: CtypesPointer[ctypes.c_float], bf16: CtypesPoint GGML_OP_TIMESTEP_EMBEDDING = 52 GGML_OP_ARGSORT = 53 GGML_OP_LEAKY_RELU = 54 -GGML_OP_FLASH_ATTN = 55 -GGML_OP_FLASH_ATTN_EXT = 56 -GGML_OP_FLASH_FF = 57 -GGML_OP_FLASH_ATTN_BACK = 58 -GGML_OP_SSM_CONV = 59 -GGML_OP_SSM_SCAN = 60 -GGML_OP_WIN_PART = 61 -GGML_OP_WIN_UNPART = 62 -GGML_OP_GET_REL_POS = 63 -GGML_OP_ADD_REL_POS = 64 -GGML_OP_UNARY = 65 -GGML_OP_MAP_UNARY = 66 -GGML_OP_MAP_BINARY = 67 -GGML_OP_MAP_CUSTOM1_F32 = 68 -GGML_OP_MAP_CUSTOM2_F32 = 69 -GGML_OP_MAP_CUSTOM3_F32 = 70 -GGML_OP_MAP_CUSTOM1 = 71 -GGML_OP_MAP_CUSTOM2 = 72 -GGML_OP_MAP_CUSTOM3 = 73 -GGML_OP_CROSS_ENTROPY_LOSS = 74 -GGML_OP_CROSS_ENTROPY_LOSS_BACK = 75 -GGML_OP_COUNT = 76 +GGML_OP_FLASH_ATTN_EXT = 55 +GGML_OP_FLASH_ATTN_BACK = 56 +GGML_OP_SSM_CONV = 57 +GGML_OP_SSM_SCAN = 58 +GGML_OP_WIN_PART = 59 +GGML_OP_WIN_UNPART = 60 +GGML_OP_GET_REL_POS = 61 +GGML_OP_ADD_REL_POS = 62 +GGML_OP_UNARY = 63 +GGML_OP_MAP_UNARY = 64 +GGML_OP_MAP_BINARY = 65 +GGML_OP_MAP_CUSTOM1_F32 = 66 +GGML_OP_MAP_CUSTOM2_F32 = 67 +GGML_OP_MAP_CUSTOM3_F32 = 68 +GGML_OP_MAP_CUSTOM1 = 69 +GGML_OP_MAP_CUSTOM2 = 70 # enum ggml_unary_op { @@ -736,7 +728,7 @@ class ggml_object(ctypes.Structure): # // n-dimensional tensor # struct ggml_tensor { # enum ggml_type type; -# enum ggml_backend_type backend; +# GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); # struct ggml_backend_buffer * buffer; @@ -2939,18 +2931,20 @@ def ggml_repeat_back( # GGML_API struct ggml_tensor * ggml_concat( # struct ggml_context * ctx, # struct ggml_tensor * a, -# struct ggml_tensor * b); +# struct ggml_tensor * b, +# int dim); @ggml_function( "ggml_concat", [ ggml_context_p_ctypes, ctypes.POINTER(ggml_tensor), ctypes.POINTER(ggml_tensor), + ctypes.c_int, ], ctypes.POINTER(ggml_tensor), ) def ggml_concat( - ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, / + ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, dim: int, / ) -> ggml_tensor_p: """Concatenate two tensors along the second axis and return the result. @@ -2958,6 +2952,7 @@ def ggml_concat( ctx: ggml context a: first tensor b: second tensor + dim: dimension to concatenate along Returns: Pointer to ggml_tensor""" @@ -4740,11 +4735,12 @@ def ggml_soft_max_back_inplace( # // rotary position embedding -# // if mode & 1 == 1, skip n_past elements (DEPRECATED) +# // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) # // if mode & 2 == 1, GPT-NeoX style # // if mode & 4 == 1, ChatGLM style # // # // b is an int32 vector with size a->ne[2], it contains the positions +# // c is freq factors (e.g. phi3-128k), (optional) # GGML_API struct ggml_tensor * ggml_rope( # struct ggml_context * ctx, # struct ggml_tensor * a, @@ -4837,10 +4833,11 @@ def ggml_rope_inplace( # // custom RoPE -# GGML_API struct ggml_tensor * ggml_rope_custom( +# GGML_API struct ggml_tensor * ggml_rope_ext( # struct ggml_context * ctx, # struct ggml_tensor * a, # struct ggml_tensor * b, +# struct ggml_tensor * c, # int n_dims, # int mode, # int n_ctx, @@ -4852,11 +4849,12 @@ def ggml_rope_inplace( # float beta_fast, # float beta_slow); @ggml_function( - "ggml_rope_custom", + "ggml_rope_ext", [ ggml_context_p_ctypes, ctypes.POINTER(ggml_tensor), ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), ctypes.c_int, ctypes.c_int, ctypes.c_int, @@ -4870,10 +4868,11 @@ def ggml_rope_inplace( ], ctypes.POINTER(ggml_tensor), ) -def ggml_rope_custom( +def ggml_rope_ext( ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, + c: ggml_tensor_p, n_dims: Union[ctypes.c_int, int], mode: Union[ctypes.c_int, int], n_ctx: Union[ctypes.c_int, int], @@ -4886,15 +4885,15 @@ def ggml_rope_custom( beta_slow: Union[ctypes.c_float, float], /, ) -> ggml_tensor_p: - """Custom rotary position embedding""" ... # // in-place, returns view(a) -# GGML_API struct ggml_tensor * ggml_rope_custom_inplace( +# GGML_API struct ggml_tensor * ggml_rope_ext_inplace( # struct ggml_context * ctx, # struct ggml_tensor * a, # struct ggml_tensor * b, +# struct ggml_tensor * c, # int n_dims, # int mode, # int n_ctx, @@ -4906,11 +4905,12 @@ def ggml_rope_custom( # float beta_fast, # float beta_slow); @ggml_function( - "ggml_rope_custom_inplace", + "ggml_rope_ext_inplace", [ ggml_context_p_ctypes, ctypes.POINTER(ggml_tensor), ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), ctypes.c_int, ctypes.c_int, ctypes.c_int, @@ -4924,10 +4924,11 @@ def ggml_rope_custom( ], ctypes.POINTER(ggml_tensor), ) -def ggml_rope_custom_inplace( +def ggml_rope_ext_inplace( ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, + c: ggml_tensor_p, n_dims: Union[ctypes.c_int, int], mode: Union[ctypes.c_int, int], n_ctx: Union[ctypes.c_int, int], @@ -4940,46 +4941,123 @@ def ggml_rope_custom_inplace( beta_slow: Union[ctypes.c_float, float], /, ) -> ggml_tensor_p: - """Custom rotary position embedding inplace""" ... -# // compute correction dims for YaRN RoPE scaling -# GGML_CALL void ggml_rope_yarn_corr_dims( -# int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); +# GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int n_dims, +# int mode, +# int n_ctx, +# int n_orig_ctx, +# float freq_base, +# float freq_scale, +# float ext_factor, +# float attn_factor, +# float beta_fast, +# float beta_slow), +# "use ggml_rope_ext instead"); @ggml_function( - "ggml_rope_yarn_corr_dims", + "ggml_rope_custom", [ + ggml_context_p_ctypes, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_float, - ctypes.POINTER(ctypes.c_float), + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, ], - None, + ctypes.POINTER(ggml_tensor), ) -def ggml_rope_yarn_corr_dims( +def ggml_rope_custom( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], n_orig_ctx: Union[ctypes.c_int, int], freq_base: Union[ctypes.c_float, float], + freq_scale: Union[ctypes.c_float, float], + ext_factor: Union[ctypes.c_float, float], + attn_factor: Union[ctypes.c_float, float], beta_fast: Union[ctypes.c_float, float], beta_slow: Union[ctypes.c_float, float], - dims: CtypesArray[ctypes.c_float], /, -) -> None: - """Compute correction dims for YaRN RoPE scaling""" +) -> ggml_tensor_p: + """Custom rotary position embedding""" ... -# // xPos RoPE, in-place, returns view(a) -# GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( +# GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace( # struct ggml_context * ctx, # struct ggml_tensor * a, # struct ggml_tensor * b, # int n_dims, -# float base, -# bool down); +# int mode, +# int n_ctx, +# int n_orig_ctx, +# float freq_base, +# float freq_scale, +# float ext_factor, +# float attn_factor, +# float beta_fast, +# float beta_slow), +# "use ggml_rope_ext_inplace instead"); +@ggml_function( + "ggml_rope_custom_inplace", + [ + ggml_context_p_ctypes, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ], + ctypes.POINTER(ggml_tensor), +) +def ggml_rope_custom_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], + n_orig_ctx: Union[ctypes.c_int, int], + freq_base: Union[ctypes.c_float, float], + freq_scale: Union[ctypes.c_float, float], + ext_factor: Union[ctypes.c_float, float], + attn_factor: Union[ctypes.c_float, float], + beta_fast: Union[ctypes.c_float, float], + beta_slow: Union[ctypes.c_float, float], + /, +) -> ggml_tensor_p: + """Custom rotary position embedding inplace""" + ... + +# struct ggml_tensor * ggml_rope_xpos_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int n_dims, +# float base, +# bool down); @ggml_function( "ggml_rope_xpos_inplace", [ @@ -5001,7 +5079,33 @@ def ggml_rope_xpos_inplace( down: Union[ctypes.c_bool, bool], /, ) -> ggml_tensor_p: - """xPos RoPE, in-place, returns view(a)""" + ... + +# // compute correction dims for YaRN RoPE scaling +# GGML_CALL void ggml_rope_yarn_corr_dims( +# int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); +@ggml_function( + "ggml_rope_yarn_corr_dims", + [ + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ctypes.POINTER(ctypes.c_float), + ], + None, +) +def ggml_rope_yarn_corr_dims( + n_dims: Union[ctypes.c_int, int], + n_orig_ctx: Union[ctypes.c_int, int], + freq_base: Union[ctypes.c_float, float], + beta_fast: Union[ctypes.c_float, float], + beta_slow: Union[ctypes.c_float, float], + dims: CtypesArray[ctypes.c_float], + /, +) -> None: + """Compute correction dims for YaRN RoPE scaling""" ... @@ -5011,6 +5115,7 @@ def ggml_rope_xpos_inplace( # struct ggml_context * ctx, # struct ggml_tensor * a, # struct ggml_tensor * b, +# struct ggml_tensor * c, # int n_dims, # int mode, # int n_ctx, @@ -5029,6 +5134,7 @@ def ggml_rope_xpos_inplace( ggml_context_p_ctypes, ctypes.POINTER(ggml_tensor), ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), ctypes.c_int, ctypes.c_int, ctypes.c_int, @@ -5048,6 +5154,7 @@ def ggml_rope_back( ctx: ggml_context_p, a: ggml_tensor_p, b: ggml_tensor_p, + c: ggml_tensor_p, n_dims: Union[ctypes.c_int, int], mode: Union[ctypes.c_int, int], n_ctx: Union[ctypes.c_int, int], @@ -5822,33 +5929,6 @@ def ggml_top_k( ... -# GGML_API struct ggml_tensor * ggml_flash_attn( -# struct ggml_context * ctx, -# struct ggml_tensor * q, -# struct ggml_tensor * k, -# struct ggml_tensor * v, -# bool masked); -@ggml_function( - "ggml_flash_attn", - [ - ggml_context_p_ctypes, - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ctypes.c_bool, - ], - ctypes.POINTER(ggml_tensor), -) -def ggml_flash_attn( - ctx: ggml_context_p, - q: ggml_tensor_p, - k: ggml_tensor_p, - v: ggml_tensor_p, - masked: Union[ctypes.c_bool, bool], - /, -) -> ggml_tensor_p: - ... - #define GGML_KQ_MASK_PAD 32 GGML_KQ_MASK_PAD = 32 @@ -5905,6 +5985,7 @@ def ggml_flash_attn_ext_set_prec( ... +# // TODO: needs to be adapted to ggml_flash_attn_ext # GGML_API struct ggml_tensor * ggml_flash_attn_back( # struct ggml_context * ctx, # struct ggml_tensor * q, @@ -5936,37 +6017,6 @@ def ggml_flash_attn_back( ... -# GGML_API struct ggml_tensor * ggml_flash_ff( -# struct ggml_context * ctx, -# struct ggml_tensor * a, -# struct ggml_tensor * b0, -# struct ggml_tensor * b1, -# struct ggml_tensor * c0, -# struct ggml_tensor * c1); -@ggml_function( - "ggml_flash_ff", - [ - ggml_context_p_ctypes, - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ctypes.POINTER(ggml_tensor), - ], - ctypes.POINTER(ggml_tensor), -) -def ggml_flash_ff( - ctx: ggml_context_p, - a: ggml_tensor_p, - b0: ggml_tensor_p, - b1: ggml_tensor_p, - c0: ggml_tensor_p, - c1: ggml_tensor_p, - /, -) -> ggml_tensor_p: - ... - - # GGML_API struct ggml_tensor * ggml_ssm_conv( # struct ggml_context * ctx, # struct ggml_tensor * s, @@ -8684,6 +8734,12 @@ def ggml_cpu_has_avx512_vnni() -> int: ... +# GGML_API int ggml_cpu_has_avx512_bf16(void); +@ggml_function("ggml_cpu_has_avx512_bf16", [], ctypes.c_int) +def ggml_cpu_has_avx512_bf16() -> int: + ... + + # GGML_API int ggml_cpu_has_fma (void); @ggml_function("ggml_cpu_has_fma", [], ctypes.c_int) def ggml_cpu_has_fma() -> int: @@ -8696,6 +8752,12 @@ def ggml_cpu_has_neon() -> int: ... +# GGML_API int ggml_cpu_has_sve (void); +@ggml_function("ggml_cpu_has_sve", [], ctypes.c_int) +def ggml_cpu_has_sve() -> int: + ... + + # GGML_API int ggml_cpu_has_arm_fma (void); @ggml_function("ggml_cpu_has_arm_fma", [], ctypes.c_int) def ggml_cpu_has_arm_fma() -> int: @@ -10789,6 +10851,22 @@ def ggml_backend_cuda_unregister_host_buffer( ... +# GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); +@ggml_function( + "ggml_backend_cuda_log_set_callback", + [ + ggml_log_callback, + ctypes.c_void_p, + ], + None, + enabled=GGML_USE_CUDA, +) +def ggml_backend_cuda_log_set_callback( + log_callback, user_data: Union[ctypes.c_void_p, int, None], / # type: ignore +): + ... + + ##################################################### # GGML METAL API # source: src/ggml-metal.h @@ -11270,7 +11348,7 @@ def ggml_backend_vk_host_buffer_type() -> Optional[ggml_backend_buffer_type_t]: ##################################################### # GGML Vulkan API -# source: src/ggml-vulkan.h +# source: src/ggml-rpc.h ##################################################### diff --git a/vendor/ggml b/vendor/ggml index 7cf94a2..2aae01f 160000 --- a/vendor/ggml +++ b/vendor/ggml @@ -1 +1 @@ -Subproject commit 7cf94a2bb99eecfe7f55fa80e19b89e00bf7fe4d +Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c