Skip to content

Commit

Permalink
Merge pull request #1 from MagnusS0/noramistral-tokenizer
Browse files Browse the repository at this point in the history
feat: add compatability with noramistral
  • Loading branch information
MagnusS0 authored Jun 23, 2024
2 parents 6a2f298 + 921e2c3 commit 4da25c0
Show file tree
Hide file tree
Showing 27 changed files with 1,359 additions and 10 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-normistral-7b-warm.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down
1 change: 1 addition & 0 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "normistral-7b-warm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/norallm/normistral-7b-warm", },
]


Expand Down
9 changes: 3 additions & 6 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,18 +453,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
Expand All @@ -483,6 +477,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if chkhsh == "a3ab9069a4c073804dfd16a852e6a0776cba5a46402ec3c7325851b57e0c4869":
# ref: https://huggingface.co/norallm/normistral-7b-warm
res = "normistral-7b-warm"

if res is None:
logger.warning("\n")
Expand Down
1 change: 0 additions & 1 deletion kompute
Submodule kompute deleted from 456519
12 changes: 12 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4899,6 +4899,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else if (
tokenizer_pre == "normistral-7b-warm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_NORMISTRAL;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -13297,6 +13300,15 @@ struct llm_tokenizer_bpe {
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_NORMISTRAL:
regex_exprs = {
"[^\\S ]{1}",
" {1}\\S+",
" {0,1}\\d{1}",
" {0,1}[^\\sA-Za-z0-9À-ÿĀ-ſḀ-ỿ]{1}",
" {2,8}",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_NORMISTRAL = 16,
};

// note: these values should be synchronized with ggml_rope
Expand Down
106 changes: 106 additions & 0 deletions models/ggml-vocab-jina-v2-code.gguf.inp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__


__ggml_vocab_test__



__ggml_vocab_test__




__ggml_vocab_test__


__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__

=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__











🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__
43 changes: 43 additions & 0 deletions models/ggml-vocab-jina-v2-code.gguf.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
5187 879 59261 21535
42 6395 3776 266

225
261
264
202
203
420
3712
11208
10564 7550
28137 7550
10564 10288
28137 10288
28137 10288 5
10564 16 7550 5
28137 16 7550 5
472 453 9919 104 252 18 3029
91 13392 1577 54321 19498 364 46363 8437
36655 13633 1769 14501 54827 21893 3849 10107 13878 41078
13065 227 50218 13065 246 25763 238 13065 242 25763 229 13065 249 13065 120 13065 258 25763 228 13065 258 13065 100 50218 13065 232 13065 228 13065 254 13065 232 25763 228 13065 236
8000 253 227 301 4411 13 9919 251 119 2965 240 8000 239 109 26726 301 10186 3520 23869 302 45604 13 12284 255 232 301 2895 53752 810 1533 2920 4613 3565 13
10564
28137
225 28137
261 28137
264 28137
264 28137 287 28137
301
203 278
11 225 3742
10564 16 711 11 474 5 8294 1021 1212 9919 251 228 959 10133 23692 5928 9173 33543 1330 1254 13567 22873 44634 257
23
1103
9581
3303
20428
13652
3303 9581
8274
8274 23
319 655 7239 11489 274 6881 12642 16716 203 8000 253 227 301 4411 13 9919 251 119 2965 240 8000 239 109 26726 301 10186 3520 23869 302 45604 13 12284 255 232 9919 104 252 8000 104 252 795 8104 38292 795 9581 795 3303 795 20428 795 13652 795 3303 9581 795 18 23 795 419 23 795 1713 23 225 13065 227 50218 13065 246 25763 238 13065 242 25763 229 13065 249 13065 120 13065 258 25763 228 13065 258 13065 100 50218 13065 232 8000 251 228 959 10133 23692 5928 9173 33543 1330 1254 13567 22873 44634 257 36031 12434 16706 13633 1769 14501 54827 21893 3849 10107 13878 41078 7095 9107 30834 2678 1246 1246 40651 13911 5366 23681 7887 527 10105 3081 363 88 1505 1063 1476 2866 16 363 495 1212 4509 35 363 49 691 4509 527 9104 2554 605 16 363 40 1212 4156 2681 4594 69 35 2893 11 30247 323 11 80 48
106 changes: 106 additions & 0 deletions models/ggml-vocab-jina-v2-de.gguf.inp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__


__ggml_vocab_test__



__ggml_vocab_test__




__ggml_vocab_test__


__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__

=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__











🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__
43 changes: 43 additions & 0 deletions models/ggml-vocab-jina-v2-de.gguf.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
1009 699 35137 3294
39832 261

225
6733
53448
202
203
203 203
203 203 203
202 203
17964 1568
29546 1568
17964 3519
29546 3519
29546 3519 5
17964 16 1568 5
29546 16 1568 5
555 337 5060 104 252 18 71 428
91 3079 28 964 30274 48013 267 87 6649 14811
9024 6983 146 236 6294 52261 4933 244 146 237 13905 32390 46632 51078
162 257 227 162 257 119 162 257 246 162 258 238 162 257 242 162 258 229 162 257 249 162 257 120 162 257 258 162 258 228 162 257 258 162 257 100 162 257 119 162 257 232 162 257 228 162 257 254 162 257 232 162 258 228 162 257 236
3753 253 227 406 17453 13 10278 119 54678 3753 239 109 16598 406 52806 1504 5752 78 276 2365 851 697 13 38607 406 20529 5752 12069 413 671 983 1469 30658 13
17964
29546
225 29546
6733 29546
53448 29546
53448 29546 203 53448 29546
406
203 3887
11 15453
17964 16 361 11 476 5 1953 459 426 10278 228 4985 167 235 244 167 230 116 57520 106 33974 166 120 103 46520 255 2281 2237 42047 47551 107 176 126 257
23
3837
45768
3837 3837
3837 45768
3837 3837 3837
3837 3837 45768
3837 3837 3837 3837
3837 3837 3837 45768
203 225 203 203 225 203 203 203 225 202 225 202 202 225 202 203 6733 203 53448 203 13607 203 13607 225 203 3753 253 227 406 17453 13 10278 119 54678 3753 239 109 16598 406 52806 1504 5752 78 276 2365 851 697 13 38607 5060 104 252 3753 104 252 589 8235 54381 589 45768 54381 3837 54381 45768 54381 3837 3837 54381 3837 45768 589 18 23 589 466 23 589 714 23 34376 257 227 162 257 119 162 257 246 162 258 238 162 257 242 162 258 229 162 257 249 162 257 120 162 257 258 162 258 228 162 257 258 162 257 100 162 257 119 162 257 232 32164 228 4985 167 235 244 167 230 116 57520 106 33974 166 120 103 46520 255 2281 2237 42047 47551 107 176 126 257 485 6624 17 30007 14589 33 36028 6983 146 236 6294 52261 4933 244 146 237 13905 32390 46632 51078 1268 12228 12228 11 51396 51396 51396 68 30699 30699 21828 11344 1844 20800 4300 324 1990 927 1268 88 939 540 507 899 16 1268 3136 426 2158 35 1268 49 586 2158 324 2202 1066 436 16 1268 40 426 917 822 11788 35 628 11 30868 264 11 80 48
Loading

0 comments on commit 4da25c0

Please sign in to comment.