Skip to content

Commit

Permalink
wpm : portable unicode tolower (#6305)
Browse files Browse the repository at this point in the history
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
  • Loading branch information
cebtenzzre authored Mar 26, 2024
1 parent 557410b commit 32c8486
Show file tree
Hide file tree
Showing 9 changed files with 1,699 additions and 1,425 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,7 @@ add_library(llama
llama.h
unicode.h
unicode.cpp
unicode-data.cpp
)

target_include_directories(llama PUBLIC .)
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
unicode.o: unicode.cpp unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@

OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@

OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o

llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
Expand Down
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ let package = Package(
"ggml.c",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
Expand Down
15 changes: 8 additions & 7 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
Expand All @@ -127,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");

_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });

const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
Expand Down
22 changes: 8 additions & 14 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
#include <algorithm>
#include <array>
#include <cassert>
#include <cctype>
#include <cfloat>
#include <cinttypes>
#include <climits>
Expand All @@ -71,7 +72,6 @@
#include <cstdio>
#include <cstring>
#include <ctime>
#include <cwctype>
#include <forward_list>
#include <fstream>
#include <functional>
Expand Down Expand Up @@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
continue;
}
code = to_lower(code);
code = unicode_tolower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) {
code = ' ';
}
Expand All @@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
std::vector<std::string> words;
while (r < new_str.size()) {
// if is whitespace
if (isspace(new_str[r])) {
if (isspace(new_str[r], std::locale::classic())) {
if (r > l) words.push_back(new_str.substr(l, (r - l)));
l = r + 1;
r = l;
Expand All @@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
return words;
}

uint32_t to_lower(uint32_t code) {
static const std::locale locale("en_US.UTF-8");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
}
#endif
return std::tolower(wchar_t(code), locale);
}

bool is_ascii_punct(uint32_t code) {
return code < 256 && ispunct(code);
if (code > 0xFF) {
return false;
}
auto c = char(static_cast<unsigned char>(code));
return ispunct(c, std::locale::classic());
}

bool is_chinese_char(uint32_t cpt) {
Expand Down
Loading

0 comments on commit 32c8486

Please sign in to comment.