From d4ad31a771a4f0900444d1da6815c553e3478306 Mon Sep 17 00:00:00 2001 From: zakkor Date: Fri, 17 Nov 2023 17:36:44 +0200 Subject: [PATCH] examples : add tokenize (#4039) --- examples/CMakeLists.txt | 1 + examples/tokenize/CMakeLists.txt | 5 ++++ examples/tokenize/tokenize.cpp | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 examples/tokenize/CMakeLists.txt create mode 100644 examples/tokenize/tokenize.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 75b8df676c52b..71bcb6893e20d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -24,6 +24,7 @@ else() add_subdirectory(llama-bench) add_subdirectory(llava) add_subdirectory(main) + add_subdirectory(tokenize) add_subdirectory(parallel) add_subdirectory(perplexity) add_subdirectory(quantize) diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt new file mode 100644 index 0000000000000..5e6654d7e5988 --- /dev/null +++ b/examples/tokenize/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET tokenize) +add_executable(${TARGET} tokenize.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp new file mode 100644 index 0000000000000..72b60f9a45808 --- /dev/null +++ b/examples/tokenize/tokenize.cpp @@ -0,0 +1,44 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + if (argc < 3 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]); + return 1; + } + + auto model_path = argv[1]; + auto prompt = argv[2]; + + const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; + + llama_backend_init(false); + + llama_model_params model_params = llama_model_default_params(); + model_params.vocab_only = true; + llama_model * model = llama_load_model_from_file(model_path, model_params); + + llama_context_params ctx_params = llama_context_default_params(); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + const bool add_bos = true; + + std::vector tokens; + + tokens = ::llama_tokenize(model, prompt, add_bos, true); + + for (int i = 0; i < (int) tokens.size(); i++) { + if (printing_ids) { + printf("%d\n", tokens[i]); + } else { + printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str()); + } + } + + return 0; +}