From 9a7fcae43ee2fd04fddcda597fa777b92f3d5471 Mon Sep 17 00:00:00 2001 From: jwnz Date: Tue, 8 Oct 2024 15:23:53 +0900 Subject: [PATCH] adding tokenizer configuration (#19) ### Description This PR aims to add some better functionality for configuration of the tokenizer (#5). ### Todo - [x] Allow for selection of huggingface tokenizer. Downloads the model from the hub. - [x] Jieba tokenizer (chinese) - [x] tiktoken - [x] tiniestsegmenter (japanese) [optional] - [x] Allow switching of HF tokenizers - [x] Add Tests for tokenizing ### Notes ~~1. Currently, if a HuggingFace tokenizer is initialized, the tokenizer cannot be changed. e.g. Doing `SELECT tokenize('i have an apple', 'hf', 'sentence-transformers/LaBSE');` after `SELECT tokenize('i have an apple', 'hf', 'google-t5/t5-base');` would just use the LaBSE tokenizer as it has already been initialized. Need a better way to handle this.~~ ~~2. There is no official rust crate for tiktoken.~~ Used [tiktoken-rs]( https://github.com/zurawiki/tiktoken-rs ) ### Usage ```SQL SELECT tokenize('i have an apple', 'hf', 'google-bert/bert-base-uncased'); SELECT tokenize('i have an apple', 'hf', 'google-t5/t5-base'); SELECT tokenize('i have an apple', 'tiktoken', 'gpt2'); SELECT tokenize('i have an apple', 'tiniestsegmenter', ''); SELECT tokenize('i have an apple', 'jieba', ''); SELECT tokenize('i have an apple', 'ws', ''); ``` --------- Signed-off-by: jwnz --- Cargo.lock | 970 ++++++++++++++++++++++++++++++++++++++----- Cargo.toml | 4 + src/lib.rs | 23 +- src/sql/finalize.sql | 18 +- src/tokenizer/mod.rs | 278 +++++++++++++ 5 files changed, 1158 insertions(+), 135 deletions(-) create mode 100644 src/tokenizer/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 05137a6..7e9be00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "aho-corasick" version = "1.1.3" @@ -13,9 +19,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "atomic-traits" @@ -38,19 +44,37 @@ dependencies = [ "winapi", ] +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + [[package]] name = "base64" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bindgen" version = "0.69.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "cexpr", "clang-sys", "itertools 0.12.1", @@ -64,6 +88,21 @@ dependencies = [ "syn", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -72,9 +111,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "bitvec" @@ -88,6 +127,17 @@ dependencies = [ "wyz", ] +[[package]] +name = "bstr" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -106,9 +156,21 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.98" +version = "1.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07b1695e2c7e8fc85310cde85aeaab7e3097f593c91d209d3f9df76c928100f0" +dependencies = [ + "shlex", +] + +[[package]] +name = "cedarwood" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] [[package]] name = "cee-scape" @@ -146,6 +208,19 @@ dependencies = [ "libloading", ] +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.52.0", +] + [[package]] name = "convert_case" version = "0.6.0" @@ -155,6 +230,31 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -182,9 +282,9 @@ checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "darling" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -192,9 +292,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", @@ -206,9 +306,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", @@ -217,18 +317,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" dependencies = [ "darling", "proc-macro2", @@ -238,19 +338,46 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ "derive_builder_core", "syn", ] +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + [[package]] name = "either" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" [[package]] name = "enum-map" @@ -278,6 +405,16 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "esaxx-rs" version = "0.1.10" @@ -294,18 +431,59 @@ dependencies = [ "once_cell", ] +[[package]] +name = "fancy-regex" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" +dependencies = [ + "bit-set", + "regex", +] + +[[package]] +name = "fastrand" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" + [[package]] name = "fixedbitset" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flate2" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -321,6 +499,15 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -378,13 +565,30 @@ dependencies = [ "libc", ] +[[package]] +name = "hf-hub" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +dependencies = [ + "dirs", + "indicatif", + "log", + "native-tls", + "rand", + "serde", + "serde_json", + "thiserror", + "ureq", +] + [[package]] name = "home" version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -411,14 +615,36 @@ checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" [[package]] name = "indexmap" -version = "2.2.6" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "is_ci" version = "1.2.0" @@ -449,11 +675,26 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jieba-rs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1e2b0210dc78b49337af9e49d7ae41a39dceac6e5985613f1cf7763e2f76a25" +dependencies = [ + "cedarwood", + "derive_builder", + "fxhash", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lazycell" @@ -463,25 +704,51 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libloading" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.6.0", + "libc", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", ] [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "macro_rules_attribute" @@ -501,9 +768,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "minimal-lexical" @@ -511,6 +778,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "monostate" version = "0.1.13" @@ -532,6 +808,23 @@ dependencies = [ "syn", ] +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nom" version = "7.1.3" @@ -542,6 +835,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" version = "1.19.0" @@ -570,6 +869,56 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "openssl" +version = "0.10.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "owo-colors" version = "3.5.0" @@ -579,6 +928,29 @@ dependencies = [ "supports-color", ] +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", +] + [[package]] name = "paste" version = "1.0.15" @@ -603,9 +975,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" -version = "2.7.10" +version = "2.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "560131c633294438da9f7c4b08189194b20946c8274c6b9e38881a7874dc8ee8" +checksum = "fdbef9d1d47087a895abd220ed25eb4ad973a5e26f6a4367b038c25e28dfc2d9" dependencies = [ "memchr", "thiserror", @@ -626,9 +998,12 @@ dependencies = [ name = "pg_bestmatch" version = "0.0.0" dependencies = [ + "jieba-rs", "lazy_static", "pgrx", "rand", + "tiktoken-rs", + "tiniestsegmenter", "tokenizers", ] @@ -638,7 +1013,7 @@ version = "0.12.0-alpha.1" source = "git+https://github.com/tensorchord/pgrx.git?branch=v0.12.0-alpha.1-patch2#cd5a003ee34687c65b37c30a93bbda1c71f761ad" dependencies = [ "atomic-traits", - "bitflags 2.5.0", + "bitflags 2.6.0", "bitvec", "enum-map", "heapless", @@ -721,32 +1096,79 @@ dependencies = [ "unescape", ] +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pkg-config" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "portable-atomic" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d30538d42559de6b034bc76fd6dd4c38961b1ee5c6c56e3808c50128fdbc22ce" + [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" -version = "1.0.84" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -818,11 +1240,31 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355ae415ccd3a04315d3f8246e86d67689ea74d88d915576e1589a351062a13b" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + [[package]] name = "regex" -version = "1.10.4" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -832,9 +1274,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", @@ -843,9 +1285,24 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "ring" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] [[package]] name = "rustc-hash" @@ -862,6 +1319,51 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" + +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "ryu" version = "1.0.18" @@ -877,12 +1379,50 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "seahash" version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "0.11.0" @@ -903,9 +1443,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.203" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] @@ -922,9 +1462,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", @@ -933,20 +1473,21 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.117" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] [[package]] name = "serde_spanned" -version = "0.6.6" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d" dependencies = [ "serde", ] @@ -957,19 +1498,31 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "smallvec" version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "spm_precompiled" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" dependencies = [ - "base64", + "base64 0.13.1", "nom", "serde", "unicode-segmentation", @@ -993,6 +1546,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "supports-color" version = "1.3.1" @@ -1005,9 +1564,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.66" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -1020,31 +1579,65 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tempfile" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "tiktoken-rs" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234" +dependencies = [ + "anyhow", + "base64 0.21.7", + "bstr", + "fancy-regex", + "lazy_static", + "parking_lot", + "rustc-hash", +] + +[[package]] +name = "tiniestsegmenter" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f39721481fb54f0e9f3a1da5a6ac6063c61ec62ec828cd5e1860acce9458f40" + [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -1065,6 +1658,7 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom", + "hf-hub", "itertools 0.12.1", "lazy_static", "log", @@ -1088,9 +1682,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.13" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" dependencies = [ "serde", "serde_spanned", @@ -1100,18 +1694,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.22.13" +version = "0.22.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c" +checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf" dependencies = [ "indexmap", "serde", @@ -1140,15 +1734,15 @@ checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -1164,9 +1758,15 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode_categories" @@ -1174,11 +1774,36 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "native-tls", + "once_cell", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "url", + "webpki-roots", +] + [[package]] name = "url" -version = "2.5.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna", @@ -1187,13 +1812,19 @@ dependencies = [ [[package]] name = "uuid" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "walkdir" version = "2.5.0" @@ -1210,6 +1841,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "webpki-roots" +version = "0.26.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1228,11 +1868,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1241,84 +1881,159 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.9" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86c949fede1d13936a99f14fafd3e76fd642b556dd2ce96287fbe2e0151bfac6" +checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f" dependencies = [ "memchr", ] @@ -1331,3 +2046,30 @@ checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" dependencies = [ "tap", ] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/Cargo.toml b/Cargo.toml index 08ed331..b045541 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,10 +18,14 @@ pg15 = ["pgrx/pg15"] pg16 = ["pgrx/pg16"] [dependencies] +jieba-rs = "0.7.0" lazy_static = "1.4.0" pgrx = { version = "=0.12.0-alpha.1", default-features = false, features = [] } rand = "0.8.5" +"tiktoken-rs" = "0.5.9" +tiniestsegmenter = "0.3.0" tokenizers = { version = "0.19.1", default-features = false, features = [ + "http", "onig", ] } diff --git a/src/lib.rs b/src/lib.rs index 11024c0..3c3ed53 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +mod tokenizer; + pgrx::pg_module_magic!(); pgrx::extension_sql_file!("./sql/finalize.sql", finalize); @@ -17,18 +19,9 @@ compiler_error!("PostgreSQL version must be selected."); #[pgrx::pg_guard] unsafe extern "C" fn _PG_init() {} -static BERT_BASE_UNCASED_BYTES: &[u8] = include_bytes!("../tokenizer/bert_base_uncased.json"); -lazy_static::lazy_static! { - static ref BERT_BASE_UNCASED: tokenizers::Tokenizer = tokenizers::Tokenizer::from_bytes(BERT_BASE_UNCASED_BYTES).unwrap(); -} - #[pgrx::pg_extern(immutable, strict, parallel_safe)] -pub fn tokenize(t: &str) -> Vec { - BERT_BASE_UNCASED - .encode(t, false) - .expect("failed to tokenize") - .get_tokens() - .to_vec() +pub fn tokenize(t: &str, tokenizer: &str, model: Option<&str>) -> Vec { + tokenizer::tokenize(tokenizer, model, t) } #[derive(Debug)] @@ -52,9 +45,11 @@ pub fn bm25_document_to_svector_internal( dims: i32, t: &str, style: &str, + tokenizer: &str, + model: Option<&str>, ) -> String { use std::collections::BTreeMap; - let tokens = tokenize(t); + let tokens = tokenize(t, tokenizer, model); let mut x = BTreeMap::::new(); unsafe { use pgrx::pg_sys::*; @@ -150,9 +145,11 @@ pub fn bm25_query_to_svector_internal( dims: i32, t: &str, style: &str, + tokenizer: &str, + model: Option<&str>, ) -> String { use std::collections::BTreeMap; - let tokens = tokenize(t); + let tokens = tokenize(t, tokenizer, model); let mut x = BTreeMap::::new(); unsafe { use pgrx::pg_sys::*; diff --git a/src/sql/finalize.sql b/src/sql/finalize.sql index 5ecf64d..98ff73c 100644 --- a/src/sql/finalize.sql +++ b/src/sql/finalize.sql @@ -11,13 +11,15 @@ CREATE TABLE pg_bm25( -- props b REAL NOT NULL, k1 REAL NOT NULL, + tokenizer TEXT NOT NULL, + model TEXT NOT NULL, -- cached words INT NOT NULL, docs INT NOT NULL, dims INT NOT NULL ); -CREATE FUNCTION bm25_create(tab regclass, col TEXT, mat TEXT, b REAL DEFAULT 0.75, k1 REAL DEFAULT 1.2) RETURNS VOID AS $fn$ +CREATE FUNCTION bm25_create(tab regclass, col TEXT, mat TEXT, tokenizer TEXT DEFAULT 'hf', model TEXT DEFAULT 'google-bert/bert-base-uncased', b REAL DEFAULT 0.75, k1 REAL DEFAULT 1.2) RETURNS VOID AS $fn$ DECLARE test TEXT; ins_words INT; @@ -31,7 +33,7 @@ BEGIN EXECUTE format($$ CREATE MATERIALIZED VIEW %s AS WITH - inputs AS (SELECT bm_catalog.tokenize(%s) AS input FROM %s), + inputs AS (SELECT bm_catalog.tokenize(%s, %L, %L) AS input FROM %s), tokens AS (SELECT unnest(input)::NAME COLLATE "C" AS token FROM inputs GROUP BY token ORDER BY token), compute_how_many_tokens AS ( SELECT unnest(input)::NAME COLLATE "C" AS t, count(*)::INT AS how_many_tokens @@ -56,12 +58,12 @@ BEGIN JOIN compute_token_in_how_many_inputs ON compute_token_in_how_many_inputs.t = token CROSS JOIN var_docs; CREATE INDEX %s_index ON %s(token); - $$, mat, col, tab, mat, mat); + $$, mat, col, tokenizer, model, tab, mat, mat); EXECUTE format('SELECT sum(how_many_tokens) FROM %s', mat) INTO ins_words; EXECUTE format('SELECT count(%s) FROM %s', col, tab) INTO ins_docs; EXECUTE format('SELECT count(*) FROM %s', mat) INTO ins_dims; INSERT INTO bm_catalog.pg_bm25 - VALUES (tab, col, mat::regclass, (mat::text || '_index')::regclass, b, k1, ins_words, ins_docs, ins_dims); + VALUES (tab, col, mat::regclass, (mat::text || '_index')::regclass, b, k1, tokenizer, model, ins_words, ins_docs, ins_dims); END; $fn$ LANGUAGE plpgsql; @@ -92,7 +94,7 @@ BEGIN END; $fn$ LANGUAGE plpgsql; -CREATE FUNCTION bm25_document_to_svector(mat regclass, t TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$ +CREATE FUNCTION bm25_document_to_svector(mat regclass, t TEXT, tokenizer TEXT, model TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$ DECLARE idx regclass; p_b REAL; @@ -102,16 +104,16 @@ DECLARE p_dims INT; BEGIN SELECT indexrelid, b, k1, words, docs, dims INTO idx, p_b, p_k1, p_words, p_docs, p_dims FROM bm_catalog.pg_bm25 WHERE matrelid = mat; - RETURN bm_catalog.bm25_document_to_svector_internal(mat::oid, idx::oid, p_b, p_k1, p_words, p_docs, p_dims, t, style); + RETURN bm_catalog.bm25_document_to_svector_internal(mat::oid, idx::oid, p_b, p_k1, p_words, p_docs, p_dims, t, style, tokenizer, model); END; $fn$ LANGUAGE plpgsql; -CREATE FUNCTION bm25_query_to_svector(mat regclass, t TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$ +CREATE FUNCTION bm25_query_to_svector(mat regclass, t TEXT, tokenizer TEXT, model TEXT, style TEXT DEFAULT 'pgvecto.rs') RETURNS text STABLE STRICT PARALLEL SAFE AS $fn$ DECLARE idx regclass; p_dims INT; BEGIN SELECT indexrelid, dims INTO idx, p_dims FROM bm_catalog.pg_bm25 WHERE matrelid = mat; - RETURN bm_catalog.bm25_query_to_svector_internal(mat::oid, idx::oid, p_dims, t, style); + RETURN bm_catalog.bm25_query_to_svector_internal(mat::oid, idx::oid, p_dims, t, style, tokenizer, model); END; $fn$ LANGUAGE plpgsql; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs new file mode 100644 index 0000000..82ab483 --- /dev/null +++ b/src/tokenizer/mod.rs @@ -0,0 +1,278 @@ +use std::{ + collections::{HashMap, HashSet}, + sync::{Mutex, OnceLock}, +}; + +struct WhitespaceTokenizer; + +struct HFTokenizer { + tokenizer: tokenizers::Tokenizer, +} + +struct JiebaTokenizer { + jeiba: jieba_rs::Jieba, +} + +struct TiniestsegmenterTokenizer; + +struct TiktokenTokenizer { + tokenizer: tiktoken_rs::CoreBPE, +} + +impl JiebaTokenizer { + fn new() -> JiebaTokenizer { + JiebaTokenizer { + jeiba: jieba_rs::Jieba::new(), + } + } +} + +impl HFTokenizer { + pub fn new(model: &str) -> HFTokenizer { + HFTokenizer { + tokenizer: tokenizers::tokenizer::Tokenizer::from_pretrained(model, None).unwrap(), + } + } +} + +impl TiktokenTokenizer { + fn new(model: &str) -> TiktokenTokenizer { + let selected_model = tiktoken_rs::tokenizer::get_tokenizer(model) + .map(|tokenizer| match tokenizer { + tiktoken_rs::tokenizer::Tokenizer::O200kBase => "o200k_base", + tiktoken_rs::tokenizer::Tokenizer::Cl100kBase => "cl100k_base", + tiktoken_rs::tokenizer::Tokenizer::P50kBase => "p50k_base", + tiktoken_rs::tokenizer::Tokenizer::P50kEdit => "p50k_edit", + tiktoken_rs::tokenizer::Tokenizer::R50kBase => "r50k_base", + tiktoken_rs::tokenizer::Tokenizer::Gpt2 => "gpt2", + }) + .unwrap_or(model); + + TiktokenTokenizer { + tokenizer: match selected_model { + "o200k_base" => tiktoken_rs::o200k_base().unwrap(), + "cl100k_base" => tiktoken_rs::cl100k_base().unwrap(), + "p50k_base" => tiktoken_rs::p50k_base().unwrap(), + "p50k_edit" => tiktoken_rs::p50k_edit().unwrap(), + "r50k_base" | "gpt2" => tiktoken_rs::r50k_base().unwrap(), + _ => panic!("Unknown model"), + }, + } + } +} + +pub trait Tokenize { + // default, just tokenize on whitespace + fn tokenize(&self, s: &str) -> Vec { + s.split_whitespace().map(|s| s.to_string()).collect() + } +} + +impl Tokenize for WhitespaceTokenizer {} + +impl Tokenize for HFTokenizer { + fn tokenize(&self, s: &str) -> Vec { + self.tokenizer + .encode(s, false) + .expect("failed to tokenize") + .get_tokens() + .to_vec() + } +} + +impl Tokenize for JiebaTokenizer { + fn tokenize(&self, s: &str) -> Vec { + self.jeiba + .cut(s, true) + .iter() + .map(|s| s.to_string()) + .collect() + } +} + +impl Tokenize for TiniestsegmenterTokenizer { + fn tokenize(&self, s: &str) -> Vec { + tiniestsegmenter::tokenize(s) + .iter() + .map(|s| s.to_string()) + .collect() + } +} + +impl Tokenize for TiktokenTokenizer { + fn tokenize(&self, s: &str) -> Vec { + self.tokenizer + .encode(s, HashSet::new()) + .into_iter() + .map(|s| s.to_string()) + .collect() + } +} + +type PostgresTokenizer = Box; +type MultiOL = OnceLock>>; +type SingleOL = OnceLock; + +static HF_TOKENIZER: MultiOL = OnceLock::new(); +static TIKTOKEN_TOKENIZER: MultiOL = OnceLock::new(); +static JIEBA_TOKENIZER: SingleOL = OnceLock::new(); +static TINIESTSEGMENTER_TOKENIZER: SingleOL = OnceLock::new(); +static WHITESPACE_TOKENIZER: SingleOL = OnceLock::new(); + +fn _hashmap_tokenize( + lock: &MultiOL, + model: &str, + new_fn: impl Fn(&str) -> T, + s: &str, +) -> Vec +where + T: Tokenize + Sync + Send + 'static, +{ + let mut lock_guard = lock + .get_or_init(|| Mutex::new(HashMap::new())) + .lock() + // on panic, the mutex gets poisoned, so we need a way to handle it. + .unwrap_or_else(|e: std::sync::PoisonError<_>| e.into_inner()); + + lock_guard + .entry(model.to_string()) + .or_insert_with(|| Box::new(new_fn(model))) + .tokenize(s) +} + +pub fn tokenize(tokenizer: &str, model: Option<&str>, s: &str) -> Vec { + let selected_tokenizer = match tokenizer { + "hf" => { + let selected_model = model.expect("model must be provided for hf tokenizer"); + return _hashmap_tokenize(&HF_TOKENIZER, selected_model, HFTokenizer::new, s); + } + "tiktoken" => { + let selected_model = model.expect("model or encoding must be provided"); + return _hashmap_tokenize( + &TIKTOKEN_TOKENIZER, + selected_model, + TiktokenTokenizer::new, + s, + ); + } + "ws" => WHITESPACE_TOKENIZER.get_or_init(|| Box::new(WhitespaceTokenizer)), + "jieba" => JIEBA_TOKENIZER.get_or_init(|| Box::new(JiebaTokenizer::new())), + "tiniestsegmenter" => { + TINIESTSEGMENTER_TOKENIZER.get_or_init(|| Box::new(TiniestsegmenterTokenizer)) + } + _ => panic!("Unknown tokenizer"), + }; + + selected_tokenizer.tokenize(s) +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_whitespace_tokenizer() { + assert_eq!( + *super::tokenize("ws", None, "i have an apple"), + vec!["i", "have", "an", "apple"] + ); + } + + #[test] + fn test_hftokenizer() { + assert_eq!( + super::tokenize("hf", Some("bert-base-uncased"), "i have an apple"), + vec!["i", "have", "an", "apple"] + ); + + assert_eq!( + super::tokenize("hf", Some("google-t5/t5-base"), "i have an apple"), + vec!["▁", "i", "▁have", "▁an", "▁apple"] + ); + + assert_eq!( + super::tokenize("hf", Some("bert-base-uncased"), "i have an apple"), + vec!["i", "have", "an", "apple"] + ); + } + + #[test] + fn test_jieba() { + assert_eq!( + super::tokenize("jieba", None, "测试版本将于秋季推出。"), + vec!["测试", "版本", "将", "于", "秋季", "推出", "。"] + ); + } + + #[test] + fn test_tiniestsegmenter() { + assert_eq!( + super::tokenize( + "tiniestsegmenter", + None, + "今作の主人公はリンクではなくゼルダ姫" + ), + vec![ + "今作", + "の", + "主人", + "公", + "は", + "リンク", + "で", + "は", + "なく", + "ゼルダ", + "姫" + ] + ); + } + + #[test] + fn test_tiktoken() { + // Test the encodings first + + // o200k_base + assert_eq!( + super::tokenize("tiktoken", Some("o200k_base"), "i want an apple"), + vec!["72", "1682", "448", "30366"] + ); + + // cl100k_base + assert_eq!( + super::tokenize("tiktoken", Some("cl100k_base"), "i want an apple"), + vec!["72", "1390", "459", "24149"] + ); + + // p50k_base + assert_eq!( + super::tokenize("tiktoken", Some("p50k_base"), "i want an apple"), + vec!["72", "765", "281", "17180"] + ); + + // p50k_edit + assert_eq!( + super::tokenize("tiktoken", Some("p50k_edit"), "i want an apple"), + vec!["72", "765", "281", "17180"] + ); + + // r50k_base + assert_eq!( + super::tokenize("tiktoken", Some("r50k_base"), "i want an apple"), + vec!["72", "765", "281", "17180"] + ); + + //gpt2 + assert_eq!( + super::tokenize("tiktoken", Some("gpt2"), "i want an apple"), + vec!["72", "765", "281", "17180"] + ); + } + + // panic + + #[test] + #[should_panic] + fn test_tiktoken_panic() { + super::tokenize("tiktoken", Some("foo"), "i want an apple"); + } +}