Skip to content

Commit

Permalink
Refactor metaspace (#1476)
Browse files Browse the repository at this point in the history
* version = "0.15.3-dev-0”

Improve performances of meta space, but also just fix it.

(transformers) ➜  transformers git:(refactor-default-llama) βœ— python ../scripts/gemma-dummy.py
Token indices sequence length is longer than the specified maximum sequence length for this model (14999 > 2048). Running this sequence through the model will result in indexing errors
['<REPR_END>', '▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.']
['▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.']
[0.0006330013275146484, 0.0014591217041015625, 0.015890836715698242, 0.18584918975830078, 2.1726326942443848]
(transformers) ➜  transformers git:(refactor-default-llama) βœ— python ../scripts/gemma-dummy.py
Token indices sequence length is longer than the specified maximum sequence length for this model (10000 > 2048). Running this sequence through the model will result in indexing errors
['<REPR_END>', 'in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.']
['in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.']
[0.0008409023284912109, 0.0008909702301025391, 0.00882411003112793, 0.10214710235595703, 1.187899112701416]

* well what do we have

* nit

* be BC with non legacy

* unrelated change for clippy

* fix test

* splitting is a must for word_ids

* fmt and lint

* Fixing everything (hopefully better).

* Fixing node.

* Including yarn.lock

* Lint.

* Stubs.

* revert to use split

* fix merge issues

* fix tests

* finish fixing tests

* ruff

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
  • Loading branch information
ArthurZucker and Narsil authored Mar 30, 2024
1 parent 6153126 commit 0906971
Show file tree
Hide file tree
Showing 21 changed files with 1,660 additions and 1,503 deletions.
12 changes: 10 additions & 2 deletions bindings/node/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ export function ctcDecoder(
cleanup?: boolean | undefined | null,
): Decoder
export function fuseDecoder(): Decoder
export function metaspaceDecoder(replacement?: string = '▁', addPrefixSpace?: bool = true): Decoder
export function metaspaceDecoder(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): Decoder
export function replaceDecoder(pattern: string, content: string): Decoder
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
export function stripDecoder(content: string, left: number, right: number): Decoder
Expand Down Expand Up @@ -89,7 +93,11 @@ export function byteLevelAlphabet(): Array<string>
export function whitespacePreTokenizer(): PreTokenizer
export function whitespaceSplitPreTokenizer(): PreTokenizer
export function bertPreTokenizer(): PreTokenizer
export function metaspacePreTokenizer(replacement?: string = '▁', addPrefixSpace?: bool = true): PreTokenizer
export function metaspacePreTokenizer(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): PreTokenizer
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
export function sequencePreTokenizer(preTokenizers: Array<PreTokenizer>): PreTokenizer
Expand Down
37 changes: 37 additions & 0 deletions bindings/node/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,43 @@ switch (platform) {
loadError = e
}
break
case 'riscv64':
if (isMusl()) {
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-musl.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-riscv64-musl.node')
} else {
nativeBinding = require('tokenizers-linux-riscv64-musl')
}
} catch (e) {
loadError = e
}
} else {
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-gnu.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-riscv64-gnu.node')
} else {
nativeBinding = require('tokenizers-linux-riscv64-gnu')
}
} catch (e) {
loadError = e
}
}
break
case 's390x':
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-s390x-gnu.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-s390x-gnu.node')
} else {
nativeBinding = require('tokenizers-linux-s390x-gnu')
}
} catch (e) {
loadError = e
}
break
default:
throw new Error(`Unsupported architecture on Linux: ${arch}`)
}
Expand Down
2 changes: 1 addition & 1 deletion bindings/node/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "tokenizers",
"version": "0.14.0-dev0",
"version": "0.15.3-dev0",
"repository": {
"type": "git",
"url": "git+https://github.com/huggingface/tokenizers.git"
Expand Down
19 changes: 16 additions & 3 deletions bindings/node/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,19 +90,32 @@ pub fn fuse_decoder() -> Decoder {
#[napi]
pub fn metaspace_decoder(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<Decoder> {
let add_prefix_space = add_prefix_space.unwrap_or(true);
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
"replacement is supposed to be a single char",
));
}
let replacement = replacement.chars().next().unwrap();
let prepend_scheme: PrependScheme =
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
"always" => PrependScheme::Always,
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
_ => {
return Err(Error::from_reason(
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
));
}
};
Ok(Decoder {
decoder: Some(Arc::new(RwLock::new(
tk::decoders::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
))),
})
}
Expand Down
19 changes: 16 additions & 3 deletions bindings/node/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,20 +155,33 @@ pub fn bert_pre_tokenizer() -> PreTokenizer {
#[napi]
pub fn metaspace_pre_tokenizer(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<PreTokenizer> {
let add_prefix_space = add_prefix_space.unwrap_or(true);
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
"replacement is supposed to be a single char",
));
}
let replacement = replacement.chars().next().unwrap();
let prepend_scheme: PrependScheme =
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
"always" => PrependScheme::Always,
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
_ => {
return Err(Error::from_reason(
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
));
}
};

Ok(PreTokenizer {
pretok: Some(Arc::new(RwLock::new(
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
))),
})
}
Expand Down
Loading

0 comments on commit 0906971

Please sign in to comment.