From 81605a2317cbb81095f474022a9e0307880d8001 Mon Sep 17 00:00:00 2001 From: Adam Pocock Date: Fri, 25 Oct 2024 16:24:27 -0400 Subject: [PATCH] Fixing a multithreading bug in WordpieceTokenizer. (#382) --- .../util/tokens/impl/wordpiece/WordpieceTokenizer.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Util/Tokenization/src/main/java/org/tribuo/util/tokens/impl/wordpiece/WordpieceTokenizer.java b/Util/Tokenization/src/main/java/org/tribuo/util/tokens/impl/wordpiece/WordpieceTokenizer.java index b8b41c115..4b951802d 100644 --- a/Util/Tokenization/src/main/java/org/tribuo/util/tokens/impl/wordpiece/WordpieceTokenizer.java +++ b/Util/Tokenization/src/main/java/org/tribuo/util/tokens/impl/wordpiece/WordpieceTokenizer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2024, Oracle and/or its affiliates. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ * and Chinese characters. The resulting tokens are then applied to the * wordpiece algorithm implemented in {@link Wordpiece} which is driven by an * input vocabulary which matches tokens and token suffixes as it can. Any - * tokens that are not found in the input vocbulary are marked as "unknown". + * tokens that are not found in the input vocabulary are marked as "unknown". */ public class WordpieceTokenizer implements Tokenizer { @@ -133,7 +133,7 @@ public boolean advance() { currentToken = this.whitespaceTokenizer.getToken(); getWordpieceTokens(); currentWordpieceIndex = 0; - if (currentWordpieceTokens.size() == 0) { + if (currentWordpieceTokens.isEmpty()) { return advance(); } return true; @@ -181,7 +181,7 @@ private void getWordpieceTokens() { List wordpieces = wordpiece.wordpiece(text); - if (wordpieces.size() == 0) { + if (wordpieces.isEmpty()) { return; } else if (wordpieces.size() == 1) { String wp = wordpieces.get(0); @@ -245,7 +245,7 @@ public WordpieceTokenizer clone() { copy.basicTokenizer = basicTokenizer.clone(); copy.reset = false; copy.currentToken = null; - copy.currentWordpieceTokens.clear(); + copy.currentWordpieceTokens = new ArrayList<>(); copy.currentWordpieceIndex = -1; return copy; } catch (CloneNotSupportedException e) {