Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Plexcalibur authored Jul 18, 2024
2 parents 35c71ab + ae9bd5d commit 799aa55
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 17 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-docs-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
run: npm run build

- name: Deploy
uses: peaceiris/actions-gh-pages@v3
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./docs/build
Expand Down
4 changes: 2 additions & 2 deletions benchmark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pyperf==2.6.3
tiktoken==0.6.0
pyperf==2.7.0
tiktoken==0.7.0
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.9-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
7 changes: 5 additions & 2 deletions gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
#

##############################################################################
#
Expand Down Expand Up @@ -55,7 +57,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
Expand Down Expand Up @@ -84,7 +86,8 @@ done
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s
' "$PWD" ) || exit

# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
Expand Down
2 changes: 2 additions & 0 deletions gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@rem SPDX-License-Identifier: Apache-2.0
@rem

@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
Expand Down
19 changes: 9 additions & 10 deletions lib/src/main/java/com/knuddels/jtokkit/GptBytePairEncoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.knuddels.jtokkit.api.EncodingResult;
import com.knuddels.jtokkit.api.GptBytePairEncodingParams;
import com.knuddels.jtokkit.api.IntArrayList;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -44,7 +45,7 @@ public EncodingResult encode(String text, int maxTokenCount) {

private InternalResult encodeInternal(String text, int maxTokenCount, boolean keepEncodings) {
if (text == null) {
return new InternalResult(new IntArrayList(0), false);
return new InternalResult(new IntArrayList(0), -1, false, -1);
}

specialEncoder.checkForSpecialTokens(text);
Expand All @@ -64,7 +65,7 @@ public EncodingResult encodeOrdinary(String text, int maxTokenCount) {

private InternalResult encodeOrdinaryInternal(String text, int maxTokenCount, boolean keepEncodings) {
if (text == null) {
return new InternalResult(new IntArrayList(0), false);
return new InternalResult(new IntArrayList(0), -1, false, -1);
}

IntArrayList out = new IntArrayList();
Expand All @@ -81,12 +82,12 @@ private InternalResult encodeOrdinaryInternal(String text, int maxTokenCount, bo
String decoded = decode(tokens);
if (text.startsWith(decoded)) {
// If decoded text is equal to the head of the original text, we can safely return the tokens
return new InternalResult(tokens, text.length() > decoded.length());
return new InternalResult(tokens, -1, text.length() > decoded.length(), decoded.length() - 1);
}
}
}

return new InternalResult(out, tokenCount, false);
return new InternalResult(out, tokenCount, false, text.length() - 1);
}

int encodeOrdinaryInternal(String text, int maxTokenCount, boolean keepEncodings, IntArrayList out) {
Expand Down Expand Up @@ -140,15 +141,13 @@ private static final class InternalResult {
private final IntArrayList tokens;
private final boolean truncated;
private final int tokenCount;
private final int lastProcessedCharacterIndex; // -1 == text was null or string was empty()

private InternalResult(IntArrayList tokens, boolean truncated) {
this(tokens, -1, truncated);
}

private InternalResult(IntArrayList tokens, int tokenCount, boolean truncated) {
private InternalResult(IntArrayList tokens, int tokenCount, boolean truncated, int lastProcessedCharacterIndex) {
this.tokens = tokens;
this.truncated = truncated;
this.tokenCount = tokenCount < 0 ? tokens.size() : tokenCount;
this.lastProcessedCharacterIndex = lastProcessedCharacterIndex;
}

private EncodingResult toEncodingResult() {
Expand All @@ -158,7 +157,7 @@ private EncodingResult toEncodingResult() {
);
}

return new EncodingResult(tokens, truncated);
return new EncodingResult(tokens, truncated, lastProcessedCharacterIndex);
}

private int toTokenCount() {
Expand Down
18 changes: 18 additions & 0 deletions lib/src/main/java/com/knuddels/jtokkit/api/EncodingResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@
public final class EncodingResult {
private final IntArrayList tokens;
private final boolean truncated;
private final int lastProcessedCharacterIndex;

public EncodingResult(final IntArrayList tokens, final boolean truncated) {
this(tokens, truncated, -1);
}

public EncodingResult(final IntArrayList tokens, final boolean truncated, final int lastProcessedCharacterIndex) {
this.tokens = tokens;
this.truncated = truncated;
this.lastProcessedCharacterIndex = lastProcessedCharacterIndex;
}

/**
Expand All @@ -30,11 +36,23 @@ public boolean isTruncated() {
return truncated;
}

/**
* Returns the index of the last processed character in the input string
*
* @return the index of the last processed character in the input string, is -1 if text was null or empty
*/
public int getLastProcessedCharacterIndex() {
return lastProcessedCharacterIndex;
}



@Override
public String toString() {
return "EncodingResult{"
+ "tokens=" + tokens
+ ", truncated=" + truncated
+ ", lastProcessedCharacterIndex=" + lastProcessedCharacterIndex
+ '}';
}
}
3 changes: 2 additions & 1 deletion lib/src/main/java/com/knuddels/jtokkit/api/ModelType.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ public enum ModelType {
GPT_4("gpt-4", EncodingType.CL100K_BASE, 8192),
GPT_4O("gpt-4o", EncodingType.O200K_BASE, 128000),
GPT_4_32K("gpt-4-32k", EncodingType.CL100K_BASE, 32768),
GPT_3_5_TURBO("gpt-3.5-turbo", EncodingType.CL100K_BASE, 4096),
GPT_4_TURBO("gpt-4-turbo", EncodingType.CL100K_BASE, 128000),
GPT_3_5_TURBO("gpt-3.5-turbo", EncodingType.CL100K_BASE, 16385),
GPT_3_5_TURBO_16K("gpt-3.5-turbo-16k", EncodingType.CL100K_BASE, 16385),

// text
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package com.knuddels.jtokkit;

import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingType;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class EncodingLastProcessedCharacterIndexTest {


private static final Encoding ENCODING = Encodings.newDefaultEncodingRegistry().getEncoding(EncodingType.CL100K_BASE);

@Test
void testNullInput() {
var encodingResult = ENCODING.encode(null, 10);
assertEquals(encodingResult.getLastProcessedCharacterIndex(), -1);
}

@Test
void testEmptyInput() {
String input = "";
var encodingResult = ENCODING.encode(input, 10);
assertEquals(encodingResult.getLastProcessedCharacterIndex(), -1);
}

@Test
void testShortInput() {
String input = "Hello World!";
var encodingResult = ENCODING.encode(input, 10);
assertEquals(encodingResult.getLastProcessedCharacterIndex(), 11);
}

@Test
void testLongInput() {
String input = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce condimentum enim ac tellus malesuada, a consectetur nibh efficitur. 🚀🚀🚀";
var encodingResult = ENCODING.encode(input, 10);
assertEquals(encodingResult.getLastProcessedCharacterIndex(), 55);
}
}

0 comments on commit 799aa55

Please sign in to comment.