From 98a8be563564510b3e4f792a271055b8866ab612 Mon Sep 17 00:00:00 2001 From: Wes Couch Date: Fri, 10 Feb 2023 13:34:55 -0500 Subject: [PATCH] Remove fs and path usage to allow gpt-3-encoder to work in the browser and in nodejs. Add countTokens function and tests. --- Encoder.js | 12 +++++++----- Encoder.test.js | 9 +++++++-- example.js | 3 ++- index.d.ts | 2 ++ index.js | 3 ++- package.json | 2 +- vocab.bpe => vocab.js | 17 +++++++++-------- 7 files changed, 30 insertions(+), 18 deletions(-) rename vocab.bpe => vocab.js (99%) diff --git a/Encoder.js b/Encoder.js index 8549150..76e18eb 100644 --- a/Encoder.js +++ b/Encoder.js @@ -1,9 +1,6 @@ // This file includes code which was modified from https://github.com/openai/gpt-2 -const fs = require('fs') -const path = require('path'); - -const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json'))); -const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8'); +const encoder = require('./encoder.json'); +const bpe_file = require('./vocab'); const range = (x, y) => { const res = Array.from(Array(y).keys()).slice(x) @@ -172,7 +169,12 @@ function decode(tokens) { return text } +function countTokens(text) { + return encode(text).length; +} + module.exports = { + countTokens, encode, decode }; \ No newline at end of file diff --git a/Encoder.test.js b/Encoder.test.js index 2556e10..0730646 100644 --- a/Encoder.test.js +++ b/Encoder.test.js @@ -1,4 +1,4 @@ -const {encode, decode} = require('./Encoder.js'); +const {countTokens, encode, decode} = require('./Encoder.js'); test('empty string', () => { const str = ""; @@ -41,4 +41,9 @@ test('properties of Object',()=>{ expect(encode(str)).toEqual([1462, 10100, 23772, 468, 23858, 21746, 1988, 5189]); expect(decode(encode(str))).toEqual(str); -}) \ No newline at end of file +}) + +test('token count of string', () => { + const str = 'This string should have 7 tokens.'; + expect(countTokens(str)).toEqual(7); +}); \ No newline at end of file diff --git a/example.js b/example.js index fcf64ce..5c40524 100644 --- a/example.js +++ b/example.js @@ -1,9 +1,10 @@ -const {encode, decode} = require('./encoder.js') +const {countTokens, encode, decode} = require('./encoder.js') const str = 'This is an example sentence to try encoding out on!' const encoded = encode(str) console.log('Encoded this string looks like: ', encoded) +console.log('Encoded string contains ' + countTokens(str) + ' tokens'); console.log('We can look at each token and what it represents') for(let token of encoded){ diff --git a/index.d.ts b/index.d.ts index 1441a40..217e96b 100644 --- a/index.d.ts +++ b/index.d.ts @@ -1,4 +1,6 @@ declare module "gpt-3-encoder" { + export function countTokens(text: string): number; + export function encode(text: string): number[]; export function decode(tokens: number[]): string; diff --git a/index.js b/index.js index db5d95b..eda2f03 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,7 @@ -const { encode, decode } = require("./Encoder"); +const { countTokens, encode, decode } = require("./Encoder"); module.exports = { + countTokens, encode, decode, }; diff --git a/package.json b/package.json index 9bf45f8..0e595fc 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "files": [ "Encoder.js", "encoder.json", - "vocab.bpe", + "vocab.js", "index.d.ts" ], "scripts": { diff --git a/vocab.bpe b/vocab.js similarity index 99% rename from vocab.bpe rename to vocab.js index 226b075..3dcfeb1 100644 --- a/vocab.bpe +++ b/vocab.js @@ -1,4 +1,4 @@ -#version: 0.2 +module.exports = `#version: 0.2 Ġ t Ġ a h e @@ -4343,7 +4343,7 @@ ack s ad y d o ĠG ood -Ġ ` +Ġ \` Ġw ish Ġreve aled Âł Âł @@ -7302,7 +7302,7 @@ J ohn S m ĠF und Ġconst antly -Ġ` ` +Ġ\` \` Ġgener ated ĠA ction ĠP sych @@ -11335,7 +11335,7 @@ P ut Ġbrief ly ri ve Ġstim ul -Ġ`` ( +Ġ\`\` ( Ġ __ Ġch ip Ġha z @@ -15249,7 +15249,7 @@ ab a le tt Ġfol k Ġch ase -` ` +\` \` ĠBr us Ġte ens c ue @@ -32896,7 +32896,7 @@ ath s n atal =" " fl ags -`` `` +\`\` \`\` Ġs ul K h Ġpot assium @@ -44389,7 +44389,7 @@ ub en ĠT ight ind al ic as -` . +\` . C AST '' ; ĠF et @@ -47414,7 +47414,7 @@ CD C Ġsal ads F le Ġindustrial ized -` , +\` , ĠO WN Ġbec k ĠPart icularly @@ -49999,3 +49999,4 @@ om inated ĠColl ider Ġinform ants Ġg azed +`;