Skip to content

Commit

Permalink
Merge branch 'master' into latpull
Browse files Browse the repository at this point in the history
# Conflicts:
#	docs/Encoder.js.html
#	docs/global.html
#	docs/index.html
#	package-lock.json
  • Loading branch information
hashfox committed Mar 8, 2023
2 parents 6715083 + 02eaadd commit c5a3b63
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 14 deletions.
17 changes: 11 additions & 6 deletions Encoder.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@


const encoder = require("./encoder");

// This file includes code which was modified from https://github.com/openai/gpt-2
const bpe_ranks = require("./bpe_ranks");

//The old version used to include this but i prebuild it into a js file to be loaded by browserify
//todo delete old comments when not needed
// const fs = require('fs')
// const path = require('path');
// const json-loder
// const loader = require("json-loader");

// const encoder = loader('./encoder.json');

// const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));
const encoder = require("./encoder");

const bpe_ranks = require("./bpe_ranks");
// const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8');

const range = (x, y) => {
Expand Down Expand Up @@ -67,7 +71,7 @@ function get_pairs(word) {
}

const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu
// The regular expression patis used to split a string into an array of tokens.
// The regular expression pat is used to split a string into an array of tokens.
//
// The regular expression consists of several parts:
// 's|'t|'re|'ve|'m|'ll|'d: These are all short forms of common English words (e.g. "is", "not", "have"). The | symbol means "or", so this part of the expression matches any of these short forms.
Expand Down Expand Up @@ -262,7 +266,8 @@ function tokenStats(input) {

// Sort the frequency object by frequency in descending order
stats.frequency = Object.fromEntries(
Object.entries(stats.frequency).sort((a, b) => b[1] - a[1])
Object.entries(stats.frequency)
.sort((a, b) => b[1] - a[1])
)

return stats
Expand Down
5 changes: 2 additions & 3 deletions browser.js

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion build_encoder.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const fs = require('fs');
const path = require('path');


const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));

console.log("Breaks stuff i think");
Expand Down
5 changes: 2 additions & 3 deletions docs/browser.js

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ declare module "gpt-3-encoder" {

export function countTokens(text: string): number;

export function tokenStats(input: string | number[]): any;
export function tokenStats(input: string | number[]): TokenStats;

export interface TokenStats {
count: number;
unique: number;
frequency: Record<string, number>;
positions: Record<string, number[]>;
tokens: string[];
}

}
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
"scripts": {
"docs": "jsdoc Encoder.js -r README.md -d docs && cp browser.* docs",
"build_bpe_ranks": "node build_bpe_ranks.js",
"build_encoder": "node build_encoder.js",

"build": "browserify index.js -s gpt3encoder -o browser.js",
"test": "jest",
"browser": "firefox example/browser.html",
Expand Down

0 comments on commit c5a3b63

Please sign in to comment.