Skip to content

Commit

Permalink
https://github.com/latitudegames/GPT-3-Encoder/pull/30#issuecomment-1…
Browse files Browse the repository at this point in the history
…460069823

Add types let me know if this is the correct way to implement it.

And here are some additional useful stats that could be added to the stats object:

    averageWordLength: The average length of the words in the tokens array.
    mostFrequentWords: An array of the most frequently occurring words in the tokens array, ordered by frequency.
    leastFrequentWords: An array of the least frequently occurring words in the tokens array, ordered by frequency.
    wordPositions: An object that maps each word in the tokens array to its positions (indices) in the array.
    wordLengths: An array of the lengths of the words in the tokens array.
  • Loading branch information
hashfox committed Mar 8, 2023
1 parent c3c2e25 commit 02eaadd
Show file tree
Hide file tree
Showing 10 changed files with 35 additions and 22 deletions.
17 changes: 11 additions & 6 deletions Encoder.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@


const encoder = require("./encoder");

// This file includes code which was modified from https://github.com/openai/gpt-2
const bpe_ranks = require("./bpe_ranks");

//The old version used to include this but i prebuild it into a js file to be loaded by browserify
//todo delete old comments when not needed
// const fs = require('fs')
// const path = require('path');
// const json-loder
// const loader = require("json-loader");

// const encoder = loader('./encoder.json');

// const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));
const encoder = require("./encoder");

const bpe_ranks = require("./bpe_ranks");
// const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8');

const range = (x, y) => {
Expand Down Expand Up @@ -67,7 +71,7 @@ function get_pairs(word) {
}

const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu
// The regular expression patis used to split a string into an array of tokens.
// The regular expression pat is used to split a string into an array of tokens.
//
// The regular expression consists of several parts:
// 's|'t|'re|'ve|'m|'ll|'d: These are all short forms of common English words (e.g. "is", "not", "have"). The | symbol means "or", so this part of the expression matches any of these short forms.
Expand Down Expand Up @@ -259,7 +263,8 @@ function tokenStats(input) {

// Sort the frequency object by frequency in descending order
stats.frequency = Object.fromEntries(
Object.entries(stats.frequency).sort((a, b) => b[1] - a[1])
Object.entries(stats.frequency)
.sort((a, b) => b[1] - a[1])
)

return stats
Expand Down
5 changes: 2 additions & 3 deletions browser.js

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion build_encoder.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const fs = require('fs');
const path = require('path');


const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));

console.log("Breaks stuff i think");
Expand Down
5 changes: 3 additions & 2 deletions docs/Encoder.js.html
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ <h1 class="page-title">Source: Encoder.js</h1>

// Sort the frequency object by frequency in descending order
stats.frequency = Object.fromEntries(
Object.entries(stats.frequency).sort((a, b) => b[1] - a[1])
Object.entries(stats.frequency)
.sort((a, b) => b[1] - a[1])
)

return stats
Expand Down Expand Up @@ -370,7 +371,7 @@ <h2><a href="index.html">Home</a></h2><h3>Global</h3><ul><li><a href="global.htm
<br class="clear">

<footer>
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Sun Jan 08 2023 03:34:35 GMT-0500 (Eastern Standard Time)
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Wed Mar 08 2023 07:47:02 GMT-0800 (Pacific Standard Time)
</footer>

<script> prettyPrint(); </script>
Expand Down
5 changes: 2 additions & 3 deletions docs/browser.js

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions docs/global.html
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ <h5>Parameters:</h5>

<dt class="tag-source">Source:</dt>
<dd class="tag-source"><ul class="dummy"><li>
<a href="Encoder.js.html">Encoder.js</a>, <a href="Encoder.js.html#line277">line 277</a>
<a href="Encoder.js.html">Encoder.js</a>, <a href="Encoder.js.html#line278">line 278</a>
</li></ul></dd>


Expand Down Expand Up @@ -529,7 +529,7 @@ <h5>Parameters:</h5>

<dt class="tag-source">Source:</dt>
<dd class="tag-source"><ul class="dummy"><li>
<a href="Encoder.js.html">Encoder.js</a>, <a href="Encoder.js.html#line313">line 313</a>
<a href="Encoder.js.html">Encoder.js</a>, <a href="Encoder.js.html#line314">line 314</a>
</li></ul></dd>


Expand Down Expand Up @@ -1070,7 +1070,7 @@ <h2><a href="index.html">Home</a></h2><h3>Global</h3><ul><li><a href="global.htm
<br class="clear">

<footer>
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Sun Jan 08 2023 03:34:35 GMT-0500 (Eastern Standard Time)
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Wed Mar 08 2023 07:47:02 GMT-0800 (Pacific Standard Time)
</footer>

<script> prettyPrint(); </script>
Expand Down
2 changes: 1 addition & 1 deletion docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ <h2><a href="index.html">Home</a></h2><h3>Global</h3><ul><li><a href="global.htm
<br class="clear">

<footer>
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Sun Jan 08 2023 03:34:35 GMT-0500 (Eastern Standard Time)
Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.0</a> on Wed Mar 08 2023 07:47:02 GMT-0800 (Pacific Standard Time)
</footer>

<script> prettyPrint(); </script>
Expand Down
10 changes: 9 additions & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ declare module "gpt-3-encoder" {

export function countTokens(text: string): number;

export function tokenStats(input: string | number[]): any;
export function tokenStats(input: string | number[]): TokenStats;

export interface TokenStats {
count: number;
unique: number;
frequency: Record<string, number>;
positions: Record<string, number[]>;
tokens: string[];
}

}
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
"scripts": {
"docs": "jsdoc Encoder.js -r README.md -d docs && cp browser.* docs",
"build_bpe_ranks": "node build_bpe_ranks.js",
"build_encoder": "node build_encoder.js",

"build": "browserify index.js -s gpt3encoder -o browser.js",
"test": "jest",
"browser": "firefox example/browser.html",
Expand Down

0 comments on commit 02eaadd

Please sign in to comment.