From 0e88f9488dc61cf5d96cd9577a10985607321419 Mon Sep 17 00:00:00 2001 From: Jay Wang Date: Thu, 5 Dec 2024 15:23:56 -0500 Subject: [PATCH] Init commit --- .c8rc.json | 5 ++ .eslintignore | 5 ++ .eslintrc.cjs | 43 ++++++++++ .gitattributes | 2 + .github/workflows/build.yml | 32 +++++++ .gitignore | 34 ++++++++ .npmignore | 30 +++++++ .prettierignore | 4 + .prettierrc | 6 ++ LICENSE | 21 +++++ README.md | 1 + package.json | 67 +++++++++++++++ src/index.ts | 1 + src/mememo.ts | 162 ++++++++++++++++++++++++++++++++++++ src/my-types.ts | 0 src/utils/utils.ts | 36 ++++++++ test/mememo.test.ts | 58 +++++++++++++ test/utils/utils.test.ts | 36 ++++++++ tsconfig.json | 19 +++++ typescript.eslintrc.json | 4 + vite.config.ts | 33 ++++++++ 21 files changed, 599 insertions(+) create mode 100644 .c8rc.json create mode 100644 .eslintignore create mode 100644 .eslintrc.cjs create mode 100644 .gitattributes create mode 100644 .github/workflows/build.yml create mode 100644 .gitignore create mode 100644 .npmignore create mode 100644 .prettierignore create mode 100644 .prettierrc create mode 100644 LICENSE create mode 100644 README.md create mode 100644 package.json create mode 100644 src/index.ts create mode 100644 src/mememo.ts create mode 100644 src/my-types.ts create mode 100644 src/utils/utils.ts create mode 100644 test/mememo.test.ts create mode 100644 test/utils/utils.test.ts create mode 100644 tsconfig.json create mode 100644 typescript.eslintrc.json create mode 100644 vite.config.ts diff --git a/.c8rc.json b/.c8rc.json new file mode 100644 index 0000000..f6d291c --- /dev/null +++ b/.c8rc.json @@ -0,0 +1,5 @@ +{ + "check-coverage": true, + "lines": 95, + "reporter": ["json-summary"] +} diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..e8a2221 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,5 @@ +node_modules +dist +coverage +**/*.d.ts +tests \ No newline at end of file diff --git a/.eslintrc.cjs b/.eslintrc.cjs new file mode 100644 index 0000000..1e7a2a1 --- /dev/null +++ b/.eslintrc.cjs @@ -0,0 +1,43 @@ +module.exports = { + parser: '@typescript-eslint/parser', + extends: [ + 'eslint:recommended', + 'plugin:@typescript-eslint/recommended', + 'plugin:@typescript-eslint/recommended-requiring-type-checking', + 'prettier' + ], + parserOptions: { + ecmaVersion: 'latest', + sourceType: 'module', + project: ['./typescript.eslintrc.json'], + extraFileExtensions: ['.cjs'] + }, + plugins: ['@typescript-eslint', 'prettier'], + env: { + es6: true, + browser: true + }, + ignorePatterns: ['node_modules'], + rules: { + indent: ['error', 2, { SwitchCase: 1 }], + 'linebreak-style': ['error', 'unix'], + quotes: ['error', 'single'], + 'prefer-const': ['error'], + semi: ['error', 'always'], + 'max-len': [ + 'warn', + { + code: 80 + } + ], + 'prettier/prettier': 2, + '@typescript-eslint/no-floating-promises': 'off', + '@typescript-eslint/no-unsafe-return': 'off', + '@typescript-eslint/ban-ts-comment': 'off', + '@typescript-eslint/restrict-template-expressions': 'off', + '@typescript-eslint/no-non-null-assertion': 'off', + '@typescript-eslint/no-empty-function': 'off', + '@typescript-eslint/no-unused-vars': ['warn'], + 'no-self-assign': 'off' + } +}; diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f7b0270 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.ipynb linguist-detectable=false +*.yml linguist-language=typescript \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..72a4ec9 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,32 @@ +name: build + +on: + # Triggers the workflow on push or pull request events but only for the main branch + push: + branches: [ main ] + pull_request: + branches: [ main ] + #: Run the test every week + # schedule: + # - cron: "0 12 * * 1" + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + node-version: [16] + os: [ubuntu-latest] + # os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v3 + - name: Use Node.js + uses: actions/setup-node@v3 + with: + node-version: ${{ matrix.node-version }} + - name: Install dependencies + run: npm install + - name: Test + run: npm run test:run \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f419d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +coverage +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +pnpm-lock.yaml + +test-generation +*.npz +gh-pages +examples/models/benchmark + +notebooks \ No newline at end of file diff --git a/.npmignore b/.npmignore new file mode 100644 index 0000000..a8b33da --- /dev/null +++ b/.npmignore @@ -0,0 +1,30 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist-ssr +coverage +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? +.github + +pnpm-lock.yaml +examples +test-generation +gh-pages \ No newline at end of file diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..b56605b --- /dev/null +++ b/.prettierignore @@ -0,0 +1,4 @@ +node_modules +**/node_modules +**/lib +**/package.json \ No newline at end of file diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..1f77a44 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,6 @@ +{ + "singleQuote": true, + "trailingComma": "none", + "svelteSortOrder": "options-scripts-styles-markup", + "arrowParens": "avoid" +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..72f1d35 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023, Jay Wang and Polo Chau + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa01a4c --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Matmul \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..37f314e --- /dev/null +++ b/package.json @@ -0,0 +1,67 @@ +{ + "name": "mememo", + "version": "0.0.1", + "description": "On-device vector database", + "main": "./dist/index.umd.js", + "module": "./dist/index.es.js", + "type": "module", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "require": "./dist/index.umd.js", + "import": "./dist/index.es.js", + "types": "./dist/index.d.ts" + } + }, + "repository": { + "type": "git", + "url": "git+https://github.com/xiaohk/mememo" + }, + "keywords": [ + "machine-learning", + "machine", + "learning", + "ai", + "matrix", + "matrix multiplication", + "linear algebra", + "math" + ], + "author": "Jay Wang", + "license": "MIT", + "bugs": { + "url": "https://github.com/xiaohk/mememo/issues" + }, + "homepage": "https://github.com/xiaohk/mememo#readme", + "scripts": { + "test": "vitest", + "test:ui": "vitest --ui", + "test:run": "vitest run", + "coverage": "vitest run --coverage && c8 report && pnpm run coverage:badge", + "coverage:badge": "pnpx make-coverage-badge --output-path ./imgs/coverage-badge.svg", + "build": "pnpm run clean && vite build", + "build:doc": "pnpm typedoc ./src/index.ts --excludeExternals --externalPattern 'node_modules' --name 'WebSHAP' --out './gh-pages/doc/' '$SRC_DIR'", + "clean": "rimraf ./dist", + "publish": "pnpm publish --access=public" + }, + "devDependencies": { + "@rollup/plugin-typescript": "^11.1.6", + "@types/d3-random": "^3.0.3", + "@typescript-eslint/eslint-plugin": "^6.19.1", + "@typescript-eslint/parser": "^6.19.1", + "@vitest/browser": "^1.2.1", + "@vitest/coverage-v8": "^1.2.1", + "c8": "^9.1.0", + "d3-random": "^3.0.1", + "eslint": "^8.56.0", + "eslint-config-prettier": "^9.1.0", + "eslint-plugin-prettier": "^5.1.3", + "prettier": "^3.2.4", + "rimraf": "^5.0.5", + "tslib": "^2.6.2", + "typescript": "^5.3.3", + "vite": "^5.0.12", + "vitest": "^1.2.1", + "webdriverio": "^8.29.1" + } +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..b8f5805 --- /dev/null +++ b/src/index.ts @@ -0,0 +1 @@ +export { add } from './mememo'; diff --git a/src/mememo.ts b/src/mememo.ts new file mode 100644 index 0000000..c568afb --- /dev/null +++ b/src/mememo.ts @@ -0,0 +1,162 @@ +/** + * Mememo + * @author: Jay Wang (jay@zijie.wang) + */ + +import { randomLcg, randomUniform, randomInt } from 'd3-random'; +import { comb, getCombinations } from './utils/utils'; +import type { RandomUniform, RandomInt } from 'd3-random'; + +export const add = (a: number, b: number) => { + return a + b; +}; + +type DistanceFunction = 'cosine'; + +interface HNSWConfig { + /** Distance function. */ + distanceFunction?: DistanceFunction; + + /** The max number of neighbors for each node. A reasonable range of m is from + * 5 to 48. Smaller m generally produces better results for lower recalls + * and/or lower dimensional data, while bigger m is better for high recall + * and/or high dimensional data. */ + m?: number; + + /** The number of neighbors to consider in construction's greedy search. */ + efConstruction?: number; + + /** The number of neighbors to keep for each node at the first level. */ + mMax0?: number; + + /** Normalizer parameter controlling number of overlaps across layers. */ + ml?: number; + + /** Optional random seed. */ + seed?: number; +} + +/** + * A node in the HNSW graph. + */ +class Node { + /** The unique key of an element. */ + key: T; + + /** The embedding value of the element. */ + value: number[]; + + constructor(key: T, value: number[]) { + this.key = key; + this.value = value; + } +} + +/** + * One graph layer in the HNSW index + */ +class GraphLayer { + /** The graph maps a key to its neighbor and distances */ + graph: Map>; + + /** + * Initialize a new graph layer. + * @param key The first key to insert into the graph layer. + */ + constructor(key: T) { + this.graph = new Map>(); + this.graph.set(key, new Map()); + } +} + +/** + * HNSW (Hierarchical Navigable Small World) class. + */ +export class HNSW { + distanceFunction: DistanceFunction; + + /** The max number of neighbors for each node. */ + m: number; + + /** The number of neighbors to consider in construction's greedy search. */ + efConstruction: number; + + /** The number of neighbors to keep for each node at the first level. */ + mMax0: number; + + /** Normalizer parameter controlling number of overlaps across layers. */ + ml: number; + + /** Seeded random number generator */ + rng: () => number; + + nodes: Map>; + graphLayers: GraphLayer[]; + + /** + * Constructs a new instance of the class. + * @param config - The configuration object. + * @param config.distanceFunction - Distance function. Default: 'cosine' + * @param config.m - The max number of neighbors for each node. A reasonable + * range of m is from 5 to 48. Smaller m generally produces better results for + * lower recalls and/or lower dimensional data, while bigger m is better for + * high recall and/or high dimensional data. Default: 16 + * @param config.efConstruction - The number of neighbors to consider in + * construction's greedy search. Default: 100 + * @param config.mMax0 - The maximum number of connections that a node can + * have in the zero layer. Default 2 * m. + * @param config.ml - Normalizer parameter. Default 1 / ln(m) + * @param config.seed - Optional random seed. + */ + constructor({ + distanceFunction, + m, + efConstruction, + mMax0, + ml, + seed + }: HNSWConfig) { + // Initialize HNSW parameters + this.distanceFunction = distanceFunction || 'cosine'; + this.m = m || 16; + this.efConstruction = efConstruction || 100; + this.mMax0 = mMax0 || this.m * 2; + this.ml = ml || 1 / Math.log(this.m); + + if (seed) { + this.rng = randomLcg(seed); + } else { + this.rng = randomLcg(randomUniform()()); + } + + // Data structures + this.graphLayers = []; + this.nodes = new Map>(); + } + + /** + * Insert a new element to the index. + * @param key Key of the new element. + * @param value The embedding of the new element to insert. + */ + insert(key: T, value: number[]) { + // If the key already exists, update the node + if (this.nodes.has(key)) { + // TODO: Update the node + return; + } + + // Randomly determine the max level of this node + const level = this._getRandomLevel(); + console.log(level); + // console.log(key, value); + } + + /** + * Generate a random level for a node using a exponentially decaying + * probability distribution + */ + _getRandomLevel() { + return Math.floor(-Math.log(this.rng()) * this.ml); + } +} diff --git a/src/my-types.ts b/src/my-types.ts new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/utils.ts b/src/utils/utils.ts new file mode 100644 index 0000000..8b4d96d --- /dev/null +++ b/src/utils/utils.ts @@ -0,0 +1,36 @@ +/** + * Compute n choose k + * @param n n to choose from + * @param k to choose k + * @returns Result + */ +export const comb = (n: number, k: number): number => { + const minK = Math.min(k, n - k); + return Array.from(new Array(minK), (_, i) => i + 1).reduce( + (a, b) => (a * (n + 1 - b)) / b, + 1 + ); +}; + +/** + * Return all n-length combinations from an array + * @param array Input array + * @param n Length of combinations + * @returns An array of n-length combinations + */ +export const getCombinations = (array: T[], n: number): T[][] => { + const result: T[][] = []; + + function backtrack(first = 0, current: T[] = []) { + if (current.length === n) { + result.push(current); + return; + } + for (let i = first; i < array.length; i++) { + backtrack(i + 1, [...current, array[i]]); + } + } + + backtrack(); + return result; +}; diff --git a/test/mememo.test.ts b/test/mememo.test.ts new file mode 100644 index 0000000..e6ee228 --- /dev/null +++ b/test/mememo.test.ts @@ -0,0 +1,58 @@ +import { describe, test, expect, beforeEach } from 'vitest'; +import { add, HNSW } from '../src/mememo'; +import embeddingDataJSON from '../notebooks/data/accident-report-embeddings-100.json'; + +interface EmbeddingData { + embeddings: number[][]; + reportNumbers: number[]; +} +const embeddingData = embeddingDataJSON as EmbeddingData; + +test('add()', () => { + expect(add(10, 1)).toBe(11); +}); + +test('constructor', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine', + efConstruction: 100, + m: 16 + }); +}); + +test('insert()', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine' + }); + + const embedding = embeddingData.embeddings[0]; + hnsw.insert('name', embedding); +}); + +test('_getRandomLevel()', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine', + seed: 20240101 + }); + + const levels: number[] = []; + for (let i = 0; i < 10000; i++) { + levels.push(hnsw._getRandomLevel()); + } + + // Count the different levels + const levelCounter = new Map(); + for (const level of levels) { + if (levelCounter.has(level)) { + levelCounter.set(level, levelCounter.get(level)! + 1); + } else { + levelCounter.set(level, 1); + } + } + + expect(levelCounter.get(0)! > 9000); + expect(levelCounter.get(1)! > 400); + expect(levelCounter.get(1)! < 700); + expect(levelCounter.get(2)! < 50); + expect(levelCounter.get(3)! < 10); +}); diff --git a/test/utils/utils.test.ts b/test/utils/utils.test.ts new file mode 100644 index 0000000..a4614d0 --- /dev/null +++ b/test/utils/utils.test.ts @@ -0,0 +1,36 @@ +import { describe, test, expect, beforeEach } from 'vitest'; +import { comb, getCombinations } from '../../src/utils/utils'; + +test('comb()', () => { + expect(comb(10, 1)).toBe(10); + expect(comb(10, 0)).toBe(1); + expect(comb(10, 0)).toBe(1); + expect(comb(15, 3)).toBe(455); + expect(comb(15, 3)).toBe(455); + expect(comb(25, 18)).toBe(480700); + expect(comb(25, 18)).toBe(480700); + expect(comb(100, 5)).toBe(75287520); + expect(comb(100, 5)).toBe(75287520); +}); + +test('getCombinations()', () => { + const myArray = [1, 2, 3, 4]; + + expect(getCombinations(myArray, 2)).toEqual([ + [1, 2], + [1, 3], + [1, 4], + [2, 3], + [2, 4], + [3, 4] + ]); + + expect(getCombinations(myArray, 3)).toEqual([ + [1, 2, 3], + [1, 2, 4], + [1, 3, 4], + [2, 3, 4] + ]); + + expect(getCombinations(myArray, 4)).toEqual([[1, 2, 3, 4]]); +}); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..27a5f7c --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ESNext", + "useDefineForClassFields": true, + "module": "ESNext", + "lib": ["ESNext", "DOM"], + "moduleResolution": "Node", + "strict": true, + "sourceMap": true, + "resolveJsonModule": true, + "esModuleInterop": true, + "noImplicitReturns": true, + "noImplicitAny": true, + "outDir": "dist", + "rootDir": "./src", + "skipLibCheck": true + }, + "include": ["src"], +} diff --git a/typescript.eslintrc.json b/typescript.eslintrc.json new file mode 100644 index 0000000..fa84f56 --- /dev/null +++ b/typescript.eslintrc.json @@ -0,0 +1,4 @@ +{ + "extends": "./tsconfig.json", + "include": ["src", "test", ".eslintrc.js"] +} \ No newline at end of file diff --git a/vite.config.ts b/vite.config.ts new file mode 100644 index 0000000..4a39d9d --- /dev/null +++ b/vite.config.ts @@ -0,0 +1,33 @@ +/// + +import path from 'path'; +import { defineConfig } from 'vite'; +import typescript from '@rollup/plugin-typescript'; + +const resolvePath = (str: string) => path.resolve(__dirname, str); + +export default defineConfig({ + test: { + browser: { + enabled: false, + name: 'chrome' + } + }, + build: { + lib: { + entry: resolvePath('src/index.ts'), + name: 'mememo', + fileName: format => `index.${format}.js` + }, + sourcemap: true, + rollupOptions: { + plugins: [ + typescript({ + target: 'esnext', + rootDir: resolvePath('./src'), + declaration: true + }) + ] + } + } +});