diff --git a/.c8rc.json b/.c8rc.json new file mode 100644 index 0000000..f6d291c --- /dev/null +++ b/.c8rc.json @@ -0,0 +1,5 @@ +{ + "check-coverage": true, + "lines": 95, + "reporter": ["json-summary"] +} diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..e8a2221 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,5 @@ +node_modules +dist +coverage +**/*.d.ts +tests \ No newline at end of file diff --git a/.eslintrc.cjs b/.eslintrc.cjs new file mode 100644 index 0000000..1e7a2a1 --- /dev/null +++ b/.eslintrc.cjs @@ -0,0 +1,43 @@ +module.exports = { + parser: '@typescript-eslint/parser', + extends: [ + 'eslint:recommended', + 'plugin:@typescript-eslint/recommended', + 'plugin:@typescript-eslint/recommended-requiring-type-checking', + 'prettier' + ], + parserOptions: { + ecmaVersion: 'latest', + sourceType: 'module', + project: ['./typescript.eslintrc.json'], + extraFileExtensions: ['.cjs'] + }, + plugins: ['@typescript-eslint', 'prettier'], + env: { + es6: true, + browser: true + }, + ignorePatterns: ['node_modules'], + rules: { + indent: ['error', 2, { SwitchCase: 1 }], + 'linebreak-style': ['error', 'unix'], + quotes: ['error', 'single'], + 'prefer-const': ['error'], + semi: ['error', 'always'], + 'max-len': [ + 'warn', + { + code: 80 + } + ], + 'prettier/prettier': 2, + '@typescript-eslint/no-floating-promises': 'off', + '@typescript-eslint/no-unsafe-return': 'off', + '@typescript-eslint/ban-ts-comment': 'off', + '@typescript-eslint/restrict-template-expressions': 'off', + '@typescript-eslint/no-non-null-assertion': 'off', + '@typescript-eslint/no-empty-function': 'off', + '@typescript-eslint/no-unused-vars': ['warn'], + 'no-self-assign': 'off' + } +}; diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f7b0270 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.ipynb linguist-detectable=false +*.yml linguist-language=typescript \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..72a4ec9 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,32 @@ +name: build + +on: + # Triggers the workflow on push or pull request events but only for the main branch + push: + branches: [ main ] + pull_request: + branches: [ main ] + #: Run the test every week + # schedule: + # - cron: "0 12 * * 1" + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + node-version: [16] + os: [ubuntu-latest] + # os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v3 + - name: Use Node.js + uses: actions/setup-node@v3 + with: + node-version: ${{ matrix.node-version }} + - name: Install dependencies + run: npm install + - name: Test + run: npm run test:run \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f419d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +coverage +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +pnpm-lock.yaml + +test-generation +*.npz +gh-pages +examples/models/benchmark + +notebooks \ No newline at end of file diff --git a/.npmignore b/.npmignore new file mode 100644 index 0000000..a8b33da --- /dev/null +++ b/.npmignore @@ -0,0 +1,30 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist-ssr +coverage +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? +.github + +pnpm-lock.yaml +examples +test-generation +gh-pages \ No newline at end of file diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..b56605b --- /dev/null +++ b/.prettierignore @@ -0,0 +1,4 @@ +node_modules +**/node_modules +**/lib +**/package.json \ No newline at end of file diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..1f77a44 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,6 @@ +{ + "singleQuote": true, + "trailingComma": "none", + "svelteSortOrder": "options-scripts-styles-markup", + "arrowParens": "avoid" +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..72f1d35 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023, Jay Wang and Polo Chau + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa01a4c --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Matmul \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..37f314e --- /dev/null +++ b/package.json @@ -0,0 +1,67 @@ +{ + "name": "mememo", + "version": "0.0.1", + "description": "On-device vector database", + "main": "./dist/index.umd.js", + "module": "./dist/index.es.js", + "type": "module", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "require": "./dist/index.umd.js", + "import": "./dist/index.es.js", + "types": "./dist/index.d.ts" + } + }, + "repository": { + "type": "git", + "url": "git+https://github.com/xiaohk/mememo" + }, + "keywords": [ + "machine-learning", + "machine", + "learning", + "ai", + "matrix", + "matrix multiplication", + "linear algebra", + "math" + ], + "author": "Jay Wang", + "license": "MIT", + "bugs": { + "url": "https://github.com/xiaohk/mememo/issues" + }, + "homepage": "https://github.com/xiaohk/mememo#readme", + "scripts": { + "test": "vitest", + "test:ui": "vitest --ui", + "test:run": "vitest run", + "coverage": "vitest run --coverage && c8 report && pnpm run coverage:badge", + "coverage:badge": "pnpx make-coverage-badge --output-path ./imgs/coverage-badge.svg", + "build": "pnpm run clean && vite build", + "build:doc": "pnpm typedoc ./src/index.ts --excludeExternals --externalPattern 'node_modules' --name 'WebSHAP' --out './gh-pages/doc/' '$SRC_DIR'", + "clean": "rimraf ./dist", + "publish": "pnpm publish --access=public" + }, + "devDependencies": { + "@rollup/plugin-typescript": "^11.1.6", + "@types/d3-random": "^3.0.3", + "@typescript-eslint/eslint-plugin": "^6.19.1", + "@typescript-eslint/parser": "^6.19.1", + "@vitest/browser": "^1.2.1", + "@vitest/coverage-v8": "^1.2.1", + "c8": "^9.1.0", + "d3-random": "^3.0.1", + "eslint": "^8.56.0", + "eslint-config-prettier": "^9.1.0", + "eslint-plugin-prettier": "^5.1.3", + "prettier": "^3.2.4", + "rimraf": "^5.0.5", + "tslib": "^2.6.2", + "typescript": "^5.3.3", + "vite": "^5.0.12", + "vitest": "^1.2.1", + "webdriverio": "^8.29.1" + } +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..b8f5805 --- /dev/null +++ b/src/index.ts @@ -0,0 +1 @@ +export { add } from './mememo'; diff --git a/src/mememo.ts b/src/mememo.ts new file mode 100644 index 0000000..cda0bdd --- /dev/null +++ b/src/mememo.ts @@ -0,0 +1,222 @@ +/** + * Mememo + * @author: Jay Wang (jay@zijie.wang) + */ + +import { randomLcg, randomUniform, randomInt } from 'd3-random'; +import { comb, getCombinations } from './utils/utils'; +import type { RandomUniform, RandomInt } from 'd3-random'; + +export const add = (a: number, b: number) => { + return a + b; +}; + +type BuiltInDistanceFunction = 'cosine' | 'cosine-normalized'; + +/** + * - 'cosine': Cosine distance + * - 'cosine-normalized': Cosine distance between two normalized vectors + */ +type DistanceFunction = + | BuiltInDistanceFunction + | ((a: number[], b: number[]) => number); + +// Built-in distance functions +const DISTANCE_FUNCTIONS: Record< + BuiltInDistanceFunction, + (a: number[], b: number[]) => number +> = { + cosine: (a: number[], b: number[]) => { + const dotProduct = a.reduce( + (sum, value, index) => sum + value * b[index], + 0 + ); + const magnitudeA = Math.sqrt(a.reduce((sum, value) => sum + value ** 2, 0)); + const magnitudeB = Math.sqrt(b.reduce((sum, value) => sum + value ** 2, 0)); + return 1 - dotProduct / (magnitudeA * magnitudeB); + }, + + 'cosine-normalized': (a: number[], b: number[]) => { + const dotProduct = a.reduce( + (sum, value, index) => sum + value * b[index], + 0 + ); + return 1 - dotProduct; + } +}; + +interface HNSWConfig { + /** Distance function. */ + distanceFunction?: DistanceFunction; + + /** The max number of neighbors for each node. A reasonable range of m is from + * 5 to 48. Smaller m generally produces better results for lower recalls + * and/or lower dimensional data, while bigger m is better for high recall + * and/or high dimensional data. */ + m?: number; + + /** The number of neighbors to consider in construction's greedy search. */ + efConstruction?: number; + + /** The number of neighbors to keep for each node at the first level. */ + mMax0?: number; + + /** Normalizer parameter controlling number of overlaps across layers. */ + ml?: number; + + /** Optional random seed. */ + seed?: number; +} + +/** + * A node in the HNSW graph. + */ +class Node { + /** The unique key of an element. */ + key: T; + + /** The embedding value of the element. */ + value: number[]; + + constructor(key: T, value: number[]) { + this.key = key; + this.value = value; + } +} + +/** + * One graph layer in the HNSW index + */ +class GraphLayer { + /** The graph maps a key to its neighbor and distances */ + graph: Map>; + + /** + * Initialize a new graph layer. + * @param key The first key to insert into the graph layer. + */ + constructor(key: T) { + this.graph = new Map>(); + this.graph.set(key, new Map()); + } +} + +/** + * HNSW (Hierarchical Navigable Small World) class. + */ +export class HNSW { + distanceFunction: (a: number[], b: number[]) => number; + + /** The max number of neighbors for each node. */ + m: number; + + /** The number of neighbors to consider in construction's greedy search. */ + efConstruction: number; + + /** The number of neighbors to keep for each node at the first level. */ + mMax0: number; + + /** Normalizer parameter controlling number of overlaps across layers. */ + ml: number; + + /** Seeded random number generator */ + rng: () => number; + + /** A collection all the nodes */ + nodes: Map>; + + /** A list of all layers */ + graphLayers: GraphLayer[]; + + /** Current entry point of the graph */ + entryPoint: T | null = null; + + /** + * Constructs a new instance of the class. + * @param config - The configuration object. + * @param config.distanceFunction - Distance function. Default: 'cosine' + * @param config.m - The max number of neighbors for each node. A reasonable + * range of m is from 5 to 48. Smaller m generally produces better results for + * lower recalls and/or lower dimensional data, while bigger m is better for + * high recall and/or high dimensional data. Default: 16 + * @param config.efConstruction - The number of neighbors to consider in + * construction's greedy search. Default: 100 + * @param config.mMax0 - The maximum number of connections that a node can + * have in the zero layer. Default 2 * m. + * @param config.ml - Normalizer parameter. Default 1 / ln(m) + * @param config.seed - Optional random seed. + */ + constructor({ + distanceFunction, + m, + efConstruction, + mMax0, + ml, + seed + }: HNSWConfig) { + // Initialize HNSW parameters + this.m = m || 16; + this.efConstruction = efConstruction || 100; + this.mMax0 = mMax0 || this.m * 2; + this.ml = ml || 1 / Math.log(this.m); + + if (seed) { + this.rng = randomLcg(seed); + } else { + this.rng = randomLcg(randomUniform()()); + } + + // Set the distance function + if (distanceFunction === undefined) { + this.distanceFunction = DISTANCE_FUNCTIONS['cosine']; + } else { + if (typeof distanceFunction === 'string') { + this.distanceFunction = DISTANCE_FUNCTIONS[distanceFunction]; + } else { + this.distanceFunction = distanceFunction; + } + } + + // Data structures + this.graphLayers = []; + this.nodes = new Map>(); + } + + /** + * Insert a new element to the index. + * @param key Key of the new element. + * @param value The embedding of the new element to insert. + */ + insert(key: T, value: number[]) { + // If the key already exists, update the node + if (this.nodes.has(key)) { + // TODO: Update the node + return; + } + + // Randomly determine the max level of this node + const level = this._getRandomLevel(); + + if (this.entryPoint !== null) { + // (1): Search closest point + // Top layer => all layers above the new node's highest layer + for (let l = this.graphLayers.length - 1; l >= level + 1; l--) {} + } + + // If the level is beyond current layers, extend the layers + for (let l = this.graphLayers.length; l < level + 1; l++) { + this.graphLayers.push(new GraphLayer(key)); + + // Set entry point as the last added node + this.entryPoint = key; + } + } + + /** + * Generate a random level for a node using a exponentially decaying + * probability distribution + */ + _getRandomLevel() { + return Math.floor(-Math.log(this.rng()) * this.ml); + } +} diff --git a/src/my-types.ts b/src/my-types.ts new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/utils.ts b/src/utils/utils.ts new file mode 100644 index 0000000..8b4d96d --- /dev/null +++ b/src/utils/utils.ts @@ -0,0 +1,36 @@ +/** + * Compute n choose k + * @param n n to choose from + * @param k to choose k + * @returns Result + */ +export const comb = (n: number, k: number): number => { + const minK = Math.min(k, n - k); + return Array.from(new Array(minK), (_, i) => i + 1).reduce( + (a, b) => (a * (n + 1 - b)) / b, + 1 + ); +}; + +/** + * Return all n-length combinations from an array + * @param array Input array + * @param n Length of combinations + * @returns An array of n-length combinations + */ +export const getCombinations = (array: T[], n: number): T[][] => { + const result: T[][] = []; + + function backtrack(first = 0, current: T[] = []) { + if (current.length === n) { + result.push(current); + return; + } + for (let i = first; i < array.length; i++) { + backtrack(i + 1, [...current, array[i]]); + } + } + + backtrack(); + return result; +}; diff --git a/test/mememo.test.ts b/test/mememo.test.ts new file mode 100644 index 0000000..740f394 --- /dev/null +++ b/test/mememo.test.ts @@ -0,0 +1,95 @@ +import { describe, test, expect, beforeEach } from 'vitest'; +import { add, HNSW } from '../src/mememo'; +import embeddingDataJSON from '../notebooks/data/accident-report-embeddings-100.json'; + +interface EmbeddingData { + embeddings: number[][]; + reportNumbers: number[]; +} +const embeddingData = embeddingDataJSON as EmbeddingData; + +test('add()', () => { + expect(add(10, 1)).toBe(11); +}); + +test('constructor', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine', + efConstruction: 100, + m: 16 + }); +}); + +test('insert()', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine' + }); + + const embedding = embeddingData.embeddings[0]; + hnsw.insert('name', embedding); +}); + +test('_getRandomLevel()', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine', + seed: 20240101 + }); + + const levels: number[] = []; + for (let i = 0; i < 10000; i++) { + levels.push(hnsw._getRandomLevel()); + } + + // Count the different levels + const levelCounter = new Map(); + for (const level of levels) { + if (levelCounter.has(level)) { + levelCounter.set(level, levelCounter.get(level)! + 1); + } else { + levelCounter.set(level, 1); + } + } + + expect(levelCounter.get(0)! > 9000); + expect(levelCounter.get(1)! > 400); + expect(levelCounter.get(1)! < 700); + expect(levelCounter.get(2)! < 50); + expect(levelCounter.get(3)! < 10); +}); + +test('distance function (cosine)', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine' + }); + + const a = [0.44819598, 0.26875241, 0.02164449, 0.33802939, 0.2482019]; + const b = [0.99448402, 0.29269615, 0.98586198, 0.57482737, 0.12994758]; + + expect(hnsw.distanceFunction(a, b)).closeTo(0.2554613725418178, 1e-6); +}); + +test('distance function (cosine-normalized)', () => { + const hnsw = new HNSW({ + distanceFunction: 'cosine-normalized' + }); + + const a = [0.3448653, 0.4612705, 0.79191367, 0.057099, 0.19470466]; + const b = [0.39233533, 0.37618326, 0.12894695, 0.50411272, 0.65863662]; + + expect(hnsw.distanceFunction(a, b)).closeTo(0.43203611706139833, 1e-6); +}); + +test('distance function (custom)', () => { + const l1Distance = (a: number[], b: number[]) => { + return a.reduce((sum, value, index) => sum + Math.abs(value - b[index]), 0); + }; + + const hnsw = new HNSW({ + distanceFunction: l1Distance + }); + + const a = [0.44819598, 0.26875241, 0.02164449, 0.33802939, 0.2482019]; + const b = [0.99448402, 0.29269615, 0.98586198, 0.57482737, 0.12994758]; + + expect(hnsw.distanceFunction(a, b)).closeTo(1.8895015711439895, 1e-6); +}); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..27a5f7c --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ESNext", + "useDefineForClassFields": true, + "module": "ESNext", + "lib": ["ESNext", "DOM"], + "moduleResolution": "Node", + "strict": true, + "sourceMap": true, + "resolveJsonModule": true, + "esModuleInterop": true, + "noImplicitReturns": true, + "noImplicitAny": true, + "outDir": "dist", + "rootDir": "./src", + "skipLibCheck": true + }, + "include": ["src"], +} diff --git a/typescript.eslintrc.json b/typescript.eslintrc.json new file mode 100644 index 0000000..fa84f56 --- /dev/null +++ b/typescript.eslintrc.json @@ -0,0 +1,4 @@ +{ + "extends": "./tsconfig.json", + "include": ["src", "test", ".eslintrc.js"] +} \ No newline at end of file diff --git a/vite.config.ts b/vite.config.ts new file mode 100644 index 0000000..4a39d9d --- /dev/null +++ b/vite.config.ts @@ -0,0 +1,33 @@ +/// + +import path from 'path'; +import { defineConfig } from 'vite'; +import typescript from '@rollup/plugin-typescript'; + +const resolvePath = (str: string) => path.resolve(__dirname, str); + +export default defineConfig({ + test: { + browser: { + enabled: false, + name: 'chrome' + } + }, + build: { + lib: { + entry: resolvePath('src/index.ts'), + name: 'mememo', + fileName: format => `index.${format}.js` + }, + sourcemap: true, + rollupOptions: { + plugins: [ + typescript({ + target: 'esnext', + rootDir: resolvePath('./src'), + declaration: true + }) + ] + } + } +});