-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'bootstrap' into matrix_operations
- Loading branch information
Showing
28 changed files
with
498 additions
and
7,257 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# Target Use case | ||
|
||
## User case 1: Global ranking for a keyword | ||
|
||
### Input | ||
a keyword (e.g., "sports") and a weight array | ||
|
||
```JSON | ||
[ | ||
{ | ||
"contentId": "content-1...", | ||
"weight": 10 | ||
}, | ||
|
||
{ | ||
"contentId": "content-2...", | ||
"weight": 0 | ||
}, | ||
|
||
{ | ||
"contentId": "content-3...", | ||
"weight": 50 | ||
}, | ||
|
||
] | ||
``` | ||
|
||
### Output | ||
a ranking and scores | ||
``` | ||
["content-3", "content-1"] | ||
[score(3), score(1)] | ||
``` | ||
|
||
## Content Objects - we have several object having these properties | ||
```JSON | ||
[ | ||
{ | ||
"contentId": "content-1...", | ||
"attributes": { | ||
"sports": 1000, | ||
"soccer": 50, | ||
"cleats": 500, | ||
"baseball": 0, | ||
} | ||
}, | ||
{ | ||
"contentId": "content-2...", | ||
"attributes": { | ||
"sports": 10, | ||
"soccer": 0, | ||
"cleats": 0, | ||
"baseball": 10, | ||
"baseball cap": 200 | ||
} | ||
}, | ||
] | ||
``` | ||
|
||
|
||
## User 1 | ||
User 1 is more about soccer | ||
|
||
## User 2 | ||
User 2 is more about baseball | ||
|
||
## Ranking for User 1 | ||
[content-1, content-2] | ||
|
||
## Ranking for User 2 | ||
This is interesting because the content based filtering is overridden by the staking. | ||
[content-1, content-2] | ||
|
||
|
||
### Ranking issues | ||
|
||
1. If we average the embeddings for each content first and then find the score, we cannot tell what the score means and cannot ensure weighting is done on the target words. | ||
2. If we score each keyword first against the user embedding, then we can apply weights on each keyword score. That will give us better ranking on keywords. | ||
|
||
|
||
# requirements | ||
|
||
1. the eigen.js integratation (muktadir) | ||
2. co-occurance based ranking (ayush and muratcan) | ||
3. user preference embedding (ayush and muktadir) | ||
4. The whole co-occurance matrix is calculated on browser based on their history. So, different users will actually have different co-occurance matrix. | ||
|
||
So, one user may have this: | ||
|
||
["epl", "la liga", "sports", "soccer", "ice cream", "soccer-ball"] | ||
|
||
But another may have this: | ||
|
||
["ice cream", "cloths", "perfume"] | ||
|
||
As a content owner, I am looking for users interested in buying a soccer ball. | ||
|
||
Some users love to watch soccer, but they don't have the "soccer ball" in their representation vector. But, with cooccurance representation, it might be possible to relate "soccer ball" to soccer and so | ||
|
||
|
||
["epl", "la liga", "sports", "soccer", "ice cream", "soccer-ball"] | ||
|
||
user: [1, 0, 1, 1, 1, 0] | ||
keyword-OH: [0, 0, 0, 0, 0, 1] | ||
keyword-co: [0.1, 1, 1, 1, 0, 1] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import { ContainerModule, interfaces } from 'inversify'; | ||
import { IndexedDB } from 'crunchDB/implementations/business/IndexedDB'; | ||
import { | ||
IIndexedDB, | ||
IIndexedDBType, | ||
} from 'crunchDB/interfaces/business/IIndexedDB'; | ||
export const crunchDBModule = new ContainerModule( | ||
( | ||
bind: interfaces.Bind, | ||
_unbind: interfaces.Unbind, | ||
_isBound: interfaces.IsBound, | ||
_rebind: interfaces.Rebind | ||
) => { | ||
bind<IIndexedDB>(IIndexedDBType) | ||
.to(IndexedDB) | ||
.inSingletonScope(); | ||
} | ||
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import { injectable } from 'inversify'; | ||
import { ICrunchDB } from 'crunchDB/interfaces/ICrunchDB'; | ||
import { | ||
IIndexedDB, | ||
IIndexedDBType, | ||
} from 'crunchDB/interfaces/business/IIndexedDB'; | ||
|
||
@injectable() | ||
export class CrunchDB implements ICrunchDB { | ||
private dbService: IIndexedDB; | ||
|
||
constructor(@inject(IIndexedDBType) dbService: IIndexedDB) { | ||
this.dbService = dbService; | ||
} | ||
|
||
async initializeDatabase(): Promise<void> { | ||
this.dbService | ||
.init() | ||
.map(() => { | ||
console.log('Database initialized successfully.'); | ||
}) | ||
.mapErr(e => { | ||
console.error(`Failed to initialize database: ${e.message}`); | ||
}); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import { injectable } from 'inversify'; | ||
import { ResultAsync } from 'neverthrow'; | ||
import { IDBConfig } from 'crunchDB/objects/business/IDBConfig'; | ||
import { VolatileObject } from 'crunchDB/objects/business/SimpleObject'; | ||
import { IIndexedDB } from 'crunchDB/interfaces/business/IIndexedDB'; | ||
|
||
@injectable() | ||
export class IndexedDB implements IIndexedDB { | ||
private db: IDBDatabase | null = null; | ||
|
||
constructor(private config: IDBConfig) {} | ||
|
||
init(): ResultAsync<void, Error> { | ||
return ResultAsync.fromPromise( | ||
new Promise<void>((resolve, reject) => { | ||
const request = indexedDB.open(this.config.dbName, this.config.version); | ||
request.onupgradeneeded = (event: IDBVersionChangeEvent) => { | ||
const db = request.result; | ||
for (const storeConfig of this.config.stores) { | ||
if (!db.objectStoreNames.contains(storeConfig.name)) { | ||
const store = db.createObjectStore(storeConfig.name, { | ||
keyPath: storeConfig.keyPath, | ||
autoIncrement: storeConfig.autoIncrement ?? false, | ||
}); | ||
|
||
storeConfig.indices?.forEach(index => { | ||
store.createIndex(index.name, index.keyPath, index.options); | ||
}); | ||
} | ||
} | ||
}; | ||
request.onsuccess = () => { | ||
this.db = request.result; | ||
resolve(); | ||
}; | ||
request.onerror = () => reject(new Error('Failed to open database')); | ||
}), | ||
e => new Error(`Database initialization failed: ${e}`) | ||
); | ||
} | ||
|
||
addObject( | ||
storeName: string, | ||
VolatileObject: VolatileObject | ||
): ResultAsync<void, Error> { | ||
return this.transaction(storeName, 'readwrite', store => | ||
store.add(VolatileObject) | ||
).map(() => undefined); | ||
} | ||
|
||
getObject( | ||
storeName: string, | ||
id: number | ||
): ResultAsync<VolatileObject, Error> { | ||
return this.transaction(storeName, 'readonly', store => store.get(id)); | ||
} | ||
|
||
getAllObjects( | ||
storeName: string | ||
): ResultAsync<VolatileObject[], Error> { | ||
return ResultAsync.fromPromise( | ||
new Promise<VolatileObject[]>((resolve, reject) => { | ||
if (!this.db) return reject(new Error('DB is not initialized')); | ||
const transaction = this.db.transaction(storeName, 'readonly'); | ||
const store = transaction.objectStore(storeName); | ||
const request = store.openCursor(); | ||
const VolatileObjects: VolatileObject[] = []; | ||
|
||
request.onsuccess = () => { | ||
const cursor = request.result; | ||
if (cursor) { | ||
VolatileObjects.push(cursor.value); | ||
cursor.continue(); | ||
} else { | ||
resolve(VolatileObjects); | ||
} | ||
}; | ||
|
||
transaction.onerror = () => reject(new Error('Transaction failed')); | ||
request.onerror = () => reject(new Error('Request failed')); | ||
}), | ||
e => new Error(`Error fetching all VolatileObjects: ${e}`) | ||
); | ||
} | ||
|
||
getAllItems(storeName: string): ResultAsync<VolatileObject[], Error> { | ||
return ResultAsync.fromPromise( | ||
new Promise<VolatileObject[]>((resolve, reject) => { | ||
if (!this.db) { | ||
reject(new Error('DB is not initialized')); | ||
return; | ||
} | ||
const transaction = this.db.transaction(storeName, 'readonly'); | ||
const store = transaction.objectStore(storeName); | ||
const request = store.openCursor(); | ||
const items: VolatileObject[] = []; | ||
|
||
request.onsuccess = () => { | ||
const cursor = request.result; | ||
if (cursor) { | ||
items.push(cursor.value); | ||
cursor.continue(); | ||
} else { | ||
resolve(items); // Resolve the array of items once cursor has iterated through all the entries | ||
} | ||
}; | ||
|
||
transaction.onerror = () => reject(new Error('Transaction failed')); | ||
request.onerror = () => reject(new Error('Cursor operation failed')); | ||
}), | ||
e => new Error(`Error fetching all items with cursor: ${e}`) | ||
); | ||
} | ||
|
||
private transaction<T>( | ||
storeName: string, | ||
mode: IDBTransactionMode, | ||
action: (store: IDBObjectStore) => IDBRequest<T> | ||
): ResultAsync<T, Error> { | ||
return ResultAsync.fromPromise( | ||
new Promise<T>((resolve, reject) => { | ||
if (!this.db) return reject(new Error('DB is not initialized')); | ||
const transaction = this.db.transaction(storeName, mode); | ||
const store = transaction.objectStore(storeName); | ||
const request = action(store); | ||
|
||
transaction.oncomplete = () => resolve(request.result); | ||
transaction.onerror = () => reject(new Error('Transaction failed')); | ||
request.onerror = () => reject(new Error('Request operation failed')); | ||
}), | ||
e => new Error(`Transaction error: ${e}`) | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import { BaseStemmer } from "@nlpjs/core"; | ||
import { StemmerEn, StopwordsEn } from "@nlpjs/lang-en"; | ||
import { IStemmerService } from "crunchDB/interfaces"; | ||
import { ELanguageCode, NLPSupportedLanguages, NLPError, WordRoot } from "crunchDB/objects"; | ||
import { ResultAsync, errAsync, okAsync } from "neverthrow"; | ||
|
||
|
||
export class StemmerService implements IStemmerService { | ||
public tokenizeSync(language: ELanguageCode, text: string): WordRoot[] { | ||
if (!NLPSupportedLanguages.includes(language)) { | ||
return this.toWordRoots(text.split(" ")); | ||
} | ||
try { | ||
const words = this.getStemmer(language).tokenizeAndStem(text, false); // does normalization by default and, false means "dont keep stopwords" | ||
return this.toWordRoots(words); | ||
} catch (error) { | ||
throw new NLPError((error as Error).message, error); | ||
} | ||
} | ||
|
||
public tokenize( | ||
language: ELanguageCode, | ||
text: string, | ||
): ResultAsync<WordRoot[], NLPError> { | ||
try { | ||
return okAsync(this.tokenizeSync(language, text)); | ||
} catch (error) { | ||
return errAsync(error as NLPError); // guranteed to be NLPError | ||
} | ||
} | ||
|
||
private toWordRoots(tokens: string[]): WordRoot[] { | ||
return tokens.map((token) => WordRoot(token)); | ||
} | ||
|
||
/** | ||
* | ||
* @param language | ||
* @returns returns english stemmer by default | ||
*/ | ||
private getStemmer(language: ELanguageCode): BaseStemmer { | ||
switch (language) { | ||
case ELanguageCode.English: | ||
return this.getStemmerEn(); | ||
} | ||
return this.getStemmerEn(); | ||
} | ||
|
||
private getStemmerEn(): StemmerEn { | ||
const stemmer = new StemmerEn(); | ||
stemmer.stopwords = new StopwordsEn(); | ||
return stemmer; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
export * from './StemmerService'; | ||
export * from './Vocabulary'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
export interface ICrunchDB {} |
Oops, something went wrong.