-
Notifications
You must be signed in to change notification settings - Fork 355
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
23 changed files
with
449 additions
and
748 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
export { DEFAULT_PERSIST_PATH, DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME} from './types' | ||
|
||
export { DEFAULT_METADATA_COLLECTION_SUFFIX, DEFAULT_COLLECTION_DATA_SUFFIX, DEFAULT_NAMESPACE, DEFAULT_REF_DOC_COLLECTION_SUFFIX} from './kv-document-store' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import { BaseDocumentStore } from './types'; | ||
import { BaseNode } from '../../schema'; | ||
import type { BaseKVStore } from '../kvstore'; | ||
import { DEFAULT_BATCH_SIZE } from '../../global'; | ||
import { path } from '@llamaindex/env'; | ||
import { type DocJson, jsonToDoc } from './utils'; | ||
|
||
// The default namespace prefix for the document store. | ||
export const DEFAULT_NAMESPACE = 'docstore'; | ||
// The nodes collection contains the content of each node, along with metadata specific | ||
// to each node, including associated attributes like excluded metadata and relationships. | ||
export const DEFAULT_COLLECTION_DATA_SUFFIX = '/data'; | ||
// Contains mappings from each document to the list of node IDs that belong to it | ||
// including the document's metadata. | ||
export const DEFAULT_REF_DOC_COLLECTION_SUFFIX = '/ref_doc_info'; | ||
// Contains references from each node to its corresponding document, | ||
// including the node's document hash and reference document ID. | ||
export const DEFAULT_METADATA_COLLECTION_SUFFIX = '/metadata'; | ||
|
||
export class KVDocumentStore extends BaseDocumentStore { | ||
private kvStore: BaseKVStore<DocJson<unknown>>; | ||
#namespace: string; | ||
#nodeCollectionSuffix: string; | ||
#refDocCollectionSuffix: string; | ||
#metadataCollectionSuffix: string; | ||
#nodeCollection: string; | ||
#refDocCollection: string; | ||
#metadataCollection: string; | ||
#batchSize: number; | ||
|
||
constructor ( | ||
kvStore: BaseKVStore<DocJson>, | ||
namespace: string = DEFAULT_NAMESPACE, | ||
batchSize: number = DEFAULT_BATCH_SIZE, | ||
nodeCollectionSuffix: string = DEFAULT_COLLECTION_DATA_SUFFIX, | ||
refDocCollectionSuffix: string = DEFAULT_REF_DOC_COLLECTION_SUFFIX, | ||
metadataCollectionSuffix: string = DEFAULT_METADATA_COLLECTION_SUFFIX | ||
) { | ||
super(); | ||
this.kvStore = kvStore; | ||
this.#namespace = namespace; | ||
this.#nodeCollectionSuffix = nodeCollectionSuffix; | ||
this.#refDocCollectionSuffix = refDocCollectionSuffix; | ||
this.#metadataCollectionSuffix = metadataCollectionSuffix; | ||
this.#nodeCollection = path.join(this.#namespace, | ||
this.#nodeCollectionSuffix); | ||
this.#refDocCollection = path.join(this.#namespace, | ||
this.#refDocCollectionSuffix); | ||
this.#metadataCollection = path.join(this.#namespace, | ||
this.#metadataCollectionSuffix); | ||
this.#batchSize = batchSize; | ||
} | ||
|
||
get docs (): Promise<Map<string, BaseNode>> { | ||
return this.kvStore.getAll(this.#nodeCollection).then(jsonDict => { | ||
const docs = new Map<string, BaseNode>(); | ||
for (const [key, json] of Object.entries(jsonDict)) { | ||
docs.set(key, jsonToDoc(json, this.serializer)); | ||
} | ||
return docs; | ||
}); | ||
} | ||
|
||
#prepareKVPair( | ||
key: string, | ||
val: DocJson<unknown>, | ||
collection: string | undefined | ||
) { | ||
|
||
} | ||
|
||
async addDocuments ( | ||
docs: BaseNode[], allowUpdate: boolean, | ||
batchSize?: number, | ||
storeText?: boolean | ||
) { | ||
batchSize = batchSize || this.#batchSize; | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import { path } from '@llamaindex/env'; | ||
import { BaseNode } from '../../schema'; | ||
import { jsonSerializer, type Serializer } from './utils'; | ||
|
||
export const DEFAULT_PERSIST_FNAME = 'docstore.json'; | ||
export const DEFAULT_PERSIST_DIR = './storage'; | ||
export const DEFAULT_PERSIST_PATH = path.join(DEFAULT_PERSIST_DIR, | ||
DEFAULT_PERSIST_FNAME); | ||
|
||
type RefDocInfo<ExtraInfo extends Record<string, unknown>> = { | ||
nodeIds: string[] | ||
extraInfo: ExtraInfo | ||
} | ||
|
||
export abstract class BaseDocumentStore { | ||
serializer: Serializer< | ||
Record<string, unknown>, | ||
// we don't care about what's the target type of the serialization, so we use any here | ||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
any | ||
> = jsonSerializer; | ||
|
||
abstract persist ( | ||
persistPath: string | ||
): Promise<void> | ||
|
||
abstract get docs (): Promise<Map<string, BaseNode>> | ||
|
||
abstract addDocuments ( | ||
docs: BaseNode[], | ||
allowUpdate: boolean, | ||
batchSize: number, | ||
storeText: boolean | ||
): Promise<void> | ||
|
||
abstract getDocument ( | ||
docId: string, | ||
raiseError: boolean | ||
): Promise<BaseNode | undefined> | ||
|
||
abstract deleteDocument ( | ||
docId: string, | ||
raiseError: boolean | ||
): Promise<void> | ||
|
||
abstract documentExists ( | ||
docId: string | ||
): Promise<boolean> | ||
|
||
abstract setDocumentHash ( | ||
docId: string, | ||
docHash: string | ||
): Promise<void> | ||
|
||
abstract getDocumentHash ( | ||
docId: string | ||
): Promise<string | undefined> | ||
|
||
abstract getAllDocumentHashes (): Promise<Map<string, string>> | ||
|
||
abstract getAllRefDocInfo (): Promise<Map<string, RefDocInfo<Record<string, unknown>> | undefined>> | ||
|
||
abstract getRefDocInfo ( | ||
refDocId: string | ||
): Promise<RefDocInfo<Record<string, unknown>> | undefined> | ||
|
||
abstract deleteRefDoc ( | ||
refDocId: string, | ||
raiseError: boolean | ||
): Promise<void> | ||
|
||
abstract getNodes ( | ||
nodeIds: string[], | ||
raiseError: boolean | ||
): Promise<BaseNode[]> | ||
|
||
abstract getNode ( | ||
nodeId: string, | ||
raiseError: boolean | ||
): Promise<BaseNode> | ||
|
||
abstract getNodeDict ( | ||
nodeIdDict: Record<number, string> | ||
): Promise<Record<number, BaseNode>> | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import type { BaseNode } from "../../schema"; | ||
import { Document, ObjectType, TextNode } from "../../schema"; | ||
import { ImageDocument, MetadataMode } from '../../schema'; | ||
import type { SerializableValue } from '../kvstore'; | ||
|
||
const TYPE_KEY = "__type__"; | ||
const DATA_KEY = "__data__"; | ||
|
||
export interface Serializer<Data, Persistence> { | ||
toPersistence(data: Data): Persistence; | ||
fromPersistence(data: Persistence): Data | ||
} | ||
|
||
export const jsonSerializer: Serializer<Record<string, unknown>, string> = { | ||
toPersistence(data) { | ||
return JSON.stringify(data); | ||
}, | ||
fromPersistence(data) { | ||
return JSON.parse(data); | ||
}, | ||
}; | ||
|
||
export const noneSerializer: Serializer<Record<string, unknown>, Record<string, unknown>> = { | ||
toPersistence(data) { | ||
return data; | ||
}, | ||
fromPersistence(data) { | ||
return data; | ||
}, | ||
}; | ||
|
||
type DocJson = { | ||
[TYPE_KEY]: ObjectType; | ||
[DATA_KEY]: Record<string, unknown>; // from BaseNode, todo: add zod type check here | ||
}; | ||
|
||
export function isValidDocJson( | ||
docJson: SerializableValue, | ||
): docJson is DocJson { | ||
return ( | ||
typeof docJson === "object" && | ||
docJson !== null && | ||
TYPE_KEY in docJson && | ||
DATA_KEY in docJson | ||
); | ||
} | ||
|
||
export function docToJson( | ||
doc: BaseNode, | ||
serializer: Serializer<BaseNode, Record<string, unknown>>, | ||
): DocJson { | ||
return { | ||
[DATA_KEY]: serializer.toPersistence(doc), | ||
[TYPE_KEY]: doc.type, | ||
}; | ||
} | ||
|
||
export function jsonToDoc( | ||
docDict: DocJson, | ||
serializer: Pick<Serializer<Record<string, unknown>, unknown>, 'fromPersistence'>, | ||
): BaseNode { | ||
const docType = docDict[TYPE_KEY]; | ||
|
||
// fixme: add zod type check here | ||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
const dataDict: Record<string, any> = serializer.fromPersistence(docDict[DATA_KEY]); | ||
let doc: BaseNode; | ||
|
||
if (docType === ObjectType.DOCUMENT) { | ||
doc = new Document({ | ||
text: dataDict.text, | ||
id_: dataDict.id_, | ||
embedding: dataDict.embedding, | ||
hash: dataDict.hash, | ||
metadata: dataDict.metadata, | ||
}); | ||
} else if (docType === ObjectType.TEXT) { | ||
doc = new TextNode({ | ||
text: dataDict.text, | ||
id_: dataDict.id_, | ||
hash: dataDict.hash, | ||
metadata: dataDict.metadata, | ||
relationships: dataDict.relationships, | ||
}); | ||
} else if (docType === ObjectType.IMAGE_DOCUMENT) { | ||
doc = new ImageDocument({ | ||
image: dataDict.image, | ||
id_: dataDict.id_, | ||
embedding: dataDict.embedding, | ||
hash: dataDict.hash, | ||
metadata: dataDict.metadata, | ||
}); | ||
} else { | ||
throw new Error(`Unknown doc type: ${docType}`); | ||
} | ||
|
||
return doc; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
export { | ||
type SerializableValue, | ||
BaseKVStore, | ||
BaseFileSystemKVStore | ||
} from './types'; | ||
export { SimpleKVStore } from './simple-kv-store'; |
Oops, something went wrong.