-
Notifications
You must be signed in to change notification settings - Fork 40
/
2-updatePinecone.js
65 lines (65 loc) · 2.38 KB
/
2-updatePinecone.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// 1. Import required modules
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
// 2. Export updatePinecone function
export const updatePinecone = async (client, indexName, docs) => {
console.log("Retrieving Pinecone index...");
// 3. Retrieve Pinecone index
const index = client.Index(indexName);
// 4. Log the retrieved index name
console.log(`Pinecone index retrieved: ${indexName}`);
// 5. Process each document in the docs array
for (const doc of docs) {
console.log(`Processing document: ${doc.metadata.source}`);
const txtPath = doc.metadata.source;
const text = doc.pageContent;
// 6. Create RecursiveCharacterTextSplitter instance
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
});
console.log("Splitting text into chunks...");
// 7. Split text into chunks (documents)
const chunks = await textSplitter.createDocuments([text]);
console.log(`Text split into ${chunks.length} chunks`);
console.log(
`Calling OpenAI's Embedding endpoint documents with ${chunks.length} text chunks ...`
);
// 8. Create OpenAI embeddings for documents
const embeddingsArrays = await new OpenAIEmbeddings().embedDocuments(
chunks.map((chunk) => chunk.pageContent.replace(/\n/g, " "))
);
console.log("Finished embedding documents");
console.log(
`Creating ${chunks.length} vectors array with id, values, and metadata...`
);
// 9. Create and upsert vectors in batches of 100
const batchSize = 100;
let batch = [];
for (let idx = 0; idx < chunks.length; idx++) {
const chunk = chunks[idx];
const vector = {
id: `${txtPath}_${idx}`,
values: embeddingsArrays[idx],
metadata: {
...chunk.metadata,
loc: JSON.stringify(chunk.metadata.loc),
pageContent: chunk.pageContent,
txtPath: txtPath,
},
};
batch.push(vector);
// When batch is full or it's the last item, upsert the vectors
if (batch.length === batchSize || idx === chunks.length - 1) {
await index.upsert({
upsertRequest: {
vectors: batch,
},
});
// Empty the batch
batch = [];
}
}
// 10. Log the number of vectors updated
console.log(`Pinecone index updated with ${chunks.length} vectors`);
}
};