-
Notifications
You must be signed in to change notification settings - Fork 1
/
embedding-csv.py
72 lines (60 loc) · 2.54 KB
/
embedding-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import getpass
import glob
from dotenv import load_dotenv
from tqdm import tqdm
import pickle
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
load_dotenv(verbose=True)
document_dir = os.environ['DOCUMENT_DIR']
vectorstore_dir = os.environ['VECTOR_DB_DIR']
embeddings_model = os.environ['MODEL_EMBEDDINGS']
### WORKAROUND for "trust_remote_code=True is required" error in HuggingFaceEmbeddings()
from transformers import AutoModel
model = AutoModel.from_pretrained(embeddings_model, trust_remote_code=True)
##
## Read source documents and extract return the list contains documents
##
def generate_vectorstore_docs_from_data_source(
glob_pattern:str,
max_doc_count:int=-1):
doc_count = 0
data_files = glob.glob(glob_pattern, recursive=True)
documents = []
print('*** Begin to split documents')
for data_file in tqdm(data_files):
df = pd.read_csv(data_file)
for index, row in df.iterrows():
row_dict = row.to_dict()
row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
document = Document(page_content=row_text)
documents.append(document)
print('*** Complete to split documents')
hf_bge_embeddings = HuggingFaceEmbeddings(
model_name=embeddings_model,
model_kwargs = {'device':'cpu'},
encode_kwargs = {'normalize_embeddings': True},
)
vectorstore = Chroma(
persist_directory=vectorstore_dir,
embedding_function=hf_bge_embeddings
)
print('*** Begin to add documents into vectorDB')
for doc in tqdm(documents):
vectorstore.add_documents([doc])
print('*** Complete to add documents into vectorDB')
# Generate documents from data source. Read the documents from pickle file if exists.
pickle_file = './doc_obj.pickle'
if not os.path.exists(pickle_file):
print('*** Reading Original Data and generating document(s) and Converting documents into embeddings and creating a vector store(s)')
docs = generate_vectorstore_docs_from_data_source(f'{document_dir}/**/*.csv')
with open(pickle_file, 'wb') as f:
pickle.dump(docs, file=f)
else:
print(f'*** Reading documents and Converting documents into embeddings and creating a vector store(s) from a pickled file ({pickle_file})')
with open(pickle_file, 'rb') as f:
docs = pickle.load(file=f)