Here's my code to do this:
import os, time
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# Define the parameters of the embedding model
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': False}
hf_embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
for i in range( len(os.listdir('SOURCE_DOCUMENTS')) + 1):
# Load the document
loader = DirectoryLoader('SOURCE_DOCUMENTS_MD/', glob=f"./{i+1}.md")
docs = loader.load()
# Save to disk
db = Chroma.from_documents(docs, hf_embeddings, persist_directory=f"./DB/{i+1}")
time.sleep(1)
But I find that it creates garbled up vector DB pretty quickly, like after the fifth document.