I am trying to embed a folder of doc/pdf files, then querying the vector database. I am intentionally avoiding using langchain hub to process the query/prompt, to minimise data leak.
I encountered the following error at "results = loaded_vectorstore.query ...". I must have misunderstood how chroma databases work. Could anyone help point me in the right direction? Thank you so much!
AttributeError: 'Chroma' object has no attribute 'query'
My full code:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import glob
import os
os.environ["OPENAI_API_KEY"] = "XXX"
#LOAD DOCUMENTS
folder_path = 'XXX'
file_extensions = ['*.docx', '*.pdf']
documents = []
texts=[]
total_word_count = 0
for file_extension in file_extensions:
file_pattern = os.path.join(folder_path, file_extension)
# print(file_pattern)
for file_path in glob.glob(file_pattern):
if file_path.endswith('.docx'):
print('Loading: ',file_path)
loader = Docx2txtLoader(file_path)
document = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=10)
text = text_splitter.split_documents(document)
texts.extend(text)
elif file_path.endswith('.pdf'):
print('Loading: ',file_path)
loader = PyPDFLoader(file_path)
document = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=10)
text = text_splitter.split_documents(document)
texts.extend(text)
vectorstore = Chroma.from_documents(texts,OpenAIEmbeddings(),persist_directory="db3/")
vectorstore.persist()
#LOAD MODEL
PERSISTENT_PATH="db3/"
# Load the vectorstore back
loaded_vectorstore = Chroma(persist_directory=PERSISTENT_PATH,embedding_function=OpenAIEmbeddings())
# Vectorize the query
query = "what is an apple"
embedding_function=OpenAIEmbeddings()
from openai import OpenAI
client = OpenAI()
query_vector = client.embeddings.create(
input=query,
model="text-embedding-3-small"
)
results = loaded_vectorstore.query(
query_vector,
n_results=2
)
print(results)