Store FAISS vector store in some database like SQlite3

45 views Asked by At

This is my code which I am using to load my faiss vector store and print it as a dataframe using pandas from my folder faiss index which contain two files in it. This is my directory.

faiss index index.faiss index.pkl

Now this is the output I am getting after printing out my faiss vector_store.docstore._dict. What I want is to store my it in some database like SQLite3.

                            chunk_id                                         document  page                                            content

0 6ead4019-13c7-41e6-8c5e-bd7458087e71 Little Book of Plagiarism.pdf 1 This booklet is based upon “The Little Book of... 1 68f52aa8-eea3-4c52-825f-cc4fa81735ac Little Book of Plagiarism.pdf 2 CONTENTS \n \nWhat is Plagiarism? .............. 2 7ecb569f-ecdc-408a-9014-fbf129e9a966 Little Book of Plagiarism.pdf 2 UK Academic Traditions ......................... 3 00b27c8e-4d77-47e3-98a1-d2c261d2cb44 Little Book of Plagiarism.pdf 2 Collusion ................................ ..... 4 f186b715-bb47-4dd5-9fca-469742b72d95 Little Book of Plagiarism.pdf 2 Making Notes ................................... .. ... ... ... ... 81 1df7ba99-0386-4354-9362-1b5aad11ed6f Policy on drugs and tobacco control at HEIs.pdf 10 10 | P a g e \n of compliance with this provi... 82 ea0b9d6f-d1ce-46e6-ae74-66123fc602b6 Policy on drugs and tobacco control at HEIs.pdf 11 11 | P a g e \n \nAnnex - A \nUNDERTAKING ... 83 de5f6e45-3fc2-41af-947e-a7ff73cd00d8 Policy on drugs and tobacco control at HEIs.pdf 11 any time and to take any measure to ensure ... 84 5a9168e9-348a-4624-a489-8e18fde3437f Policy on drugs and tobacco control at HEIs.pdf 12 12 | P a g e \n ANNEX -B \n \nUNDERTAKING FOR... 85 5ae17d45-f523-4c02-97cd-57d4a6d96247 Policy on drugs and tobacco control at HEIs.pdf 12 policie s. Further, I have read and am aware o...

[86 rows x 4 columns]

Python app.py code


from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import pandas as pd
# import sqlite3
from IPython.display import display
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory

# loader = PyPDFLoader("Little Book of Plagiarism.pdf")
# pages = loader.load_and_split()
# print(pages)

# text_splitter = RecursiveCharacterTextSplitter(
#     # Set a really small chunk size, just to show.
#     chunk_size=800,
#     chunk_overlap=100,
#     length_function=len,
#     is_separator_regex=False,
# )
# chunked_docs = text_splitter.split_documents(pages)
# print(chunked_docs[30])

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"


# Create embeddings and store them in a FAISS vector store
embedder = HuggingFaceEmbeddings()
vector_store = FAISS.load_local("faiss index", embedder, allow_dangerous_deserialization=True)
#vector_store = FAISS.from_documents(chunked_docs, embedder)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

#print(vector_store.docstore._dict)
print("length", len(vector_store.docstore._dict))

# Assuming vector_store is your example vector_store
page_contents = []

for doc_id, document in vector_store.docstore._dict.items():
    page_content = document.page_content
    page_contents.append(page_content)

#text_embeddings = embedder.embed_documents(page_contents)


#print(vector_store)
repo_id="google/flan-t5-large"
qa = ConversationalRetrievalChain.from_llm(
    HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0,"max_length":512}
    ), vector_store.as_retriever(), memory=memory, chain_type="stuff")
chat_history = []

def show_vstore(store):
  vector_df = store_to_df(store)
  display (vector_df)
#convert vector store into df to convenient access
def store_to_df(store):
  v_dict = store.docstore._dict
  data_rows = [ ]
  for k in v_dict.keys():
    doc_name = v_dict[k].metadata['source'].split('/')[-1]
    page_number = v_dict[k].metadata['page']+1
    content = v_dict[k].page_content
    data_rows.append({"chunk_id":k, "document":doc_name, "page":page_number, "content": content})

    # for i in range(len(data_rows)):
    #     data_rows[i]["embeddings"] = text_embeddings[i]
  vector_df = pd.DataFrame(data_rows)
  return vector_df

#df = store_to_df(vector_store)
#df.to_csv('output.csv', index=False)
print(store_to_df(vector_store))



I have seen both langchain and faiss documentation but could not found a way to store my faiss vector store in a database like SQlite. I want to store my faiss vector in a database like SQlite3.

0

There are 0 answers