I am creating a RAG agent where i can query a pdf file. My query goes to a vector database that looks at the uploaded document and returns the relavant documents that then go in as the query to an LLM. I want to create a tool that, when the relavant document contains a table, parses the document into a pandas dataframe and answeres the original query with this understandable table. I have dealt with the process of converting the pdf table into a pandas dataframe using the camelot library. I just can't seem to implement it in my RAG project.
import os
# used to load the pdfs
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
# used to parse pdf tables
import camelot
import tabula
# used to create the retrieval tool
from langchain.agents import tool
# used to create the memory
from langchain.memory import ConversationBufferMemory
# used to create the prompt template
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
from langchain.schema import SystemMessage
from langchain.prompts import MessagesPlaceholder
# used to create the agent executor
from langchain_openai import OpenAI
from langchain.agents import AgentExecutor
from langchain_openai import ChatOpenAI
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Optional, Type, List, Union
from langchain.prompts import PromptTemplate
from langchain.chains import LLMMathChain, LLMChain
from langchain.callbacks.manager import (AsyncCallbackManagerForToolRun, CallbackManagerForToolRun)
from langchain.memory import ConversationBufferWindowMemory
from langchain.agents import AgentType, initialize_agent, Tool, AgentExecutor, AgentOutputParser
from langchain.schema import AgentAction, AgentFinish, OutputParserException
from langchain import hub
from langchain.agents import AgentExecutor, create_openai_tools_agent
# THis is the serp API wrapper than will help us route ambiguous queries to google search engine
from langchain_community.utilities import SerpAPIWrapper
from langchain.agents import load_tools
llm = ChatOpenAI(temperature = 0, openai_api_key=openai_api_key, model="gpt-3.5-turbo")
# Setting up the document loader. This returns the document in a page by page format wrapped in the pyDf wrapper
def load_document(doc_path=""):
loader = PyPDFLoader(doc_path)
pages = loader.load_and_split()
doc = loader.load()
return doc, pages
def establish_retriever(doc_path, chunk_size=1000, overlap=50):
"""
Recursive splitter tries to go to our specified chunksize by recursively splitting the document on ["\n\n", "\n", " ", ""]
If the first split by \n\n is still large then it moves to the next character which is \n and tries to split by it.
If it is still larger than our specified chunk size it moves to the next character.
We fetch these chunks and then convert them into vector embeddings using OpenAi's embeddings.
We store these embeddings into our vector database.
We return the vector database object.
"""
doc, pages = load_document(doc_path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
texts = text_splitter.split_documents(doc)
embeddings = OpenAIEmbeddings()# creating embeddings
db = FAISS.from_documents(texts, embeddings) # store the embeddings in our FAISS database
retriever = db.as_retriever()
return retriever
doc_path = "camelot/test data/example table.pdf" # This is just an example, it can be anything
retriever = establish_retriever(doc_path)
# This is the retrieval tool that searches the documents and fetches chunks most similar to user query using FAISS.
@tool
def retriever_tool(query):
"""Searches and returns relevant documents with page info to user queries."""
docs = retriever.get_relevant_documents(query)
return docs
tools = [retriever_tool]
# This is an arithmetic tool which simply performs any calculation the user might want ot perform.
@tool
def math_tool(query):
"Takes in a arithmetic equation as query and solves the equation in a step-by-step manner."
llm_math_chain = LLMMathChain.from_llm(llm, verbose=True)
response = llm_math_chain.run(query)
return response
tools.append(math_tool)
def get_tables_from_PDF(pdf_file, pages):
# Args:
# pdf_file (str): The path to the PDF file to be parsed.
# pages (str): A string indicating which pages to extract tables from. It can be formatted in three ways:
# - "pages=X": Extracts only page X (e.g., "pages=1").
# - "pages=X,Y,Z": Extracts a comma-separated list of pages (e.g., "pages=2,4,5").
# - "pages=X-Y,Z-M": Extracts a range of pages (e.g., "pages=10-20,23-27").
#
# Returns:
# A list of Camelot-parsed tables from the specified PDF pages.
tables = camelot.read_pdf(pdf_file, pages=pages)
return tables
parse_table_tool = Tool(
name='Table parser',
func=get_tables_from_PDF,
description='Useful when the query contains a table.'
)
tools.append(parse_table_tool)
As you can see, i have already created a tool for it, but the get_tables_from_PDF takes the pages as input, and this is what i want to acheive.
# Vital to retain memory since that is what enables the back-forth convo
memory = ConversationBufferMemory(memory_key=memory_key, return_messages=True)
# Here we are simply wrapping the prompts to feed into the model
system_message = SystemMessage(
content=(f"{sys_prompt}")
)
prompt = OpenAIFunctionsAgent.create_prompt(
system_message=system_message,
extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)]
)
# Now we define the AGENT
agent = create_openai_functions_agent(llm=llm, tools=tools, prompt=prompt)
# This is where everything is wrapped together to form our RAG agent. RAG is just one capability enabled by our retrieval tool. Adding more tools gives more abilities
agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True)
# This function passes user queries to the model and returns the inferred response
def infer(query):
agent_input = {'input':query}
result = agent_executor(agent_input)
return result['output']