Retrieve page from the PDF in PDF-chatbot using Langchain

1.2k views Asked by At

I have developed a small app based on langchain and streamlit, where user can ask queries using pdf files. The code is mentioned as below:

from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback


def main():
    load_dotenv()
    st.set_page_config(page_title="Ask your PDF")
    st.header("Ask your PDF ")
    
    # upload file
    pdf = st.file_uploader("Upload your PDF", type="pdf")
    
    # extract the text
    if pdf is not None:
      pdf_reader = PdfReader(pdf)
      text = ""
      for page in pdf_reader.pages:
        text += page.extract_text()
        
      # split into chunks
      text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
      )
      chunks = text_splitter.split_text(text)
      
      # create embeddings
      embeddings = OpenAIEmbeddings()
      knowledge_base = FAISS.from_texts(chunks, embeddings)
      
      # show user input
      user_question = st.text_input("Ask a question about your PDF:")
      if user_question:
        docs = knowledge_base.similarity_search(user_question)
        
        llm = OpenAI()
        chain = load_qa_chain(llm)
        with get_openai_callback() as cb:
          response = chain.run(input_documents=docs, question=user_question)
          print(cb)
           
        st.write(response)
    

if __name__ == '__main__':
    main()

Can someone suggest that how I can retrieve or render the page of the pdf from where answer or information has been extracted? I have came across this but won't able to implement it properly.

2

There are 2 answers

11
ferdy On BEST ANSWER

Here is a simple approach.

  • While reading the pdf, also save the content per page and the page number.
    # extract the text
    if pdf is not None:
        pdf_reader = PdfReader(pdf)
        text = ""

        page_dict = {}
        for i, page in enumerate(pdf_reader.pages):
            page_content = page.extract_text()
            text += page_content + '\n\n'
            page_dict[page_content] = i+1

Once we get the response, we will compare it with the content of each page that we have saved before. The idea is to get which page gets the highest similarity to the response. It can be page 1, page 2, etc.

            # Get the similarity between each page and response.
            # Use spacy model (free). Openai similarity can be expensive
            # but maybe more accurate.
            data = []
            for page_content, page_num in page_dict.items():
                similarity = spacy_sim(response, page_content)
                data.append([similarity, page_num])

Sort the data and get the page with highest similarity.

            # Sort the similarity score fron high to low.
            data = sorted(data, key=lambda x: x[0], reverse=True)
            print(data)

            # Get the top page number.
            top_page_num = data[0][1]

Now generate all the images per page, using the library pdf2image. We are going to show the content of the page as an image. You can do other methods as we already have the content of the page. But in this approach I will show the image via streamlit image widget.

            # Generate images per page in the pdf.
            images = convert_from_path(pdf.name)

Now that we have a list of images, get the index that corresponds to the page that we want to show.

            # Show the the page image with the highest similarity.
            st.image(images[top_page_num-1])

Here is the code to get the similarity score between page content and response.

def spacy_sim(str1, str2):
    """model en_core_web_lg should be better"""
    nlp = spacy.load("en_core_web_md")
    doc_1 = nlp(str1)
    doc_2 = nlp(str2)
    return doc_1.similarity(doc_2)

Sample output

enter image description here

You can download a sample pdf from my google drive.

Full code

from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from pdf2image import convert_from_path
import spacy


SECRET = 'abc'


def spacy_sim(str1, str2):
    nlp = spacy.load("en_core_web_md")
    doc_1 = nlp(str1)
    doc_2 = nlp(str2)
    return doc_1.similarity(doc_2)


def main():
    load_dotenv()
    st.set_page_config(page_title="Ask your PDF")
    st.header("Ask your PDF ")

    # upload file, should be in the same location with the streamlit script.
    pdf = st.file_uploader("Upload your PDF", type="pdf")

    # extract the text
    if pdf is not None:
        pdf_reader = PdfReader(pdf)
        text = ""

        page_dict = {}
        for i, page in enumerate(pdf_reader.pages):
            page_content = page.extract_text()
            text += page_content + '\n\n'
            page_dict[page_content] = i+1

        # split into chunks
        text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )
        chunks = text_splitter.split_text(text)

        # create embeddings
        embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
        knowledge_base = FAISS.from_texts(chunks, embeddings)

        # show user input
        user_question = st.text_input("Ask a question about your PDF:")
        if user_question:
            docs = knowledge_base.similarity_search(user_question)

            llm = OpenAI(openai_api_key=SECRET)
            chain = load_qa_chain(llm)
            with get_openai_callback() as cb:
                response = chain.run(input_documents=docs,
                                     question=user_question)
                print(f'billing details: {cb}')

            # Get the similarity between each page and response.
            # Use spacy model (free). Openai similarity can be expensive
            # but maybe more accurate.
            data = []
            for page_content, page_num in page_dict.items():
                similarity = spacy_sim(response, page_content)
                data.append([similarity, page_num])

            # Sort the similarity score from high to low.
            data = sorted(data, key=lambda x: x[0], reverse=True)
            print(data)

            # Get the top page number.
            top_page_num = data[0][1]

            st.write(f"Answer: {response}")

            # Generate images per page from the pdf.
            images = convert_from_path(pdf.name)

            # Show the page image with the highest similarity.
            st.image(images[top_page_num-1])


if __name__ == '__main__':
    main()

Use similarity from openai api.

import openai

openai.api_key  = SECRET

def openai_sim(str1, str2):
    # Call the API
    response = openai.Embedding.create(
        input=[str1, str2],
        model="text-embedding-ada-002"
    )

    # Extract the embeddings
    embedding1 = response['data'][0]['embedding']
    embedding2 = response['data'][1]['embedding']

    # Calculate cosine similarity
    similarity_score = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    return similarity_score

Use sentence-transfomer for similarity.

def transformer_sim(str1, str2):
    """
    install pytorch:
        https://pytorch.org/get-started/locally/

    install sentence-transformers:
        pip install -U sentence-transformers

    from sentence_transformers import SentenceTransformer, util
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings1 = model.encode(str1, convert_to_tensor=True)
    embeddings2 = model.encode(str2, convert_to_tensor=True)
    cosine_score = util.cos_sim(embeddings1, embeddings2)
    simscore = float(cosine_score[0][0])

    return simscore

Solution 2

Uses pymupdf to save the text and save the images per page. Uploaded file can be from anywhere not necessarily from the location of the streamlit script because while we are saving the text on each pdf page, we also save the images as data bytes.

This also uses the sentence-transformer to measure similarity of two text strings useful for page content and response comparison.

Full code

"""Using sentence-transfomer for similarity score."""


from dotenv import load_dotenv
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from sentence_transformers import SentenceTransformer, util
import fitz  # pymupdf


SECRET = 'abc'


def transformer_sim(str1, str2):
    """
    install pytorch:
        https://pytorch.org/get-started/locally/

    install sentence-transformers:
        pip install -U sentence-transformers

    from sentence_transformers import SentenceTransformer, util
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings1 = model.encode(str1, convert_to_tensor=True)
    embeddings2 = model.encode(str2, convert_to_tensor=True)
    cosine_score = util.cos_sim(embeddings1, embeddings2)
    simscore = float(cosine_score[0][0])

    return simscore


def main():
    load_dotenv()
    st.set_page_config(page_title="Ask your PDF")
    st.header("Ask your PDF ")

    # upload file, should be in the same location with the streamlit script.
    pdf = st.file_uploader("Upload your PDF", type="pdf")

    # extract the text
    if pdf is not None:
        text = ""
        images = []
        page_dict = {}

        with fitz.open(stream=pdf.read(), filetype="pdf") as pdf_pages:            
            for i, page in enumerate(pdf_pages):
                page_content = page.get_text()
                text += page_content + '\n\n'
                page_dict[page_content] = i+1

                # images
                pix = page.get_pixmap()
                bytes_data = pix.tobytes("PNG")
                images.append(bytes_data)

        # split into chunks
        text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )
        chunks = text_splitter.split_text(text)

        # create embeddings
        embeddings = OpenAIEmbeddings(openai_api_key=SECRET)
        knowledge_base = FAISS.from_texts(chunks, embeddings)

        # show user input
        user_question = st.text_input("Ask a question about your PDF:")
        if user_question:
            docs = knowledge_base.similarity_search(user_question)

            llm = OpenAI(openai_api_key=SECRET)
            chain = load_qa_chain(llm)
            with get_openai_callback() as cb:
                response = chain.run(input_documents=docs,
                                     question=user_question)
                print(f'billing details: {cb}')

            # Get the similarity between each page and response.
            data = []
            for page_content, page_num in page_dict.items():
                similarity = transformer_sim(response, page_content)
                data.append([similarity, page_num, page_content])

            # Sort the similarity score fron high to low.
            data = sorted(data, key=lambda x: x[0], reverse=True)

            # Get the top page number.
            top_page_num = data[0][1]
            top_sim_score = data[0][0]

            st.write(f"Answer: {response}")
            st.markdown(f'**There is a top similarity score of {top_sim_score} that the response is from page {top_page_num}**')

            # Show the the page image with the highest similarity.
            st.image(images[top_page_num-1])


if __name__ == '__main__':
    main()
0
Yilmaz On

In Langchain chains have callback argument. From here

class Chain(BaseModel, ABC):
    """Base interface that all chains should implement."""

    memory: BaseMemory
    callbacks: Callbacks

    def __call__(
        self,
        inputs: Any,
        return_only_outputs: bool = False,
        callbacks: Callbacks = None,
    ) -> Dict[str, Any]:
        ...

when you run a chain, you can pass the callback like this

chain = LLMChain(llm=llm, prompt=prompt)
 
chain.run(input="input", callbacks=[yourHandler])

Langchain has on_retriever_end callback.

async on_retriever_end(documents: Sequence[Document], **kwargs: Any) → None

Run when retriever ends running.

you can define this callback:

def on_retriever_end(self, documents, **kwargs):
    for idx, doc in enumerate(documents):
        source = doc.metadata["source"]
        f"From {source}")
        

as far as I understood, when you add this to the callbacks list in, retriever document automatically is passed to the callback.

 # retriever object will be passed to on_retriever_end callback
   chain=whateverChain(retriever=retriever, memory=memory, callbacks=[on_retriever_end])

if you want to highlight the part of pdf where you get the answer, this will be the hard part. Highlighting the relevant sections would be rather challenging because the chunk size we're using is rather large - we'd likely end up highlighting too much content.