I'm trying to create chunks on indexing a document but PromptHelper chunking doesn't appear to be happening and the entire large text body is being passed to the OpenAI model and throwing a token limit error.
The error message:
openai.error.InvalidRequestError: This model's maximum context length is 8192 tokens, however you requested 53623 tokens (53623 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.
My code:
from langchain.document_loaders import TextLoader, DirectoryLoader, JSONLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from llama_index import (
GPTVectorStoreIndex,
download_loader,
LLMPredictor,
PromptHelper,
)
import os
os.environ['OPENAI_API_KEY'] = ''
def createLocalIndex(persona):
print("called createLocalIndex...")
dataDirectory = "data/" + persona
indexJson = persona + "_index.json"
# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 0.1
# set chunk token size
chunk_size_limit = 100
prompt_helper = PromptHelper(
max_input_size, num_output, max_chunk_overlap, chunk_size_limit=chunk_size_limit
)
print("set prompt helper...")
# Define LLM properties
# Only required when building the index
llm_predictor = LLMPredictor(
llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=num_output)
)
print("set llm_predictor...")
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(dataDirectory)
documents = loader.load_data()
print("document loader created...")
index = GPTVectorStoreIndex(
documents,
llm_predictor=llm_predictor,
prompt_helper=prompt_helper,
verbose=True,
)
print("index created...")
# Save the index to a local file
index.save_to_disk(indexJson)
print("saved new index: " + indexJson)
return index
For context, this is the code I'm basing mine from and results I'm expecting to get (yes, it's a little outdated using gpt_index but I believed I've made the proper name corrections...) : https://twitter.com/danshipper/status/1620464950450724870/photo/1 https://twitter.com/danshipper/status/1620464966410051594/photo/1