I am working on a RAG system using LlamaIndex. I try to adapt small-to-big chunking strategy for retrieval stage. I have numerous articles as inputs and some metadata about them. here is the list of metadata items:
- title
- date
- url
- keywords
- entities
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
from llama_index.prompts.prompts import SimpleInputPrompt
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
import torch
from transformers import BitsAndBytesConfig
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
)
import torch
from llama_index.llms import HuggingFaceLLM
llm = HuggingFaceLLM(
context_window=3900,
max_new_tokens=1024,
generate_kwargs={"temperature": 0.1,"do_sample": True},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer=tokenizer,
model=model,
device_map="auto",
model_kwargs={"quantization_config": nf4_config}
#model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embed_model = LangchainEmbedding(
HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
)
from llama_index.node_parser import SimpleNodeParser
node_parser = SimpleNodeParser.from_defaults(chunk_size=1256)
base_nodes = node_parser.get_nodes_from_documents(docs)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
node.id_ = f"node-{idx}"
# creates a persistant index to disk
client = QdrantClient(path="./qdrant_data")
# create our vector store with hybrid indexing enabled
# batch_size controls how many nodes are encoded with sparse vectors at once
vector_store = QdrantVectorStore(
"qdrant", client=client, enable_hybrid=True, batch_size=10
)
storage_context = StorageContext.from_defaults(vector_store=qdrant_vector_store)
service_context = ServiceContext.from_defaults(
chunk_size=1256,
chunk_overlap=0,
llm=llm,
embed_model=embed_model)
base_index = VectorStoreIndex.from_documents(
docs,
storage_context=storage_context,
service_context=service_context,
)
base_retriever = base_index.as_retriever(similarity_top_k=3)
from llama_index.schema import IndexNode
sub_chunk_sizes = [856, 920, 1024]
sub_node_parsers = [
SimpleNodeParser.from_defaults(chunk_size=c, chunk_overlap=c // 2) for c in sub_chunk_sizes
]
all_nodes = []
for base_node in base_nodes:
for n in sub_node_parsers:
sub_nodes = n.get_nodes_from_documents([base_node])
sub_inodes = [
IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
]
all_nodes.extend(sub_inodes)
# also add original node to node
original_node = IndexNode.from_text_node(base_node, base_node.node_id)
all_nodes.append(original_node)
when i try to use really small chunk sizes, due to my metadata, I get the following error:
ValueError: Metadata length (1143) is longer than chunk size (856). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
i want to use the small-to-big chunking strategy to enhance the retrieval performance of the model but i do not know how to deal with this situtation. i keep increasing the sub-chunk sizes but it is getting away from the strategy.
I have also summaries of all the documents but, because of this reason, I had to remove them from my metadata.
Any suggestions would be appreciated.
When LlamaIndex embeds a document it includes the metadata, and when it sends a chunk to the LLM it also includes the metadata. The node parser knows that so it takes the length of the metadata into account. In this use-case you're using small-to-big so you don't need the metadata embedded or sent to the LLM, so you can exclude it from both of those using 2 properties of Nodes:
excluded_llm_metadata_keys
andexcluded_embed_metadata_keys
(docs).In practice this looks like (modifying your code):