I want to get the hidden states of a mixtral model that I loaded and when used with a input. I want to do the same that was done in the shown code example. But I don't know how to load the gguf model of Mixtral other than with the Llama cpp library.
So this is what I want to do:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "intfloat/e5-mistral-7b-instruct"
t = AutoTokenizer.from_pretrained(model_id)
t.pad_token = t.eos_token
m = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto" )
m.eval()
texts = [
"this is a test",
"this is another test case with a different length",
]
prompt_template = "This sentence: {text} means in one word:"
texts = [prompt_template.format(text=x) for x in texts]
t_input = t(texts, padding=True, return_tensors="pt")
with torch.no_grad():
last_hidden_state = m(**t_input, output_hidden_states=True, return_dict=True).hidden_states[-1]
idx_of_the_last_non_padding_token = t_input.attention_mask.bool().sum(1)-1
sentence_embeddings = last_hidden_state[torch.arange(last_hidden_state.shape[0]), idx_of_the_last_non_padding_token]
print(idx_of_the_last_non_padding_token)
print(sentence_embeddings.shape)
This how I load:
from llama_cpp import Llama
model_path = "01_models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
llm = Llama(model_path=model_path, n_ctx=4096, n_threads=31, output_hidden_states=True)
output = llm(
"Hi",
max_tokens=4096,
stop=["<|endoftext|>", "</s>"],
echo=True,
)
So tried to get the hidden_state like that:
text= "some text"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
t_input = tokenizer(text, return_tensors="pt")
states = Llama(**t_input, model_path=model_path, n_ctx=4096, n_threads=31, output_hidden_states=True, return_dict=True).hidden_states[-1]
I hope someone can help.