After installing
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
And then some boilerplates to load the Mistral model:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit= True,
bnb_4bit_quant_type= "nf4",
bnb_4bit_compute_dtype= torch.bfloat16,
bnb_4bit_use_double_quant= False,
)
base_model="mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token
We can access the embedding layers (tokenizers -> dense layer output) from:
print(type(model.model.embed_tokens))
model.model.embed_tokens
[out]:
torch.nn.modules.sparse.Embedding
Embedding(32000, 4096)
But when I try to feed in some strings it's not the right expected types, e.g.
model.model.embed_tokens(tokenizer("Hello world"))
[out]:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-30-66f4114cc3e1> in <cell line: 3>()
1 import numpy as np
2
----> 3 model.model.embed_tokens(tokenizer("Hello world"))
4 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2231 # remove once script supports set_grad_enabled
2232 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2234
2235
TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not BatchEncoding
or
model.model.embed_tokens(tokenizer("Hello world").input_ids)
[out]:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-31-95e95b326f0d> in <cell line: 3>()
1 import numpy as np
2
----> 3 model.model.embed_tokens(tokenizer("Hello world").input_ids)
4 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2231 # remove once script supports set_grad_enabled
2232 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2234
2235
TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list
And this seems to be the right type it's expecting:
model.model.embed_tokens(torch.tensor(tokenizer("Hello world").input_ids))
[out]:
tensor([[-4.0588e-03, 1.6499e-04, -4.6997e-03, ..., -1.8597e-04,
-9.9945e-04, 4.0531e-05],
[-1.9684e-03, 1.6098e-03, -4.2343e-04, ..., -2.7924e-03,
1.1673e-03, -1.0529e-03],
[-2.3346e-03, 2.0752e-03, -1.4114e-03, ..., 8.4305e-04,
-1.0376e-03, -2.0294e-03],
[-1.5640e-03, 9.3460e-04, 1.8692e-04, ..., 1.1749e-03,
3.3760e-04, 3.3379e-05]], device='cuda:0', dtype=torch.bfloat16,
grad_fn=<EmbeddingBackward0>)
Is there a way to specify the return type when using the tokenizer() instead of casting the output of the tokenizer into a torch tensor?
Try the
return_tensors='pt'argument, e.g.