I tried to use deepspeed to conduct tensor parallel on starcoder as I had multiple small GPUs and each of which cannot singly hold the whole model.
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch
import deepspeed
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))
cache_dir = '/llm-benchmark/starcoder-cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
checkpoint = "bigcode/starcoder"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
# Load model without moving it to device
model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir=cache_dir)
ds_engine = deepspeed.init_inference(model, tensor_parallel={'enabled': True, 'tp_size': world_size})
model = ds_engine.module
print('before tokenizing')
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(f"{device}")
print('before generation')
outputs = model.generate(inputs)
print('after generation')
print(tokenizer.decode(outputs[0]))
print('full result')
When I ran the above code, it seemed that the model had been splitter successfully. However, I got the following error:
Traceback (most recent call last):
File "/root/code/starcoder/generate.py", line 29, in <module>
outputs = model.generate(inputs)
File "/root/code/starcoder/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/generation/utils.py", line 1437, in generate
return self.greedy_search(
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/generation/utils.py", line 2248, in greedy_search
outputs = self(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 808, in forward
transformer_outputs = self.transformer(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 673, in forward
outputs = block(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 316, in forward
attn_outputs = self.attn(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 230, in forward
query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
File "/root/code/starcoder/lib/python3.10/site-packages/torch/_tensor.py", line 803, in split
return torch._VF.split_with_sizes(self, split_size, dim)
RuntimeError: split_with_sizes expects split_sizes to sum exactly to 1600 (input tensor's size at dimension 2), but got split_sizes=[1536, 256]
Traceback (most recent call last):
File "/root/code/starcoder/generate.py", line 29, in <module>
outputs = model.generate(inputs)
File "/root/code/starcoder/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/generation/utils.py", line 1437, in generate
return self.greedy_search(
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/generation/utils.py", line 2248, in greedy_search
outputs = self(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 808, in forward
transformer_outputs = self.transformer(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 673, in forward
outputs = block(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 316, in forward
attn_outputs = self.attn(
File "/root/code/starcoder/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/code/starcoder/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 230, in forward
query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
File "/root/code/starcoder/lib/python3.10/site-packages/torch/_tensor.py", line 803, in split
return torch._VF.split_with_sizes(self, split_size, dim)
RuntimeError: split_with_sizes expects split_sizes to sum exactly to 1600 (input tensor's size at dimension 2), but got split_sizes=[1536, 256]
It seems that the tokenizer is not aligned with the model. Why is this happening?