I'm new to fine-tuning LLM's so please excuse any silly questions. I've trained BERT on a custom dataset and 4 A100 GPUs without any PEFT techniques. I was able to use a decent batch size and did not run into cuda out of memory issues. Now, I want to test it out on Falcon7B using PEFT. The trainable parameters are much lower than that of BERT (after using PEFT) and yet I'm unable to train without running into CUDA out of memory. Why is this happening? If I can fit the parameters of BERT (108,311,810 trainable parameters), then why can't I use Falcon 7B with PEFT (4,727,680 parameters).
Here is a snippet of what I'm doing:
from src import dataset, config
import pandas as pd
from utils import helper
#data loading, tokenizing and model building
import torch
from transformers import (
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer
)
from transformers import logging as hf_logging
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_int8_training
)
train_df = pd.read_feather(config.TRAINING_FILE)
# Testing on sample
train_df = train_df.head(10000)
tokenizer = config.TOKENIZER #For context, this is AutoTokenizer from HF
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Generate dataset
train_dataset = dataset.SepsisDataset(train_df['note_text'].values, train_df['label'].values, tokenizer=tokenizer)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(config.BASE_MODEL_PATH, num_labels=config.NUM_LABELS) #BASE_MODEL_PATH= "tiiuae/falcon-7b"
# load latest checkpoint if resume training is True
# Train using PEFT
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))
print('Resized token embeddings!')
# Prepare model for int8 training
model = prepare_model_for_int8_training(model)
# LoRA Config
peft_config = LoraConfig(task_type="SEQ_CLS",
r=16,
lora_alpha=32,
lora_dropout=0.01,
bias='none',
)
# Change to model to a peft model
model = get_peft_model(model, peft_config)
# Lets print trainable parameters
print(model.print_trainable_parameters())
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
# Define training arguments
training_args = TrainingArguments(
output_dir=config.MODEL_OUTPUT_DIR,
evaluation_strategy='epoch',
save_strategy = 'epoch',
num_train_epochs=config.EPOCHS,
per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
learning_rate=config.LEARNING_RATE,
optim='paged_adamw_8bit',
weight_decay=0.02,
logging_steps = 1,
fp16=True,
logging_first_step = True,
logging_strategy = 'epoch',
log_level = 'info',
lr_scheduler_type='cosine_with_restarts' # constant, cosine_with_restarts
)
# Define Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
compute_metrics=helper.compute_metrics
)
# Train the model
trainer.train()
When I use a batch_size of 1, it works. However, the size of my dataset is very large.