I'm trying to fine-tune phi-1.5 (code below) and I run into a OutOfMemoryError: CUDA out of memory
whenever I try to run the trainer.train()
line below. I have tried this on multiple different computers with different GPUs and different GPU memory capacity. If I change per_device_train_batch_size
and per_device_eval_batch_size
, I can increase or decrease the amount of memory the program tries to allocate, but for some reason, whenever the error is thrown, I always have about half of the memory needed free (ie. I increase per_device_batch_size
, memory requirement goes up to ~2GiB and free memory is ~1GiB; I decrease per_device_batch_size
, memory requirement goes down to 4~0MiB and free memory is ~20MiB). What can I do to solve this?
import glob
import os
import random
from pandas import DataFrame
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
# Model and Tokenizer Initialization
MODEL_NAME = "HUGGINGFACE_MODEL_NAME"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# File Processing Functions
def chunk_lines(lines, chunk_size):
"""Yield successive n-sized chunks from a list of lines."""
for i in range(len(lines) - chunk_size + 1):
yield "".join(lines[i : i + chunk_size])
def process_files(pattern, chunk_size):
"""Process all files matching the pattern and create chunks."""
chunks = []
for filepath in glob.glob(pattern):
with open(filepath, "r", encoding="utf-8") as file:
lines = file.readlines()
chunks.extend(chunk_lines(lines, chunk_size))
return chunks
# Data Preparation
file_pattern = "*.txt"
chunk_size = 25
chunks = process_files(file_pattern, chunk_size)
random.shuffle(chunks)
split_percent = 0.9
split_index = int(len(chunks) * split_percent)
train_df = DataFrame(chunks[:split_index], columns=["data"])
val_df = DataFrame(chunks[split_index:], columns=["data"])
# Custom Dataset Class
class CustomData(Dataset):
def __init__(self, df):
self.df = df
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
text = self.df.iloc[idx]["data"]
return tokenizer.encode_plus(
text,
return_tensors="pt",
max_length=2048,
truncation=True,
padding="max_length",
)
train = CustomData(train_df)
val = CustomData(val_df)
# Training Configuration
training_args = TrainingArguments(
report_to=None,
auto_find_batch_size=True,
gradient_accumulation_steps=4,
num_train_epochs=1,
learning_rate=2e-4,
fp16=False,
save_total_limit=4,
logging_steps=25,
save_steps=25,
output_dir="./outputs",
save_strategy="epoch",
optim="paged_adamw_8bit",
lr_scheduler_type="cosine",
warmup_ratio=0.05,
)
# Ensure Output Directory Exists
os.makedirs(training_args.output_dir, exist_ok=True)
os.environ["WANDB_DISABLED"] = "true"
# Training
model.config.use_cache = False
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train,
eval_dataset=val,
dataset_text_field="data",
tokenizer=tokenizer,
max_seq_length=2048,
)
trainer.train()
I am aware this is a common question, but I have tried the listed improvements (no history, no unneeded tensors, no oom handler, no random numbers, can't data parallel).