CUDA OutOfMemoryError but free memory is always half of required memory in error message

280 views Asked by At

I'm trying to fine-tune phi-1.5 (code below) and I run into a OutOfMemoryError: CUDA out of memory whenever I try to run the trainer.train() line below. I have tried this on multiple different computers with different GPUs and different GPU memory capacity. If I change per_device_train_batch_size and per_device_eval_batch_size, I can increase or decrease the amount of memory the program tries to allocate, but for some reason, whenever the error is thrown, I always have about half of the memory needed free (ie. I increase per_device_batch_size, memory requirement goes up to ~2GiB and free memory is ~1GiB; I decrease per_device_batch_size, memory requirement goes down to 4~0MiB and free memory is ~20MiB). What can I do to solve this?

import glob
import os
import random
from pandas import DataFrame
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer

# Model and Tokenizer Initialization
MODEL_NAME = "HUGGINGFACE_MODEL_NAME"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# File Processing Functions
def chunk_lines(lines, chunk_size):
    """Yield successive n-sized chunks from a list of lines."""
    for i in range(len(lines) - chunk_size + 1):
        yield "".join(lines[i : i + chunk_size])

def process_files(pattern, chunk_size):
    """Process all files matching the pattern and create chunks."""
    chunks = []
    for filepath in glob.glob(pattern):
        with open(filepath, "r", encoding="utf-8") as file:
            lines = file.readlines()
            chunks.extend(chunk_lines(lines, chunk_size))
    return chunks

# Data Preparation
file_pattern = "*.txt"
chunk_size = 25
chunks = process_files(file_pattern, chunk_size)
random.shuffle(chunks)
split_percent = 0.9
split_index = int(len(chunks) * split_percent)
train_df = DataFrame(chunks[:split_index], columns=["data"])
val_df = DataFrame(chunks[split_index:], columns=["data"])

# Custom Dataset Class
class CustomData(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]["data"]
        return tokenizer.encode_plus(
            text,
            return_tensors="pt",
            max_length=2048,
            truncation=True,
            padding="max_length",
        )

train = CustomData(train_df)
val = CustomData(val_df)

# Training Configuration
training_args = TrainingArguments(
    report_to=None,
    auto_find_batch_size=True,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=False,
    save_total_limit=4,
    logging_steps=25,
    save_steps=25,
    output_dir="./outputs",
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

# Ensure Output Directory Exists
os.makedirs(training_args.output_dir, exist_ok=True)
os.environ["WANDB_DISABLED"] = "true"

# Training
model.config.use_cache = False
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    dataset_text_field="data",
    tokenizer=tokenizer,
    max_seq_length=2048,
)
trainer.train()

I am aware this is a common question, but I have tried the listed improvements (no history, no unneeded tensors, no oom handler, no random numbers, can't data parallel).

0

There are 0 answers