DeepSpeed multi-GPU finetuning does not work

2.2k views Asked by At

Currently, I am trying to fine tune the Korean Llama model(13B) on a private dataset through DeepSpeed and Flash Attention 2, TRL SFTTrainer. I am using 2 * A100 80G GPUs for the fine-tuning, however, I could not conduct the fine-tuning.. I can't find out the problem and any solution to this situation by googling.. Please let me know what the problem is and how to solve it.

As I mentioned above, I've got stuck in that situation. I used the accelerate launch to utilize multi-GPU and DeepSpeed config provided by TRL example code. I also tried to use deepspeeedzero2.yaml and deepspeedzero3.yaml however, both of them did not work..

The running code is as follows:

accelerate launch --config_file=accelerate_configs/deepspeed_zero3.yaml --num_processes 2 finetuning/finetune_SFT.py \
    --model_path beomi/llama-2-koen-13b \
    --data_path Cartinoe5930/KoRAE_filtered_12k \
    --output_dir finetuning/result/llama2/ \
    --wandb_project KoRAE_llama2 \
    --wandb_run_name KoRAE_llama2 \
    --hub_path HUB_PATH_TO_UPLOAD_MODEL \
    --auth_token MY_HF_ACCESS_TOKEN \

The specific code of finetune_SFT.py is as follows:

import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling
from accelerate import Accelerator
from datasets import load_dataset
from tqdm import tqdm
import random

from trl import SFTTrainer
from accelerate import Accelerator

from utils.prompter import Prompter

import argparse

def args_parse():
    parser = argparse.ArgumentParser()

    parser.add_argument("--hf_token", type=str, help="Required to upload models to hub.")
    parser.add_argument("--model_path", type=str, default="beomi/llama-2-koen-13b")
    parser.add_argument("--data_path", type=str, default="Cartinoe5930/KoRAE_filtered_12k")
    parser.add_argument("--num_proc", type=int)

    parser.add_argument("--seq_length", type=int, default=4096)
    parser.add_argument("--num_epochs", type=int, default=3)
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--micro_batch_size", type=int, default=2)
    parser.add_argument("--val_set_size", type=float, default=0)
    parser.add_argument("--logging_steps", type=int, default=10)
    parser.add_argument("--save_strategy", type=str, default="epoch", help="You can choose the strategy of saving model.")
    parser.add_argument("--gradient_checkpointing", type=bool, default=True)
    parser.add_argument("--group_by_length", type=bool, default=False)
    parser.add_argument("--packing", type=bool, default=False)

    parser.add_argument("--learning_rate", type=float, default=3e-4)
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
    parser.add_argument("--warmup_ratio", type=float, default=0.03)
    parser.add_argument("--weight_decay", type=float, default=0)
    
    parser.add_argument("--wandb_project", type=str)
    parser.add_argument("--wandb_run_name", type=str)

    parser.add_argument(
        "--output_dir",
        type=str,
        required=True
    )
    parser.add_argument(
        "--hf_hub_path",
        type=str,
        required=True,
        help="The hub path to upload the model"
    )

    return parser.parse_args()

def process_dataset(example):
    prompter = Prompter("KoRAE_template")

    result_data = []
    for i in range(len(example)):
        full_prompt = prompter.generate_prompt(
            example["instruction"][i],
            example["prompt"][i],
            example["input"][i],
            example["output"][i])
        result_data.append(full_prompt)

    return result_data

def create_datasets(args):
    dataset = load_dataset(
        args.data_path,
        split="train",
        num_proc=args.num_proc if args.num_proc else None,
    )

    if args.val_set_size > 0:
        train_val = dataset.train_test_split(test_size=args.val_set_size, seed=42)

        train_data = train_val["train"]
        val_data = train_val["test"]
    else:
        train_data = dataset
        val_data = None

    return train_data, val_data


if __name__ == "__main__":
    args = args_parse()

    gradient_accumulation_steps = args.batch_size // args.micro_batch_size // args.num_proc

    model = AutoModelForCausalLM.from_pretrained(
        args.model_path,
        # device_map={"": Accelerator().process_index},
        torch_dtype=torch.bfloat16,
        use_auth_token=args.hf_token,
        use_flash_attention_2=True
    )
    model.config.use_cache = False
    model.enable_input_require_grads()

    tokenizer = AutoTokenizer.from_pretrained(
        args.model_path,
        use_auth_token=args.hf_token,
    )

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Check if parameter passed or if set within environ
    use_wandb = len(args.wandb_project) > 0 or (
        "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
    )
    # Only overwrite environ if wandb param passed
    if len(args.wandb_project) > 0:
        os.environ["WANDB_PROJECT"] = args.wandb_project

    train_dataset, eval_dataset = create_datasets(args)
    
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.num_epochs,
        per_device_train_batch_size=args.micro_batch_size,
        per_device_eval_batch_size=args.micro_batch_size if eval_dataset else None,
        gradient_accumulation_steps=gradient_accumulation_steps,
        gradient_checkpointing=args.gradient_checkpointing,
        learning_rate=args.learning_rate,
        logging_steps=args.logging_steps,
        save_strategy=args.save_strategy,
        save_steps=args.save_steps if args.save_strategy == "steps" else None,
        evaluation_strategy="epoch" if eval_dataset else None,
        group_by_length=args.group_by_length,
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_ratio=args.warmup_ratio,
        bf16=True,
        save_total_limit=2,
        remove_unused_columns=False,
        report_to="wandb" if use_wandb else None,
        run_name=args.wandb_run_name if use_wandb else None,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        formatting_func=process_dataset,
        data_collator=data_collator,
        packing=args.packing,
        max_seq_length=args.seq_length,
        tokenizer=tokenizer,
        args=training_args
    )

    trainer.train()
    trainer.save_model(args.output_dir)

    model.push_to_hub(
        args.hf_hub_path,
        use_temp_dir=True,
        use_auth_token=args.hf_token,
    )
    tokenizer.push_to_hub(
        args.hf_hup_path,
        use_temp_dir=True,
        use_auth_token=args.hf_token,
    )

The log of above code is as follows. There are not specific errors or something strange however, that situation just continue and training bar and any other logs are not appeared. So I can't know the training is really conducting..

[2023-11-08 05:02:56,499] torch.distributed.run: [WARNING] 
[2023-11-08 05:02:56,499] torch.distributed.run: [WARNING] *****************************************
[2023-11-08 05:02:56,499] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2023-11-08 05:02:56,499] torch.distributed.run: [WARNING] *****************************************
[2023-11-08 05:03:00,158] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-11-08 05:03:00,422] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|████████████████| 10/10 [00:07<00:00,  1.30it/s]
/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py:374: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py:671: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
[2023-11-08 05:04:45,333] [INFO] [comm.py:637:init_distributed] cdb=None
[2023-11-08 05:04:45,333] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Loading checkpoint shards: 100%|████████████████| 10/10 [00:10<00:00,  1.09s/it]
/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py:374: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py:671: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
  warnings.warn(
[2023-11-08 05:04:49,096] [INFO] [comm.py:637:init_distributed] cdb=None
Parameter Offload: Total persistent parameters: 414720 in 81 params

Please let me know the brilliant solution to resolve this problem!

0

There are 0 answers