Finetuning LLaMA for token classification errors

398 views Asked by At

For Roberta, I implemented a version of token classification that outputs the one hot encodings. You can find that here. However, it doesn't work with LLaMA because of optimisations done elsewhere in the code.

This works

class RobertaForMultiLabelTokenClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            target: torch.LongTensor = labels.view(logits.size())
            loss = loss_fct(logits, target.float())

        logits = torch.sigmoid(logits)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

But this doesn't

class LlamaForTokenClassification(LlamaPreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    classifier_dropout = 0.1

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.llama = LlamaModel(config)

        self.dropout = nn.Dropout(self.classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.post_init()

    def forward(
            self,
            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            labels: Optional[torch.LongTensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.llama(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            target: torch.LongTensor = labels.view(logits.size())
            loss = loss_fct(logits, target.float())

        logits = torch.sigmoid(logits)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

When I run it using the default Trainer, I get an error about CUDA. I'm running a 3090 and finetuning for CasualLM works fine.

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [52,0,0], thread: [30,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
  File "/mnt/e/Comparison-QoC/code/LlamaForTokenClassification.py", line 152, in forward
    outputs = self.llama(
  File "/home/coen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 708, in forward
    layer_outputs = decoder_layer(
  File "/home/coen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 424, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/home/coen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 321, in forward
    query_states = self.q_proj(hidden_states)
  File "/home/coen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/home/coen/.local/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 248, in forward
    out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
  File "/home/coen/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 579, in matmul_4bit
    return MatMul4Bit.apply(A, B, out, bias, quant_state)
  File "/home/coen/.local/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/home/coen/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 516, in forward
    output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

The issue shows a mismatch in size, but I don't see where the issue occurs.

Using the SFTTrainer, I get a NotImplemented exception.

Can anyone tell me what I might be doing wrong?

1

There are 1 answers

0
Coen Hacking On

I managed to solve the problem:

class ClassificationHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        # print(features.dtype, features)  # Check the dtype of features
        # print(self.dense.weight.dtype)  # Check the dtype of the weight matrix in self.dense
        # print(self.dense.bias.dtype)  # Check the dtype of the bias vector in self.dense

        features = features.float()  # Convert features to torch.float32
        x = self.dense(features)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class AutoModelForMultiTokenClassification(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, **kwargs):
        super().__init__()
        self.num_labels = num_labels

        # Load pretrained model
        self.pretrained_model = AutoModel.from_pretrained(
            pretrained_model_name,
            load_in_8bit=True,
            torch_dtype=torch.float16,
        )

        # Add classification head
        self.classification_head = ClassificationHead(self.pretrained_model.config, num_labels)

        # Freeze the weights of pretrained_model
        for param in self.pretrained_model.parameters():
            param.requires_grad = False

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            labels=None,  # Add labels argument
            return_dict: Optional[bool] = None,
            **kwargs,
    ):
        base_model_output = self.pretrained_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            **kwargs,
        )

        last_hidden_state = base_model_output[0]
        token_logits = self.classification_head(last_hidden_state.to(torch.float16))

        loss = None
        if labels is not None:
            labels = labels.to(token_logits.device)
            loss_fct = nn.BCEWithLogitsLoss()
            target: torch.LongTensor = labels.view(token_logits.size())
            loss = loss_fct(token_logits, target.float())

        if not return_dict:
            output = (token_logits,) + base_model_output[2:]
            return ((loss,) + (token_logits,)) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=token_logits,
            hidden_states=base_model_output.hidden_states,
            attentions=base_model_output.attentions,
        )

For my problem I also adjusted the DataCollatorForTokenClassification:

from typing import Union, Optional

from transformers import PreTrainedTokenizerBase
from transformers.data.data_collator import DataCollatorMixin
from transformers.utils import PaddingStrategy
from dataclasses import dataclass


@dataclass
class DataCollatorForTokenClassification(DataCollatorMixin):
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = 'max_length'
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    num_labels: int = 4
    return_tensors: str = "pt"

    def torch_call(self, features):
        import torch

        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None

        no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]

        batch = self.tokenizer.pad(
            no_labels_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        if labels is None:
            return batch

        sequence_length = batch["input_ids"].shape[1]
        padding_side = self.tokenizer.padding_side

        def to_list(tensor_or_iterable):
            if isinstance(tensor_or_iterable, torch.Tensor):
                return tensor_or_iterable.tolist()
            return list(tensor_or_iterable)

        if padding_side == "right":
            batch[label_name] = [
                to_list(label) + [[self.label_pad_token_id] * self.num_labels] * (sequence_length - len(label)) for
                label in labels
            ]
        else:
            batch[label_name] = [
                [[self.label_pad_token_id] * self.num_labels] * (sequence_length - len(label)) + to_list(label) for
                label in labels
            ]

        # print(type(batch[label_name]), batch[label_name])

        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
        return batch