Problems with training a model with pytorch

33 views Asked by At

I am trying to train a model on pytorch and it is giving me the following error:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

It seems that the trainer has trouble fitting the model.

Here is the full error message:

Traceback (most recent call last):
  File "/content/drive/My Drive/TS_T5-main/scripts/train.py", line 44, in <module>
    run_training(args_dict, dataset)
  File "/content/drive/My Drive/TS_T5-main/source/train.py", line 33, in run_training
    train(args_dict)
  File "/content/drive/My Drive/TS_T5-main/source/model.py", line 398, in train
    trainer.fit(model)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
    self._call_and_handle_interrupt(
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl
    results = self._run(model, ckpt_path=self.ckpt_path)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run
    results = self._run_stage()
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage
    return self._run_train()
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train
    self.fit_loop.run()
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance
    self._outputs = self.epoch_loop.run(self._data_fetcher)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance
    batch_output = self.batch_loop.run(kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance
    outputs = self.optimizer_loop.run(optimizers, kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance
    result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step
    self.trainer._call_lightning_module_hook(
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1552, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/content/drive/My Drive/TS_T5-main/source/model.py", line 213, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/optimizer.py", line 168, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step
    return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 153, in optimizer_step
    return optimizer.step(closure=closure, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
    return wrapped(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py", line 385, in wrapper
    out = func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py", line 457, in step
    loss = closure()
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 138, in _wrap_closure
    closure_result = closure()
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 141, in closure
    self._backward_fn(step_output.closure_loss)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 304, in backward_fn
    self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1706, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py", line 191, in backward
    self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward
    model.backward(closure_loss, optimizer, optimizer_idx, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/module.py", line 1418, in backward
    loss.backward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Epoch 0:   0%|          | 0/16832 [00:19<?, ?it/s]

And here is the code from model.py

from functools import lru_cache
from pathlib import Path


from easse.sari import corpus_sari
from torch.nn import functional as F
from source.helper import log_stdout, tokenize, yield_sentence_pair, yield_lines, load_preprocessor, read_lines, \
    count_line
import argparse
import os
import logging
import random
import nltk
from source.resources import NEWSELA_DATASET, get_data_filepath, WIKILARGE_DATASET, TURKCORPUS_DATASET, \
    WIKILARGE_WIKIAUTO_DATASET

nltk.download('punkt')

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.trainer import seed_everything
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    get_linear_schedule_with_warmup, AutoConfig, AutoModel
)

torch.set_grad_enabled(True)
print("START_____________________________")

class T5FineTuner(pl.LightningModule):
    def __init__(self, model_name, learning_rate, adam_epsilon, custom_loss, weight_decay, dataset,
                 train_batch_size, valid_batch_size, train_sample_size, valid_sample_size, max_seq_length,
                 n_gpu, gradient_accumulation_steps, num_train_epochs, warmup_steps, nb_sanity_val_steps,
                 *args, **kwargs):
        super(T5FineTuner, self).__init__()
        self.save_hyperparameters()
        self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.model_name)
        self.tokenizer = T5TokenizerFast.from_pretrained(self.hparams.model_name)
        self.model = self.model.to(self.device)
        self.preprocessor = load_preprocessor()

    def is_logger(self):
        return self.trainer.global_rank <= 0

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )
        return outputs

    def generate(self, sentence):
        sentence = self.preprocessor.encode_sentence(sentence)
        text = "simplify: " + sentence

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.hparams.max_seq_length,
            padding='max_length',
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].to(self.device)
        attention_masks = encoding["attention_mask"].to(self.device)

        beam_outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_masks,
            do_sample=False,
            max_length=self.hparams.max_seq_length,
            num_beams=8,
            early_stopping=True,
            num_return_sequences=1
        )
        pred_sent = self.tokenizer.decode(beam_outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        return pred_sent

    def training_step(self, batch, batch_idx):
        labels = batch["target_ids"]
        # Huggingface’s loss functions are defined to exclude the ID -100 during loss calculations. Therefore, we need to convert all padding token IDs in labels to -100.
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        self.opt.zero_grad()
        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask'],
        )

        if self.hparams.custom_loss:
            print("______________EnteredIf!______________")
            loss = outputs.loss
            complexity_score = torch.tensor(random.randint(0, 100) * 0.01, requires_grad=True, device=self.device)
            complexity_score.requires_grad = True
            # complexity_score = self._custom_step(outputs['logits'])
            # loss = loss * complexity_score
            lambda_ = 0.7
            # loss = lambda_ * loss + (1-lambda_)*complexity_score
            # loss = torch.sqrt(loss + lambda_ * complexity_score)
            print("Before custom loss calculation - loss shape:", loss.shape, "complexity_score shape:", complexity_score.shape)
            loss = loss + complexity_score + lambda_ * (complexity_score - loss)
            print("After custom loss calculation - loss shape:", loss.shape)
            
            print(complexity_score)
            self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)
            # print(loss)
            loss.requires_grad = True
            return loss
        else:
            print("______________Entered Else!______________")
            loss = outputs.loss
            self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)
            loss.requires_grad = True
            return loss

        # loss = outputs.loss
        # logs = {"train_loss": loss}
        # self.logger.experiment.add_scalars('loss', logs, global_step=self.global_step)
        # return {"loss": loss, "log": logs}

    def validation_step(self, batch, batch_idx):
        loss = self.sari_validation_step(batch)
        # loss = self._step(batch)
        print("Val_loss", loss)
        logs = {"val_loss": loss}
        # self.logger.experiment.add_scalars('loss', logs, global_step=self.global_step)
        # return {"val_loss": torch.tensor(loss)}
        self.log('val_loss', loss, batch_size=self.hparams.valid_batch_size)
        t = torch.tensor(loss, dtype=float, requires_grad=True)
        print(t)
        return t

    def sari_validation_step(self, batch):
        def generate(sentence):
            sentence = self.preprocessor.encode_sentence(sentence)
            text = "simplify: " + sentence
            # print("Simplifying: ", text)

            encoding = self.tokenizer(
                text,
                truncation=True,
                max_length=self.hparams.max_seq_length,
                padding='max_length',
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].to(self.device)
            attention_masks = encoding["attention_mask"].to(self.device)

            beam_outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_masks,
                do_sample=False,
                max_length=self.hparams.max_seq_length,
                num_beams=8,
                early_stopping=True,
                num_return_sequences=1
            ).to(self.device)
            # final_outputs = []
            # for beam_output in beam_outputs:
            sent = self.tokenizer.decode(beam_outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            # if sent.lower() != sentence.lower() and sent not in final_outputs:
                # final_outputs.append(sent)
            
            return sent
            # return final_outputs[0]

        pred_sents = []
        for source in batch["source"]:
            pred_sent = generate(source)
            pred_sents.append(pred_sent)

        score = corpus_sari(batch["source"], pred_sents, batch["targets"])
        print("Sari score: ", score)

        return 1 - score / 100

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        # optimizer = SAM(optimizer_grouped_parameters, base_optimizer, lr=self.hparams.learning_rate, momentum=0.9)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None, optimizer_closure=None,
                       on_tpu=None, using_native_amp=None, using_lbfgs=None):
        optimizer.step(closure=optimizer_closure)

        optimizer.zero_grad()
        self.lr_scheduler.step()


    def train_dataloader(self):
        train_dataset = TrainDataset(dataset=self.hparams.dataset,
                                     tokenizer=self.tokenizer,
                                     max_len=self.hparams.max_seq_length,
                                     sample_size=self.hparams.train_sample_size)

        dataloader = DataLoader(train_dataset,
                                batch_size=self.hparams.train_batch_size,
                                drop_last=True,
                                shuffle=True,
                                pin_memory=True,
                                num_workers=4)
        t_total = ((len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                   // self.hparams.gradient_accumulation_steps
                   * float(self.hparams.num_train_epochs)
                   )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = ValDataset(dataset=self.hparams.dataset,
                                 tokenizer=self.tokenizer,
                                 max_len=self.hparams.max_seq_length,
                                 sample_size=self.hparams.valid_sample_size)
        return DataLoader(val_dataset,
                          batch_size=self.hparams.valid_batch_size,
                          num_workers=2)


logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))


class TrainDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=256, sample_size=1):
        self.sample_size = sample_size
        # print("init TrainDataset ...")
        preprocessor = load_preprocessor()
        self.source_filepath = preprocessor.get_preprocessed_filepath(dataset, 'train', 'complex')
        self.target_filepath = preprocessor.get_preprocessed_filepath(dataset, 'train', 'simple')

        self.max_len = max_len
        self.tokenizer = tokenizer

        self._load_data()

    def _load_data(self):
        self.inputs = read_lines(self.source_filepath)
        self.targets = read_lines(self.target_filepath)

    def __len__(self):
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        source = "simplify: " + self.inputs[index]
        target = self.targets[index]

        tokenized_inputs = self.tokenizer(
            [source],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )
        tokenized_targets = self.tokenizer(
            [target],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()

        src_mask = tokenized_inputs["attention_mask"].squeeze()  # might need to squeeze
        target_mask = tokenized_targets["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,
                'sources': self.inputs[index], 'targets': [self.targets[index]]}


class ValDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=256, sample_size=1):
        self.sample_size = sample_size
        self.source_filepath = get_data_filepath(dataset, 'valid', 'complex')
        if dataset == NEWSELA_DATASET:
            self.target_filepaths = [get_data_filepath(dataset, 'valid', 'simple')]

        else:  # TURKCORPUS_DATASET as default
            self.target_filepaths = [get_data_filepath(TURKCORPUS_DATASET, 'valid', 'simple.turk', i) for i in range(8)]

        self.max_len = max_len
        self.tokenizer = tokenizer

        self._build()

    def __len__(self):
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        return {"source": self.inputs[index], "targets": self.targets[index]}

    def _build(self):
        self.inputs = []
        self.targets = []

        for source in yield_lines(self.source_filepath):
            self.inputs.append(source)

        self.targets = [[] for _ in range(count_line(self.target_filepaths[0]))]
        for filepath in self.target_filepaths:
            for idx, line in enumerate(yield_lines(filepath)):
                self.targets[idx].append(line)


def train(train_args):
    args = argparse.Namespace(**train_args)
    seed_everything(args.seed, workers=True)

    print(train_args)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=args.output_dir,
        filename="checkpoint-{epoch}",
        monitor="val_loss",
        verbose=True,
        mode="min",
        save_top_k=5
    )

    train_params = dict(
        accumulate_grad_batches=args.gradient_accumulation_steps,
        #gpus=args.n_gpu,
        max_epochs=args.num_train_epochs,
        # early_stop_callback=False,
        precision=16 if args.fp_16 else 32,
        amp_level=args.opt_level,
        amp_backend='apex',
        # gradient_clip_val=args.max_grad_norm,
        # checkpoint_callback=checkpoint_callback,
        callbacks=[LoggingCallback(), checkpoint_callback],
        # logger=TensorBoardLogger(f'{args.output_dir}/logs'),
        num_sanity_val_steps=args.nb_sanity_val_steps,  # skip sanity check to save time for debugging purpose
        # plugins='ddp_sharded',
        # progress_bar_refresh_rate=1,

    )

    print("Initialize model")
    model = T5FineTuner(**train_args)

    trainer = pl.Trainer(**train_params, accelerator="auto")
    print(" Training model")
    trainer.fit(model)

    print("training finished")

    # print("Saving model")
    # model.model.save_pretrained(args.output_dir)

    # print("Saved model")

Just in case, also the code from train.py

from pathlib import Path;import sys;sys.path.append(str(Path(__file__).resolve().parent.parent)) # fix path
import time
from source.helper import log_params, log_stdout
from source.model import train
from source.evaluate import evaluate_on_TurkCorpus
from source.resources import get_experiment_dir, WIKILARGE_DATASET, get_last_experiment_dir, EXP_DIR
from source.preprocessor import Preprocessor
from optparse import OptionParser


def run_training(args_dict, dataset=WIKILARGE_DATASET):
    parser = OptionParser()
    parser.add_option("-r", "--resume",
                      action="store_true", dest="resume", default=False,
                      help="Resume from the previous training.")
    (options, args) = parser.parse_args()

    if options.resume:
        last_training_dir = get_last_experiment_dir()
        # last_training_dir = EXP_DIR / 'exp_1619204109284287'
        print("Resume from previous training: ", last_training_dir)
        args_dict['output_dir'] = last_training_dir
        args_dict['model_name_or_path'] = last_training_dir  # / 'checkpointepoch=2.ckpt'
    else:
        args_dict['output_dir'] = get_experiment_dir(create_dir=True)
        log_params(args_dict["output_dir"] / "params.json", args_dict)
    # args_dict['logging_dir'] = args_dict['output_dir'] / 'logs'

    preprocessor = Preprocessor(args_dict['features_kwargs'])
    preprocessor.preprocess_dataset(dataset)
    args_dict["dataset"] = dataset
    with log_stdout(args_dict['output_dir'] / "logs.txt"):
        train(args_dict)
        
def run_train_tuning(trial, args_dict, dataset=WIKILARGE_DATASET):
    dir_name = f'{int(time.time() * 1000000)}'
    exp_dir_path = EXP_DIR / f'tuning_experiments/exp_{dir_name}'
    exp_dir_path.mkdir(parents=True, exist_ok=True)

    args_dict['output_dir'] = exp_dir_path
    log_params(args_dict["output_dir"] / "params.json", args_dict)

    preprocessor = Preprocessor(args_dict['features_kwargs'])
    preprocessor.preprocess_dataset(dataset)
    args_dict["dataset"] = dataset
    with log_stdout(args_dict['output_dir'] / "logs.txt"):
        run_training(args_dict, dataset)  # Pass args_dict and dataset explicitly
    return evaluate_on_TurkCorpus(args_dict['features_kwargs'], 'valid', exp_dir_path)

Any help would be much appreciated as I've been struggling for days.

So far I have tried to remove cases where tensors can be rewrapped or detached but I might have missed something I don't see.

0

There are 0 answers