ThroughputMonitor in PytorchLightning

15 views Asked by At

I have a this PytorchLightning Code. It's a Distributed code with 3 nodes, fixed batch size, and some metric logging.

class Resnet18Model(pl.LightningModule):
def __init__(self, num_classes=10):
    super().__init__()
    self.model = resnet18(weights=None, num_classes=num_classes)
    self.train_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)
    self.valid_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)

def forward(self, x):
    return self.model(x)

def training_step(self, batch, batch_idx):
    images, labels = batch
    out = self(images)
    loss = F.cross_entropy(out, labels)
    self.log('train_acc', self.train_acc, on_step=True, on_epoch=False)
    return loss

def validation_step(self, batch, batch_idx):
    images, labels = batch
    out = self(images)
    self.valid_acc(out, labels)
    self.log('valid_acc', self.valid_acc, on_step=False, on_epoch=True)

    ....

def main():
    model = Resnet18Model()

    early_stopping = EarlyStopping(monitor="valid_acc", mode="max", stopping_threshold=0.80, patience=3)

    gpus_available = torch.cuda.is_available()
    if gpus_available:
        devices = torch.cuda.device_count()
        accelerator = "gpu"
    else:
        devices = multiprocessing.cpu_count()
        accelerator = "cpu"

    exp_name = f"resnet18_cifar10_lightning_{accelerator}_{args.batch_size}"

    logger = TensorBoardLogger("logs/tb_logs", name=exp_name)

    profiler = PyTorchProfiler(dirpath="logs/profiler_logs", name=exp_name,
                           # emit_nvtx=True,
                           export_to_chrome=True)

    trainer = Trainer(
        max_epochs=args.epochs,
        accelerator=accelerator,
        devices=devices,
        strategy="ddp",
        logger=logger,
        enable_progress_bar=True,
        profiler=profiler,
        num_nodes=3,
        log_every_n_steps=10,
        callbacks=[DeviceStatsMonitor(cpu_stats=True), early_stopping]
    )

   trainer.fit(model)

I want to add ThroughputMonitor to be logged with the TensorBoard but I am not aware how. I read the doc and tried many variations but neither worked.

Best

0

There are 0 answers