I want to use pytorch DistributedDataParallel for adversarial training. The loss function is trades.The code can run in DataParallel mode. But in DistributedDataParallel mode, I got this error. When I change the loss to AT, it can run successfully. Why can't run with trades loss? The two loss functions are as follows:

-- Process 1 terminated with the following error:

Traceback (most recent call last):
  File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/data/zsd/defense/my_adv_training/muti_gpu_2.py", line 170, in main_worker
    train_loss = train(train_loader, model, optimizer, epoch, local_rank, args)
  File "/data/zsd/defense/my_adv_training/muti_gpu_2.py", line 208, in train
    loss = trades(model,x, y,optimizer, args.epsilon,args.step_size,args.num_steps,beta=6.0)
  File "/data/zsd/defense/my_adv_training/loss_functions.py", line 137, in trades
    loss_kl.backward()
  File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [640]] is at version 4; expected version 3 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).  

.

for i, (x, y) in enumerate(train_loader):
    # measure data loading time
    x,y = x.cuda(local_rank, non_blocking=True),  y.cuda(local_rank, non_blocking=True)
    
    loss = trades(model,x, y,optimizer, args.epsilon,args.step_size,args.num_steps,beta=6.0)  
    torch.distributed.barrier()      
   
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

def trades():
    model.eval()
    criterion_kl = nn.KLDivLoss(reduction='sum')
    x_adv = x.detach() + 0.001 * torch.randn_like(x).detach()
    nat_output = model(x)
    for _ in range(num_steps):
        x_adv.requires_grad_()
        with torch.enable_grad():
            loss_kl = criterion_kl(F.log_softmax(model(x_adv), dim=1),
                               F.softmax(nat_output, dim=1))
        loss_kl.backward()
        eta = step_size * x_adv.grad.sign()
        x_adv = x_adv.detach() + eta
        x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
        x_adv = torch.clamp(x_adv, 0.0, 1.0)
    model.train()
    x_adv = Variable(x_adv, requires_grad=False)
    optimizer.zero_grad()
    # calculate robust loss
    logits = model(x)
    loss_natural = nn.CrossEntropyLoss()(logits, y)
    loss_robust = (1.0 / x.size(0)) * criterion_kl(F.log_softmax(model(x_adv), dim=1),
                           F.softmax(logits, dim=1))
    loss = loss_natural + beta * loss_robust
    return loss

def AT():
    model.eval()
    x_adv = x.detach() + torch.from_numpy(np.random.uniform(-epsilon, 
epsilon, x.shape)).float().cuda()
    x_adv = torch.clamp(x_adv, 0.0, 1.0)
    for k in range(num_steps):
        x_adv.requires_grad_()
        output = model(x_adv)
        model.zero_grad()
        with torch.enable_grad():
            loss = nn.CrossEntropyLoss()(output, y)
        loss.backward()
        eta = step_size * x_adv.grad.sign()
        x_adv = x_adv.detach() + eta
        x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
        x_adv = torch.clamp(x_adv, 0.0, 1.0)
    x_adv = Variable(x_adv, requires_grad=False)

    model.train()
    logits_adv = model(x_adv)
    loss = nn.CrossEntropyLoss()(logits_adv, y)
    return loss
2

There are 2 answers

0
shudong On BEST ANSWER

I changed the code of trades and solved this error. But I don't know why this works.

def trades():
    model.eval()
    criterion_kl = nn.KLDivLoss(reduction='sum')
    x_adv = x.detach() + 0.001 * torch.randn_like(x).detach()
    nat_output = model(x)
    for _ in range(num_steps):
        x_adv.requires_grad_()
        with torch.enable_grad():
            loss_kl = criterion_kl(F.log_softmax(model(x_adv), dim=1),
                           F.softmax(nat_output, dim=1))
        grad = torch.autograd.grad(loss_kl, [x_adv])[0]
        x_adv = x_adv.detach() + step_size * torch.sign(grad.detach())
        x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
        x_adv = torch.clamp(x_adv, 0.0, 1.0)
    model.train()
    x_adv = Variable(x_adv, requires_grad=False)
    optimizer.zero_grad()
    # calculate robust loss
    logits = model(x)
    loss_natural = nn.CrossEntropyLoss()(logits, y)
    loss_robust = (1.0 / x.size(0)) * criterion_kl(F.log_softmax(model(x_adv), dim=1),
                       F.softmax(logits, dim=1))
    loss = loss_natural + beta * loss_robust
    return loss
1
Yifan Jiang On

My problem was solved after adding broadcast_buffers=False to torch.nn.parallel.DistributedDataParallel Following https://github.com/ashkamath/mdetr/issues/16#issuecomment-878388469