I want to use pytorch DistributedDataParallel for adversarial training. The loss function is trades.The code can run in DataParallel mode. But in DistributedDataParallel mode, I got this error. When I change the loss to AT, it can run successfully. Why can't run with trades loss? The two loss functions are as follows:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/data/zsd/defense/my_adv_training/muti_gpu_2.py", line 170, in main_worker
train_loss = train(train_loader, model, optimizer, epoch, local_rank, args)
File "/data/zsd/defense/my_adv_training/muti_gpu_2.py", line 208, in train
loss = trades(model,x, y,optimizer, args.epsilon,args.step_size,args.num_steps,beta=6.0)
File "/data/zsd/defense/my_adv_training/loss_functions.py", line 137, in trades
loss_kl.backward()
File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/lthpc/.conda/envs/bba/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [640]] is at version 4; expected version 3 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
.
for i, (x, y) in enumerate(train_loader):
# measure data loading time
x,y = x.cuda(local_rank, non_blocking=True), y.cuda(local_rank, non_blocking=True)
loss = trades(model,x, y,optimizer, args.epsilon,args.step_size,args.num_steps,beta=6.0)
torch.distributed.barrier()
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
def trades():
model.eval()
criterion_kl = nn.KLDivLoss(reduction='sum')
x_adv = x.detach() + 0.001 * torch.randn_like(x).detach()
nat_output = model(x)
for _ in range(num_steps):
x_adv.requires_grad_()
with torch.enable_grad():
loss_kl = criterion_kl(F.log_softmax(model(x_adv), dim=1),
F.softmax(nat_output, dim=1))
loss_kl.backward()
eta = step_size * x_adv.grad.sign()
x_adv = x_adv.detach() + eta
x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
x_adv = torch.clamp(x_adv, 0.0, 1.0)
model.train()
x_adv = Variable(x_adv, requires_grad=False)
optimizer.zero_grad()
# calculate robust loss
logits = model(x)
loss_natural = nn.CrossEntropyLoss()(logits, y)
loss_robust = (1.0 / x.size(0)) * criterion_kl(F.log_softmax(model(x_adv), dim=1),
F.softmax(logits, dim=1))
loss = loss_natural + beta * loss_robust
return loss
def AT():
model.eval()
x_adv = x.detach() + torch.from_numpy(np.random.uniform(-epsilon,
epsilon, x.shape)).float().cuda()
x_adv = torch.clamp(x_adv, 0.0, 1.0)
for k in range(num_steps):
x_adv.requires_grad_()
output = model(x_adv)
model.zero_grad()
with torch.enable_grad():
loss = nn.CrossEntropyLoss()(output, y)
loss.backward()
eta = step_size * x_adv.grad.sign()
x_adv = x_adv.detach() + eta
x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
x_adv = torch.clamp(x_adv, 0.0, 1.0)
x_adv = Variable(x_adv, requires_grad=False)
model.train()
logits_adv = model(x_adv)
loss = nn.CrossEntropyLoss()(logits_adv, y)
return loss
I changed the code of trades and solved this error. But I don't know why this works.