how to debug torch distributed processgroup?

131 views Asked by At

I want to analyse the pytorch distributed backend interface but I don't know how to debug it.QAQ

vscode python,debug+ gbd attach,python C++ debuuger Subprocesses can't be debugged? I'm wondering if there is a better way to debug the IDE, it would be easier to use the IDE

import os
import time
import numpy as np
import torch
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.distributed._tensor import DTensor, DeviceMesh, Shard, Replicate, distribute_tensor,zeros

def run(rank, size):

  a = torch.tensor([[0, 2.], [3, 0]])
  a.to_sparse()
  if rank == 0:
    print(a.to_dense())
  dist.barrier()
  dist.all_reduce(a)
  dist.barrier()
  if rank == 0 :
    print(a.to_dense())

def init_process(rank_id, size, fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '12347'
    dist.init_process_group(backend, rank=rank_id, world_size=size)
    fn(rank_id, size)

if __name__ == "__main__":
    big_tensor = torch.arange(0,16).reshape(4,4)
    size = 1
    processes = []
    mp.set_start_method("spawn")
    for rank in range(size):
        p = mp.Process(target=init_process, args=(rank, size, run))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

0

There are 0 answers