Error in exporting detectron2 trained model to onnx or torch script model

Question

Error in exporting detectron2 trained model to onnx or torch script model

264 views Asked by rezahs At 30 December 2024 at 16:42

I'm trying to export a model trained in detectron2 using export_model.py. My model's configuration and the way I'm building it are as follows:

def setup_cfg():
    cfg = get_cfg()
    cfg_file = model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.merge_from_file(cfg_file)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    return cfg

Which then I build:

cfg = setup_cfg()
model_ = build_model(cfg) 
DetectionCheckpointer(model_).resume_or_load("./model/model_final.pth", resume=False)
model_.eval()

Where model_final.pth is my trained weight. Below is the truncated output of this cell which I believe means so far so good:

GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
      (res2): Sequential(
        (0): BottleneckBlock(
          (shortcut): Conv2d(
            64, 256, kernel_size=(1`

Then I used two following functions from export_model.py with some minor modifications only to run on notebook.

def export_scripting(torch_model, format, output):
    assert TORCH_VERSION >= (1, 8)
    fields = {
        "proposal_boxes": Boxes,
        "objectness_logits": Tensor,
        "pred_boxes": Boxes,
        "scores": Tensor,
        "pred_classes": Tensor,
        "pred_masks": Tensor,
        "pred_keypoints": torch.Tensor,
        "pred_keypoint_heatmaps": torch.Tensor,
    }
    assert format == "torchscript", "Scripting only supports torchscript format."

    class ScriptableAdapterBase(nn.Module):
        # Use this adapter to workaround https://github.com/pytorch/pytorch/issues/46944
        # by not retuning instances but dicts. Otherwise the exported model is not deployable
        def __init__(self):
            super().__init__()
            self.model = torch_model
            self.eval()

    if isinstance(torch_model, GeneralizedRCNN):

        class ScriptableAdapter(ScriptableAdapterBase):
            def forward(self, inputs: Tuple[Dict[str, torch.Tensor]]) -> List[Dict[str, Tensor]]:
                instances = self.model.inference(inputs, do_postprocess=False)
                return [i.get_fields() for i in instances]

    else:

        class ScriptableAdapter(ScriptableAdapterBase):
            def forward(self, inputs: Tuple[Dict[str, torch.Tensor]]) -> List[Dict[str, Tensor]]:
                instances = self.model(inputs)
                return [i.get_fields() for i in instances]

    ts_model = scripting_with_instances(ScriptableAdapter(), fields)
    with PathManager.open(os.path.join(output, "model.ts"), "wb") as f:
       torch.jit.save(ts_model, f)
    dump_torchscript_IR(ts_model, output)
    # TODO inference in Python now missing postprocessing glue code
    return None

def export_tracing(torch_model, inputs, output, format):
    logger = setup_logger()
    assert TORCH_VERSION >= (1, 8)
    image = inputs[0]["image"]
    inputs = [{"image": image}]  # remove other unused keys

    if isinstance(torch_model, GeneralizedRCNN):

        def inference(model, inputs):
            # use do_postprocess=False so it returns ROI mask
            inst = model.inference(inputs, do_postprocess=False)[0]
            return [{"instances": inst}]

    else:
        inference = None  # assume that we just call the model directly

    traceable_model = TracingAdapter(torch_model, inputs, inference)

    if format == "torchscript":
        ts_model = torch.jit.trace(traceable_model, (image,))
        with PathManager.open(os.path.join(output, "model.ts"), "wb") as f:
            torch.jit.save(ts_model, f)
        dump_torchscript_IR(ts_model, output)
    elif format == "onnx":
        with PathManager.open(os.path.join(output, "model.onnx"), "wb") as f:
            torch.onnx.export(traceable_model, (image,), f, opset_version=STABLE_ONNX_OPSET_VERSION)
    logger.info("Inputs schema: " + str(traceable_model.inputs_schema))
    logger.info("Outputs schema: " + str(traceable_model.outputs_schema))

    if format != "torchscript":
        return None
    if not isinstance(torch_model, (GeneralizedRCNN, RetinaNet)):
        return None

    def eval_wrapper(inputs):
        """
        The exported model does not contain the final resize step, which is typically
        unused in deployment but needed for evaluation. We add it manually here.
        """
        input = inputs[0]
        instances = traceable_model.outputs_schema(ts_model(input["image"]))[0]["instances"]
        postprocessed = detector_postprocess(instances, input["height"], input["width"])
        return [{"instances": postprocessed}]

    return eval_wrapper

I then used export_tracing(model_, inputs, "./", "onnx") for the second function. inputs is the output of inputs = get_sample_inputs("./sample_img.jpg"). The output of this cell is given below:

RuntimeError: 
object has no attribute nms:
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/torchvision/ops/boxes.py", line 41
        _log_api_usage_once(nms)
    _assert_has_ops()
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
           ~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
'nms' is being compiled since it was called from '_batched_nms_coordinate_trick'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/torchvision/ops/boxes.py", line 94
    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    return keep

Also before this Runtime Error I'm getting warnings like below:

TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
python3.10/site-packages/detectron2/structures/boxes.py:151): TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if tensor.numel() == 0:

For first function I used export_scripting(model_, "torchscript", "./") and the truncated Runtime Error is as follows:

RuntimeError: 
object has no attribute nms:
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/torchvision/ops/boxes.py", line 41
        _log_api_usage_once(nms)
    _assert_has_ops()
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
           ~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
'nms' is being compiled since it was called from '_batched_nms_vanilla'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/torchvision/ops/boxes.py", line 109
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
'_batched_nms_vanilla' is being compiled since it was called from 'batched_nms'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/torchvision/ops/boxes.py", line 73
    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    else:
        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
'batched_nms' is being compiled since it was called from 'batched_nms'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/detectron2/layers/nms.py", line 20
    # just call it directly.
    # Fp16 does not have enough range for batched NMS, so adding float().
    return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
'batched_nms' is being compiled since it was called from 'find_top_rpn_proposals'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/detectron2/modeling/proposal_generator/proposal_utils.py", line 121
            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]

        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
'find_top_rpn_proposals' is being compiled since it was called from 'RPN.predict_proposals'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 503
        with torch.no_grad():
            pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
            return find_top_rpn_proposals(
                   ~~~~~~~~~~~~~~~~~~~~~~~
                pred_proposals,
                ~~~~~~~~~~~~~~~
                pred_objectness_logits,
                ~~~~~~~~~~~~~~~~~~~~~~~
                image_sizes,
                ~~~~~~~~~~~~
                self.nms_thresh,
                ~~~~~~~~~~~~~~~~
                self.pre_nms_topk[self.training],
                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                self.post_nms_topk[self.training],
                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                self.min_box_size,
                ~~~~~~~~~~~~~~~~~~
                self.training,
                ~~~~~~~~~~~~~ <--- HERE
            )
'RPN.predict_proposals' is being compiled since it was called from 'RPN.forward'
  File "/home/ora-rnd/.local/lib/python3.10/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 477
        else:
            losses = {}
        proposals = self.predict_proposals(
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes
            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        )
        return proposals, losses

Could you please help me to find what I'm doing wrong and in general what is the proper way to convert a detectron2 trained model to vanilla pytorch model or export it to onnx? Information of my working environment:

sys.platform                     linux
Python                           3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
numpy                            1.24.1
detectron2                       0.6 @/home/xxx/.local/lib/python3.10/site-packages/detectron2
Compiler                         GCC 11.4
CUDA compiler                    not available
DETECTRON2_ENV_MODULE            <not set>
PyTorch                          2.1.0+cu121 @/home/xxx/.local/lib/python3.10/site-packages/torch
PyTorch debug build              False
torch._C._GLIBCXX_USE_CXX11_ABI  False
GPU available                    Yes
GPU 0                            NVIDIA GeForce RTX 3050 Laptop GPU (arch=8.6)
Driver version                   535.113.01
CUDA_HOME                        None - invalid!
Pillow                           9.0.1
torchvision                      0.16.0+cu118 @/home/xxx/.local/lib/python3.10/site-packages/torchvision
torchvision arch flags           /home/xxx/.local/lib/python3.10/site-packages/torchvision/_C.so
fvcore                           0.1.5.post20221221
iopath                           0.1.9
cv2                              4.8.1
-------------------------------  ---------------------------------------------------------------------------
PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  - CuDNN 8.9.2
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=8.9.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.1.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF,

Original Q&A

TechQA.

Error in exporting detectron2 trained model to onnx or torch script model

There are 0 answers

Related Questions in PYTORCH

Related Questions in ONNX

Related Questions in DETECTRON

Related Questions in TORCHSCRIPT

Related Questions in COCO

Popular Questions

Popular Tags

Trending Questions