Why are some nn.Linear layers not quantized by Pytorch?

741 views Asked by At

I'm quantizing the Swin transformer (static PTQ) using the following function:

def static_quantize(m, data_loader):
    backend = 'qnnpack'
    torch.backends.quantized.engine = backend
    m.eval()

    m.qconfig = torch.quantization.get_default_qconfig(backend)
    torch.quantization.prepare(m, inplace=True)

    with torch.no_grad():
        for i, data in enumerate(data_loader):
            if i >= 100:
                break
            result = m(return_loss=False, **data)
        
    torch.quantization.convert(m, inplace=True)

    return m

Most modules, including linear layers, do get quantized. However some linear layers of a SwinBlock are skipped, as you can see here:

(3): SwinBlockSequence(
  (blocks): ModuleList(
    (0): SwinBlock(
      (quant): Quantize(scale=tensor([0.3938]), zero_point=tensor([122]), dtype=torch.quint8)
      (dequant): DeQuantize()
      (norm1): QuantizedLayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): ShiftWindowMSA(
        (w_msa): WindowMSA(
          (quant): Quantize(scale=tensor([0.0294]), zero_point=tensor([155]), dtype=torch.quint8)
          (dequant): DeQuantize()
          (qkv): QuantizedLinear(in_features=768, out_features=2304, scale=0.039033032953739166, zero_point=133, qscheme=torch.per_tensor_affine)
          (attn_drop): Dropout(p=0, inplace=False)
          (proj): QuantizedLinear(in_features=768, out_features=768, scale=0.0369536317884922, zero_point=110, qscheme=torch.per_tensor_affine)
          (proj_drop): Dropout(p=0, inplace=False)
          (softmax): Softmax(dim=-1)
        )
        (drop): DropPath()
      )
      (norm2): QuantizedLayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ffn): FFN( // <------- HERE (children not quantized)
        (activate): GELU()
        (layers): Sequential(
          (0): Sequential(
            (0): Linear(in_features=768, out_features=3072, bias=True)
            (1): GELU()
            (2): Dropout(p=0, inplace=False)
          )
          (1): Linear(in_features=3072, out_features=768, bias=True)
          (2): Dropout(p=0, inplace=False)
        )
        (dropout_layer): DropPath()
      )
    )

I am referring to the FFN submodule, where nothing is quantized. However, it contains linear layers, which ought to pose no problems for quantization.

Here's how FFN is added to the module:

        _ffn_cfgs = {
            'embed_dims': embed_dims,
            'feedforward_channels': int(embed_dims * ffn_ratio),
            'num_fcs': 2,
            'ffn_drop': 0,
            'dropout_layer': dict(type='DropPath', drop_prob=drop_path),
            'act_cfg': dict(type='GELU'),
            **ffn_cfgs
        }
        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
        self.ffn = FFN(**_ffn_cfgs)

Here's the source code for FFN:


@FEEDFORWARD_NETWORK.register_module()
class FFN(BaseModule):
    """Implements feed-forward networks (FFNs) with identity connection.

    Args:
        embed_dims (int): The feature dimension. Same as
            `MultiheadAttention`. Defaults: 256.
        feedforward_channels (int): The hidden dimension of FFNs.
            Defaults: 1024.
        num_fcs (int, optional): The number of fully-connected layers in
            FFNs. Default: 2.
        act_cfg (dict, optional): The activation config for FFNs.
            Default: dict(type='ReLU')
        ffn_drop (float, optional): Probability of an element to be
            zeroed in FFN. Default 0.0.
        add_identity (bool, optional): Whether to add the
            identity connection. Default: `True`.
        dropout_layer (obj:`ConfigDict`): The dropout_layer used
            when adding the shortcut.
        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
            Default: None.
    """

    @deprecated_api_warning(
        {
            'dropout': 'ffn_drop',
            'add_residual': 'add_identity'
        },
        cls_name='FFN')
    def __init__(self,
                 embed_dims=256,
                 feedforward_channels=1024,
                 num_fcs=2,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_drop=0.,
                 dropout_layer=None,
                 add_identity=True,
                 init_cfg=None,
                 **kwargs):
        super().__init__(init_cfg)
        assert num_fcs >= 2, 'num_fcs should be no less ' \
            f'than 2. got {num_fcs}.'
        self.embed_dims = embed_dims
        self.feedforward_channels = feedforward_channels
        self.num_fcs = num_fcs
        self.act_cfg = act_cfg
        self.activate = build_activation_layer(act_cfg)

        layers = []
        in_channels = embed_dims
        for _ in range(num_fcs - 1):
            layers.append(
                Sequential(
                    Linear(in_channels, feedforward_channels), self.activate,
                    nn.Dropout(ffn_drop)))
            in_channels = feedforward_channels
        layers.append(Linear(feedforward_channels, embed_dims))
        layers.append(nn.Dropout(ffn_drop))
        self.layers = Sequential(*layers)
        self.dropout_layer = build_dropout(
            dropout_layer) if dropout_layer else torch.nn.Identity()
        self.add_identity = add_identity

    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
    def forward(self, x, identity=None):
        """Forward function for `FFN`.

        The function would add x to the output tensor if residue is None.
        """
        out = self.layers(x)
        if not self.add_identity:
            return self.dropout_layer(out)
        if identity is None:
            identity = x
        return identity + self.dropout_layer(out)
1

There are 1 answers

0
corazza On BEST ANSWER

The problem is very silly: Linear in this case referred to an mmcv wrapper class for nn.Linear. Quantizing the wrapper class is not supported.

class Linear(torch.nn.Linear):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # empty tensor forward of Linear layer is supported in Pytorch 1.6
        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
            out_shape = [x.shape[0], self.out_features]
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:
                # produce dummy gradient to avoid DDP warning.
                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
                return empty + dummy
            else:
                return empty

        return super().forward(x)

By the looks of it (since I'm using PyTorch 1.8.1) this can be easily remedied by modifying the FFN class to use nn.Linear.