Source code for vformer.models.classification.visformer

import einops
import torch.nn as nn

from ...attention import VanillaSelfAttention
from ...encoder.embedding.pos_embedding import PosEmbedding
from ...utils import ATTENTION_REGISTRY, MODEL_REGISTRY


[docs]class VisformerConvBlock(nn.Module):
    """
    Convolution Block for Vision-Friendly transformers
    https://arxiv.org/abs/2104.12533

    Parameters
    -----------
    in_channels: int
        Number of input channels
    group: int
        Number of groups for convolution, default is 8
    activation: torch.nn.Module
        Activation function between layers, default is nn.GELU
    p_dropout: float
        Dropout rate, default is 0.0
    """

    def __init__(self, in_channels, group=8, activation=nn.GELU, p_dropout=0.0):
        super().__init__()

        self.norm1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, in_channels * 2, kernel_size=1, bias=False)
        self.act1 = activation()
        self.conv2 = nn.Conv2d(
            in_channels * 2,
            in_channels * 2,
            kernel_size=3,
            padding=1,
            groups=group,
            bias=False,
        )
        self.act2 = activation()
        self.conv3 = nn.Conv2d(in_channels * 2, in_channels, kernel_size=1, bias=False)
        self.drop = nn.Dropout(p_dropout)

[docs]    def forward(self, x):
        """

        Parameters
        ----------
        x: torch.Tensor
            Input tensor
        Returns
        ----------
        torch.Tensor
            Returns tensor of same size as input
        """

        xt = x
        xt = self.norm1(xt)
        x = self.conv1(x)
        x = self.act1(x)
        x = self.drop(x)
        x = self.conv2(x)
        x = self.act2(x)
        x = self.conv3(x)
        x = self.drop(x)
        x = x + xt

        return x


[docs]@ATTENTION_REGISTRY.register()
class VisformerAttentionBlock(nn.Module):
    """
    Attention Block for Vision-Friendly transformers
    https://arxiv.org/abs/2104.12533

    Parameters
    ----------
    in_channels: int
        Number of input channels
    num_heads: int
        Number of heads for attention, default is 8
    activation: torch.nn.Module
        Activation function between layers, default is nn.GELU
    p_dropout: float
        Dropout rate, default is 0.0
    """

    def __init__(self, in_channels, num_heads=8, activation=nn.GELU, p_dropout=0.0):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, in_channels * 4, kernel_size=1, bias=False)
        self.act1 = activation()
        self.conv2 = nn.Conv2d(in_channels * 4, in_channels, kernel_size=1, bias=False)
        self.attn = VanillaSelfAttention(
            in_channels, num_heads=num_heads, head_dim=in_channels // num_heads
        )
        self.norm1 = nn.BatchNorm2d(in_channels)
        self.norm2 = nn.BatchNorm2d(in_channels)
        self.drop = nn.Dropout(p_dropout)

[docs]    def forward(self, x):
        """

        Parameters
        ----------
        x: torch.Tensor
            Input tensor
        Returns
        ----------
        torch.Tensor
            Returns tensor of same size as input
        """

        B, C, H, W = x.shape
        xt = einops.rearrange(x, "b c h w -> b (h w) c")
        x = self.norm1(x)
        x = einops.rearrange(x, "b c h w -> b (h w) c")
        x = self.attn(x)
        x = xt + x
        x = einops.rearrange(x, "b (h w) c -> b c h w", h=H, w=W)
        xt = x
        x = self.norm2(x)
        x = self.conv1(x)
        x = self.act1(x)
        x = self.conv2(x)
        x = self.drop(x)
        x = xt + x

        return x


[docs]@MODEL_REGISTRY.register()
class Visformer(nn.Module):
    """
    A builder to construct a Vision-Friendly transformer model as in the paper :`Visformer: The Vision-friendly Transformer <https://arxiv.org/abs/1906.11488>`_

    Parameters
    ----------
    img_size: int,tuple
        Size of the input image
    n_classes: int
        Number of classes in the dataset
    depth: tuple[int]
        Number of layers before each embedding reduction
    config: tuple[int]
        Choice of convolution block (0) or attention block (1) for corresponding layer
    channel_config: tuple[int]
        Number of channels for each layer
    num_heads: int
        Number of heads for attention block, default is 8
    conv_group: int
        Number of groups for convolution block, default is 8
    p_dropout_conv: float
        Dropout rate for convolution block, default is 0.0
    p_dropout_attn: float
        Dropout rate for attention block, default is 0.0
    activation: nn.Module
        Activation function between layers, default is nn.GELU
    pos_embedding: bool
        Whether to use positional embedding, default is True

    """

    def __init__(
        self,
        img_size,
        n_classes,
        depth: tuple,
        config: tuple,
        channel_config: tuple,
        num_heads=8,
        conv_group=8,
        p_dropout_conv=0.0,
        p_dropout_attn=0.0,
        activation=nn.GELU,
        pos_embedding=True,
    ):
        super().__init__()

        q = 0
        assert (
            len(channel_config) == len(depth) - depth.count(0) + 2
        ), "Channel config is not correct"

        assert set(config).issubset(
            set([0, 1])
        ), "Config is not correct, should contain only 0 and 1"

        self.linear = nn.Linear(channel_config[-1], n_classes)

        if isinstance(img_size, int):
            img_size = (img_size, img_size)
        image_size = list(img_size)

        assert image_size[0] // (2 ** (len(depth) + 1)) > 0, "Image size is too small"
        assert image_size[1] // (2 ** (len(depth) + 1)) > 0, "Image size is too small"

        self.stem = nn.ModuleList(
            [
                nn.Conv2d(
                    channel_config[q],
                    channel_config[q + 1],
                    kernel_size=7,
                    padding=3,
                    stride=2,
                    bias=False,
                ),
                nn.BatchNorm2d(channel_config[q + 1]),
                nn.ReLU(inplace=True),
            ]
        )

        q += 1
        emb = 2
        image_size = [i // 2 for i in image_size]

        for i in range(len(depth)):

            if depth[i] == 0:
                emb *= 2
                config = tuple([0] + list(config))
                continue

            self.stem.extend(
                [
                    nn.Conv2d(
                        channel_config[q],
                        channel_config[q + 1],
                        kernel_size=emb,
                        stride=emb,
                    ),
                    nn.BatchNorm2d(channel_config[q + 1]),
                    nn.ReLU(inplace=True),
                ]
            )

            image_size = [k // emb for k in image_size]
            emb = 2
            q += 1

            if pos_embedding:
                self.stem.extend(
                    [PosEmbedding([channel_config[q], image_size[0]], image_size[1])]
                )

            if config[i] == 0:
                self.stem.extend(
                    [
                        VisformerConvBlock(
                            channel_config[q],
                            group=conv_group,
                            p_dropout=p_dropout_conv,
                            activation=activation,
                        )
                        for j in range(depth[i])
                    ]
                )

            elif config[i] == 1:
                self.stem.extend(
                    [
                        VisformerAttentionBlock(
                            channel_config[q],
                            num_heads,
                            activation,
                            p_dropout_attn,
                        )
                        for j in range(depth[i])
                    ]
                )

        self.stem.extend([nn.BatchNorm2d(channel_config[-1]), nn.AdaptiveAvgPool2d(1)])
        self.softmax = nn.Softmax(dim=1)

[docs]    def forward(self, x):
        """

        Parameters
        ----------
        x: torch.Tensor
            Input tensor
        Returns
        ----------
        torch.Tensor
            Returns tensor of size `n_classes`

        """
        for i in self.stem:
            x = i(x)

        x.squeeze_(2).squeeze_(2)
        x = self.linear(x)
        x = self.softmax(x)

        return x


[docs]@MODEL_REGISTRY.register()
def Visformer_S(img_size, n_classes, in_channels=3):
    """
    Visformer-S model from the paper: `Visformer: The Vision-friendly Transformer <https://arxiv.org/abs/1906.11488>`_

    Parameters
    ----------
    img_size: int,tuple
        Size of the input image
    n_classes: int
        Number of classes in the dataset
    in_channels: int
        Number of channels in the input
    """
    return Visformer(
        img_size,
        n_classes,
        (0, 7, 4, 4),
        (0, 1, 1),
        (in_channels, 32, 192, 384, 768),
        num_heads=6,
    )


[docs]@MODEL_REGISTRY.register()
def VisformerV2_S(img_size, n_classes, in_channels=3):
    """
    VisformerV2-S model from the paper: `Visformer: The Vision-friendly Transformer <https://arxiv.org/abs/1906.11488>`_

    Parameters
    ----------
    img_size: int,tuple
        Size of the input image
    n_classes: int
        Number of classes in the dataset
    in_channels: int
        Number of channels in the input
    """
    return Visformer(
        img_size,
        n_classes,
        (1, 10, 14, 3),
        (0, 0, 1, 1),
        (in_channels, 32, 64, 128, 256, 512),
        num_heads=6,
    )


[docs]@MODEL_REGISTRY.register()
def Visformer_Ti(img_size, n_classes, in_channels=3):
    """
    Visformer-Ti model from the paper: `Visformer: The Vision-friendly Transformer <https://arxiv.org/abs/1906.11488>`_

    Parameters
    ----------
    img_size: int,tuple
        Size of the input image
    n_classes: int
        Number of classes in the dataset
    in_channels: int
        Number of channels in the input
    """
    return Visformer(
        img_size,
        n_classes,
        (0, 7, 4, 4),
        (0, 1, 1),
        (in_channels, 16, 96, 192, 384),
        num_heads=6,
    )


[docs]@MODEL_REGISTRY.register()
def VisformerV2_Ti(img_size, n_classes, in_channels=3):
    """
    VisformerV2-Ti model from the paper: `Visformer: The Vision-friendly Transformer <https://arxiv.org/abs/1906.11488>`_

    Parameters
    ----------
    img_size: int,tuple
        Size of the input image
    n_classes: int
        Number of classes in the dataset
    in_channels: int
        Number of channels in the input
    """

    return Visformer(
        img_size,
        n_classes,
        (1, 4, 6, 2),
        (0, 0, 1, 1),
        (in_channels, 24, 48, 96, 192, 384),
        num_heads=6,
    )