Source code for vformer.encoder.embedding.convvt

import torch.nn as nn
from einops import rearrange


[docs]class ConvEmbedding(nn.Module):
    """
    Projects image patches into embedding space using convolutional layer.

    Parameters
    -----------
    patch_size: int, default is 7
        Size of a patch
    in_channels: int, default is 3
        Number of input channels
    embedding_dim: int, default is 64
        Dimension of hidden layer
    stride: int or tuple, default is 4
        Stride of the convolution operation
    padding: int, default is 2
        Padding to all sides of the input
    """

    def __init__(
        self, patch_size=7, in_channels=3, embedding_dim=64, stride=4, padding=2
    ):
        super().__init__()
        self.patch_size = (patch_size, patch_size)

        self.proj = nn.Conv2d(
            in_channels,
            embedding_dim,
            kernel_size=self.patch_size,
            stride=stride,
            padding=padding,
        )
        self.norm = nn.LayerNorm(embedding_dim)

[docs]    def forward(self, x):
        """
        Parameters
        -----------
        x: torch.tensor
            Input tensor

        Returns
        -----------
        torch.Tensor
            Returns output tensor (embedding) by applying a convolution operations on input tensor
        """
        x = self.proj(x)
        B, C, H, W = x.shape
        x = rearrange(x, "b c h w -> b (h w) c")
        x = self.norm(x)
        x = rearrange(x, "b (h w) c -> b c h w", h=H, w=W)

        return x