Source code for vformer.encoder.embedding.video_patch_embeddings

# Video Patch Embedding
# data Dim is in following format - Batch,Time,Channels,Height,Width

# As discussed in paper; this embedding is just Like Vanilla Embedding where we take non overlapping image patches and map them into multidimension embedding

import torch
import torch.nn as nn
from einops.layers.torch import Rearrange


[docs]class LinearVideoEmbedding(nn.Module):
    """

    Parameters
    -----------
    embedding_dim: int
        Dimension of the resultant embedding
    patch_height: int
        Height of the patch
    patch_width: int
        Width of the patch
    patch_dim: int
        patch_dimension

    """

    def __init__(
        self,
        embedding_dim,
        patch_height,
        patch_width,
        patch_dim,
    ):

        super().__init__()
        self.patch_embedding = nn.Sequential(
            Rearrange(
                "b t c (h ph) (w pw) -> b t (h w) (ph pw c)",
                ph=patch_height,
                pw=patch_width,
            ),
            nn.Linear(patch_dim, embedding_dim),
        )

[docs]    def forward(self, x):
        """

        Parameters
        -----------
        x: torch.Tensor
            Input tensor

        Returns
        ----------
        torch.Tensor
            Returns patch embeddings of size `embedding_dim`

        """

        return self.patch_embedding(x)


#
[docs]class TubeletEmbedding(nn.Module):
    """

    Parameters
    ----------
    embedding_dim: int
        Dimension of the resultant embedding
    tubelet_t: int
        Temporal length of single tube/patch
    tubelet_h: int
        Heigth  of single tube/patch
    tubelet_w: int
        Width of single tube/patch
    in_channels: int
        Number of channels
    """

    def __init__(self, embedding_dim, tubelet_t, tubelet_h, tubelet_w, in_channels):
        super(TubeletEmbedding, self).__init__()
        tubelet_dim = in_channels * tubelet_h * tubelet_w * tubelet_t
        self.tubelet_embedding = nn.Sequential(
            Rearrange(
                "b  (t pt) c (h ph) (w pw) -> b t (h w) (pt ph pw c)",
                pt=tubelet_t,
                ph=tubelet_h,
                pw=tubelet_w,
            ),
            nn.Linear(tubelet_dim, embedding_dim),
        )

[docs]    def forward(self, x):
        """

        Parameters
        ----------
        x: Torch.tensor
            Input tensor

        """
        return self.tubelet_embedding(x)