Source code for vformer.encoder.vanilla

import torch.nn as nn
from torchvision.ops import StochasticDepth

from ..attention import VanillaSelfAttention
from ..functional import PreNorm
from ..utils import ENCODER_REGISTRY
from .nn import FeedForward


[docs]@ENCODER_REGISTRY.register() class VanillaEncoder(nn.Module): """ Parameters ---------- embedding_dim: int Dimension of the embedding depth: int Number of self-attention layers num_heads: int Number of the attention heads head_dim: int Dimension of each head mlp_dim: int Dimension of the hidden layer in the feed-forward layer p_dropout: float Dropout Probability attn_dropout: float Dropout Probability drop_path_rate: float Stochastic drop path rate """ def __init__( self, embedding_dim, depth, num_heads, head_dim, mlp_dim, p_dropout=0.0, attn_dropout=0.0, drop_path_rate=0.0, drop_path_mode="batch", ): super().__init__() self.encoder = nn.ModuleList([]) for _ in range(depth): self.encoder.append( nn.ModuleList( [ PreNorm( dim=embedding_dim, fn=VanillaSelfAttention( dim=embedding_dim, num_heads=num_heads, head_dim=head_dim, p_dropout=attn_dropout, ), ), PreNorm( dim=embedding_dim, fn=FeedForward( dim=embedding_dim, hidden_dim=mlp_dim, p_dropout=p_dropout, ), ), ] ) ) self.drop_path = ( StochasticDepth(p=drop_path_rate, mode=drop_path_mode) if drop_path_rate > 0.0 else nn.Identity() )
[docs] def forward(self, x): """ Parameters ---------- x: torch.Tensor Returns ---------- torch.Tensor Returns output tensor """ for attn, ff in self.encoder: x = attn(x) + x x = self.drop_path(ff(x)) + x return x