Source code for vformer.encoder.pyramid

import torch.nn as nn
from torchvision.ops import StochasticDepth

from ..attention import SpatialAttention
from ..common.blocks import DWConv
from ..functional import PreNorm
from ..utils import ENCODER_REGISTRY

[docs]class PVTFeedForward(nn.Module): """ Parameters ---------- dim: int Dimension of the input tensor hidden_dim: int, optional Dimension of hidden layer out_dim:int, optional Dimension of output tensor activation: nn.Module Activation Layer, default is nn.GELU p_dropout: float Dropout probability/rate, default is 0.0 linear: bool Whether to use linear Spatial attention,default is False use_dwconv: bool Whether to use Depth-wise convolutions, default is False kernel_size_dwconv: int `kernel_size` parameter for 2D convolution used in Depth wise convolution stride_dwconv: int `stride` parameter for 2D convolution used in Depth wise convolution padding_dwconv: int `padding` parameter for 2D convolution used in Depth wise convolution bias_dwconv:bool `bias` parameter for 2D convolution used in Depth wise convolution """ def __init__( self, dim, hidden_dim=None, out_dim=None, activation=nn.GELU, p_dropout=0.0, linear=False, use_dwconv=False, **kwargs ): super(PVTFeedForward, self).__init__() out_dim = out_dim if out_dim is not None else dim hidden_dim = hidden_dim if hidden_dim is not None else dim self.use_dwconv = use_dwconv self.fc1 = nn.Linear(dim, hidden_dim) self.relu = nn.ReLU(inplace=True) if linear else nn.Identity() if use_dwconv: self.dw_conv = DWConv(dim=hidden_dim, **kwargs) self.to_out = nn.Sequential( activation(), nn.Dropout(p=p_dropout), nn.Linear(hidden_dim, out_dim), nn.Dropout(p=p_dropout), )
[docs] def forward(self, x, **kwargs): """ Parameters ---------- x: torch.Tensor Input tensor H: int Height of image patch W: int Width of image patch Returns -------- torch.Tensor Returns output tensor """ x = self.relu(self.fc1(x)) if self.use_dwconv: x = self.dw_conv(x, **kwargs) return self.to_out(x)
[docs]@ENCODER_REGISTRY.register() class PVTEncoder(nn.Module): """ Parameters ---------- dim: int Dimension of the input tensor num_heads: int Number of attention heads mlp_ratio: Ratio of MLP hidden dimension to embedding dimension depth: int Number of attention layers in the encoder qkv_bias: bool Whether to add a bias vector to the q,k, and v matrices qk_scale:float, optional Override default qk scale of head_dim ** -0.5 in Spatial Attention if set p_dropout: float Dropout probability attn_dropout: float Dropout probability drop_path: tuple(float) List of stochastic drop rate activation: nn.Module Activation layer use_dwconv:bool Whether to use depth-wise convolutions in overlap-patch embedding sr_ratio: float Spatial Reduction ratio linear: bool Whether to use linear Spatial attention, default is ```False```. drop_path_mode: str Mode for `StochasticDepth <>_ , must be one of {``batch`` or ``row``} """ def __init__( self, dim, num_heads, mlp_ratio, depth, qkv_bias, qk_scale, p_dropout, attn_dropout, drop_path, activation, use_dwconv, sr_ratio, linear=False, drop_path_mode="batch", ): super(PVTEncoder, self).__init__() self.encoder = nn.ModuleList([]) for i in range(depth): self.encoder.append( nn.ModuleList( [ PreNorm( dim=dim, fn=SpatialAttention( dim=dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_dropout, proj_drop=p_dropout, sr_ratio=sr_ratio, linear=linear, ), ), PreNorm( dim=dim, fn=PVTFeedForward( dim=dim, hidden_dim=int(dim * mlp_ratio), activation=activation, p_dropout=p_dropout, linear=linear, use_dwconv=use_dwconv, ), ), ] ) ) self.drop_path = ( StochasticDepth(p=drop_path[i], mode=drop_path_mode) if drop_path[i] > 0.0 else nn.Identity() )
[docs] def forward(self, x, **kwargs): for prenorm_attn, prenorm_ff in self.encoder: x = x + self.drop_path(prenorm_attn(x, **kwargs)) x = x + self.drop_path(prenorm_ff(x, **kwargs)) return x