Source code for vformer.attention.window

import torch
import torch.nn as nn

from ..utils import ATTENTION_REGISTRY, get_relative_position_bias_index, pair


[docs]@ATTENTION_REGISTRY.register()
class WindowAttention(nn.Module):
    """
    Implementation of Window Attention introduced in: `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_


    Parameters
    -----------
    dim: int
        Number of input channels.
    window_size : int or tuple[int]
        The height and width of the window.
    num_heads: int
        Number of attention heads.
    qkv_bias :bool
        If True, add a learnable bias to query, key, value, default is ``True``
    qk_scale: float, optional
        Override default qk scale of head_dim ** -0.5 if set
    attn_dropout: float, optional
        Dropout rate, default is 0.0.
    proj_dropout: float, optional
        Dropout rate, default is 0.0.

    """

    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_dropout=0.0,
        proj_dropout=0.0,
    ):
        super(WindowAttention, self).__init__()

        self.dim = dim
        self.window_size = pair(window_size)
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = qk_scale or self.head_dim**-0.5
        self.qkv_bias = True
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(
                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads
            )
        )
        relative_position_index = get_relative_position_bias_index(self.window_size)
        self.register_buffer("relative_position_index", relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.to_out_1 = nn.Sequential(nn.Softmax(dim=-1), nn.Dropout(attn_dropout))
        self.to_out_2 = nn.Sequential(nn.Linear(dim, dim), nn.Dropout(proj_dropout))
        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.2)

[docs]    def forward(self, x, mask=None):
        """

        Parameters
        ----------
        x: torch.Tensor
            input Tensor
        mask: torch.Tensor
            Attention mask used for shifted window attention, if None, window attention will be used,
            else attention mask will be taken into consideration.
            for better understanding you may refer `this github issue. <https://github.com/microsoft/Swin-Transformer/issues/38>`_

        Returns
        ----------
        torch.Tensor
            Returns output tensor by applying Window-Attention or Shifted-Window-Attention on input tensor

        """

        B_, N, C = x.shape
        qkv = (
            self.qkv(x)
            .reshape(
                B_,
                N,
                3,
                self.num_heads,
                C // self.num_heads,
            )
            .permute(2, 0, 3, 1, 4)
        )
        q, k, v = qkv[0], qkv[1], qkv[2]
        q = q * self.scale
        attn = q @ k.transpose(-2, -1)

        relative_position_bias = (
            self.relative_position_bias_table[self.relative_position_index.view(-1)]
            .view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1,
            )
            .permute(2, 0, 1)
            .contiguous()
        )
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
                1
            ).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)

        attn = self.to_out_1(attn)
        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.to_out_2(x)

        return x