
    9i                        S SK r S SKrS SKrS SKrS SKJr  S SKJs  Jr	  S SK
Js  Jr  S SKJrJrJr   " S S\R"                  5      rSS\ R&                  -  4S jrS rS	 r " S
 S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      rg)    N)DropPath	to_2tupletrunc_normal_c                   R   ^  \ rS rSrSrSS\R                  S4U 4S jjrS rSr	U =r
$ )Mlp   zMultilayer perceptron.N        c                   > [         TU ]  5         U=(       d    UnU=(       d    Un[        R                  " X5      U l        U" 5       U l        [        R                  " X#5      U l        [        R                  " U5      U l        g N)	super__init__nnLinearfc1actfc2Dropoutdrop)selfin_featureshidden_featuresout_features	act_layerr   	__class__s         b/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vidt/backbone.pyr   Mlp.__init__   s]     	#2{)8[99[:;99_;JJt$	    c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )r   xs     r   forwardMlp.forward!   sH    HHQKHHQKIIaLHHQKIIaLr   )r   r   r   r   )__name__
__module____qualname____firstlineno____doc__r   GELUr   r    __static_attributes____classcell__r   s   @r   r   r      s'    ! "&"77% r   r   i'     c           
      ^   US-  nU) nUR                  S[        R                  S9nUR                  S[        R                  S9nSnXfSS2SS2SS24   U-   -  U-  nXwSS2SS2SS24   U-   -  U-  n[        R                  " U[        R                  U R                  S9n	USU	S-  -  U-  -  n	USS2SS2SS2S4   U	-  n
USS2SS2SS2S4   U	-  n[        R
                  " U
SS2SS2SS2SSS24   R                  5       U
SS2SS2SS2SSS24   R                  5       4S	S
9R                  S5      n
[        R
                  " USS2SS2SS2SSS24   R                  5       USS2SS2SS2SSS24   R                  5       4S	S
9R                  S5      n[        R                  " X4SS
9nU$ )a  Masked Sinusoidal Positional Encoding

Args:
    x: [PATCH] tokens
    mask: the padding mask for [PATCH] tokens
    num_pos_feats: the size of channel dimension
    temperature: the temperature value
    scale: the normalization scale

Returns:
    pos: Sinusoidal positional encodings
r+      )dtypegư>N)r.   devicer      dim   )
cumsumtorchfloat32aranger0   stacksincosflattencat)r   masknum_pos_featstemperaturescalenot_masky_embedx_embedepsdim_tpos_xpos_yposs                r   masked_sin_pos_encodingrJ   *   s   $ "Q&MuHooau}}o5Gooau}}o5G
CBC+c12U:GArs+c12U:GLLemmAHHME!uz*]:;EAq!TM"U*EAq!TM"U*EKK	q!Q1}		!	!	#U1aADqD=%9%=%=%?@wqz 
 KK	q!Q1}		!	!	#U1aADqD=%9%=%=%?@wqz 
 ))UN
*CJr   c                     U R                   u  p#pEU R                  X#U-  XU-  UU5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )z
Args:
    x: (B, H, W, C)
    window_size (int): window size

Returns:
    windows: (num_windows*B, window_size, window_size, C)
r   r-   r4   r+   r1      r/   )shapeviewpermute
contiguous)r   window_sizeBHWCwindowss          r   window_partitionrW   W   si     JA!	q{"Kk1A;	Aii1aA%:<R1(M Nr   c                     [        U R                  S   X#-  U-  U-  -  5      nU R                  XBU-  X1-  UUS5      nUR                  SSSSSS5      R	                  5       R                  XBUS5      nU$ )z
Args:
    windows: (num_windows*B, window_size, window_size, C)
    window_size (int): Window size
    H (int): Height of image
    W (int): Width of image

Returns:
    x: (B, H, W, C)
r   r/   r-   r4   r+   r1   rL   )intrM   rN   rO   rP   )rV   rQ   rS   rT   rR   r   s         r   window_reverserZ   h   s~     	GMM! 3k ABCAQ[(!*:K "	&A			!Q1a#..055aArBAHr   c                   H   ^  \ rS rSrSr    SU 4S jjr   SS jrSrU =r$ )ReconfiguredAttentionModulez   a  Window based multi-head self attention (W-MSA) module with relative position bias -> extended with RAM.
It supports both of shifted and non-shifted window.

!!!!!!!!!!! IMPORTANT !!!!!!!!!!!
The original attention module in Swin is replaced with the reconfigured attention module in Section 3.
All the Args are shared, so only the forward function is modified.
See https://arxiv.org/pdf/2110.03921.pdf
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Args:
    dim (int): Number of input channels.
    window_size (tuple[int]): The height and width of the window.
    num_heads (int): Number of attention heads.
    qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
    qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
    attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
    proj_drop (float, optional): Dropout ratio of output. Default: 0.0
c                   > [         TU ]  5         Xl        X l        X0l        X-  nU=(       d    US-  U l        [        R                  " [        R                  " SUS   -  S-
  SUS   -  S-
  -  U5      5      U l
        [        R                  " U R                  S   5      n	[        R                  " U R                  S   5      n
[        R                  " [        R                  " X/5      5      n[        R                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  nUR                  SSS5      R!                  5       nUS S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   UR#                  S5      nU R%                  SU5        [        R&                  " XS-  US9U l        [        R*                  " U5      U l        [        R&                  " X5      U l        [        R*                  " U5      U l        [3        U R                  S	S
9  [        R4                  " SS9U l        g )Ng      r+   r   r-   r/   relative_position_indexr4   bias{Gz?stdr2   )r   r   r3   rQ   	num_headsrA   r   	Parameterr6   zerosrelative_position_bias_tabler8   r9   meshgridr<   rO   rP   sumregister_bufferr   qkvr   	attn_dropproj	proj_dropr   Softmaxsoftmax)r   r3   rQ   re   qkv_biasqk_scalerm   ro   head_dimcoords_hcoords_wcoordscoords_flattenrelative_coordsr_   r   s                  r   r   $ReconfiguredAttentionModule.__init__   s@    	&"#/4
 -/LLKK[^+a/AA4F4JK!#-$)
 << 0 0 34<< 0 0 34U^^X,@ABvq1(A)-*. /1?@DaAH 2II *11q!Z\ 	1 	"..q1A5	6 	1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"964	6 99S'9I.IIc'	I.d77SAzzb)r   c           	      
   U R                   S   U R                   S   :X  d   eU R                   S   nXf-  nU(       dh  UR                  u  ppX-  nUR                  XU5      n[        R                  " X/SS9nU R                  U5      nUSS2SU2SS24   USS2US2SS24   pOUS   R                  u  ppX-  nUS   R                  u  nnnnUU-  nUS   R                  XU5      nUS   R                  UUU5      n[        R                  " UUU/SS9nU R                  U5      nUSS2SU2SS24   USS2XU-   2SS24   USS2UU-   S2SS24   nnnUR                  XU
S5      n[        X5      nUR                  S   nUR                  UXf-  SU R                  XR                  -  5      nUR                  SSSSS5      nUS   US   US   nnnUU R                  -  nUUR                  S	S5      -  nU R                  U R                  R                  S5         R                  U R                   S   U R                   S   -  U R                   S   U R                   S   -  S5      nUR                  SSS5      R                  5       nUUR                  S5      -   nUbs  UR                  S   nUR                  UU-  UU R                  Xw5      nUR                  S5      R                  S5      n UU -   nUR                  SU R                  UU5      nU R!                  U5      nU R#                  U5      nUU-  R                  SS5      R                  UXfU5      n!UR                  USSU R                  XR                  -  5      nUR                  SSSSS5      nUS   US   US   n$n#n"U(       a  WR                  UWWSU R                  XR                  -  5      nUSS2SS2SS2SS2SS2SS24   R                  SSSSSS
5      R                  5       n%U%R                  SXR                  UU-  S5      n%U%S   U%S   n'n&[        R                  " U&U#/SS9[        R                  " U'U$/SS9n$n#U"U R                  -  n"U"U#R                  S	S5      -  n(Ub  U(U-   n(U R!                  U(5      n(U R#                  U(5      n(U(U$-  R                  SS5      R                  USU5      n)[%        U!XiU
5      n![        R                  " U!R                  XU
-  U5      U)/SS9nU R'                  U5      nU R)                  U5      nUSS2SX-  2SS24   R                  XX5      n!USS2X-  S2SS24   n)U!U)4$ )a  Forward function.
RAM module receives [Patch] and [DET] tokens and returns their calibrated ones

Args:
    x: [PATCH] tokens
    det: [DET] tokens
    mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None -> mask for shifted window attention

    "additional inputs for RAM"
    cross_attn: whether to use cross-attention [det x patch] (for selective cross-attention)
    cross_attn_mask: mask for cross-attention

Returns:
    patch_x: the calibrated [PATCH] tokens
    det_x: the calibrated [DET] tokens
r   r-   r2   Nr/   r4   r+   r1   rL   )rQ   rM   rN   r6   r=   rl   rW   reshapere   rO   rA   	transposerh   r_   rP   	unsqueezerq   rm   rZ   rn   ro   )*r   r   detr>   
cross_attncross_attn_maskrQ   local_map_sizerR   rS   rT   rU   Nfull_qkv	patch_qkvdet_qkv_ori_Hori_Wori_N	shifted_xcross_xcross_patch_qkvB_
_patch_qkvpatch_qpatch_kpatch_v
patch_attnrelative_position_biasnWtmp0tmp1patch_xdet_qdet_kdet_vpatch_kvcross_patch_kcross_patch_vdet_attndet_xs*                                             r   r    #ReconfiguredAttentionModule.forward   s   . "d&6&6q&9999&&q)$2 JA!AqQA		1(*Axx{H!)!RaR(!3XaQh5Gw1JA!A!"1AueQEME!		!*Idii5!,G		9gs3;Axx{HBQB"HQe)Q->$?!QQVYZYZJZA[ )0INN1B/	 %$	__Q%%b+*CQ&*nna>>6IK	&&q!Q15
$.qM:a=*Q-' DJJ& 1 1"b 99
!%!B!B((--b1"3374  #d&6&6q&99  #d&6&6q&994 	
 "8!?!?q!"Z\ 	"8"B"B1"EE
 AB??28R#1CD>>!$..q1DJ#T^^^)79J \\*-
^^J/
'221a8@@!- ,,q"ann9LM//!Q1a0%aj'!*gajeu  .221eUA37>>343FHO 'q!Q'(r1a(0 118Aq!Q9:2<<FJL   }}Q>>55="MH ,4A;=M !99mU%;)*,-2YYu7M;<.> E
 

"EOOB33&/1H<<)>>(+E!,,Q2::1b!D !+!< IIw||A1ua0%8a@IIaLNN1 AvvqL/&&qQ2!QUVQ,~r   )
rm   r3   re   rn   ro   rl   rh   rA   rq   rQ   )TNr	   r	   )NFN	r"   r#   r$   r%   r&   r   r    r(   r)   r*   s   @r   r\   r\   z   s3    . -*d   $E Er   r\   c            
       r   ^  \ rS rSrSrSSSSSSSS\R                  \R                  4
U 4S	 jjrS
 r	Sr
U =r$ )SwinTransformerBlockiE  a$  Swin Transformer Block.

Args:
    dim (int): Number of input channels.
    num_heads (int): Number of attention heads.
    window_size (int): Window size.
    shift_size (int): Shift size for SW-MSA.
    mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
    qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
    qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
    drop (float, optional): Dropout rate. Default: 0.0
    attn_drop (float, optional): Attention dropout rate. Default: 0.0
    drop_path (float, optional): Stochastic depth rate. Default: 0.0
    act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
    norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
   r         @TNr	   c           
        > [         TU ]  5         Xl        X l        X0l        X@l        XPl        SU R
                  s=::  a  U R                  :  d   S5       e   S5       eU" U5      U l        [        U[        U R                  5      UUUU	US9U l
        U
S:  a  [        U
5      O[        R                  " 5       U l        U" U5      U l        [!        X-  5      n[#        UUUUS9U l        S U l        S U l        g )Nr   z shift_size must in 0-window_size)rQ   re   rr   rs   rm   ro   r	   )r   r   r   r   )r   r   r3   re   rQ   
shift_size	mlp_rationorm1r\   r   attnr   r   Identity	drop_pathnorm2rY   r   mlprS   rT   )r   r3   re   rQ   r   r   rr   rs   r   rm   r   r   
norm_layermlp_hidden_dimr   s                 r   r   SwinTransformerBlock.__init__W  s     	"&$"DOO6d&6&66Z8ZZ6Z8ZZ6_
/!$"2"23	 $b. ".0kkm 	_
S_-*	 r   c           	         UR                   u  pgnU R                  U R                  pXyU
-  U R                  -   :X  d   S5       eUnU R	                  U5      nUSS2SX-  2SS24   USS2X-  S2SS24   pUR                  XiX5      nUnS=pU R                  XR                  -  -
  U R                  -  nU R                  XR                  -  -
  U R                  -  n[        R                  " USSUUUU45      nUR                   u  nnnnUu  nnU R                  U5      nU R                  S:  a0  [        R                  " XR                  * U R                  * 4SS9nUnOUnSnU(       a  UU-   nUU-   nUU4nOUU-   nUnU R                  UUUUUS9u  nnU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS:  d  US:  a  USS2SU	2SU
2SS24   R                  5       nUR                  XiU
-  U5      n[        R                  " X/SS9nXR!                  U5      -   nXR!                  U R#                  U R%                  U5      5      5      -   nU$ )	a  Forward function.

Args:
    x: Input feature, tensor size (B, H*W + DET, C). i.e., binded [PATCH, DET] tokens
    H, W: Spatial resolution of the input feature.
    mask_matrix: Attention mask for cyclic shift.

    "additional inputs'
    pos: (patch_pos, det_pos)
    cross_attn: whether to use cross attn [det x [det + patch]]
    cross_attn_mask: attention mask for cross-attention

Returns:
    x: calibrated & binded [PATCH, DET] tokens
input feature has wrong sizeNr   )r-   r+   )shiftsdims)r>   r   r   r   r-   r2   )rM   rS   rT   det_token_numr   rN   rQ   Fpaddet_pos_linearr   r6   rollr   rP   r=   r   r   r   )r   r   mask_matrixrI   r   r   rR   LrU   rS   rT   shortcutr   orig_xpad_lpad_tpad_rpad_br   HpWp	patch_posdet_posr   	attn_maskcross_patchs                             r   r    SwinTransformerBlock.forward  s   " ''avvtvv1ED....N0NN.JJqM1fqufa<!AquvqL/3FF1 !!A(8(8$88D<L<LL!!A(8(8$88D<L<LLEE!aE5%78ww2r1 !	7%%g. ??Q

OO+doo-=>VMI#III  9,K-C"K0I -C!I !+ # -	3 ??Q

9A
 A19	!RaR!Q,**,AFF1!eQIIqhA& ~~a((txx

1677r   )rS   rT   r   r3   r   r   r   r   r   re   r   rQ   )r"   r#   r$   r%   r&   r   r'   	LayerNormr   r    r(   r)   r*   s   @r   r   r   E  sB    ( 77LL*XX Xr   r   c                   N   ^  \ rS rSrSr\R                  S4U 4S jjrS rSr	U =r
$ )PatchMergingi  zPatch Merging Layer

Args:
    dim (int): Number of input channels.
    norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
Tc                    > [         TU ]  5         Xl        U(       a  SU-  OSn[        R                  " SU-  USS9U l        U" SU-  5      U l        [        R                  " XSS9U l        U" U5      U l        g )Nr+      r1   Fr`   )	r   r   r3   r   r   	reductionnorm	expansionr   )r   r3   r   expand
expand_dimr   s        r   r   PatchMerging.__init__  si     !'QWC
1s7JUCq3w'	 3?_
r   c           
         UR                   u  pEnXRU-  U R                  -   :X  d   S5       eUSS2SX#-  2SS24   USS2X#-  S2SS24   pqUR                  XBX65      nUS-  S:H  =(       d    US-  S:H  nU(       a#  [        R                  " USSSUS-  SUS-  45      nUSS2SSS2SSS2SS24   n	USS2SSS2SSS2SS24   n
USS2SSS2SSS2SS24   nUSS2SSS2SSS2SS24   n[
        R                  " XX/S5      nUR                  USSU-  5      nUR                  SSS5      n[
        R                  " X/SS9nU R                  U5      nU R                  U5      nU$ )	a  Forward function.

Args:
    x: Input feature, tensor size (B, H*W, C), i.e., binded [PATCH, DET] tokens
    H, W: Spatial resolution of the input feature.

Returns:
    x: merged [PATCH, DET] tokens;
    only [PATCH] tokens are reduced in spatial dim, while [DET] tokens is fix-scale
r   Nr+   r-   r   r/   r1   r2   )
rM   r   rN   r   r   r6   r=   repeatr   r   )r   r   rS   rT   rR   r   rU   r   	pad_inputx0x1x2x3s                r   r    PatchMerging.forward  s    ''aED....N0NN.1fqufa<!AquvqL/3FF1 UaZ0QUaZ	a!Q1q5!QU34Aq!$Q$1a q!$Q$1a q!$Q$1a q!$Q$1a IIrr&+FF1b!a%  jjAq!IIqhA&IIaLNN1r   )r3   r   r   r   r   r"   r#   r$   r%   r&   r   r   r   r    r(   r)   r*   s   @r   r   r     s"     (*||D %% %r   r   c                   d   ^  \ rS rSrSrSSSSSSS\R                  SSS4U 4S	 jjrSS
 jrSr	U =r
$ )
BasicLayeri  a  A basic Swin Transformer layer for one stage.

Args:
    dim (int): Number of feature channels
    depth (int): Depths of this stage.
    num_heads (int): Number of attention head.
    window_size (int): Local window size. Default: 7.
    mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
    qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
    qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
    drop (float, optional): Dropout rate. Default: 0.0
    attn_drop (float, optional): Attention dropout rate. Default: 0.0
    drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
    norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
    downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
    use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
r   r   TNr	   Fc                   > [         TU ]  5         X@l        US-  U l        X l        Xl        Xl        [        R                  " [        U5       Vs/ s H=  n[        UUUUS-  S:X  a  SOUS-  UUUUU	[        U
[        5      (       a  X   OU
US9PM?     sn5      U l        Ub  U" XU(       + S9U l        g S U l        g s  snf )Nr+   r   )r3   re   rQ   r   r   rr   rs   r   rm   r   r   )r3   r   r   )r   r   rQ   r   depthr3   use_checkpointr   
ModuleListranger   
isinstancelistblocks
downsample)r   r3   r   re   rQ   r   rr   rs   r   rm   r   r   r   lastr   ir   s                   r   r   BasicLayer.__init__/  s      	&%*
, mm 16e%
 1=1 !#'!"Q!1+2B#!!#i.. $,4=%' 1=%
 " !(DCDO #DO+%
s   AB<c           
         UR                   S   n[        [        R                  " X R                  -  5      5      U R                  -  n[        [        R                  " X0R                  -  5      5      U R                  -  n	[
        R                  " SXS4UR                  S9n
[        SU R                  * 5      [        U R                  * U R                  * 5      [        U R                  * S5      4n[        SU R                  * 5      [        U R                  * U R                  * 5      [        U R                  * S5      4nSnU H  nU H  nXSS2XSS24'   US-  nM     M     [        XR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  [        S5      5      R                  US:H  [        S5      5      nU(       Ga  UR                   SS u  nnUU:X  a  UU:X  dG  [        R                   " US   R                  5       X#4S	9R#                  [
        R$                  5      S   n['        XU R(                  5      nUR                  5       nUR                  US:g  [        S5      5      R                  US:H  [        S5      5      nUR                  XrU-  5      R                  S5      R                  S5      n[        R*                  " USU R,                  4SS
9nOSnSnUU4n[/        U R0                  5       H`  u  nnX#sUl        Ul        U(       a  SnUnUnOSnSnSU4nU R6                  (       a  [8        R8                  " UUUUUUS9nMV  U" UUUUUS9nMb     U R:                  b)  U R;                  XU5      nUS-   S-  US-   S-  nnXUUUU4$ XX1X#4$ )a  Forward function.

Args:
    x: Input feature, tensor size (B, H*W, C).
    H, W: Spatial resolution of the input feature.
    det_pos: pos encoding for det token
    input_mask: padding mask for inputs
    cross_attn: whether to use cross attn [det x [det + patch]]
r   r-   )r0   Nr/   r+   g      Yr	   size)valueTF)rI   r   r   )rM   rY   npceilrQ   r6   rg   r0   slicer   rW   rN   r   masked_fillfloatr   interpolatetoboolrJ   r3   r   r   	enumerater   rS   rT   r   
checkpointr   )r   r   rS   rT   r   
input_maskr   rR   r   r   img_maskh_slicesw_slicescnthwmask_windowsr   _H_Wr   r   rI   n_blkblk_cross_attn_cross_attn_mask_posx_downWhWws                                  r   r    BasicLayer.forward^  s    GGAJ ---./$2B2BB---./$2B2BB;;21~ahh?!d.../4+++??*,-2DOO3CT-JL !d.../4+++??*,-2DOO3CT-JL A'*A!$q   (2B2BC#(()-)9)9D<L<L)LN **1-0F0Fq0II	)))q.*/-99D.71neCj:J 	
 %%ab)FB!Ga]]t$**,A6;;=2ejj>!M
 0txxHI )..0O-99/S:PRWX^R_`Os2E#J?  .22q5#)A,yy| ee!T%7%7!8CO I"O '"#DKK0JE3LCE35 "#2 ##' g""))*$46 *$46/ 1B ??&__Q1-F!e\AEa<BFB**q##r   )r   r   r3   r   r   r   rQ   )Fr   r*   s   @r   r   r     sA    , LL  %-#^j$ j$r   r   c                   >   ^  \ rS rSrSr    SU 4S jjrS rSrU =r$ )
PatchEmbedi  a,  Image to Patch Embedding

Args:
    patch_size (int): Patch token size. Default: 4.
    in_chans (int): Number of input image channels. Default: 3.
    embed_dim (int): Number of linear projection output channels. Default: 96.
    norm_layer (nn.Module, optional): Normalization layer. Default: None
c                    > [         TU ]  5         [        U5      nXl        X l        X0l        [        R                  " X#XS9U l        Ub  U" U5      U l	        g S U l	        g )N)kernel_sizestride)
r   r   r   
patch_sizein_chans	embed_dimr   Conv2drn   r   )r   r  r  r  r   r   s        r   r   PatchEmbed.__init__  sX    
 	z*
$ "IIZL	!"9-DIDIr   c           
         UR                  5       u    p#nX@R                  S   -  S:w  a8  [        R                  " USU R                  S   X@R                  S   -  -
  45      nX0R                  S   -  S:w  a:  [        R                  " USSSU R                  S   X0R                  S   -  -
  45      nU R	                  U5      nU R
                  b  UR                  S5      UR                  S5      peUR                  S5      R                  SS5      nU R                  U5      nUR                  SS5      R                  SU R                  XV5      nU$ )zForward function.r-   r   r+   r4   r/   )
r   r  r   r   rn   r   r<   r~   rN   r  )r   r   r   rS   rT   r  r  s          r   r    PatchEmbed.forward  s"    VVX
1q!!Q&a!T__Q/!ooa6H2HHIJAq!!Q&a!Q 2Q9K5K KLNA IIaL99 VVAYq			!&&q!,A		!AAq!&&r4>>2BAr   )r  r  r   r  rn   )r1   r4   `   Nr   r*   s   @r   r  r    s%      	& r   r  c                      ^  \ rS rSrSrSSSS/ SQ/ SQS	S
SSSSS\R                  SS/ SQSS4U 4S jjrS r\	R                  R                  S 5       rSSS/4S jrS rSU 4S jjrS rSrU =r$ )SwinTransformeri  aU  Swin Transformer backbone.
    A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
      https://arxiv.org/pdf/2103.14030

Args:
    pretrain_img_size (int): Input image size for training the pretrained model,
        used in absolute position embedding. Default 224.
    patch_size (int | tuple(int)): Patch size. Default: 4.
    in_chans (int): Number of input image channels. Default: 3.
    embed_dim (int): Number of linear projection output channels. Default: 96.
    depths (tuple[int]): Depths of each Swin Transformer stage.
    num_heads (tuple[int]): Number of attention head of each stage.
    window_size (int): Window size. Default: 7.
    mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
    qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
    qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
    drop_rate (float): Dropout rate.
    attn_drop_rate (float): Attention dropout rate. Default: 0.
    drop_path_rate (float): Stochastic depth rate. Default: 0.2.
    norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
    ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
    patch_norm (bool): If True, add normalization after patch embedding. Default: True.
    out_indices (Sequence[int]): Output from which stages.
    frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
        -1 means not freezing any args.
    use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
   r1   r4   r  )r+   r+      r+   )r4   r        r   r   TNr	   g?F)r-   r+   r4   r/   c                 $  > [         TU ]  5         Xl        [        U5      U l        X@l        Xl        UU l        UU l        UU l	        [        UUUU R                  (       a  UOS S9U l        U R                  (       ax  [        U5      n[        U5      nUS   US   -  US   US   -  /n[        R                  " [        R                   " SUUS   US   5      5      U l        [%        U R"                  SS9  [        R&                  " US9U l        [        R*                  " SU[-        U5      5       Vs/ s H  nUR/                  5       PM     nn[        R0                  " 5       U l        [5        U R                  5       H  n[7        [9        USU-  -  5      UU   UU   UUU	U
UUU[-        US U 5      [-        US US-    5       UUU R                  :  a  [:        OS UU R                  S-
  :  a  S OSUS	9nU R2                  R=                  U5        M     [5        U R                  5       Vs/ s H  n[9        USU-  -  5      PM     nnUU l        U H%  nU" UU   5      nS
U 3nU RA                  UU5        M'     U RC                  5         g s  snf s  snf )N)r  r  r  r   r   r-   rb   rc   )pr+   T)r3   r   re   rQ   r   rr   rs   r   rm   r   r   r   r   r   r   )"r   r   pretrain_img_sizelen
num_layersr  ape
patch_normout_indicesfrozen_stagesr  patch_embedr   r   rf   r6   rg   absolute_pos_embedr   r   pos_droplinspacerj   itemr   layersr   r   rY   r   appendnum_features
add_module_freeze_stages)r   r$  r  r  r  depthsre   rQ   r   rr   rs   	drop_rateattn_drop_ratedrop_path_rater   r'  r(  r)  r*  r   patches_resolutionr   dpri_layerlayerr   r2  
layer_namer   s                               r   r   SwinTransformer.__init__  s   , 	!2f+"$&* &!%)__z$	@ 88 )*; <":.J!!$
15!!$
15"
 ')llAy*<Q*?.q13'4D# $11s;

Y/ $nnQFL
LAFFHL 	 

 mmoT__-G	AwJ./Wo#G,'#!!(c&'"23C|!8L4MN% 4??* (15%!(;;T$-#/E$ KKu%' .* 8=T__7MN7M!I1,-7MN(
 #G|G45Ey)JOOJ. #
 	M
6 Os   0J(Jc                    U R                   S:  aB  U R                  R                  5         U R                  R                  5        H
  nSUl        M     U R                   S:  a"  U R
                  (       a  SU R                  l        U R                   S:  ax  U R                  R                  5         [        SU R                   S-
  5       H@  nU R                  U   nUR                  5         UR                  5        H
  nSUl        M     MB     g g )Nr   Fr-   r+   )
r*  r+  eval
parametersrequires_gradr'  r,  r-  r   r0  )r   paramr   ms       r   r4  SwinTransformer._freeze_stagesz  s    "!!#))446&+# 7 "txx49D##1"MM 1d00145KKN\\^E*/E' , 6 #r   c                 
    SS1$ )Ndet_pos_embed	det_token )r   s    r   no_weight_decaySwinTransformer.no_weight_decay  s    --r   d   r   c                    Xl         X l        [        R                  " [        R
                  " SX R                  S   5      5      U l        [        U R                  SS9U l        X0l	        [        R
                  " SX#5      n[        USS9n[        R                  R                  U5      U l
        [        [        U R                  5      S-
  5       Vs/ s H  nU R                  US-      PM     snU l        US:X  a%  U R                  R                  U R                  5        X@l        S[        U R                   5      [        U R                  5      -
  -  U l        U R                    Hd  nX'l        UR$                  b  X'R$                  l        UR&                   H.  nX(l        [        R(                  " X8R*                  5      Ul        M0     Mf     US:X  a  SU R                   S	   l        ggs  snf )
a  A function to add necessary (leanable) variables to Swin Transformer for object detection

Args:
    method: vidt or vidt_wo_neck
    det_token_num: the number of object to detect, i.e., number of object queries
    pos_dim: the channel dimension of positional encodings for [DET] and [PATCH] tokens
    cross_indices: the indices where to use the [DET X PATCH] cross-attention
        there are four possible stages in [0, 1, 2, 3]. 3 indicates Stage 4 in the ViDT paper.
r-   r   rb   rc   vidtr+   Nvidt_wo_neckr/   )methodr   r   rf   r6   rg   r2  rH  r   pos_dimrG  r   r%  num_channelsr1  cross_indicesr0  mask_divisorr   r   r   r3   r   )	r   rP  r   rQ  rS  rG  r   r<  blocks	            r   finetune_detSwinTransformer.finetune_det  s      +KK=*;*;A*>?A&t~~3?  A}>%m="XX//>
 3t001A56
6 a!e$6
 V$$*DKK 03t7I7I3J JK [[E"/+1>  .&3#')yy))'D$ &	 ! ^#)-DKKO& $+
s   G)c           
         UR                   S   UR                   S   UR                   S   n  nU R                  U5      nUR                  S5      UR                  S5      peUR                  S5      R	                  SS5      nU R                  U5      nU R                  R                  USS5      nU R                  n[        R                  " US   R                  5       XPR                  -  X`R                  -  4S9R                  [        R                  5      S   n/ n	[!        U R"                  5       H  n
U R$                  U
   nXR&                  ;   a  SOS	n[        R(                  " X/SS
9nU" UUUUUUS9u  pppVUSS2SU R*                  * 2SS24   USS2U R*                  * S2SS24   pqU
S:  d  M  USS2SU R*                  * 2SS24   R-                  X>US5      R/                  SSSS5      nU	R1                  U5        M     U	R1                  UR-                  X5US5      R/                  SSSS5      5        WSS2U R*                  * S2SS24   R/                  SSS5      nUR/                  SSS5      nU	u  nnnnUUUUUU4$ )a  Forward function.

Args:
    x: input rgb images
    mask: input padding masks [0: rgb values, 1: padded values]

Returns:
    patch_outs: multi-scale [PATCH] tokens (four scales are used)
        these tokens are the first input of the neck decoder
    det_tgt: final [DET] tokens obtained at the last stage
        this tokens are the second input of the neck decoder
    det_pos: the learnable pos encoding for [DET] tokens.
        these encodings are used to generate reference points in deformable attention
r   r+   r4   r-   r/   Nr   TFr2   )r   r   r   )rM   r+  r   r<   r~   r-  rH  r   rG  r   r   r   rT  r   r6   r   r   r&  r0  rS  r=   r   rN   rO   r1  )r   r   r>   rR   r   r  r  rH  r   
patch_outsstager<  r   x_outrS   rT   	patch_outdet_tgt
features_0
features_1
features_2
features_3s                         r   r    SwinTransformer.forward  s   " ''!*aggaj!''!*a1 QAFF1IBIIaL""1a(MM! NN))!R4	 $$ }}J))))))+, -/BuzzN1>
 
4??+EKK&E "'*<*<!<%J 		1.a0A &+%&'"EaB Q 4$"4"4!4 4a78!A<IM@J @KLM=N ;Oy qy!!%9t'9'9&9%91"<=BB!R!!(Aq!!4 !!),5 ,: 	!&&B/771aCD D.../23;;Aq!D //!Q*9C6
J
J:z:wOOr   c                 L   > [         [        U ]  U5        U R                  5         g)z?Convert the model into training mode while keep layers freezed.N)r   r  trainr4  )r   moder   s     r   rd  SwinTransformer.train  s    ot*40r   c                 T   SnXR                   R                  5       -  n[        U R                  5       H  u  p#XR                  5       -  nM     XR                  U R
                  S   -  U R
                  S   -  SU R                  -  -  -  nXR                  U R                  -  -  nU$ )Nr   r-   r+   )r+  flopsr   r0  r2  r9  r&  num_classes)r   rh  r   r<  s       r   rh  SwinTransformer.flops  s    !!''))!$++.HA[[]"E /""T%<%<& ((+,014??0BD 	D""T%5%555r   )r,  r'  rS  rG  rH  r   r  r*  r0  rT  rP  rR  r2  r&  r)  r+  r(  rQ  r-  r$  )T)r"   r#   r$   r%   r&   r   r   r   r4  r6   jitignorerJ  rV  r    rd  rh  r(   r)   r*   s   @r   r  r    s    < "$|| +^@0" YY. .
 $' $%3	7.rMP^ r   r  )mathosnumpyr   r6   torch.nnr   torch.nn.functional
functionalr   torch.utils.checkpointutilsr   timm.models.layersr   r   r   Moduler   pirJ   rW   rZ   r\   r   r   r   r  r  rI  r   r   <module>rx     s   
  	      + + A A")) : )."#dgg+	*Z"$H")) HVV299 Vr;299 ;|l$ l$^/ /dhbii hr   