
    9i                      F   S SK Jr  S SKrS SKJs  Jr  S SKJr  S SKJr  SSK	J
r
  / SQr " S S	\R                  5      r " S
 S\R                  5      r " S S\R                  5      r " S S\R                  5      rSS\4S jjrSS\4S jjrSS\4S jjrSS\4S jjrg)    )OrderedDictN)	LayerNorm)nn   )DropPath)vit_base	vit_largevit_large_336vit_hugec                   :    \ rS rSrSrS\R                  4S jrSrg)	QuickGELU   z 
An activation function module.
xc                 :    U[         R                  " SU-  5      -  $ )NgZd;?)torchsigmoidselfr   s     e/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/ofa/vit.pyforwardQuickGELU.forward   s    5==+++     N)	__name__
__module____qualname____firstlineno____doc__r   Tensorr   __static_attributes__r   r   r   r   r      s    , ,r   r   c                      ^  \ rS rSrSr  SS\S\S\R                  4U 4S jjjrS\R                  4S jr	S\R                  4S	 jr
S
rU =r$ )ResidualAttentionBlock   a  
A residual attention block module.

step 1. Calculate the self attention in input with layer normalization.
step 2. Add input to the result of self attention's result as I.
step 3. Calculate the mlp of input I with layer normalization.
step 4. Add I to the result of mlp.
d_modeln_head	attn_maskc                   > [         TU ]  5         [        R                  " X5      U l        [        U5      U l        [        R                  " [        S[        R                  " XS-  5      4S[        5       4S[        R                  " US-  U5      4/5      5      U l        [        U5      U l        X0l        [        U5      U l        g)a  
Args:
    d_model (`int`): The embedding dimensions.
    n_head (`int`): The number of heads in self attention block.
    attn_mask (`Tensor`, **optional**, default to None):
        Attention mask using in self attention.
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
c_fc   geluc_projN)super__init__r   MultiheadAttentionattnr   ln_1
Sequentialr   Linearr   mlpln_2r&   r   	drop_path)r   r$   r%   r&   drop_path_rate	__class__s        r   r-   ResidualAttentionBlock.__init__)   s     	))':	g&	==7aK89%299Wq[':;  g&	"!.1r   r   c                     U R                   b.  U R                   R                  UR                  UR                  S9OSU l         U R	                  XUSU R                   S9S   $ )z
A wrapper of self attention .
N)dtypedeviceF)need_weightsr&   r   )r&   tor:   r;   r/   r   s     r   	attention ResidualAttentionBlock.attentionF   si     ~~) NNAGGAHH=/3 	 yy!%4>>  CCDF 	Fr   c                     XR                  U R                  U R                  U5      5      5      -   nXR                  U R                  U R	                  U5      5      5      -   nU$ N)r5   r>   r0   r3   r4   r   s     r   r   ResidualAttentionBlock.forwardP   sK    t~~diil;<<txx		!566r   )r/   r&   r5   r0   r4   r3   N        )r   r   r   r   r   intr   r   r-   r>   r   r    __classcell__r7   s   @r   r"   r"      s_     ,0 #	222 "LL2 2:F5<< F  r   r"   c                      ^  \ rS rSrSr  SS\S\S\S\R                  S\4
U 4S jjjr	S	\R                  4S
 jr
SrU =r$ )TransformerV   zg
A transformer module using in `VisionTransformer`.

Execute a sequential of `ResidualAttentionBlock`.
widthlayersheadsr&   r6   c                    > [         TU ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        XXE5      PM     sn6 U l        gs  snf )a  
Args:
    width (`int`): The width of input image.
    layers (`int`): The number of `ResidualAttentionBlock` layers.
    heads (int): The number of self attention heads.
    attn_mask (`Tensor`, **optional**, default to None):
        Attention mask using in self attention.
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
N)	r,   r-   rK   rL   r   r1   ranger"   	resblocks)r   rK   rL   rM   r&   r6   _r7   s          r   r-   Transformer.__init__]   sU    & 	
6])
" #5K")
  )
s   Ar   c                 $    U R                  U5      $ rA   )rP   r   s     r   r   Transformer.forwardx   s    ~~a  r   )rL   rP   rK   rC   )r   r   r   r   r   rE   r   r   floatr-   r   r    rF   rG   s   @r   rI   rI   V   sc     #' #  	
 <<  6! ! !r   rI   c                   p   ^  \ rS rSrSr SS\S\S\S\S\S\4U 4S	 jjjrS
\R                  4S jr
SrU =r$ )VisionTransformer|   ao  
Vision transformer module.

step 1. Using conv2d to get the image embedding.
step 2. If the resolution of input image doesn't equal to the initialized one
    do `bilinear` interpolate to get new patch position embedding.
step 3. Add position embedding to image embedding to generate final image representation.
step 4. Do `Transformer` to the image representation.
input_resolution
patch_sizerK   rL   rM   r6   c                 >  > [         TU ]  5         Xl        X l        [        R
                  " SUUUSS9U l        US-  nX0l        [        R                  " U[        R                  " X-  S-  S-   U5      -  5      U l        [        U5      U l        [        X4XVS9U l        g)	a  
Args:
    input_resolution (`int`): The resolution of input image.
    patch_size  (`int`): The resolution of each patch image.
    width (`int`): The dimension of each patch image.
    layers (`int`): The number of `ResidualAttentionBlock` in `Transformer`.
    heads (`int`): The number of heads in self attention block.
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
   F)in_channelsout_channelskernel_sizestridebiasg         r   r6   N)r,   r-   rY   rZ   r   Conv2dconv1rK   	Parameterr   randnpositional_embeddingr   ln_prerI   transformer)	r   rY   rZ   rK   rL   rM   r6   scaler7   s	           r   r-   VisionTransformer.__init__   s    ( 	 0$YY"

 t
$&LL+a/!3U:< 2< %=!&&5Ar   r   c                    UR                   S   nUR                   S   U R                  -  UR                   S   U R                  -  pCU R                  U5      nUR                  UR                   S   UR                   S   S5      nUR	                  SSS5      nX R
                  :w  a  U R                  SS  nU R
                  U R                  -  nUR                  SXfS5      R	                  SSSS5      n[        R                  " XSU4SS9nUR	                  SSSS5      R                  X4-  S5      nXR                  UR                  5      -   nO*XR                  SS  R                  UR                  5      -   nU R                  U5      nUR	                  SSS5      nU R                  U5      nUR	                  SSS5      nUR                   u  pn
UR                  SS5      R                  XX45      nU$ )	Nr   r   rb   r\   bilinear)sizemode)shaperZ   re   reshapepermuterY   rh   Finterpolater=   r:   ri   rj   	transpose)r   r   
resolutionheightrK   old_pe	patch_numnew_pebzseqhiddens              r   r   VisionTransformer.forward   s   WWR[
t69??9#JJqMIIaggaj!''!*IIaA.....qr2F--@I^^Ay$&((/1a(; ]]e_:?F^^Aq!Q/77KFIIagg&&A--ab144QWW==AKKNIIaAQIIaA''KK1%%b&@r   )re   rY   ri   rZ   rh   rj   rK   rD   )r   r   r   r   r   rE   rU   r-   r   r   r   r    rF   rG   s   @r   rW   rW   |   so    " !$%A%A %A 	%A
 %A %A %A %AN  r   rW   r6   c                 "    [        SSSSSU 5      $ )z
An instance of base vision transformer model.

Args:
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
      i   	      rW   rc   s    r   r   r      s     S"c1b.AAr   c                 "    [        SSSSSU 5      $ )z
An instance of large vision transformer model.

Args:
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
r            r   r   rc   s    r   r	   r	           S"dBNCCr   c                 "    [        SSSSSU 5      $ )a   
An instance of large vision transformer model with 336 as input image width .

Args:
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
iP  r   r   r   r   r   rc   s    r   r
   r
      r   r   c                 "    [        SSSSSU 5      $ )z
An instance of huge vision transformer model.

Args:
    drop_path_rate (`float`, **optional**, default to 0.0):
        Drop path rate. See more details about drop path from
        https://arxiv.org/pdf/1605.07648v4.pdf.
r   r   i      r   r   rc   s    r   r   r      r   r   r   )collectionsr   r   torch.nn.functionalr   
functionalrv   fairseq.modulesr   utils.utilsr   __all__Moduler   r"   rI   rW   rU   r   r	   r
   r   r   r   r   <module>r      s   
 $    %  !,		 ,4RYY 4n#!")) #!LO		 Od	BU 	B	De 	D	D% 	D	DU 	Dr   