
    9iu              	          S SK r S SKrS SKJr  S SKJrJr  S SKrS SK	r	S SK
Js  Jr  S SK	Jr  SSKJr   " S S\R                   5      r " S	 S
\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r " S S\R"                  5      r\" SSSSSS9\" SSSSSS9S.rSS jr\S:X  a  \" 5       rgg)    N)OrderedDict)TupleUnion)nn   )ViMc                   D   ^  \ rS rSrS\R
                  4U 4S jjrSrU =r$ )	LayerNorm   xc                    > UR                   n[        TU ]	  UR                  [        R
                  5      5      nUR                  U5      $ N)dtypesuperforwardtypetorchfloat32)selfr   	orig_typeret	__class__s       o/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vision_middleware/backbone.pyr   LayerNorm.forward   s6    GG	goaffU]]34xx	""     )	__name__
__module____qualname____firstlineno__r   Tensorr   __static_attributes____classcell__r   s   @r   r
   r
      s    # # #r   r
   c                   6    \ rS rSrS\R
                  4S jrSrg)	QuickGELU   r   c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)r   sigmoidr   r   s     r   r   QuickGELU.forward   s    5==+++r   r   N)r   r   r   r    r   r!   r   r"   r   r   r   r&   r&      s    , ,r   r&   c                      ^  \ rS rSr SS\S\S\R                  4U 4S jjjrS\R                  4S jrS\R                  S\	4S	 jr
S
rU =r$ )ResidualAttentionBlock   d_modeln_head	attn_maskc                   > [         TU ]  5         [        R                  " X5      U l        [        U5      U l        [        R                  " [        S[        R                  " XS-  5      4S[        5       4S[        R                  " US-  U5      4/5      5      U l        [        U5      U l        X0l        [        5       U l        [        5       U l        g )Nc_fc   geluc_proj)r   __init__r   MultiheadAttentionattnr
   ln_1
Sequentialr   Linearr&   mlpln_2r1   r   vim_attvim_mlp)r   r/   r0   r1   r   s       r   r7   ResidualAttentionBlock.__init__!   s     	))':	g&	==&"))Gq["AB )+."BIIgk7$CDF GH g&	"uur   r   c                     U R                   b.  U R                   R                  UR                  UR                  S9OS U l         U R	                  XUSU R                   S9S   $ )N)r   deviceF)need_weightsr1   r   )r1   tor   rC   r9   r*   s     r   	attention ResidualAttentionBlock.attention3   so     !% : **''88 + @D 	 yy!%4>>  CCDF 	Fr   	task_namec                     U R                  U5      nXR                  U5      -   nXR                  X25      -   nU R                  U5      nXR	                  U5      -   nXR                  XB5      -   nU$ r   )r:   rF   r?   r>   r=   r@   )r   r   rH   
x_normed_1
x_normed_2s        r   r   ResidualAttentionBlock.forward:   se    YYq\
z**Z33YYq\
$$Z33r   )r9   r1   r:   r>   r=   r?   r@   r   )r   r   r   r    intr   r!   r7   rF   strr   r"   r#   r$   s   @r   r-   r-      s^    
 ,0 "LL $F5<< F	 	# 	 	r   r-   c            	       x   ^  \ rS rSr S
S\S\S\S\R                  4U 4S jjjrS\R                  4S jrS	r	U =r
$ )TransformerF   widthlayersheadsr1   c           
         > [         TU ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        XU5      PM     sn5      U l        g s  snf r   )	r   r7   rR   rS   r   
ModuleListranger-   	resblocks)r   rR   rS   rT   r1   _r   s         r   r7   Transformer.__init__H   sU    
 	
6](
" #5;"(
  (
s   Ar   c                     UR                  5       u  p4n/ n[        U R                  5       H  u  pxU" U40 UD6nUR                  U5        M!     U$ r   )size	enumeraterX   append)	r   r   kwargsLBDfeaturesiblks	            r   r   Transformer.forwardU   sM    &&(a/FAA  AOOA 0 r   )rS   rX   rR   r   )r   r   r   r    rM   r   r!   r7   r   r"   r#   r$   s   @r   rP   rP   F   sO     ,0	  "LL	   r   rP   c                   p   ^  \ rS rSrSr SS\S\S\S\S\S\4U 4S	 jjjrS
\R                  4S jr	Sr
U =r$ )VisionTransformer^   aB  
The Vision Transformer (ViT) model
Args:
    - input_resolution (int): shape of input image
    - patch_width (int): size of patch tokens
    - width (int): feature channels
    - layers (int): number of transformer layers
    - heads (int): number of multi-head attention
    - output_dim (int): output feature channels
input_resolution
patch_sizerR   rS   rT   
output_dimc                 0  > [         TU ]  5         Xl        [        R                  " SUUUSS9U l        US-  n[        R                  " U[        R                  " U5      -  5      U l	        [        R                  " U[        R                  " X-  S-  S-   U5      -  5      U l
        [        U5      U l        X-  U l        [        X4U5      U l        [        U5      U l        [        R                  " U[        R                  " X65      -  5      U l        X`l        g )N   F)in_channelsout_channelskernel_sizestridebiasg         r   )r   r7   rj   r   Conv2dconv1	Parameterr   randnclass_embeddingpositional_embeddingr
   ln_prepatch_per_siderP   transformerln_postprojrl   )	r   rj   rk   rR   rS   rT   rl   scaler   s	           r   r7   VisionTransformer.__init__j   s     	 0YY"
 t!||EEKK4F,FG$&LL+a/!3U:< 2< %=!&.<&ue< 'LLU)G!GH	$r   r   c           	         U R                  U5      nUR                  S5      nUR                  S5      nUR                  UR                  S   UR                  S   S5      nUR	                  SSS5      nU R
                  R                  UR                  5      R                  SSS5      R                  USS5      n[        R                  " XQ/SS9nXR                  R                  UR                  5      -   nU R                  U5      nUR	                  SSS5      nU R                  " U40 UD6nUS   nUR	                  SSS5      nU R                  US S 2SS S 24   5      nU R                  b  XR                  -  n/ nU HB  nUR!                  USS 2S S 2S S 24   R	                  SSS5      R                  USXD5      5        MD     UR!                  U5        U$ )Nr   rt   r   )dim)rv   r\   reshapeshapepermutery   rE   r   repeatr   catrz   r{   r}   r~   r   r^   )	r   r   r_   ra   P	cls_tokenx_per_layeroutputsoutputs	            r   r   VisionTransformer.forward   s   JJqMFF1IFF1IIIaggaj!''!*b1IIaA((++AGG4<<Q2FMMq!	IIyn)),,QWW55KKNIIaA&&q3F3OIIaALL1a7$99 IIA !FNN6!"a(+33Aq4577>wq"a7KM " 	qr   )
ry   rv   rj   r~   r{   rl   r|   rz   r   r}   )i   )r   r   r   r    __doc__rM   r7   r   r!   r   r"   r#   r$   s   @r   rh   rh   ^   sd    	" $'%#&% % % 	%
 % !% %:  r   rh         i      )rj   rk   rR   rS   rT       )vit_b16_224vit_b32_224c                 P    [         U    n[        S0 UD6nUR                  U5        U$ )zebuild a ViT + ViM model
Args:
    arch: name of backbone
    pretrained: weights of pretrained model
r   )
model_dictrh   load_state_dict)arch
pretrained
model_argsmodels       r   build_backboner      s.     D!J+
+E	*%Lr   __main__)r   N)mathoscollectionsr   typingr   r   numpynpr   torch.nn.functionalr   
functionalFvimr   r
   Moduler&   r-   rP   rh   dictr   r   r   r   r   r   r   <module>r      s     	 #       # #,		 ,$RYY $N")) 0G		 GX 	#"C"M#"C"M	

 zE r   