
    9i/                        S SK r S SKrS SKrS SKrS SKJr  S SKJrJr  S SK	r
S SKrS SKJs  Jr  S SKJr  S SKJr  S SKJr   " S S\R&                  5      r " S	 S
\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      rS\4S jr\R6                  R9                  5       (       a  SOSS4S\S\\\R<                  4   4S jjrg)    N)OrderedDict)TupleUnion)nn)tqdm)
TorchModelc                   D   ^  \ rS rSrS\R
                  4U 4S jjrSrU =r$ )	LayerNorm   xc                    > UR                   n[        TU ]	  UR                  [        R
                  5      5      nUR                  U5      $ N)dtypesuperforwardtypetorchfloat32)selfr   	orig_typeret	__class__s       k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vop_retrieval/backbone.pyr   LayerNorm.forward   s6    GG	goaffU]]34xx	""     )	__name__
__module____qualname____firstlineno__r   Tensorr   __static_attributes____classcell__r   s   @r   r
   r
      s    # # #r   r
   c                   6    \ rS rSrS\R
                  4S jrSrg)	QuickGELU   r   c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)r   sigmoidr   r   s     r   r   QuickGELU.forward    s    5==+++r   r   N)r   r   r   r    r   r!   r   r"   r   r   r   r&   r&      s    , ,r   r&   c                      ^  \ rS rSr S
S\S\S\R                  4U 4S jjjrS\R                  4S jrS\R                  4S jr	S	r
U =r$ )ResidualAttentionBlock$   d_modeln_head	attn_maskc                 l  > [         TU ]  5         [        R                  " X5      U l        [        U5      U l        [        R                  " [        S[        R                  " XS-  5      4S[        5       4S[        R                  " US-  U5      4/5      5      U l        [        U5      U l        X0l        g )Nc_fc   geluc_proj)r   __init__r   MultiheadAttentionattnr
   ln_1
Sequentialr   Linearr&   mlpln_2r1   )r   r/   r0   r1   r   s       r   r7   ResidualAttentionBlock.__init__&   s     	))':	g&	==&"))Gq["AB )+."BIIgk7$CDF GH g&	"r   r   c                     U R                   b.  U R                   R                  UR                  UR                  S9OS U l         U R	                  XUSU R                   S9S   $ )Nr   deviceF)need_weightsr1   r   )r1   tor   rB   r9   r*   s     r   	attention ResidualAttentionBlock.attention4   so     !% : **''88 + @D 	 yy!%4>>  CCDF 	Fr   c                     XR                  U R                  U5      5      -   nXR                  U R                  U5      5      -   nU$ r   )rE   r:   r=   r>   r*   s     r   r   ResidualAttentionBlock.forward;   s9    tyy|,,1&&r   )r9   r1   r:   r>   r=   r   )r   r   r   r    intr   r!   r7   rE   r   r"   r#   r$   s   @r   r-   r-   $   sW    
 ,0### "LL# #F5<< F  r   r-   c            	       x   ^  \ rS rSr S
S\S\S\S\R                  4U 4S jjjrS\R                  4S jrS	r	U =r
$ )TransformerA   widthlayersheadsr1   c           
         > [         TU ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        XU5      PM     sn6 U l        g s  snf r   )	r   r7   rM   rN   r   r;   ranger-   	resblocks)r   rM   rN   rO   r1   _r   s         r   r7   Transformer.__init__C   sU    
 	
6])
" #5;")
  )
s   Ar   c                 $    U R                  U5      $ r   )rR   r*   s     r   r   Transformer.forwardP   s    ~~a  r   )rN   rR   rM   r   )r   r   r   r    rI   r   r!   r7   r   r"   r#   r$   s   @r   rK   rK   A   sO     ,0	  "LL	 ! ! !r   rK   c                   f   ^  \ rS rSrS\S\S\S\S\S\4U 4S jjrS	\R                  4S
 jrSr	U =r
$ )VisualTransformerT   input_resolution
patch_sizerM   rN   rO   
output_dimc                   > [         TU ]  5         Xl        X`l        [        R
                  " SUUUSS9U l        US-  n[        R                  " U[        R                  " U5      -  5      U l
        [        R                  " U[        R                  " X-  S-  S-   U5      -  5      U l        [        U5      U l        [        X4U5      U l        [        U5      U l        [        R                  " U[        R                  " X65      -  5      U l        g )N   F)in_channelsout_channelskernel_sizestridebias            )r   r7   rZ   r\   r   Conv2dconv1	Parameterr   randnclass_embeddingpositional_embeddingr
   ln_prerK   transformerln_postproj)	r   rZ   r[   rM   rN   rO   r\   scaler   s	           r   r7   VisualTransformer.__init__V   s     0$YY"
 t!||EEKK4F,FG$&LL+a/!3U:< 2< %=!&&ue< 'LLU)G!GH	r   r   c                    U R                  U5      nUR                  UR                  S   UR                  S   S5      nUR                  SSS5      nU R                  R                  UR                  5      n[        R                  " UR                  S   SUR                  S   UR                  UR                  S9nX#-   n[        R                  " X!/SS9nXR                  R                  UR                  5      -   nU R                  U5      nUR                  SSS5      nU R                  U5      nUR                  SSS5      nU R                  US S 2SS S 24   5      nU R                  b  XR                  -  nU$ )Nr   rf   re   rA   dim)rh   reshapeshapepermuterk   rD   r   r   zerosrB   catrl   rm   rn   ro   rp   )r   r   x_1x_2s       r   r   VisualTransformer.forwardm   s8   JJqMIIaggaj!''!*b1IIaA""%%agg.kkGGAJ1772;aggahhHiIIshA&)),,QWW55KKNIIaAQIIaALL1a7$99 IIAr   )	rk   rh   rZ   ro   rm   r\   rl   rp   rn   )r   r   r   r    rI   r7   r   r!   r   r"   r#   r$   s   @r   rX   rX   T   sQ    I I# Ic II%(I69I.  r   rX   c                      ^  \ rS rSrS\S\S\\\\\\4   \4   S\S\S\S\S	\S
\S\4U 4S jjrS rS r	\
S 5       rS rSS jrS rSrU =r$ )CLIP   	embed_dimimage_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc           	      |  > [         TU ]  5         X`l        US-  n[        UUUUUUS9U l        [        UU
U	U R                  5       S9U l        Xpl        [        R                  " Xx5      U l        [        R                  " [        R                  " U R                  U5      5      U l        [!        U5      U l        [        R                  " [        R                  " X5      5      U l        [        R                  " [        R&                  " / 5      [(        R*                  " S5      -  5      U l        U R/                  5         g )N@   )rZ   r[   rM   rN   rO   r\   )rM   rN   rO   r1   g$I$I,@)r   r7   r   rX   visualrK   build_attention_maskrn   r   r   	Embeddingtoken_embeddingri   r   emptyrl   r
   ln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)r   r   r   r   r   r   r   r   r   r   r   vision_headsr   s               r   r7   CLIP.__init__   s    
 	,#r)'-(  " '#%#//1	3 %!||JJ$&LLKK++->?%A!!"34!||KK)5 7<<

29I(IJ""$r   c                 .   [         R                  R                  U R                  R                  SS9  [         R                  R                  U R
                  SS9  U R                  R                  S-  SU R                  R                  -  S-  -  nU R                  R                  S-  nSU R                  R                  -  S-  nU R                  R                   H  n[         R                  R                  UR                  R                  US9  [         R                  R                  UR                  R                  R                  US9  [         R                  R                  UR                  R                  R                  US9  [         R                  R                  UR                  R                  R                  US9  M     U R                   b@  [         R                  R                  U R                   U R                  R                  S-  S9  g g )Ng{Gz?)stdg{Gz?rd   re   )r   initnormal_r   weightrl   rn   rM   rN   rR   r9   in_proj_weightout_projr=   r3   r6   r   )r   proj_stdattn_stdfc_stdblocks        r   r   CLIP.initialize_parameters   s   
,,33>
11t<$$**D0!!(((4/1##))4/d&&,,,t3%%//EGGOOEJJ558ODGGOOEJJ//66HOEGGOOEIINN11vO>GGOOEII,,33OB	 0 +GGOO$$$*:*:*@*@$*F  H ,r   c                     [         R                  " U R                  U R                  5      nUR                  [	        S5      5        UR                  S5        U$ )Nz-infrf   )r   r   r   fill_floattriu_)r   masks     r   r   CLIP.build_attention_mask   s>    {{4..0C0CD

5=!

1r   c                 V    U R                   R                  R                  R                  $ r   )r   rh   r   r   )r   s    r   r   
CLIP.dtype   s    {{  ''---r   c                 V    U R                  UR                  U R                  5      5      $ r   )r   r   r   )r   images     r   encode_imageCLIP.encode_image   s    {{5::djj122r   c                    U R                  U5      R                  U R                  5      nX0R                  R                  U R                  5      -   nUR	                  SSS5      nU R                  U5      nUR	                  SSS5      nU R                  U5      R                  U R                  5      nU(       a  X0R                  -  $ U[        R                  " UR                  S   5      UR                  SS94   U R                  -  nU$ )Nrf   r   re   rt   ru   )r   r   r   rl   ry   rn   r   r   r   arangerx   argmax)r   textreturn_all_tokensr   s       r   encode_textCLIP.encode_text   s      &++DJJ7))..tzz::IIaAQIIaAMM!!!$**-++++ell1771:&kkbk!" #%)%9%9: r   c                    U R                  U5      nU R                  U5      nX3R                  SSS9-  nXDR                  SSS9-  nU R                  R	                  5       nXS-  UR                  5       -  nXT-  UR                  5       -  nXg4$ )Nrt   T)rv   keepdim)r   r   normr   expt)r   r   r   image_featurestext_featuresr   logits_per_imagelogits_per_texts           r   r   CLIP.forward   s    **51((.'*=*=D +> +" "%(:(:D ); )" "&&**,&7-//:KK%58H8H8JJ00r   )	r   r   r   rl   r   r   rn   r   r   )F)r   r   r   r    rI   r   r   r7   r   r   propertyr   r   r   r   r"   r#   r$   s   @r   r   r      s    "%# "% "% %eCc3,>&?&D E"%""%7:"% "%"% 36"% KN"% %(	"% >A	"%HH$ . .3 
1 
1r   r   
state_dictc                   ^ SU ;   nU(       a  U S   R                   S   n[        U R                  5        Vs/ s H5  nUR                  S5      (       d  M  UR	                  S5      (       d  M3  UPM7     sn5      nU S   R                   S   n[        U S   R                   S   S-
  S	-  5      nXV-  nOS
 V^s/ s H!  m[        [        U4S jU  5       5      5      PM#     n	n[        U	5      nU S   R                   S   n[        U S   R                   S   S-
  S	-  5      n
S nU
S-  S-   U S   R                   S   :X  d   eU
S-  nU S   R                   S   nU S   R                   S   nU S   R                   S   nU S   R                   S   nUS-  n[        [        S U  5       5      5      n[        XXBX\UXU5
      nS H  nUU ;   d  M  U U	 M     UR                  U 5        UR                  5       $ s  snf s  snf )Nzvisual.projzvisual.conv1.weightr   zvisual.z.attn.in_proj_weightrt   zvisual.positional_embeddingrf   g      ?)rf   re   r^   r4   c              3      >#    U  H4  nUR                  S T 35      (       d  M  UR                  S5      S   v   M6     g7f)zvisual.layer.re   N
startswithsplit).0kbs     r   	<genexpr>build_model.<locals>.<genexpr>   s8      9-7||l1#$67 $AGGCLOZs   ??zvisual.layer1.0.conv1.weightz$visual.attnpool.positional_embeddingre       r   rl   ztoken_embedding.weightzln_final.weightr   c              3   z   #    U  H1  nUR                  S 5      (       d  M  UR                  S5      S   v   M3     g7f)ztransformer.resblocksr   re   Nr   )r   r   s     r   r   r     s1      6%/||34 AGGCLOZs   ;;)rZ   r   r   )rx   lenkeysr   endswithroundsettupler   load_state_dicteval)r   vitr   r   r   r   	grid_sizer   r   countsoutput_widthr   r   r   r   r   r   modelkeys           `          r   build_modelr      sr   
:
%C
!"78>>qA!(
(!||I& +,::6L+M (
  ''<=CCBG56<<Q?!CcIK	,8 "

 "	  9-79 9: " 	 
 f!"@AGGJ>?EEaH !Q"j2'449E!'= = 	= ='",,-33A6I 67==a@N45;;A>J"#45;;A>)R/ 6%/6 	67
 m"J"7IKE D*3 D 
*%::<[

s   G1G1-G17(G6cudacpuTnamerB   c                   ^^^ SnU n [         R                  R                  X2(       a  TOSS9R                  5       nS nU(       dR  [        U=(       d    WR                  5       5      R                  T5      n[        T5      S:X  a  UR                  5         U$ [         R                  R                  U4S j/ S	9nUR                  R                  S
5       Vs/ s H  nS[        U5      ;   d  M  UPM     snS   mU4S jnWR!                  U5        U" UR"                  5        U" UR$                  5        [        T5      S:X  a  [         R                  R                  S / S	9n	['        U	R                  R)                  S5      R+                  5       5      S   n
U
R-                  5       mU4S jnUR!                  U5        U" UR"                  5        U" UR$                  5        UR                  5         U$ ! [         a=    U(       a  [
        R                  " SU S35        Sn[         R                  " USSS9n GNf = fs  snf )NFr   )map_locationzFile z6 is not a JIT archive. Loading as a state dict insteadT)r   weights_onlyc                  v   > [         R                  " / 5      R                  [         R                  " T 5      5      $ r   )r   r   rD   rB   )rB   s   r   <lambda>load_clip.<locals>.<lambda>6  s!    

2!!%,,v"67r   )example_inputsprim::ConstantDevicert   c                   > [        U S5      (       a  U R                  /O/ n[        U S5      (       a%  UR                  U R                  R                  5        U Hf  nUR	                  S5       HN  nSUR                  5       ;   d  M  [        US   5      R                  S5      (       d  M=  UR                  T5        MP     Mh     g )Ngraphforward1r   valuer   )	hasattrr   appendr   findAllNodesattributeNamesstrr   copyAttributes)modulegraphsr   nodedevice_nodes       r   patch_deviceload_clip.<locals>.patch_device<  s    #*67#;#;&,,6:&&MM&////0E**+;<d1133W9''1z&'99:''4 = r   c                  J    [         R                  " / 5      R                  5       $ r   )r   r   r   r   r   r   r   r   M  s    EJJrN((*r   aten::torf   c                   > [        U S5      (       a  U R                  /O/ n[        U S5      (       a%  UR                  U R                  R                  5        U Hy  nUR	                  S5       Ha  n[        UR                  5       5      nS H?  nXE   R                  5       S   S:X  d  M  XE   R                  5       R                  T5        MA     Mc     M{     g )Nr   r   r   )rf   re   r      )	r   r   r   r   r   listinputsr   r   )r   r   r   r   r  i
float_nodes         r   patch_floatload_clip.<locals>.patch_floatQ  s    '.vw'?'?fll^RFvz**foo334!..z:D!$++-0F#!9>>+G49"INN,;;JG $ ;  r   )r   jitloadr   RuntimeErrorwarningswarnr   r   rD   r   r   tracer   r   reprapplyr   r   r  findNoder  r   )r   rB   r	  
model_pathr   r   device_holdernr   float_holderfloat_inputr  r   r  s    `          @@r   	load_clipr    s    CJ?		sV  ??Ctv 	
 J<%*:*:*<=@@Hv;%KKMIIOO7 $ LM !&&334DEEatAw 	
E 	
K
	5 
KK##$""#
6{eyy*2 ' ?<--66zBIIKLQO %%'

	H 	K E&&'E%%&L{  ?MM
|#YZ CZZU?
?"s   6G5 H?.H?5AH<;H<) hashlibosurllibr  collectionsr   typingr   r   numpyr   r   torch.nn.functionalr   
functionalFr   'modelscope.models.base.base_torch_modelr   r
   r&   r-   rK   rX   r   dictr   r   is_availabler   rB   r  r   r   r   <module>r$     s     	   #        ># #,
 ,Z :!* !&.
 .b_1: _1D2D 2n ((** 2805GC GC-.Gr   