
    9inW                        S SK r S SKJr  S SKJrJrJrJr  S SKrS SK	r
S SKrS SKJr  S SKJs  Jr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJrJrJ r   S SK!J"r"  \"" 5       r#S/r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r( " S S\RR                  5      r) " S S\RJ                  5      r* " S S\RJ                  5      r+ " S S\RJ                  5      r, " S S\RJ                  5      r- " S S\RJ                  5      r.S r/S \RJ                  4S! jr0\Rb                  " \ Rd                  \Rf                  S"9 " S# S\5      5       r4g)$    N)OrderedDict)AnyDictTupleUnion)Models)
TorchModel)MODELS)FullTokenizer)
BertConfig)	BertModel)ModeKeys	ModelFileTasks)
get_loggerCLIPForMultiModalEmbeddingc                   R   ^  \ rS rSrSrSU 4S jjrS\R                  4S jrSr	U =r
$ )
Bottleneck(      c                   > [         TU ]  5         [        R                  " XSSS9U l        [        R
                  " U5      U l        [        R                  " X"SSSS9U l        [        R
                  " U5      U l        US:  a  [        R                  " U5      O[        R                  " 5       U l        [        R                  " X"U R                  -  SSS9U l        [        R
                  " X R                  -  5      U l        [        R                  " SS9U l        S U l        X0l        US:  d  X[&        R                  -  :w  a  [        R(                  " [+        S[        R                  " U5      4S	[        R                  " UX R                  -  SSSS
94S[        R
                  " X R                  -  5      4/5      5      U l        g g )N   F)bias   )paddingr   Tinplacez-10)strider   1)super__init__nnConv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3ReLUrelu
downsampler   r   
Sequentialr   )selfinplanesplanesr   	__class__s       h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/clip/model.pyr"   Bottleneck.__init__+   sY    YYx?
>>&)YYvq!%H
>>&)/5zr||F+r{{}YYv'>N
>>&>>"9:GGD)	A:j.B.B%BB mmdBLL$89! ii"*"(>>"9"#)*',./ "2>>&>>2I#JKM N	ODO C    xc                    UnU R                  U R                  U R                  U5      5      5      nU R                  U R                  U R	                  U5      5      5      nU R                  U5      nU R                  U R                  U5      5      nU R                  b  U R                  U5      nX2-  nU R                  U5      nU$ N)	r1   r'   r%   r)   r(   r,   r/   r.   r2   )r4   r;   identityouts       r8   forwardBottleneck.forwardK   s    iiA/0iiC12ll3hhtzz#'??&q)Hiin
r:   )
r,   r'   r)   r/   r%   r(   r.   r2   r1   r   r   )__name__
__module____qualname____firstlineno__r-   r"   torchTensorr@   __static_attributes____classcell__r7   s   @r8   r   r   (   s$    IO@  r:   r   c            	       H   ^  \ rS rSr S	S\S\S\S\4U 4S jjjrS rSrU =r$ )
AttentionPool2d[   spacial_dim	embed_dim	num_heads
output_dimc                   > [         TU ]  5         [        R                  " [        R
                  " US-  S-   U5      US-  -  5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l	        [        R                  " X"5      U l
        [        R                  " X$=(       d    U5      U l        X0l        g )N   r   g      ?)r!   r"   r#   	ParameterrG   randnpositional_embeddingLineark_projq_projv_projc_projrQ   )r4   rO   rP   rQ   rR   r7   s        r8   r"   AttentionPool2d.__init__]   s    
 	$&LLKKQ*I6CG%I!ii	5ii	5ii	5ii	+BC"r:   c           
         UR                  UR                  S   UR                  S   UR                  S   UR                  S   -  5      R                  SSS5      n[        R                  " UR                  SSS9U/SS9nXR                  S S 2S S S 24   R                  UR                  5      -   n[        R                  " S0 SU_S	U_S
U_SUR                  S   _SU R                  _SU R                  R                  _SU R                  R                  _SU R                  R                  _SS _S[        R                  " U R                  R                   U R                  R                   U R                  R                   /5      _SS _SS _SS_SS_SU R"                  R                  _SU R"                  R                   _SS_SU R$                  _SS_6u  pUS   $ )Nr   r   rT   r   Tdimkeepdimr`   querykeyvalueembed_dim_to_checkrQ   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingneed_weights )reshapeshapepermuterG   catmeanrW   todtypeFmulti_head_attention_forwardrQ   rZ   weightrY   r[   r   r\   rt   )r4   r;   _s      r8   r@   AttentionPool2d.forwardk   s   IIaggaj!''!*ggaj1771:-//6wq!780: 	
 IIqvv!Tv2A6A>))!T1*588AA--        !wwr{	 
 nn  ++,,  ++,,  ++,,     !!4;;#3#3T[[5E5EFH            !KK..! " ++**# $ &*% & ]]' ( ) , tr:   )r\   rY   rQ   rW   rZ   r[   r=   )	rC   rD   rE   rF   intr"   r@   rI   rJ   rK   s   @r8   rM   rM   [   s@     $(	#!##  # !	# # r:   rM   c                   D   ^  \ rS rSrSr  SU 4S jjrSS jrS rSrU =r	$ )	ModifiedResNet   am  
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
c           	      2  > [         TU ]  5         X l        X@l        [        R
                  " SUS-  SSSSS9U l        [        R                  " US-  5      U l        [        R
                  " US-  US-  SSSS9U l	        [        R                  " US-  5      U l
        [        R
                  " US-  USSSS9U l        [        R                  " U5      U l        [        R                  " S5      U l        [        R                  " SS9U l        XPl        U R%                  XQS	   5      U l        U R%                  US-  US   SS
9U l        U R%                  US-  US   SS
9U l        U R%                  US-  US   SS
9U l        US-  n[/        US-  UX25      U l        g )Nr   rT   r   F)kernel_sizer   r   r   )r   r   r   Tr   r   )r   r          )r!   r"   rR   input_resolutionr#   r$   r%   r&   r'   r(   r)   r.   r/   r*   r,   r0   r1   	_inplanes_make_layerlayer1layer2layer3layer4rM   attnpool)r4   layersrR   headsr   widthrP   r7   s          r8   r"   ModifiedResNet.__init__   s|    	$ 0 YYuzqAEK
>>%1*-YYQJ
15J
>>%1*-YYQJ1aeE
>>%(||AGGD)	 &&uQi8&&uqy&)A&F&&uqy&)A&F&&uqy&)A&FBJ	'(8B(>	(-;r:   c                     [        U R                  X5      /nU[         R                  -  U l        [        SU5       H(  nUR	                  [        U R                  U5      5        M*     [
        R                  " U6 $ )Nr   )r   r   r-   rangeappendr#   r3   )r4   r6   blocksr   r   r   s         r8   r   ModifiedResNet._make_layer   s`    T^^V<=*"6"66q&!AMM*T^^V<= " }}f%%r:   c                 ,  ^  U 4S jnUR                  T R                  R                  R                  5      nU" U5      nT R	                  U5      nT R                  U5      nT R                  U5      nT R                  U5      nT R                  U5      nU$ )Nc                   > TR                   TR                  4TR                  TR                  4TR                  TR
                  44 H"  u  pTR                  U" U" U 5      5      5      n M$     TR                  U 5      n U $ r=   )r%   r'   r(   r)   r.   r/   r1   r,   )r;   convbnr4   s      r8   stem$ModifiedResNet.forward.<locals>.stem   sj    "jj$((3djj$((5K"jj$((35IIbak*5 QAHr:   )	typer%   r   r}   r   r   r   r   r   )r4   r;   r   s   `  r8   r@   ModifiedResNet.forward   sx    	 FF4::$$**+GKKNKKNKKNKKNMM!r:   )r   r   r,   r'   r)   r/   r%   r(   r.   r   r   r   r   r   rR   r1   )   @   rB   )
rC   rD   rE   rF   __doc__r"   r   r@   rI   rJ   rK   s   @r8   r   r      s%     #& ;D& r:   r   c                   H   ^  \ rS rSrSrS\R                  4U 4S jjrSrU =r	$ )	LayerNorm   z*Subclass torch's LayerNorm to handle fp16.r;   c                    > UR                   n[        TU ]	  UR                  [        R
                  5      5      nUR                  U5      $ r=   )r}   r!   r@   r   rG   float32)r4   r;   	orig_typeretr7   s       r8   r@   LayerNorm.forward   s6    GG	goaffU]]34xx	""r:   rv   )
rC   rD   rE   rF   r   rG   rH   r@   rI   rJ   rK   s   @r8   r   r      s    4# # #r:   r   c                   6    \ rS rSrS\R
                  4S jrSrg)	QuickGELU   r;   c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)rG   sigmoidr4   r;   s     r8   r@   QuickGELU.forward   s    5==+++r:   rv   N)rC   rD   rE   rF   rG   rH   r@   rI   rv   r:   r8   r   r      s    , ,r:   r   c                      ^  \ rS rSr S
S\S\S\R                  4U 4S jjjrS\R                  4S jrS\R                  4S jr	S	r
U =r$ )ResidualAttentionBlockr   d_modeln_head	attn_maskc                 l  > [         TU ]  5         [        R                  " X5      U l        [        U5      U l        [        R                  " [        S[        R                  " XS-  5      4S[        5       4S[        R                  " US-  U5      4/5      5      U l        [        U5      U l        X0l        g )Nc_fcr   gelur\   )r!   r"   r#   MultiheadAttentionattnr   ln_1r3   r   rX   r   mlpln_2r   )r4   r   r   r   r7   s       r8   r"   ResidualAttentionBlock.__init__   s     	))':	g&	==&"))Gq["AB )+."BIIgk7$CDF GH g&	"r:   r;   c                     U R                   b.  U R                   R                  UR                  UR                  S9OS U l         U R	                  XUSU R                   S9S   $ )Nr}   deviceF)ru   r   r   )r   r|   r}   r   r   r   s     r8   	attention ResidualAttentionBlock.attention   so     !% : **''88 + @D 	 yy!%4>>  CCDF 	Fr:   c                     XR                  U R                  U5      5      -   nXR                  U R                  U5      5      -   nU$ r=   )r   r   r   r   r   s     r8   r@   ResidualAttentionBlock.forward   s9    tyy|,,1&&r:   )r   r   r   r   r   r=   )rC   rD   rE   rF   r   rG   rH   r"   r   r@   rI   rJ   rK   s   @r8   r   r      sW    
 ,0### "LL# #F5<< F  r:   r   c            	       x   ^  \ rS rSr S
S\S\S\S\R                  4U 4S jjjrS\R                  4S jrS	r	U =r
$ )Transformer   r   r   r   r   c           
         > [         TU ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        XU5      PM     sn6 U l        g s  snf r=   )	r!   r"   r   r   r#   r3   r   r   	resblocks)r4   r   r   r   r   r   r7   s         r8   r"   Transformer.__init__   sU    
 	
6])
" #5;")
  )
s   Ar;   c                 $    U R                  U5      $ r=   )r   r   s     r8   r@   Transformer.forward  s    ~~a  r:   )r   r   r   r=   )rC   rD   rE   rF   r   rG   rH   r"   r@   rI   rJ   rK   s   @r8   r   r      sO     ,0	  "LL	 ! ! !r:   r   c                   f   ^  \ rS rSrS\S\S\S\S\S\4U 4S jjrS	\R                  4S
 jrSr	U =r
$ )VisualTransformeri  r   
patch_sizer   r   r   rR   c                   > [         TU ]  5         Xl        X`l        [        R
                  " SUUUSS9U l        US-  n[        R                  " U[        R                  " U5      -  5      U l
        [        R                  " U[        R                  " X-  S-  S-   U5      -  5      U l        [        U5      U l        [        X4U5      U l        [        U5      U l        [        R                  " U[        R                  " X65      -  5      U l        g )Nr   F)in_channelsout_channelsr   r   r         rT   r   )r!   r"   r   rR   r#   r$   r%   rU   rG   rV   class_embeddingrW   r   ln_prer   transformerln_postproj)	r4   r   r   r   r   r   rR   scaler7   s	           r8   r"   VisualTransformer.__init__  s     0$YY"
 t!||EEKK4F,FG$&LL+a/!3U:< 2< %=!&&ue< 'LLU)G!GH	r:   r;   c                    U R                  U5      nUR                  UR                  S   UR                  S   S5      nUR                  SSS5      n[        R
                  " U R                  R                  UR                  5      [        R                  " UR                  S   SUR                  S   UR                  UR                  S9-   U/SS9nXR                  R                  UR                  5      -   nU R                  U5      nUR                  SSS5      nU R                  U5      nUR                  SSS5      nU R                  US S 2SS S 24   5      nU R                  b  XR                  -  nU$ )Nr   r   rg   rT   r   rb   )r%   rw   rx   ry   rG   rz   r   r|   r}   zerosr   rW   r   r   r   r   r   s     r8   r@   VisualTransformer.forward*  sD   JJqMIIaggaj!''!*IIaAII$$''05;;GGAJGGBK''884% %  
 )),,QWW55KKNIIaAQIIaALL1a7$99 IIAr:   )	r   r%   r   r   r   rR   rW   r   r   )rC   rD   rE   rF   r   r"   rG   rH   r@   rI   rJ   rK   s   @r8   r   r     sQ    I I# Ic II%(I69I.  r:   r   c            %          ^  \ rS rSr SS\S\S\\\\\\4   \4   S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\	S\4$U 4S jjjr
S r\S 5       rS rS rS rS rSrU =r$ )CLIPiI  rP   image_resolutionvision_layersvision_widthvision_patch_size
vocab_size!text_attention_probs_dropout_probtext_hidden_acttext_hidden_dropout_probtext_hidden_sizetext_initializer_rangetext_intermediate_sizetext_max_position_embeddingstext_num_attention_headstext_num_hidden_layerstext_type_vocab_size	tokenizervision_head_widthc                 <  > [         TU ]  5         [        U[        [        45      (       a  US-  U-  n[        UUUUUS9U l        OUU-  n[        UUUUUUS9U l        [        UU
UUUUU	UUUUSS9U l	        [        U R                  5      U l        [        R                  " [        R                  " X5      5      U l        [        R                  " [        R"                  " / 5      [$        R&                  " S5      -  5      U l        UU l        U R-                  5         g )Nr   )r   rR   r   r   r   )r   r   r   r   r   rR   g-q=)vocab_size_or_config_json_filehidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_acthidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangelayer_norm_eps$I$I,@)r!   r"   
isinstancetuplelistr   visualr   r   bert_configr   bertr#   rU   rG   emptytext_projectiononesnploglogit_scaler   initialize_parameters)r4   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   vision_headsr7   s                       r8   r"   CLIP.__init__K  s   0 	meT]33'",0AAL($$"!1"$DK (+<<L+!1,"$"$&DK &+5(4 84& 8)J$@04 
 d../	!||KK(4 6<<

29I(IJ"""$r:   c                 ~   [         R                  " [        R                  " / 5      [        R
                  " S5      -  5      U l        [        U R                  [        5      (       Ga
  U R                  R                  GbE  U R                  R                  R                  R                  S-  n[         R                  R                  U R                  R                  R                  R                   US9  [         R                  R                  U R                  R                  R"                  R                   US9  [         R                  R                  U R                  R                  R$                  R                   US9  [         R                  R                  U R                  R                  R                  R                   US9  U R                  R&                  U R                  R(                  U R                  R*                  U R                  R,                  4 HS  nUR/                  5        H<  u  p4UR1                  S5      (       d  M  [         R                  R3                  U5        M>     MU     U R4                  b@  [         R                  R                  U R4                  U R6                  R8                  S-  S9  g g )Nr  r   )stdz
bn3.weight)r#   rU   rG   r  r  r  r  r  r  r   r   r\   in_featuresinitnormal_rZ   r   rY   r[   r   r   r   r   named_parametersendswithzeros_r  r	  r   )r4   r  resnet_blocknameparams        r8   r  CLIP.initialize_parameters  s   <<

29I(IJdkk>22{{##/kk**11==tC 4 4 ; ; B BL 4 4 ; ; B BL 4 4 ; ; B BL 4 4 ; ; B BL KK&&(:(:DKK<N<NKK&&! $0#@#@#BKD}}\22u- $C	! +GGOO$$$*:*:*F*F*L  N ,r:   c                 V    U R                   R                  R                  R                  $ r=   )r  r%   r   r}   r4   s    r8   r}   
CLIP.dtype  s    {{  ''---r:   c                 V    U R                  UR                  U R                  5      5      $ r=   )r  r   r}   )r4   images     r8   encode_imageCLIP.encode_image  s    {{5::djj122r:   c                    U R                   R                  S   nUR                  U5      R                  U R                  5      nU R                  XS9S   R                  U R                  5      nUS S 2SS S 24   U R                  -  $ )Nz[PAD])attention_maskr   )r   vocabner   r}   r
  r  )r4   text	pad_indexr   r;   s        r8   encode_textCLIP.encode_text  s    NN((1	GGI&++DJJ7	II  ,,-//3t

0 	
 AqzD0000r:   c                 0   Uc
  Uc   S5       eUc  U R                  U5      $ Uc  U R                  U5      $ U R                  U5      nU R                  U5      nX3R                  SSS9-  nXDR                  SSS9-  nX4U R                  R	                  5       4$ )Nz#text and image cannot both be None!rg   Tr_   )r-  r%  normr  exp)r4   r$  r+  image_featurestext_featuress        r8   r@   CLIP.forward  s     D$4[6[[4=##D))\$$U++**51((.'*=*=D +> +" "%(:(:D ); )" " d.>.>.B.B.DDDr:   c                    U R                  U5      nU R                  U5      nX3R                  SSS9-  nXDR                  SSS9-  nU R                  R	                  5       nXS-  UR                  5       -  nUR                  5       nXg4$ )Nr   Tr_   )r%  r-  r0  r  r1  t)r4   r$  r+  r2  r3  r  logits_per_imagelogits_per_texts           r8   get_similarityCLIP.get_similarity  s    **51((. (*=*=4 +> +! !%(:(:q$(:(OO &&**,&7-//:KK*,,.  00r:   )r
  r	  r  r  r   r  )r   )rC   rD   rE   rF   r   r   r   floatstrr   r"   r  propertyr}   r%  r-  r@   r9  rI   rJ   rK   s   @r8   r   r   I  s%   0 "$-B%B% 	B%
 U3S##56;<B% B% B% B% ,1B% B% #(B% B% !&B% !$B%  '*!B%" #&#B%$ !$%B%& "'B%( !)B%, -B% B%HN. . .31E"1 1r:   r   c                     U R                  5        Hh  nUR                  R                  5       Ul        UR                  (       d  M5  UR                  R                  R                  5       UR                  l        Mj     g r=   )
parametersdatar;  grad)modelps     r8   convert_models_to_fp32rD    sK    666&&++++-AFFK  r:   rB  c                 ,    S nU R                  U5        g)z+Convert applicable model parameters to fp16c                 P   [        U [        R                  [        R                  [        R                  45      (       as  U R
                  R                  R                  5       U R
                  l        U R                  b3  U R                  R                  R                  5       U R                  l        [        U [        R                  5      (       aS  / S Vs/ s H  o S3PM	     snQSPSPSP H2  n[        X5      nUc  M  UR                  R                  5       Ul        M4     [        U [        5      (       a  U R                  [        R                  5        S HD  n[        X5      (       d  M  [        X5      nUc  M%  UR                  R                  5       Ul        MF     g s  snf )N)inqkv_proj_weightrl   rm   rn   )r  r   )r  r#   Conv1dr$   rX   r   r@  halfr   r   getattrr   r|   rG   hasattr)modulesattrtensorr  s        r8   _convert_weights_to_fp161convert_weights.<locals>._convert_weights_to_fp16  sA   fryy"))RYY?@@!'!3!3!8!8!:FMM{{&#);;#3#3#8#8#: fb33442GH2GQ<(2GH"$,.6 !.%"(++"2"2"4FK fi((IIejj!/Dv$$v,# $		 0DI	 0 Is   F#N)apply)rB  rT  s     r8   convert_weightsrW    s    10 
KK()r:   )module_namec                      ^  \ rS rSrU 4S jrS\\\4   S\\\4   4S jrS\\\4   S\\\4   4S jr	\
S 5       rS	rU =r$ )
r   i  c           
      L  > [         TU ]  " USU0UD6  SR                  U5      n[        R	                  SU 35        [
        R                  R                  U5      (       d   eSR                  U5      n[        R	                  SU 35        [
        R                  R                  U5      (       d   e[        USSS9 n[        USSS9 n[        R                  " U5      U l        [        R                  " U5      R                  5        H  u  pXR                  U'   M     S S S 5        S S S 5        U S	[        R                   3n
[        U
S
9U l        [#        S0 U R                  DSU R                   0D6U l        ['        U R$                  5        [(        R                  " U S	[        R*                   3S5      nSU;   a  US   OUn[-        [/        UR                  5       5      5      S   R1                  S5      (       a1  UR                  5        VV	s0 s H  u  pU[3        S5      S  U	_M     nnn	[-        [/        UR                  5       5      5      S   R1                  S5      (       a1  UR                  5        VV	s0 s H  u  pU[3        S5      S  U	_M     nnn	U R$                  R5                  U5        U R$                  R7                  5         [(        R8                  R;                  5       (       a8  SR                  [=        [
        R>                  RA                  SS5      5      5      OSU l!        [(        R8                  R;                  5       (       aq  U R$                  RE                  U RB                  5        [        R	                  SR                  [=        [
        R>                  RA                  SS5      5      5      5        g U R$                  RG                  5         [        R	                  S5        g ! , (       d  f       GN= f! , (       d  f       GN= fs  sn	nf s  sn	nf )N	model_dirz{}/vision_model_config.jsonz!Loading vision model config from z{}/text_model_config.jsonzLoading text model config from rzutf-8)encoding/)
vocab_filer   cpu
state_dictr   rP  zmodule.
clip_modelzclip_model.zcuda:{}
LOCAL_RANKz%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inferencerv   )$r!   r"   formatloggerinfoospathexistsopenjsonload
model_infoitemsr   
VOCAB_FILEr   r   r   rb  rW  rG   TORCH_MODEL_BIN_FILEnextiter
startswithlenload_state_dictevalcudais_availabler   environgetr   r|   r;  )r4   r[  argskwargsvision_model_config_filetext_model_config_filefvftrI  rJ  r_  
checkpointsdr7   s                r8   r"   #CLIPForMultiModalEmbedding.__init__   sL   t>9>v> $A#G#G$ /0H/IJ	Lww~~67777!<!C!CI!N56L5MNOww~~45555(# "%'+S7Cr"iimDO		"++-%&" . D" "{!I$8$8#9:
&*= KKDNNK( ZZk999:;UD
 *Z7 =G 	RXXZ !!$//9946HHJ?JDA!C	NO$a'JB?RXXZ !!$//==8:
C
!C&'(!+
BC''+ #(**"9"9";";  &&s2::>>!, ( AF 	::""$$OOt{{+KK?FFBJJNN<346 7 OO!!#KK<=E DC" "* @ Ds1   =P	AO6 PP=P 6
P	 P
Pinputreturnc                 ,   SSK Jn  UR                  S UR                  S 0nUR	                  S[
        R                  5      nSU;   a  [        US   [        R                  5      (       a  US   R                  U R                  5      nUR                  5       S:X  a$  UR                  S   S:X  a  UR                  S5      n[        R                  R!                  U[
        R"                  :H  5         U R$                  R'                  U5      nXfR)                  SSS	9-  nS S S 5        WX2R                  '   S
U;   a  [        US
   [        R                  5      (       a  US
   R                  U R                  5      nUR                  5       S:X  a$  UR                  S   S:X  a  UR                  S5      n[        R                  R!                  U[
        R"                  :H  5         U R$                  R+                  U5      nXR)                  SSS	9-  nS S S 5        WX2R                  '   U[
        R"                  :X  a8  U R$                  R,                  S-  R/                  5       R1                  5       US'   U$ ! , (       d  f       GN]= f! , (       d  f       N|= f)Nr   )
OutputKeysmodeimg   r   rg   Tr_   r+  r         ?r  )modelscope.outputsr  IMG_EMBEDDINGTEXT_EMBEDDINGrz  r   	INFERENCEr  rG   rH   r|   r   r`   rx   squeezeautogradset_grad_enabledTRAINrb  r%  r0  r-  r  r1  r{   )	r4   r  r  outputr  image_tensorr2  text_tensorr3  s	            r8   r@   "CLIPForMultiModalEmbedding.forward5  s!   1$$d%%t
 yy!3!34 E>juu||DD <??4;;7L!Q&<+=+=a+@A+E+33A6001GH!%!=!=l!K!/2E2ED 3F 3* "* I
 0>F++,U?z%-FF-**4;;7K A%+*;*;A*>!*C)11!4001GH $ ; ;K H -0B0BD 1C 1* !* I 1>F,,-8>>!%)__%@%@'*&+,/CE$$& =! - IH IHs   /.I3!.J3
J
Jinputsc                     U$ r=   rv   )r4   r  s     r8   postprocess&CLIPForMultiModalEmbedding.postprocess[  s    r:   c                 P    SU R                   R                  R                  5       -  $ )Nr  )rb  r  r1  r!  s    r8   temperature&CLIPForMultiModalEmbedding.temperature^  s     T__0044666r:   )rb  r   rm  r   )rC   rD   rE   rF   r"   r   r<  r   r@   r  r=  r  rI   rJ   rK   s   @r8   r   r     sd    3>j$T#s(^ $S#X $L$sCx. T#s(^  7 7r:   )5rg  collectionsr   typingr   r   r   r   rk  numpyr  rG   torch.nnr#   torch.nn.functional
functionalr~   modelscope.metainfor   modelscope.modelsr	   modelscope.models.builderr
   1modelscope.models.multi_modal.clip.bert_tokenizerr   5modelscope.models.multi_modal.clip.configuration_bertr   0modelscope.models.multi_modal.clip.modeling_bertr   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   re  __all__Moduler   rM   r   r   r   r   r   r   r   rD  rW  register_modulemulti_modal_embeddingclipr   rv   r:   r8   <module>r     s3    
 # * *       & ( , K L F @ @ .	'
(0 0f,bii ,^DRYY DN# #,		 ,RYY <!")) !&5		 5pL1299 L1^.*299 *< 33Mb7 b7 Nb7r:   