
    9i4                     n   S SK r S SKJr  S SKrS SKJr  S SKJs  Jr	  S SK
Jr  S SKJr  S SKJr  S SKJr  S SKJrJr  SSKJr  SS	KJrJr  \R4                  " \R6                  \R8                  S
9 " S S\5      5       r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r"g)    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )	load_clip)get_state_dictset_seed)module_namec                   N   ^  \ rS rSrSrS\4U 4S jjrS	S jrS rS	S jr	Sr
U =r$ )
VoP   a  
The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
This model is dynamically initialized with the following parts:
    - clip: the upstream pre-trained backbone model (CLIP in this code)
    - pool_frames: the frames pooling method
    - visual_prompt_learner: visual prompt
    - ImageEncoder: get image encoder
    - TextPromptLearner: text prompt
    - TextEncoder: get text encoder
	model_dirc                   > [         [        U ]  5         [        R                  " US5      n[        R                  " US5      n[        R                  " U[
        R                  5      n[        R                  " U5      R                  U l
        [        US9U l        [        [        U R                  R                  R                   R"                  5      5      U R                  l        [        [        U R                  R                   R"                  5      5      U R                  l        [)        U R                  R*                  U R                  5      U l        [/        U R                  U R                  5      U l        [3        U R                  U R                  5      U l        [7        U R                  U R                  5      U l        [;        U R                  U R                  5      U l        U R?                  [A        U5      5        U RC                  5         S[D        RF                  S'   [I        U R                  RJ                  5        g)z@
Initialize a VoP Model

Args:
    model_dir: model id or path,
zVoP_msrvtt9k.pthzViT-B-32.pt)namefalseTOKENIZERS_PARALLELISMN)&superr   __init__ospjoinr   CONFIGURATIONr   	from_file
hyperparamconfigr
   cliplistrangevisualtransformerlayers
vpt_layers
tpt_layersBaselinePoolingpooling_typepool_framesVisualPromptLearnervisual_prompt_learnerImageEncoderimage_encoderTextPromptLearnertext_prompt_learnerTextEncodertext_encoderload_state_dictr   evalosenvironr   seed)selfr   argskwargs
model_path	clip_archconfig_path	__class__s          h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vop_retrieval/model.pyr   VoP.__init__!   s    	c4!#XXi);<
HHY6	hhy)*A*AB&&{3>>9-	!%$))""..556"8!%eDII,A,A,H,H&I!J*4;;+C+C+/;;8 &9IIt{{&$")$))T[[A#4TYY#L '		4;;? 	^J78		 07

+,!!"    c                    UR                   S   nUR                  SSU R                  R                  U R                  R                  5      nU R	                  5       nU R                  XT5      nXfR                  SSS9-  nUR                  UU R                  R                  S5      nU R                  SU5      nU(       a  Xg4$ U$ )zl
Get video Features

Args:
    videos: the dim is [1, 12, 3, 224, 224]
    return_all_frames: default False
r      TdimkeepdimN)	shapereshaper   	input_resr*   r,   norm
num_framesr(   )r6   videosreturn_all_frames
batch_size
video_datavisual_promptsvideo_featuresvideo_features_pooleds           r=   get_video_featuresVoP.get_video_featuresF   s     \\!_
^^B4;;+@+@$(KK$9$9;
 335++NG'*=*=D +> +" "'//
040F0FL !% 0 0~ F!88$$r?   c                 l    U R                  5       nU R                  X!5      nX3R                  SSS9-  nU$ )z<
Get Text Features

Args:
    text_data: the dim is [1, 69]
rA   TrC   )r.   r0   rI   )r6   	text_datatext_promptstext_featuress       r=   get_text_featuresVoP.get_text_featuresa   sG     //1)),B%(:(:D ); )" "r?   c                 
   US   R                   S   nUS   nUS   nUR                  SSU R                  R                  U R                  R                  5      nU R	                  5       nU R                  Xe5      nU R                  5       nU R                  X5      n	XR                  SSS9-  n	XwR                  SSS9-  nUR                  UU R                  R                  S5      nU R                  X5      n
U(       a  XU
4$ X4$ )zf
Dynamic Forward Function of VoP

Args:
    data: the input data
    return_all_frames: default False
videor   textrA   rB   TrC   )rF   rG   r   rH   r*   r,   r.   r0   rI   rJ   r(   )r6   datarL   rM   rU   rN   rO   rP   rV   rW   rQ   s              r=   forwardVoP.forwardo   s     ']((+
L	']
''At{{/D/D(,(=(=?
 335++NG//1)),B%(:(:D ); )" "'*=*=D +> +" "'//
040F0FL !% 0 0 O 2GGG33r?   )r   r   r,   r(   r0   r.   r*   )F)__name__
__module____qualname____firstlineno____doc__strr   rR   rX   r^   __static_attributes____classcell__r<   s   @r=   r   r      s)    	### ##J%6 4  4r?   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )r&      z
Redefined Pooling Function
c                 d   > [         [        U ]  5         US:X  a  U R                  U l        g [
        e)Navg)r   r&   r   _avg_poolingpooling_funcNotImplementedError)r6   r'   r   r<   s      r=   r   BaselinePooling.__init__   s,    ot-/5  $ 1 1D%%r?   c                 $    UR                  SS9nU$ )z
Pooling mean of frames

Args:
    text_embeds: the input text embedding which is None here.
    video_embeds: the input video embedding with [1, 12, 512].

Returns:
    video_embeds_pooled: num_vids x embed_dim
r	   rD   )mean)r6   text_embedsvideo_embedsvideo_embeds_pooleds       r=   rm   BaselinePooling._avg_pooling   s     +//A/6""r?   c                 $    U R                  X5      $ Nrn   )r6   rt   ru   s      r=   r^   BaselinePooling.forward   s      ;;r?   rz   )
r`   ra   rb   rc   rd   r   rm   r^   rf   rg   rh   s   @r=   r&   r&      s    &#< <r?   r&   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r)      z
The implementation of visual prompt.
This module is used to define the learnable prompt parameters:
    the number of tokens is 8,
    the prompt dimension is 768,
    and the initialization weight std used is 0.02.
c                 z  > [         [        U ]  5         UR                  nUR                  R
                  R                  R                  S   nUR                  n[        R                  " [        UR                  5      SX4US9n[        R                  R                  USS9  [        R                   " U5      U l        g )Nr   r	   dtype{Gz?std)r   r)   r   vp_token_numr!   ln_postweightrF   r   torchemptylenr$   nninitnormal_	ParameterrO   )r6   
clip_modelr   r   vp_dimr   rO   r<   s          r=   r   VisualPromptLearner.__init__   s    !413**""**1177:  !!"A|5J
D1 ll>:r?   c                     U R                   nU$ ry   rO   )r6   vps     r=   r^   VisualPromptLearner.forward   s      	r?   r   	r`   ra   rb   rc   rd   r   r^   rf   rg   rh   s   @r=   r)   r)      s    
; r?   r)   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r-      z
The implementation of visual prompt.
This module is used to define the learnable prompt parameters:
    the number of tokens is 4,
    the prompt dimension is 512,
    and the initialization weight std used is 0.02.
c                   > [         [        U ]  5         UR                  nUR                  nUS:  a  US:  d   eUR
                  R                  R                  S   nUR                  n[        R                  " [        UR                  5      X4-   UUS9n[        R                  R                  USS9  [        R                   " U5      U l        X0l        X@l        g )Nr   r   r   r   )r   r-   r   tp_prefix_token_numtp_suffix_token_numln_finalr   rF   r   r   r   r   r%   r   r   r   r   rV   )	r6   r   r   r   r   tp_dimr   rV   r<   s	           r=   r   TextPromptLearner.__init__   s    /1$88$88"a',?1,DDD$$++11!4  {{!!"5	
 	$/LL6#6 #6 r?   c                     U R                   S S 2S U R                  2S S 24   U R                   S S 2U R                  S 2S S 24   4$ ry   )rV   r   )r6   s    r=   r^   TextPromptLearner.forward   sM    !!!%>d&>&>%>"AB!!!T%=%=%>"ABD 	Dr?   )rV   r   r   r   rh   s   @r=   r-   r-      s    7(D Dr?   r-   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r+      zm
The implementation of image encoder.
This module is used to obtain the features of each frame of the video.
c                   > [         [        U ]  5         X l        UR                  U l        UR
                  U l        UR                  U l        UR                  R                  U l        UR                  R                  U l	        UR                  R                  U l
        UR                  R                  U l        UR                  R                  U l        UR                  R                  U l        UR                  R                  U l        g ry   )r   r+   r   r   r$   r   rJ   r!   conv1class_embeddingpositional_embeddingln_prer"   r   projr6   r   r   r<   s      r=   r   ImageEncoder.__init__   s    lD*, ++"// ++&&,,
)00@@$.$5$5$J$J! ''..%,,88!((00%%**	r?   c           
         UR                   S   nU R                  U5      nUR                  X2R                   S   S5      nUR                  SSS5      nU R                  R                  UR                  5      n[        R                  " USUR                   S   UR                  UR                  S9nXE-   n[        R                  " XB/SS9nX R                  R                  UR                  5      -   n[        U R                  R                  5       GH)  nX`R                  ;   ai  U R                  R!                  U5      nXSS2SS2SS24   R#                  USS5      n[        R                  " USS2SS2SS24   XSS2SS2SS24   /SS9nUS:X  a  U R%                  U5      nUR                  SSS5      nU R                  R&                  U   " U5      nUR                  SSS5      nUS-   U R                  ;   d  M  [        R                  " USS2SS2SS24   USS2SU R(                  -   S2SS24   /SS9nGM,     U R+                  USS2SSS24   5      nU R,                  b  X R,                  -  nU$ )z
The forward function of image encoder.

Args:
    visual_prompts: the visual prompt, dim is [12, 1, 8, 768]
    x: the input data, dim is [12, 3, 224, 224]

Returns:
    x: the output data, dim is [12, 512]
r   r	   rA      )r   devicerr   N)rF   r   rG   permuter   tor   r   zerosr   catr   r    r"   r#   r$   indexrepeatr   	resblocksr   r   r   )	r6   rO   xrM   x_1x_2i_layeri_promptcur_layer_vps	            r=   r^   ImageEncoder.forward  s$    WWQZ
JJqMIIj''!*b1IIaA""%%agg.kk1772;aggahhHiIIshA&)),,QWW55T--445G//)??009-1a.?@GG1 &IIqBQB{LAqr1H+FAN!|KKN		!Q"A  **73A6A		!Q"A{doo-IIqBQB{AaT5F5F1F1G.J,KL"#% 6" LL1a7$99 IIAr?   )r   r   r   r   r   rJ   r   r   r"   r   r$   r   rh   s   @r=   r+   r+      s    
+$, ,r?   r+   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r/   i3  zn
The implementation of text encoder.
This module is used to obtain the features of each word of the sentence.
c                   > [         [        U ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        SU R                  ;   d   eUR                  U l
        UR                  U l        UR                  UR                  -   U l        g )Nr   )r   r/   r   r"   token_embeddingr   r   text_projectionr   r%   r   r   tp_token_numr   s      r=   r   TextEncoder.__init__9  s    k4)+%11)99$.$C$C!"++)99%%
 ++DOO####)#=#= #)#=#= "669S9SSr?   c           
      `   U R                  U5      R                  U R                  5      nUR                  S   nUu  pV[	        U R
                  R                  5       GH  nXpR                  ;   a  U R                  R                  U5      nU R                  S:  aP  XXUS-   2SS2SS24   R                  USS5      n	[        R                  " USS2SS2SS24   XSS2SS2SS24   /SS9nU R                  S:  aP  XhUS-   2SS2SS24   R                  USS5      n
[        R                  " USS2SS2SS24   XSS2SS2SS24   /SS9nUS:X  a'  X0R                  R                  U R                  5      -   nUR                  SSS5      nU R
                  R                   U   " U5      nUR                  SSS5      nUS-   U R                  ;   d  GMv  USS2SS2SS24   nUSS2SU R                  -   SU R                  -
  2SS24   nUSS2SS2SS24   n[        R                  " XU/SS9nUnGM     U R#                  U5      R                  U R                  5      nU[        R$                  " UR                  S   5      UR'                  SS9U R(                  -   4   U R*                  -  nU$ )z
The forward function of text encoder.

Args:
    text_prompts: the text prompt, dim is 2 x [12, 4, 512]
    text: the input data, dim is [1, 69]

Returns:
    x: the output data, dim is [1, 512]
r   r	   NrA   rr   r   )r   typer   rF   r    r"   r#   r%   r   r   expandr   r   r   r   r   r   r   arangeargmaxr   r   )r6   rV   r\   r   rM   prompt_prefixprompt_suffixr   r   cur_layer_tp_prefixcur_layer_tp_suffixtemp_1temp_2temp_3temps                  r=   r^   TextEncoder.forwardH  s      &++DJJ7WWQZ
'3$T--445G//)??009++a/*7:;B< 9<=>9B +CCI6<FBDP ( 		1bqb!8&9QAX;GQPA++a/*7:;B< 9<=>9B +CCI6<FBDP ( 		1crc19':aaiLIA !|1166tzzBB		!Q"A  **73A6A		!Q"A{doo-1bqb!81a$":"::2!55<6 6789 :1bc19yy&&!9qA; 6> MM!!!$**-ell1771:&kkbk!D$5$556 79=9M9MN r?   )
r   r   r   r   r   r   r   r   r%   r"   r   rh   s   @r=   r/   r/   3  s    
T2 2r?   r/   )#r3   os.pathpathr   r   torch.nnr   torch.nn.functional
functionalFmodelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   backboner
   basic_utilsr   r   register_modulevop_retrievalvop_retrieval_modelr   r&   r)   r-   r+   r/    r?   r=   <module>r      s    
      & > , * 6  1 	V%?%?Az4* z4Az4z<j <<* 4D
 DDD: DNG* Gr?   