
    9i                         S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJ	r	  S SK
Jr  S SKJrJr  SSKJrJrJrJrJr  SS	KJr  \	R.                  " \R0                  \R2                  S
9 " S S\5      5       rg)    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )BboxRegressorQ2VRankerStage1Q2VRankerStage2V2QRankerStage1V2QRankerStage2)SwinTransformerV2_1D)module_namec                   T   ^  \ rS rSrSrS\4U 4S jjrS rS r     S	S jr	Sr
U =r$ )
SOONet   a  
The implementation of 'Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding
in Long Videos'. The model is dynamically initialized with the following parts:
    - q2v_stage1: calculate qv_ctx_score.
    - v2q_stage1: calculate vq_ctx_score.
    - q2v_stage2: calculate qv_ctn_score.
    - v2q_stage2: calculate vq_ctn_score.
    - regressor: predict the offset of bounding box for each candidate anchor.
	model_dirc                   > [         T
U ]  5         [        R                  R	                  U[
        R                  5      n[        R                  " U5      R                  U l
        U R                  R                  nU R                  R                  nU R                  R                  nU R                  R                  U l        U R                  R                  U l        XPl        [!        UUUS/U-  S/U-  S/U-  SSSSS["        R$                  SSS	/U-  S
9U l        [)        XV5      U l        [-        XV5      U l        U R                  (       a!  [1        XVU5      U l        [5        XV5      U l        [9        X`R                  5      U l        [        R                  R	                  US5      n[<        R>                  " USS9S   n	U RA                  U	SS9  g)z@
Initialize SOONet Model

Args:
    model_dir: model id or path
      @   g       @Tg        g?Fr   )
patch_sizein_chans	embed_dimdepths	num_headswindow_size	mlp_ratioqkv_bias	drop_rateattn_drop_ratedrop_path_rate
norm_layer
patch_normuse_checkpointpretrained_window_sizesz"SOONet_MAD_VIT-B-32_4Scale_10C.pthcpu)map_locationmodel)strictN)!super__init__ospathjoinr   CONFIGURATIONr   	from_filehyperparamsconfignscales
hidden_dimsnippet_lengthenable_stage2stage2_topkr   nn	LayerNormvideo_encoderr   
q2v_stage1r   
v2q_stage1r   
q2v_stage2r   
v2q_stage2r
   	regressortorchloadload_state_dict)selfr   argskwargsconfig_pathr5   r6   r7   
model_path
state_dict	__class__s             j/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/soonet/model.pyr-   SOONet.__init__   s    	ggll9i.E.EF&&{3??++%%[[++
33![[66;;221% 3=cGmw|| %&C'M3" *'>)'>-g.<>DO-gBDO&z3E3EF WW\\)"FH
ZZ
?H
Z5    c                 l    U R                   (       a  U R                  " S0 UD6$ U R                  " S0 UD6$ )N )trainingforward_trainforward_testrE   rG   s     rL   forwardSOONet.forwardO   s1    ==%%///$$.v..rN   c                     [         e)N)NotImplementedErrorrT   s     rL   rR   SOONet.forward_trainU   s    !!rN   c                    UnU R                  UR                  SSS5      5      nU R                  X5      n	U R                  (       Ga  [	        5       n
[	        5       n[	        5       n[	        5       n[        U R                  5       GHa  n[        R                  " X   SSS9u  nn[        R                  " [        R                  " [	        [        USS2SU R                  24   R                  5       R                  5       R                  5       R                  5       5      5      5      5      u  nnUR!                  UR"                  5      nU
R%                  U5        UR%                  [        R&                  " X   SU5      5        X^   nX^S-      n[        R&                  " UUU SU5      n[        R&                  " UUU SU5      nUR%                  U5        UR%                  U5        GMd     [        R(                  " USS9n[        R(                  " USS9nU R+                  X'X5      u  nnnUnOSnU	nUnUnU R-                  UUU5      n[        R.                  " [        R(                  " USS95      nUUX4$ )a  
Obtain matching scores and bbox bias of the top-k candidate anchors, with
pre-extracted query features and video features as input.

Args:
    query_feats: the pre-extracted text features.
    video_feats: the pre-extracted video features.
    start_ts: the start timestamps of pre-defined multi-scale anchors.
    end_ts: the end timestamps of pre-defined multi-scale anchors.
    scale_boundaries: the begin and end anchor index for each scale in start_ts and end_ts.

Returns:
    [final_scores, bbox_bias, starts, ends]
r   r   r	   T)dim
descendingN)r[   )r<   permuter=   r8   listranger5   rB   sort
LongTensorsetr9   flattenr(   numpytolisttodeviceappendindex_selectcatr?   rA   sigmoid)rE   query_featsvideo_featsstart_tsend_tsscale_boundariesrG   	sent_feat	ctx_featsqv_ctx_scoreshit_indicesstartsendsfiltered_ctx_featsi_indicesscale_first
scale_lastfiltered_startfiltered_endqv_merge_scoresqv_ctn_scores	ctn_feats	bbox_biasfinal_scoress                             rL   rS   SOONet.forward_testX   s6   *  	&&{':':1a'CD		=&KVF6D!%4<<("ZZ!$!>
7"ZZ$$+<D,<,<+<(< = E E G K K M %2345

 "**[%7%78""7+"))&&y|Q@B /1-!e4
!&!3!3[4a"B$11;z2Aw @n-L)- )0 YYv1-F99Tq)D8<9D5O]I*II+OFDNN9iC	}}UYYA%FGY44rN   )
r4   r8   r5   r=   r?   rA   r9   r>   r@   r<   )NNNNN)__name__
__module____qualname____firstlineno____doc__strr-   rU   rR   rS   __static_attributes____classcell__)rK   s   @rL   r   r      s<    .6# .6`/" "&!%" &*D5 D5rN   r   )r.   rB   torch.nnr:   modelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   blocksr
   r   r   r   r   swin_transformerr   register_modulevideo_temporal_groundingsoonetr   rP   rN   rL   <module>r      sd    
   & > , * 67 7 2 	""?H5Z H5?H5rN   