
    9in&                        S SK r S SKrS SKJr  S SKJs  Jr   " S S\R                  5      r " S S\R                  5      r	 " S S\R                  5      r
 " S S	\R                  5      r " S
 S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      rSS jrg)    Nc                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )Q2VRankerStage1	   z
Used to calculate the qv_ctx_score with query embedding and multi anchor context embeddings as input.
The qv_ctx_score is used to pre-rank and retain top-k related anchors.
c                 d   > [         TU ]  5         [        R                  " X"5      U l        Xl        g Nsuper__init__nnLinearfcnscalesselfr   
hidden_dim	__class__s      k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/soonet/blocks.pyr
   Q2VRankerStage1.__init__   #    ))J3    c                    U R                  U5      n[        5       n[        U R                  5       HV  n[        R
                  " S[        R                  " X   SSS9[        R                  " USSS95      nUR                  U5        MX     U$ Nz
bld,bd->bl   pdim   )	r   listranger   torcheinsumF	normalizeappend)r   	ctx_featsqfeatqv_ctx_scoresiscores         r   forwardQ2VRankerStage1.forward   st    t||$ALL!"Y\QA!F!"UaQ!?AE   '	 % r   r   r   	__name__
__module____qualname____firstlineno____doc__r
   r*   __static_attributes____classcell__r   s   @r   r   r   	   s    

	 	r   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )V2QRankerStage1    zh
Used to calculate the vq_ctx_score with anchor context embeddings and multi query embeddings as input.
c                 d   > [         TU ]  5         [        R                  " X"5      U l        Xl        g r   r   r   s      r   r
   V2QRankerStage1.__init__%   r   r   c                    [        5       n[        U R                  5       He  n[        R                  " S[
        R                  " U R                  X   5      SSS9[
        R                  " USSS95      nUR                  U5        Mg     U$ r   )	r   r   r   r    r!   r"   r#   r   r$   )r   r%   r&   vq_ctx_scoresr(   r)   s         r   r*   V2QRankerStage1.forward*   so    t||$ALLakk$''),*?1!LEQA.0E   '	 % r   r,   r-   r5   s   @r   r7   r7       s    
 r   r7   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )Q2VRankerStage25   z
Used to calculate the qv_ctn_score with query embedding and video sequence embedding as input.
The qv_ctn_score is used to re-rank anchors.
c                    > [         TU ]  5         Xl        X0l        [        R
                  " X"5      U l        [        5       U l        g r   )	r	   r
   r   snippet_lengthr   r   qfcV2VAttentionencoder)r   r   r   rB   r   s       r   r
   Q2VRankerStage2.__init__;   s2    ,99Z4#~r   c                    U R                  U5      n[        5       n[        5       nUR                  5       u  pxn	[        5       n
[        U R                  5       GHe  nU R
                  SU-  -  nX-  XK   R                  S5      :X  d   e[        R                  " XK   SX;   5      nUR                  X-  UU	5      R                  5       n[        R                  " USX;   5      nU R                  U[        R                  " UR                  5       S S UR                  S95      nU
R                  U5        [        R                  " S[        R                   " UR#                  S5      SSS9[        R                   " USSS95      n[        R$                  " USS9u  pUR                  U5        UR                  X-   5        GMh     XeU
4$ )	Nr   r   r   )devicebkld,bd->bkl   r   r   )rC   r   sizer   r   rB   r    index_selectviewdetachrE   onesrH   r$   r!   r"   r#   	unsqueezemax)r   vfeatsr&   hit_indicesr'   qv_ctn_scoresqv_merge_scores_LD	ctn_featsr(   anchor_lengthqv_ctx_scorectn_featqv_ctn_scores                   r   r*   Q2VRankerStage2.forwardB   s   &++-aF	t||$A //!Q$6M%)9)>)>q)AAAA --m.>.9n>L {{1#5}#$&&,fh ))(A{~FH||

8==?2A.xGIH X& <<H,>,>q,AQA NEQA.0L $ii!<OL  .""<#>?' %* y88r   )rE   r   rC   rB   )
   r-   r5   s   @r   r?   r?   5   s    
&9 9r   r?   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )V2QRankerStage2b   zh
Used to calculate the vq_ctn_score with anchor content embeddings and multi query embeddings as input.
c                 d   > [         TU ]  5         [        R                  " X"5      U l        Xl        g r   r   r   s      r   r
   V2QRankerStage2.__init__g   r   r   c                 ^   [        5       n[        U R                  5       H  n[        R                  " S[
        R                  " U R                  X   5      R                  S5      SSS9[
        R                  " USSS95      n[        R                  " USS9nUR                  U5        M     U$ )NrI   r   r   rJ   r   r   rK   )r   r   r   r    r!   r"   r#   r   rQ   meanr$   )r   rZ   r&   vq_ctn_scoresr(   r)   s         r   r*   V2QRankerStage2.forwardl   s    t||$ALLDGGIL1;;A>!KEQA.0E JJu!,E  ' % r   r,   r-   r5   s   @r   rb   rb   b   s    

 
r   rb   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rD   y   zT
Self-attention encoder for anchor frame sequence to encode intra-anchor knowledge.
c                    > [         TU ]  5         [        SSSS9U l        [	        SSSS9U l        [        R                  " S5      U l        g )Ni  i           )max_lenr   dropout   g?)r   n_headsro   )	r	   r
   PositionEncodingposembMultiHeadAttentionrE   r   Dropoutro   )r   r   s    r   r
   V2VAttention.__init__~   s=    &sSI)c1cJzz#r   c                    [         R                  " SUU5      R                  S5      nUnXR                  U5      -   nU R	                  XXS9nU R                  UU-   5      UR                  S5      R                  5       -  nU$ )Nz
bm,bn->bmnr   )querykeyvaluemaskr   )r    r!   rQ   rs   rE   ro   float)r   video_featsvideo_masksr{   residualouts         r   r*   V2VAttention.forward   s    ||L+'))21 	!KK$<<llk  Nll8%($) *,7,A,A!,D,J,J,LMr   )ro   rE   rs   r-   r5   s   @r   rD   rD   y   s    '	 	r   rD   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )BboxRegressor   z?
Predict the offset of bounding box for each candidate anchor.
c                 l  > [         TU ]  5         [        R                  " X5      U l        [        R                  " X5      U l        U(       a  [        R                  " X5      U l        [        U5      U l        [        R                  " [        R                  " SU-  U5      [        R                  " 5       [        R                  " US5      5      U l        OY[        R                  " [        R                  " X5      [        R                  " 5       [        R                  " US5      5      U l        X l        g )Nr   )r	   r
   r   r   fc_ctxfc_qfc_ctnSelfAttentionattn
SequentialReLU	predictorenable_stage2)r   r   r   r   s      r   r
   BboxRegressor.__init__   s    ii
7IIj5	))J;DK%j1DI]]		!j.*5rwwy		*a(*DN  ]]		*12779		*a(*DN +r   c                    U R                  U5      n[        R                  " USS9n[        R                  " U R                  U5      5      [        R                  " UR                  S5      5      -  nU R                  (       a  U(       a  [        5       n[        [        U5      5       H  n[        R                  " U R                  X&   5      R                  S5      5      [        R                  " UR                  S5      R                  S5      5      -  nU R                  U5      nUR                  U5        M     [        R                  " USS9n[        R                  " XE/SS9nOUnU R                  U5      nU$ )Nr   rK   r   )r   r    catr"   relur   rQ   r   r   r   lenr   r   r$   r   )	r   r%   rZ   r&   ctx_fuse_featsctn_fuse_featsr(   r   
fuse_featss	            r   r*   BboxRegressor.forward   s   		% IIiQ/	I 67!&&OOA;    )!VN3y>*ffT[[6@@CDqvvOOA&003H5 5iin%%c*	 +
 #YY~1=NN#CLJ'JnnZ(
r   )r   r   r   r   r   r   )Fr-   r5   s   @r   r   r      s    +" r   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r      z3
Obtain pooled features by self-attentive pooling.
c                    > [         TU ]  5         [        R                  " XS-  5      U l        [        R
                  " 5       U l        [        R                  " US-  S5      U l        g )Nr   r   )r	   r
   r   r   fc1r   r   fc2)r   r   r   s     r   r
   SelfAttention.__init__   sE    99Zq9GGI	99Z1_a0r   c                     U R                  U R                  U R                  U5      5      5      R                  S5      n[        R
                  " USS9R                  S5      n[        R                  " X-  SS9nU$ )NrJ   r   rK   )	r   r   r   squeezer"   softmaxrQ   r    sum)r   xattr   s       r   r*   SelfAttention.forward   s^    hhtyy!-.66q9ii#--a0iiQ'
r   )r   r   r   r-   r5   s   @r   r   r      s    1 r   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )rr      z
An implementation of trainable positional embedding which is added to
sequence features to inject time/position information.

Args:
    max_len: The max number of trainable positional embeddings.
    dim: the dimension of positional embedding.
c                    > [         [        U ]  5         [        R                  " X5      U l        [        R                  " 5       U l        [        R                  " U5      U l	        g r   )
r	   rr   r
   r   	Embeddingembedr   r   ru   ro   )r   rn   r   ro   r   s       r   r
   PositionEncoding.__init__   s>    .0\\'/
GGI	zz'*r   c                 $   UR                   S S u  p#[        R                  " U[        R                  UR                  S9nUR                  S5      R                  US5      nU R                  U R                  U R                  U5      5      5      nU$ )Nr   )dtyperH   r   r   )
shaper    arangelongrH   rQ   repeatro   r   r   )r   r   
batch_sizeseq_lenpos_idspos_embs         r   r*   PositionEncoding.forward   sp    ggbqk
,,wejjJ##A&--j!<,,tyyG)<=>r   )ro   r   r   rm   r-   r5   s   @r   rr   rr      s    + r   rr   c                   <   ^  \ rS rSrSrSU 4S jjrS rS rSrU =r	$ )rt      z
An implementation of multi-head attention module, as described in
'Attention Is All You Need <https://arxiv.org/abs/1706.03762>'

Args:
    dim: the dimension of features of hidden layers.
    n_heads: the number of head.
c                 ^  > [         [        U ]  5         Xl        X l        X-  U l        [        R                  " X5      U l        [        R                  " X5      U l	        [        R                  " X5      U l
        [        R                  " U5      U l        [        R                  " SS9U l        g )Nr   rK   )r	   rt   r
   r   rq   head_dimr   r   to_qto_kto_vru   ro   Softmaxr   )r   r   rq   ro   r   s       r   r
   MultiHeadAttention.__init__   st     $02IIc'	IIc'	IIc'	zz'*zzb)r   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr   r   r   r   rJ   )rL   rq   r   rN   permute)r   r   new_x_shapes      r   transpose_for_scores'MultiHeadAttention.transpose_for_scores  sH    ffhsmt||T]]&CCFFK yyAq!$$r   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n	U R                  U5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  n[        X5      nU R                  U5      nU R                  U5      n[        R
                  " X5      nUR                  SSSS5      R                  5       nUR                  5       S S U R                   4-   nUR"                  " U6 nU$ )Nr   r   r   r   rJ   )r   r   r   r   r    matmul	transposemathsqrtr   mask_logitsr   ro   r   
contiguousrL   r   rN   )r   rx   ry   rz   r{   qkvq_transk_transv_transr   ctx_vr   s                 r   r*   MultiHeadAttention.forward  s   IIeIIcNIIe++A.++A.++A.ll7$5$5b68%: ;DIIdmm,,#$ll3ll3S*aAq)446

Sb!TXXL0

E"r   )r   ro   r   rq   r   r   r   r   r   )
r.   r/   r0   r1   r2   r
   r   r*   r3   r4   r5   s   @r   rt   rt      s    *%
 r   rt   c                 V    UR                  [        R                  5      nU SU-
  U-  -   $ )Ng      ?)typer    float32)inputsr{   
mask_values      r   r   r     s(    99U]]#DS4Z:---r   )gꌠ9Y>))r   r    torch.nnr   torch.nn.functional
functionalr"   Moduler   r7   r?   rb   rD   r   r   rr   rt   r    r   r   <module>r      s        bii .bii **9bii *9Zbii .299 .*BII *ZBII $ryy 41 1h.r   