
    9iA                        S r SSKrSSKrSSKJrJrJr  SSKrSSKJ	r	  SSK
J	s  Jr  SSKJs  Jr  SSKJr  SSKJrJr  SSKJr  S$S jrS\S	\S
\S\4S jr " S S\	R6                  5      r " S S\	R6                  5      r " S S\	R6                  5      r " S S\R                  R6                  5      r " S S\	R6                  5      r  " S S\	R6                  5      r! " S S\	R6                  5      r" " S S\	R6                  5      r# " S S\	R6                  5      r$ " S S \	R6                  5      r%\RL                  " \RN                  \RP                  S!9 " S" S#\5      5       r)g)%zThis ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
RDINOHead implementation is adapted from DINO framework.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                    [        U R                  5      S:X  d   eUc,  U R                  5       R                  5       R	                  5       n[
        R                  " XR                  U R                  S9R                  [        U 5      U5      U R                  S5      :  nUc  U R                  nUc  U R                  n[
        R                  " XBUS9nU$ )N   )devicedtype)r   r   )lenshapemaxlongitemtorcharanger   r   expand	unsqueeze	as_tensor)lengthmax_lenr   r   masks        `/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/rdino.pylength_to_maskr      s    v||!!!**,##%**,<<V\\;;A6K<"$*$4$4Q$78D }~??4V<DK    L_instridekernel_sizedilationc                     US:  a9  [         R                  " XU-  -
  U-  S-   5      nXS-
  -  X#-  -   nUS-  US-  /nU$ XUS-
  -  -
  S-
  U-  S-   nX-
  S-  X-
  S-  /nU$ )Nr      )mathceil)r   r   r    r!   n_stepsL_outpaddings          r   get_padding_elemr)   &   s    z))d8%;;vEJKA+&)??!#[A%56 N K!O44q8VCaGLQ&!(;<Nr   c                   T   ^  \ rS rSr      S	U 4S jjrS rS\S\S\4S jrSrU =r	$ )
Conv1d3   c
                    > [         T
U ]  5         X l        X@l        XPl        X`l        Xl        [        R                  " UUU R                  U R                  U R                  SUUS9U l	        g )Nr   )r   r!   r(   groupsbias)
super__init__r    r   r!   r(   padding_modennr+   conv)selfout_channelsr    in_channelsr   r!   r(   r.   r/   r2   	__class__s             r   r1   Conv1d.__init__5   sa     	& (II;;]]	
	r   c                    U R                   S:X  a2  U R                  XR                  U R                  U R                  5      nOnU R                   S:X  a5  U R                  S-
  U R                  -  n[
        R                  " XS45      nO)U R                   S:X  a  O[        SU R                   -   5      eU R                  U5      nU$ )Nsamecausalr   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r(   _manage_paddingr    r!   r   Fpad
ValueErrorr4   )r5   xnum_padwxs       r   forwardConv1d.forwardS   s    <<6!$$Q(8(8$--%)[[2A \\X%''!+t}}<Ga1&A\\W$ C,,    YYq\	r   r    r!   r   c                 z    UR                   S   n[        XTX#5      n[        R                  " XU R                  S9nU$ )N)mode)r   r)   r?   r@   r2   )r5   rB   r    r!   r   r   r(   s          r   r>   Conv1d._manage_paddingh   s8     wwr{"4GEE!4#4#45r   )r4   r!   r    r(   r2   r   )r   r   r;   r   Treflect)
__name__
__module____qualname____firstlineno__r1   rE   intr>   __static_attributes____classcell__r8   s   @r   r+   r+   3   sI     
<*  	
  r   r+   c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )BatchNorm1dv   c                 X   > [         TU ]  5         [        R                  " UUUS9U l        g )N)epsmomentum)r0   r1   r3   rU   norm)r5   
input_sizerX   rY   r8   s       r   r1   BatchNorm1d.__init__x   s)     	NN
	r   c                 $    U R                  U5      $ NrZ   r5   rB   s     r   rE   BatchNorm1d.forward   s    yy|r   r_   )gh㈵>g?rL   rM   rN   rO   r1   rE   rQ   rR   rS   s   @r   rU   rU   v   s    
 	
 r   rU   c                   J   ^  \ rS rSr\R
                  S4U 4S jjrS rSrU =r	$ )	TDNNBlock   r   c                    > [         [        U ]  5         [        UUUUUS9U l        U" 5       U l        [        US9U l        g )N)r7   r6   r    r!   r.   r[   )r0   rd   r1   r+   r4   
activationrU   rZ   )r5   r7   r6   r    r!   rh   r.   r8   s          r   r1   TDNNBlock.__init__   sD     	i')#%#
	 %,<8	r   c                 `    U R                  U R                  U R                  U5      5      5      $ r^   )rZ   rh   r4   r`   s     r   rE   TDNNBlock.forward   s"    yy1677r   )rh   r4   rZ   )
rL   rM   rN   rO   r3   ReLUr1   rE   rQ   rR   rS   s   @r   rd   rd      s     779(8 8r   rd   c                   8   ^  \ rS rSr   SU 4S jjrS rSrU =r$ )Res2NetBlock   c                    > [         [        U ]  5         X-  S:X  d   eX#-  S:X  d   eX-  nX#-  n[        R                  " [        US-
  5       Vs/ s H  n[        UUUUS9PM     sn5      U l        X0l        g s  snf )Nr   r   )r    r!   )	r0   rn   r1   r3   
ModuleListrangerd   blocksscale)
r5   r7   r6   rt   r    r!   
in_channelhidden_channelir8   s
            r   r1   Res2NetBlock.__init__   s     	lD*,"a'''#q((( )
%.mm UQY'%
 (! '!	
 (%
  
%
s   A9c                 >   / n[        [        R                  " XR                  SS95       HY  u  p4US:X  a  UnO:US:X  a  U R                  US-
     " U5      nOU R                  US-
     " UW-   5      nUR                  U5        M[     [        R                  " USS9nU$ )Nr   dimr   )	enumerater   chunkrt   rs   appendcat)r5   rB   yrw   x_iy_is         r   rE   Res2NetBlock.forward   s    Azzq ABFAAvakk!a%(-kk!a%(s3HHSM C IIaQr   )rs   rt   )      r   rb   rS   s   @r   rn   rn      s    
 . r   rn   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )SEBlock   c                    > [         [        U ]  5         [        XSS9U l        [
        R                  R                  SS9U l        [        X#SS9U l	        [
        R                  R                  5       U l        g )Nr   r7   r6   r    T)inplace)r0   r   r1   r+   conv1r   r3   rl   reluconv2Sigmoidsigmoid)r5   r7   se_channelsr6   r8   s       r   r1   SEBlock.__init__   s^    gt%'#1N
HHMM$M/	#AO
xx'')r   c                 h   UR                   S   nUbM  [        X#-  X1R                  S9nUR                  S5      nUR	                  SSS9nX-  R	                  SSS9U-  nOUR                  SSS9nU R                  U R                  U5      5      nU R                  U R                  U5      5      nXa-  $ )NrH   r   r   r   r#   Tr{   keepdim)
r   r   r   r   summeanr   r   r   r   )r5   rB   lengthsLr   totalss          r   rE   SEBlock.forward   s    GGBK!'+qJD>>!$DHHDH1E1d3e;A1d+AIIdjjm$LLA'ur   )r   r   r   r   r^   rb   rS   s   @r   r   r      s    * r   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )AttentiveStatisticsPooling   c                    > [         TU ]  5         SU l        X0l        U(       a  [	        US-  USS5      U l        O[	        XSS5      U l        [        R                  " 5       U l        [        UUSS9U l
        g )Ng-q=r   r   r   )r0   r1   rX   global_contextrd   tdnnr3   Tanhtanhr+   r4   )r5   channelsattention_channelsr   r8   s       r   r1   #AttentiveStatisticsPooling.__init__   sg    ,!(Q,0BAqIDI!(1EDIGGI	*!	r   c                 L   UR                   S   nSU R                  4S jnUc,  [        R                  " UR                   S   UR                  S9n[        X#-  X1R                  S9nUR                  S5      nU R                  (       a  UR                  SSS	9R                  5       nU" XU-  5      u  pxUR                  S5      R                  SSU5      nUR                  S5      R                  SSU5      n[        R                  " XU/SS
9n	OUn	U R                  U R                  U R                  U	5      5      5      n	U	R                  US:H  [        S5      5      n	[         R"                  " U	SS
9n	U" X5      u  px[        R                  " Xx4SS
9n
U
R                  S5      n
U
$ )NrH   r#   c                     X-  R                  U5      n[        R                  " XUR                  U5      -
  R	                  S5      -  R                  U5      R                  U5      5      nXE4$ )Nr#   )r   r   sqrtr   powclamp)rB   mr{   rX   r   stds         r   _compute_statistics?AttentiveStatisticsPooling.forward.<locals>._compute_statistics   s^    E;;s#D**$..--22155::3?EEcJLC9r   r   )r   r   r   Tr   rz   z-inf)r   rX   r   onesr   r   r   r   r   floatrepeatr   r4   r   r   masked_fillr?   softmax)r5   rB   r   r   r   r   r   r   r   attnpooled_statss              r   rE   "AttentiveStatisticsPooling.forward   sn   GGBK*+ 	 ?jjAHH=G gk1XXF~~a   HHDH1779E+Ae|<ID>>!$++Aq!4D--"))!Q2C99as^3DD yy499T?34 	5=9yy1%'0	yy$!4#--a0r   )r4   rX   r   r   r   )   Tr^   rb   rS   s   @r   r   r      s    ) )r   r   c                   j   ^  \ rS rSrSSSS\R
                  R                  S4U 4S jjrSS jrSr	U =r
$ )	SERes2NetBlocki#  r   r   r   c	           	         > [         T	U ]  5         X l        [        UUSSUUS9U l        [        X"X5U5      U l        [        UUSSUUS9U l        [        X$U5      U l	        S U l
        X:w  a  [        UUSS9U l
        g g )Nr   )r    r!   rh   r.   r   )r0   r1   r6   rd   tdnn1rn   res2net_blocktdnn2r   se_blockshortcutr+   )
r5   r7   r6   res2net_scaler   r    r!   rh   r.   r8   s
            r   r1   SERes2NetBlock.__init__%  s     	(!

 *,*7hP!

  <H&"')DM 'r   c                     UnU R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  X5      nX-   $ r^   )r   r   r   r   r   )r5   rB   r   residuals       r   rE   SERes2NetBlock.forwardN  s\    ==}}Q'HJJqMq!JJqMMM!%|r   )r6   r   r   r   r   r   r^   )rL   rM   rN   rO   r   r3   rl   r1   rE   rQ   rR   rS   s   @r   r   r   #  s/     88=='R
 
r   r   c                      ^  \ rS rSrSrSS\R                  R                  / SQ/ SQ/ SQSS	SS
/ SQ4U 4S jjrSS jr	Sr
U =r$ )
ECAPA_TDNNi[  zAn implementation of the speaker embedding model in a paper.
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
cpu   )r   r   r   r   i   )   r   r   r   r   )r   r#   r      r   r   r   T)r   r   r   r   r   c                   > [         TU ]  5         [        U5      [        U5      :X  d   e[        U5      [        U5      :X  d   eXPl        [        R
                  " 5       U l        U R                  R                  [        UUS   US   US   UUS   5      5        [        S[        U5      S-
  5       H9  nU R                  R                  [        X]S-
     X]   U	U
Xm   X}   UX   S95        M;     [        US   US   US   US   UUS   S9U l        [        US   UUS9U l        [        US   S-  S9U l        [!        US   S-  USS	9U l        g )
Nr   r   )r   r   r    r!   rh   r.   rH   )r.   )r   r   r#   rg   r   )r0   r1   r   r   r3   rq   rs   r~   rd   rr   r   mfar   asprU   asp_bnr+   fc)r5   r[   r   lin_neuronsrh   r   kernel_sizes	dilationsr   r   r   r   r.   rw   r8   s                 r   r1   ECAPA_TDNN.__init__a  sp     	8}L 11118}I... mmo 	Q!q		 q#h-!+,AKKUOK"/ + ,&\)!9	
 - RLRLbM":
 .RL1)

 "Xb\A-=>  q($
r   c                    UR                  SS5      n/ nU R                   H  n U" XS9nUR                  U5        M     [        R
                  " USS SS9nU R                  U5      nU R                  XS9nU R                  U5      nU R                  U5      nUR                  SS5      R                  S5      nU$ ! [         a    U" U5      n Nf = f)zpReturns the embedding vector.

Arguments
---------
x : torch.Tensor
    Tensor of shape (batch, time, channel).
r   r#   )r   Nrz   )	transposers   	TypeErrorr~   r   r   r   r   r   r   squeeze)r5   rB   r   xllayers        r   rE   ECAPA_TDNN.forward  s     KK1[[E!- IIaL ! IIbf!$HHQK HHQH(KKN GGAJKK1%%a(!  !Hs   B==CC)r   r   rs   r   r   r   r^   )rL   rM   rN   rO   __doc__r   r3   rl   r1   rE   rQ   rR   rS   s   @r   r   r   [  sD     88==+$!F
P r   r   c                   D   ^  \ rS rSr      SU 4S jjrS rS rSrU =r$ )	RDINOHeadi  c	                 Z  > [         TU ]  5         [        US5      nUS:X  a  [        R                  " X5      U l        GO.[        R                  " X5      /n	U(       a%  U	R                  [        R                  " U5      5        U	R                  [        R                  " 5       5        [        US-
  5       Hx  n
U	R                  [        R                  " Xf5      5        U(       a%  U	R                  [        R                  " U5      5        U	R                  [        R                  " 5       5        Mz     U	R                  [        R                  " Xh5      5        [        R                  " U	6 U l        [        R                  " X5      U l        U R                  U R                  5        [        R                  R                  [        R                  " XrSS95      U l        U R                   R"                  R$                  R'                  S5        U(       a  SU R                   R"                  l        g g )Nr   r#   F)r/   )r0   r1   r   r3   Linearmlpr~   rU   GELUrr   
Sequential	add_layerapply_init_weightsutilsweight_norm
last_layerweight_gdatafill_requires_grad)r5   in_dimout_dimuse_bnnorm_last_layernlayers
hidden_dimbottleneck_dimadd_dimlayers_r8   s              r   r1   RDINOHead.__init__  sd    	gq/a<yy8DHii34FbnnZ89MM"'')$7Q;'bii
?@MM".."<=bggi(	 ( MM"))J89}}f-DH7;

4%%&((..IInE:<  %%++A.5:DOO$$2 r   c                 X   [        U[        R                  5      (       a  [        R                  R                  R                  UR                  SS9  [        U[        R                  5      (       a9  UR                  b+  [        R                  R                  UR                  S5        g g g g )Ng{Gz?)r   r   )	
isinstancer3   r   r   inittrunc_normal_weightr/   	constant_)r5   r   s     r   r   RDINOHead._init_weights  sq    a##HHMM''c':!RYY''AFF,>!!!&&!, -?' $r   c                     U R                  U5      nU R                  U5      n[        R                  R	                  USSS9nU R                  U5      nX!4$ )NrH   r#   )r{   p)r   r   r3   
functional	normalizer   )r5   rB   vicr_outs      r   rE   RDINOHead.forward  sN    88A;NN8$MM##A2#3OOA{r   )r   r   r   )FTr   i      i    )	rL   rM   rN   rO   r1   r   rE   rQ   rR   rS   s   @r   r   r     s,    
 !%  # ;D- r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Combinei  c                 B   > [         [        U ]  5         Xl        X l        g r^   )r0   r
  r1   backbonehead)r5   r  r  r8   s      r   r1   Combine.__init__  s    gt%' 	r   c                 J    U R                  U5      nU R                  U5      nU$ r^   r  r  )r5   rB   outputs      r   rE   Combine.forward  s"    MM!1r   r  rb   rS   s   @r   r
  r
    s    
 r   r
  )module_namec                   P   ^  \ rS rSrS\\\4   4U 4S jjrS rS r	SS jr
SrU =r$ )	SpeakerVerification_RDINOi	  model_configc                 v  > [         TU ]  " X/UQ70 UD6  X l        X@l        U R                  S   S:w  a  [	        S5      eSU l        / SQn[        U R
                  US9U l        [        U R                  [        SSS	5      5      U l        US
   nU R                  U5        U R                  R                  5         g )Nchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r  r  r  r  i   )r   r   i   Tpretrained_model)r0   r1   r  other_configrA   feature_dimr   embedding_modelr
  r   ,_SpeakerVerification_RDINO__load_check_pointeval)r5   	model_dirr  argskwargschannels_configpretrained_model_namer8   s          r   r1   "SpeakerVerification_RDINO.__init__  s    B4B6B("Y'4/X  8) 8&t';';'0eT'B D !''9 : 56!!#r   c                     [        UR                  5      S:X  a  UR                  S   S:X  d   S5       eU R                  U5      nU R                  R	                  U5      nU$ )Nr#   r   r   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   +_SpeakerVerification_RDINO__extract_featurer  r  )r5   audiofeature	embeddings       r   rE   !SpeakerVerification_RDINO.forward$  sk    5;;1$** 	^]	^  ((/((11':	r   c                     [         R                  " XR                  S9nX"R                  SSS9-
  nUR	                  S5      nU$ )N)num_mel_binsr   Tr   )Kaldifbankr  r   r   )r5   r)  r*  s      r   __extract_feature+SpeakerVerification_RDINO.__extract_feature-  s@    ++e2B2BCLLQL==##A&r   c                 h   U(       d  [         R                  " S5      n[         R                  " [        R                  R                  U R                  U5      US9nUS   R                  5        VVs0 s H  u  pEUR                  SS5      U_M     nnnU R                  R                  USS9  g s  snnf )Nr   )map_locationteacherzmodule. T)strict)r   r   loadospathjoinr!  itemsreplacer  load_state_dict)r5   r%  r   
state_dictkvstate_dict_teas          r   __load_check_point,SpeakerVerification_RDINO.__load_check_point3  s    \\%(FZZGGLL)>?!

 #9-335
5 IIi$a'5 	 
 	,,^D,I	
s   1B.)r  r  r  r  r^   )rL   rM   rN   rO   r   strr   r1   rE   r(  r  rQ   rR   rS   s   @r   r  r  	  s-    $S#X $.
J 
Jr   r  )NNN)*r   r$   r9  typingr   r   r   r   torch.nnr3   torch.nn.functionalr  r?   torchaudio.compliance.kaldi
compliancekaldir/  modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr	   r   rP   r)   Moduler+   rU   rd   rn   r   r   r   r   r   r
  register_modulespeaker_verificationrdino_tdnn_svr   r   r   <module>rT     sC    	 # #     + + & 0 +&
3 
 
# 
 
@RYY @F")) &8		 84$588?? $Nbii 8: :z5RYY 5pl l^/		 /d
bii 
 	F,@,@B2J
 2JB2Jr   