
    9iJH                        S r SSKrSSKrSSKJrJrJr  SSKrSSKJ	r	  SSK
J	s  Jr  SSKJs  Jr  SSKJr  SSKJrJr  SSKJr  S&S jrS\S	\S
\S\4S jr " S S\	R6                  5      r " S S\	R6                  5      r " S S\	R6                  5      r " S S\R                  R6                  5      r " S S\	R6                  5      r  " S S\	R6                  5      r! " S S\	R6                  5      r" " S S\	R6                  5      r#S r$S'S jr% " S S \	R6                  5      r& " S! S"\R                  R6                  5      r'\RP                  " \RR                  \RT                  S#9 " S$ S%\5      5       r+g)(a*  This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
Self-Distillation Prototypes Network(SDPN) is a self-supervised learning framework in SV.
It comprises a teacher and a student network with identical architecture
but different parameters. Teacher/student network consists of three main modules:
the encoder for extracting speaker embeddings, multi-layer perceptron for
feature transformation, and prototypes for computing soft-distributions between
global and local views. EMA denotes Exponential Moving Average.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                    [        U R                  5      S:X  d   eUc,  U R                  5       R                  5       R	                  5       n[
        R                  " XR                  U R                  S9R                  [        U 5      U5      U R                  S5      :  nUc  U R                  nUc  U R                  n[
        R                  " XBUS9nU$ )N   )devicedtype)r   r   )lenshapemaxlongitemtorcharanger   r   expand	unsqueeze	as_tensor)lengthmax_lenr   r   masks        _/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/sdpn.pylength_to_maskr      s    v||!!!**,##%**,<<V\\;;A6K<"$*$4$4Q$78D }~??4V<DK    L_instridekernel_sizedilationc                     US:  a9  [         R                  " XU-  -
  U-  S-   5      nXS-
  -  X#-  -   nUS-  US-  /nU$ XUS-
  -  -
  S-
  U-  S-   nX-
  S-  X-
  S-  /nU$ Nr      )mathceil)r   r   r    r!   n_stepsL_outpaddings          r   get_padding_elemr*   +   s    z))d8%;;vEJKA+&)??!#[A%56 N K!O44q8VCaGLQ&!(;<Nr   c                   T   ^  \ rS rSr      S	U 4S jjrS rS\S\S\4S jrSrU =r	$ )
Conv1d8   c
                    > [         T
U ]  5         X l        X@l        XPl        X`l        Xl        [        R                  " UUU R                  U R                  U R                  SUUS9U l	        g )Nr   )r   r!   r)   groupsbias)
super__init__r    r   r!   r)   padding_modennr,   conv)selfout_channelsr    in_channelsr   r!   r)   r/   r0   r3   	__class__s             r   r2   Conv1d.__init__:   sa     	& (II;;]]	
	r   c                    U R                   S:X  a2  U R                  XR                  U R                  U R                  5      nOnU R                   S:X  a5  U R                  S-
  U R                  -  n[
        R                  " XS45      nO)U R                   S:X  a  O[        SU R                   -   5      eU R                  U5      nU$ )Nsamecausalr   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r)   _manage_paddingr    r!   r   Fpad
ValueErrorr5   )r6   xnum_padwxs       r   forwardConv1d.forwardX   s    <<6!$$Q(8(8$--%)[[2A \\X%''!+t}}<Ga1&A\\W$ C,,    YYq\	r   r    r!   r   c                 z    UR                   S   n[        XTX#5      n[        R                  " XU R                  S9nU$ )N)mode)r   r*   r@   rA   r3   )r6   rC   r    r!   r   r   r)   s          r   r?   Conv1d._manage_paddingm   s8     wwr{"4GEE!4#4#45r   )r5   r!   r    r)   r3   r   )r   r   r<   r   Treflect)
__name__
__module____qualname____firstlineno__r2   rF   intr?   __static_attributes____classcell__r9   s   @r   r,   r,   8   sI     
<*  	
  r   r,   c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )BatchNorm1d{   c                 X   > [         TU ]  5         [        R                  " UUUS9U l        g )N)epsmomentum)r1   r2   r4   rV   norm)r6   
input_sizerY   rZ   r9   s       r   r2   BatchNorm1d.__init__}   s)     	NN
	r   c                 $    U R                  U5      $ Nr[   r6   rC   s     r   rF   BatchNorm1d.forward   s    yy|r   r`   )gh㈵>g?rM   rN   rO   rP   r2   rF   rR   rS   rT   s   @r   rV   rV   {   s    
 	
 r   rV   c                   J   ^  \ rS rSr\R
                  S4U 4S jjrS rSrU =r	$ )	TDNNBlock   r   c                    > [         [        U ]  5         [        UUUUUS9U l        U" 5       U l        [        US9U l        g )N)r8   r7   r    r!   r/   r\   )r1   re   r2   r,   r5   
activationrV   r[   )r6   r8   r7   r    r!   ri   r/   r9   s          r   r2   TDNNBlock.__init__   sD     	i')#%#
	 %,<8	r   c                 `    U R                  U R                  U R                  U5      5      5      $ r_   )r[   ri   r5   ra   s     r   rF   TDNNBlock.forward   s"    yy1677r   )ri   r5   r[   )
rM   rN   rO   rP   r4   ReLUr2   rF   rR   rS   rT   s   @r   re   re      s     779(8 8r   re   c                   8   ^  \ rS rSr   SU 4S jjrS rSrU =r$ )Res2NetBlock   c                    > [         [        U ]  5         X-  S:X  d   eX#-  S:X  d   eX-  nX#-  n[        R                  " [        US-
  5       Vs/ s H  n[        UUUUS9PM     sn5      U l        X0l        g s  snf )Nr   r   )r    r!   )	r1   ro   r2   r4   
ModuleListrangere   blocksscale)
r6   r8   r7   ru   r    r!   
in_channelhidden_channelir9   s
            r   r2   Res2NetBlock.__init__   s     	lD*,"a'''#q((( )
%.mm UQY'%
 (! '!	
 (%
  
%
s   A9c                 >   / n[        [        R                  " XR                  SS95       HY  u  p4US:X  a  UnO:US:X  a  U R                  US-
     " U5      nOU R                  US-
     " UW-   5      nUR                  U5        M[     [        R                  " USS9nU$ )Nr   dimr   )	enumerater   chunkru   rt   appendcat)r6   rC   yrx   x_iy_is         r   rF   Res2NetBlock.forward   s    Azzq ABFAAvakk!a%(-kk!a%(s3HHSM C IIaQr   )rt   ru   )      r   rc   rT   s   @r   ro   ro      s    
 . r   ro   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )SEBlock   c                    > [         [        U ]  5         [        XSS9U l        [
        R                  R                  SS9U l        [        X#SS9U l	        [
        R                  R                  5       U l        g )Nr   r8   r7   r    T)inplace)r1   r   r2   r,   conv1r   r4   rm   reluconv2Sigmoidsigmoid)r6   r8   se_channelsr7   r9   s       r   r2   SEBlock.__init__   s^    gt%'#1N
HHMM$M/	#AO
xx'')r   c                 h   UR                   S   nUbM  [        X#-  X1R                  S9nUR                  S5      nUR	                  SSS9nX-  R	                  SSS9U-  nOUR                  SSS9nU R                  U R                  U5      5      nU R                  U R                  U5      5      nXa-  $ )NrI   r   r   r   r$   Tr|   keepdim)
r   r   r   r   summeanr   r   r   r   )r6   rC   lengthsLr   totalss          r   rF   SEBlock.forward   s    GGBK!'+qJD>>!$DHHDH1E1d3e;A1d+AIIdjjm$LLA'ur   )r   r   r   r   r_   rc   rT   s   @r   r   r      s    * r   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )AttentiveStatisticsPooling   c                    > [         TU ]  5         SU l        X0l        U(       a  [	        US-  USS5      U l        O[	        XSS5      U l        [        R                  " 5       U l        [        UUSS9U l
        g )Ng-q=r   r   r   )r1   r2   rY   global_contextre   tdnnr4   Tanhtanhr,   r5   )r6   channelsattention_channelsr   r9   s       r   r2   #AttentiveStatisticsPooling.__init__   sg    ,!(Q,0BAqIDI!(1EDIGGI	*!	r   c                 L   UR                   S   nSU R                  4S jnUc,  [        R                  " UR                   S   UR                  S9n[        X#-  X1R                  S9nUR                  S5      nU R                  (       a  UR                  SSS	9R                  5       nU" XU-  5      u  pxUR                  S5      R                  SSU5      nUR                  S5      R                  SSU5      n[        R                  " XU/SS
9n	OUn	U R                  U R                  U R                  U	5      5      5      n	U	R                  US:H  [        S5      5      n	[         R"                  " U	SS
9n	U" X5      u  px[        R                  " Xx4SS
9n
U
R                  S5      n
U
$ )NrI   r$   c                     X-  R                  U5      n[        R                  " XUR                  U5      -
  R	                  S5      -  R                  U5      R                  U5      5      nXE4$ )Nr$   )r   r   sqrtr   powclamp)rC   mr|   rY   r   stds         r   _compute_statistics?AttentiveStatisticsPooling.forward.<locals>._compute_statistics   s^    E;;s#D**$..--22155::3?EEcJLC9r   r   )r   r   r   Tr   r{   z-inf)r   rY   r   onesr   r   r   r   r   floatrepeatr   r5   r   r   masked_fillr@   softmax)r6   rC   r   r   r   r   r   r   r   attnpooled_statss              r   rF   "AttentiveStatisticsPooling.forward   sn   GGBK*+ 	 ?jjAHH=G gk1XXF~~a   HHDH1779E+Ae|<ID>>!$++Aq!4D--"))!Q2C99as^3DD yy499T?34 	5=9yy1%'0	yy$!4#--a0r   )r5   rY   r   r   r   )   Tr_   rc   rT   s   @r   r   r      s    ) )r   r   c                   j   ^  \ rS rSrSSSS\R
                  R                  S4U 4S jjrSS jrSr	U =r
$ )	SERes2NetBlocki(  r   r   r   c	           	         > [         T	U ]  5         X l        [        UUSSUUS9U l        [        X"X5U5      U l        [        UUSSUUS9U l        [        X$U5      U l	        S U l
        X:w  a  [        UUSS9U l
        g g )Nr   )r    r!   ri   r/   r   )r1   r2   r7   re   tdnn1ro   res2net_blocktdnn2r   se_blockshortcutr,   )
r6   r8   r7   res2net_scaler   r    r!   ri   r/   r9   s
            r   r2   SERes2NetBlock.__init__*  s     	(!

 *,*7hP!

  <H&"')DM 'r   c                     UnU R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  X5      nX-   $ r_   )r   r   r   r   r   )r6   rC   r   residuals       r   rF   SERes2NetBlock.forwardS  s\    ==}}Q'HJJqMq!JJqMMM!%|r   )r7   r   r   r   r   r   r_   )rM   rN   rO   rP   r   r4   rm   r2   rF   rR   rS   rT   s   @r   r   r   (  s/     88=='R
 
r   r   c                      ^  \ rS rSrSrSS\R                  R                  / SQ/ SQ/ SQSS	SS
/ SQ4U 4S jjrSS jr	Sr
U =r$ )
ECAPA_TDNNi`  zAn implementation of the speaker embedding model in a paper.
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
cpu   )r   r   r   r   i   )   r   r   r   r   )r   r$   r      r   r   r   T)r   r   r   r   r   c                   > [         TU ]  5         [        U5      [        U5      :X  d   e[        U5      [        U5      :X  d   eXPl        [        R
                  " 5       U l        U R                  R                  [        UUS   US   US   UUS   5      5        [        S[        U5      S-
  5       H9  nU R                  R                  [        X]S-
     X]   U	U
Xm   X}   UX   S95        M;     [        US   US   US   US   UUS   S9U l        [        US   UUS9U l        [        US   S-  S9U l        [!        US   S-  USS	9U l        g )
Nr   r   )r   r   r    r!   ri   r/   rI   )r/   )r   r   r$   rh   r   )r1   r2   r   r   r4   rr   rt   r   re   rs   r   mfar   asprV   asp_bnr,   fc)r6   r\   r   lin_neuronsri   r   kernel_sizes	dilationsr   r   r   r   r/   rx   r9   s                 r   r2   ECAPA_TDNN.__init__f  sp     	8}L 11118}I... mmo 	Q!q		 q#h-!+,AKKUOK"/ + ,&\)!9	
 - RLRLbM":
 .RL1)

 "Xb\A-=>  q($
r   c                    UR                  SS5      n/ nU R                   H  n U" XS9nUR                  U5        M     [        R
                  " USS SS9nU R                  U5      nU R                  XS9nU R                  U5      nU R                  U5      nUR                  SS5      R                  S5      nU$ ! [         a    U" U5      n Nf = f)zpReturns the embedding vector.

Arguments
---------
x : torch.Tensor
    Tensor of shape (batch, time, channel).
r   r$   )r   Nr{   )	transposert   	TypeErrorr   r   r   r   r   r   r   squeeze)r6   rC   r   xllayers        r   rF   ECAPA_TDNN.forward  s     KK1[[E!- IIaL ! IIbf!$HHQK HHQH(KKN GGAJKK1%%a(!  !Hs   B==CC)r   r   rt   r   r   r   r_   )rM   rN   rO   rP   __doc__r   r4   rm   r2   rF   rR   rS   rT   s   @r   r   r   `  sD     88==+$!F
P r   r   c                    S nXSU-  -
  :  d  XSU-  -   :  a  [         R                  SSS9  [        R                  " 5          U" X1-
  U-  5      nU" XA-
  U-  5      nU R	                  SU-  S-
  SU-  S-
  5        U R                  5         U R                  U[        R                  " S5      -  5        U R                  U5        U R                  X4S9  U sS S S 5        $ ! , (       d  f       g = f)Nc                 h    S[         R                  " U [         R                  " S5      -  5      -   S-  $ )N      ?       @)r%   erfr   )rC   s    r   norm_cdf(_no_grad_trunc_normal_.<locals>.norm_cdf  s(    TXXa$))B-/00B66r   r$   zimean is more than 2 std from [a, b] in nn.init.trunc_normal_.The distribution of values may be incorrect.)
stacklevelr   r   )minr   )warningswarnr   no_graduniform_erfinv_mul_r%   r   add_clamp_)tensorr   r   abr   l_us           r   _no_grad_trunc_normal_r     s    7 	1s7{1s7{ 2; 	 	
 
 qx3&'ah#%& 	B
AEAI. 	 	C$))B-'(D 	!#+ 
s   BC
C-c                     [        XX#U5      $ r_   )r   )r   r   r   r   r   s        r   trunc_normal_r     s    !&::r   c                   @   ^  \ rS rSr    SU 4S jjrS rS rSrU =r$ )SDPNHeadi  c                   > [         TU ]  5         [        US5      nUS:X  a  [        R                  " X5      U l        GO.[        R                  " X5      /nU(       a%  UR                  [        R                  " U5      5        UR                  [        R                  " 5       5        [        US-
  5       Hx  nUR                  [        R                  " XD5      5        U(       a%  UR                  [        R                  " U5      5        UR                  [        R                  " 5       5        Mz     UR                  [        R                  " XE5      5        [        R                  " U6 U l        U R                  U R                  5        g r#   )r1   r2   r   r4   Linearmlpr   rV   GELUrs   
Sequentialapply_init_weights)	r6   in_dimuse_bnnlayers
hidden_dimbottleneck_dimlayers_r9   s	           r   r2   SDPNHead.__init__  s     	gq/a<yy8DHii34FbnnZ89MM"'')$7Q;'bii
?@MM".."<=bggi(	 (
 MM"))J?@}}f-DH

4%%&r   c                    [        U[        R                  5      (       am  [        UR                  SS9  [        U[        R                  5      (       a9  UR
                  b+  [        R                  R                  UR
                  S5        g g g g )Ng{Gz?)r   r   )
isinstancer4   r   r   weightr0   init	constant_)r6   r   s     r   r  SDPNHead._init_weights  s`    a##!((,!RYY''AFF,>!!!&&!, -?' $r   c                 f    U R                  U5      n[        R                  R                  USSS9nU$ )NrI   r$   )r|   p)r   r4   
functional	normalizera   s     r   rF   SDPNHead.forward  s/    HHQKMM##A2#3r   )r   )Fr   i      )	rM   rN   rO   rP   r2   r  rF   rR   rS   rT   s   @r   r   r     s%       #'0- r   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )Combineri  z)
Combine backbone (ECAPA) and head (MLP)
c                 B   > [         [        U ]  5         Xl        X l        g r_   )r1   r  r2   backbonehead)r6   r  r  r9   s      r   r2   Combiner.__init__#  s    h&( 	r   c                 L    U R                  U5      nU R                  U5      nX4$ r_   r  r  )r6   rC   outputs      r   rF   Combiner.forward(  s%    MM!1yr   r  )	rM   rN   rO   rP   r   r2   rF   rR   rS   rT   s   @r   r  r    s    
 r   r  )module_namec                   T   ^  \ rS rSrSrS\\\4   4U 4S jjrS r	S r
S	S jrSrU =r$ )
SpeakerVerificationSDPNi.  z
Self-Distillation Prototypes Network (SDPN) effectively facilitates
self-supervised speaker representation learning. The specific structure can be
referred to in https://arxiv.org/pdf/2308.02774.
model_configc                 t  > [         TU ]  " X/UQ70 UD6  X l        X@l        U R                  S   S:w  a  [	        S5      eSU l        / SQn[        U R
                  US9U l        [        U R                  [        SS5      5      U l        US	   nU R                  U5        U R                  R                  5         g )
Nchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r&  r&  r&  r&  i   )r   r   Tpretrained_model)r1   r2   r#  other_configrB   feature_dimr   embedding_modelr  r   *_SpeakerVerificationSDPN__load_check_pointeval)r6   	model_dirr#  argskwargschannels_configpretrained_model_namer9   s          r   r2    SpeakerVerificationSDPN.__init__6  s    B4B6B("Y'4/X  8) 8'(<(<(0d(; = !''9 : 56!!#r   c                     [        UR                  5      S:X  a  UR                  S   S:X  d   S5       eU R                  U5      nU R                  R	                  U5      nU$ )Nr$   r   r   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   )_SpeakerVerificationSDPN__extract_featurer+  r  )r6   audiofeature	embeddings       r   rF   SpeakerVerificationSDPN.forwardM  sk    5;;1$** 	^]	^  ((/((11':	r   c                     [         R                  " XR                  S9nX"R                  SSS9-
  nUR	                  S5      nU$ )N)num_mel_binsr   Tr   )Kaldifbankr*  r   r   )r6   r6  r7  s      r   __extract_feature)SpeakerVerificationSDPN.__extract_featureV  s@    ++e2B2BCLLQL==##A&r   c                 h   U(       d  [         R                  " S5      n[         R                  " [        R                  R                  U R                  U5      US9nUS   R                  5        VVs0 s H  u  pEUR                  SS5      U_M     nnnU R                  R                  USS9  g s  snnf )Nr   )map_locationteacherzmodule. T)strict)r   r   loadospathjoinr.  itemsreplacer+  load_state_dict)r6   r2  r   
state_dictkvstate_dict_teas          r   __load_check_point*SpeakerVerificationSDPN.__load_check_point\  s    \\%(FZZGGLL)>?!

 #9-335
5 IIi$a'5 	 
 	,,^D,I	
s   1B.)r+  r*  r#  r)  r_   )rM   rN   rO   rP   r   r   strr   r2   rF   r5  r,  rR   rS   rT   s   @r   r"  r"  .  s2    $S#X $.
J 
Jr   r"  )NNN)g        r   g       r   ),r   r%   rF  typingr   r   r   r   torch.nnr4   torch.nn.functionalr  r@   torchaudio.compliance.kaldi
compliancekaldir<  modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr	   r   rQ   r*   Moduler,   rV   re   ro   r   r   r   r   r   r   r   r  register_modulespeaker_verificationsdpn_svr"   r   r   <module>ra     sL    	 # #     + + & 0 +&
3 
 
# 
 
@RYY @F")) &8		 84$588?? $Nbii 8: :z5RYY 5pl l^!H;
#ryy #Luxx   22O7Jj 7J P7Jr   