
    9i)                     x   S r SSKrSSKrSSKJrJrJr  SSKrSSK	r	SSK
Jr  SSKJs  Jr  SSKJs  Jr  SSKJs  Js  Js  Jr  SSKJr  SSKJrJr  SSKJr  SSKJ r    " S S	\RB                  5      r" " S
 S\RB                  5      r#\RH                  " \RJ                  \RL                  S9 " S S\5      5       r'g)aF  
This TDNN implementation is adapted from https://github.com/wenet-e2e/wespeaker.
TDNN replaces i-vectors for text-independent speaker verification with embeddings
extracted from a feedforward deep neural network. The specific structure can be
referred to in https://www.danielpovey.com/files/2017_interspeech_embeddings.pdf.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )	TdnnLayer   c                 8  > [         [        U ]  5         Xl        X l        X0l        X@l        XPl        [        R                  " U R                  U R                  U R
                  U R                  U R                  S9U l
        [        R                  " USS9U l        g)a  Define the TDNN layer, essentially 1-D convolution

Args:
    in_dim (int): input dimension
    out_dim (int): output channels
    context_size (int): context size, essentially the filter size
    dilation (int, optional):  Defaults to 1.
    padding (int, optional):  Defaults to 0.
)dilationpaddingF)affineN)superr   __init__in_dimout_dimcontext_sizer   r   nnConv1dconv_1dBatchNorm1dbn)selfr   r   r   r   r   	__class__s         b/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/xvector.pyr   TdnnLayer.__init__   su     	i')( yyKKLL]]LL" ..7    c                 v    U R                  U5      n[        R                  " U5      nU R                  U5      nU$ )N)r   Frelur   )r   xouts      r   forwardTdnnLayer.forward5   s.    ll1offSkggcl
r    )r   r   r   r   r   r   r   )   r   __name__
__module____qualname____firstlineno__r   r&   __static_attributes____classcell__r   s   @r   r   r      s    84 r    r   c                   <   ^  \ rS rSr     SU 4S jjrS rSrU =r$ )XVEC<   c                   > [         [        U ]  5         Xl        X0l        X@l        [        XSSS9U l        [        X"SSS9U l        [        X"SSS9U l	        [        X"SSS9U l
        [        X#SSS9U l        US:X  d  US:X  a  SOSU l        [        [        U5      " U R                  S9U l        [         R"                  " U R                  U R                  -  U5      U l        g	)
zn
Implementation of Kaldi style xvec, as described in
X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION
   r(   )r   r         TAPTSDP)r   N)r   r2   r   feat_dim	stats_dim	embed_dimr   frame_1frame_2frame_3frame_4frame_5n_statsgetattrpooling_layerspoolr   Linearseg_1)r   r:   hid_dimr;   r<   pooling_funcr   s         r   r   XVEC.__init__>   s     	dD"$ "" QO AN AN AN Q<(E1\V5KqQRNL9>>#	YYt~~<iH
r    c                    UR                  SSS5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )Nr   r7   r(   )permuter=   r>   r?   r@   rA   rE   rG   )r   r$   r%   statsembed_as        r   r&   XVEC.forwardX   sy    IIaAll1oll3ll3ll3ll3		#**U#r    )r<   r:   r=   r>   r?   r@   rA   rB   rE   rG   r;   )(      i  rQ   TSTPr)   r0   s   @r   r2   r2   <   s$     $I4 r    r2   )module_namec                   L   ^  \ rS rSrS\\\4   4U 4S jjrS rS r	S r
SrU =r$ )SpeakerVerificationTDNNf   model_configc                   > [         TU ]  " X/UQ70 UD6  X l        X@l        SU l        SU l        [        U R                  S   5      U l        [        U R                  5        [        U R                  U R
                  S9U l
        US   nU R                  U5        U R                  R                  U R                  5        U R                  R                  5         g )NP   rQ   device)r:   r<   pretrained_model)r   r   rW   other_configfeature_dimr<   r
   rZ   printr2   embedding_model*_SpeakerVerificationTDNN__load_check_pointtoeval)r   	model_dirrW   argskwargspretrained_model_namer   s         r   r    SpeakerVerificationTDNN.__init__i   s    B4B6B("#D$5$5h$?@dkk#%% A &'9 : 56,!!#r    c                    [        U[        R                  5      (       a  [        R                  " U5      n[        UR                  5      S:X  a  UR                  S5      n[        UR                  5      S:X  d   S5       eU R                  U5      nU R                  UR                  U R                  5      5      nUR                  5       R                  5       $ )Nr(   r   r7   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpylenshape	unsqueeze)_SpeakerVerificationTDNN__extract_featurer_   ra   rZ   detachcpu)r   audiofeature	embeddings       r   r&   SpeakerVerificationTDNN.forward|   s    eRZZ(($$U+Eu{{q OOA&EKK
 	YX	Y  ((/((DKK)@A	!%%''r    c                    / nU Hc  n[         R                  " UR                  S5      U R                  S9nXDR	                  SSS9-
  nUR                  UR                  S5      5        Me     [        R                  " U5      nU$ )Nr   )num_mel_binsT)dimkeepdim)Kaldifbankrp   r]   meanappendrl   cat)r   rt   featuresauru   s        r   __extract_feature)SpeakerVerificationTDNN.__extract_feature   st    BkkQd.>.>@GD AAGOOG--a01	 
 99X&r    c                     U R                   R                  [        R                  " [        R
                  R                  U R                  U5      [        R                  " S5      S9SS9  g )Nrs   )map_locationT)strict)	r_   load_state_dictrl   loadospathjoinrc   rZ   )r   rf   s     r   __load_check_point*SpeakerVerificationTDNN.__load_check_point   sO    ,,JJT^^-BC"\\%02 	 	- 	r    )rZ   r<   r_   r]   rW   r\   )r*   r+   r,   r-   r   strr   r   r&   rq   r`   r.   r/   r0   s   @r   rU   rU   f   s+    $S#X $&( r    rU   )(__doc__mathr   typingr   r   r   numpyrj   rl   torch.nnr   torch.nn.functional
functionalr"   torchaudio.compliance.kaldi
compliancekaldir|   )modelscope.models.audio.sv.pooling_layersmodelsrt   svrD   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr	   modelscope.utils.devicer
   Moduler   r2   register_modulespeaker_verificationtdnn_svrU    r    r   <module>r      s     	 # #      + + B B & 0 + 1 		  F'299 'T 22O2j 2 P2r    