
    9i'                     x   S r SSKrSSKrSSKJrJrJr  SSKrSSK	r	SSK
Jr  SSKJs  Jr  SSKJs  Jr  SSKJs  Js  Js  Jr  SSKJr  SSKJrJr  SSKJr  SSKJ r    " S S	\RB                  5      r" " S
 S\RB                  5      r#\RH                  " \RJ                  \RL                  S9 " S S\5      5       r'g)a  ResNet implementation is adapted from https://github.com/wenet-e2e/wespeaker.
ResNet, or Residual Neural Network, is notable for its optimization ease
and depth-induced accuracy gains. It utilizes skip connections within its residual
blocks to counteract the vanishing gradient problem in deep networks.
Reference: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )
BasicBlock      c           
      2  > [         [        U ]  5         [        R                  " UUSUSSS9U l        [        R                  " U5      U l        [        R                  " X"SSSSS9U l        [        R                  " U5      U l	        [        R                  " 5       U l        US:w  d  XR                  U-  :w  aa  [        R                  " [        R                  " UU R                  U-  SUSS9[        R                  " U R                  U-  5      5      U l        g g )N   r   Fkernel_sizestridepaddingbias)r   r   r   )superr   __init__nnConv2dconv1BatchNorm2dbn1conv2bn2
Sequentialshortcut	expansion)self	in_planesplanesr   	__class__s       a/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/ResNet.pyr   BasicBlock.__init__   s    j$(*YY
 >>&)YY!QUL
>>&)Q;)~~'>>MM		NNV+ !! 
 "$0G!HJDM ?    c                     [         R                  " U R                  U R                  U5      5      5      nU R	                  U R                  U5      5      nX R                  U5      -  n[         R                  " U5      nU$ N)Frelur   r   r   r   r    )r"   xouts      r&   forwardBasicBlock.forward5   sZ    ffTXXdjjm,-hhtzz#'}}QffSk
r(   )r   r   r   r   r    )r   )	__name__
__module____qualname____firstlineno__r!   r   r/   __static_attributes____classcell__r%   s   @r&   r   r      s    IJ0 r(   r   c                   J   ^  \ rS rSr\/ SQSSSSS4U 4S jjrS	 rS
 rSrU =r	$ )ResNet=   )r         r       P      TSTPTc           	        > [         [        U ]  5         X0l        X@l        XPl        [        US-  5      U-  S-  U l        Xpl        [        R                  " SUSSSSS9U l        [        R                  " U5      U l        U R                  XUS   SS9U l        U R                  XS-  US   SS9U l        U R                  XS	-  US   SS9U l        U R                  XS-  US   SS9U l        US
:X  d  US:X  a  SOSU l        [)        [*        U5      " U R                  UR,                  -  S9U l        [        R0                  " U R                  UR,                  -  U R&                  -  U5      U l        U R                  (       a6  [        R4                  " USS9U l        [        R0                  " XU5      U l        g [        R:                  " 5       U l        [        R:                  " 5       U l        g )N   r   r   Fr   r   )r      r;   TAPTSDP)in_dim)affine)r   r9   r   r#   feat_dimembedding_sizeint	stats_dimtwo_emb_layerr   r   r   r   r   _make_layerlayer1layer2layer3layer4n_statsgetattrpooling_layersr!   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)	r"   block
num_blocks
m_channelsrH   rI   pooling_funcrL   r%   s	           r&   r   ResNet.__init__?   s    	fd$&# ,X\*Z7!;*YYzqAEK
>>*-&&z!}Q ' 8&&>:a= ' <&&>:a= ' <&&>:a= ' < )E1\V5KqQRNL9>>EOO35	YYt~~?$,,N-/
NN>%HDM>BDJKKMDMDJr(   c                     U/S/US-
  -  -   n/ nU H8  nUR                  U" U R                  X$5      5        X!R                  -  U l        M:     [        R                  " U6 $ )Nr   )appendr#   r!   r   r   )r"   r\   r$   r]   r   strideslayerss          r&   rM   ResNet._make_layerg   s^    (aSJN33FMM%?@#oo5DN  }}f%%r(   c                    UR                  SSS5      nUR                  S5      n[        R                  " U R	                  U R                  U5      5      5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a:  [        R                  " U5      nU R                  U5      nU R                  U5      nU$ U$ )Nr   rC   r   )permute
unsqueeze_r+   r,   r   r   rN   rO   rP   rQ   rU   rW   rL   rY   rZ   )	r"   r-   r.   out1out2out3statsembed_aembed_bs	            r&   r/   ResNet.forwardo   s    IIaALLOffTXXdjjm,-{{3{{4 {{4 kk$		#**U#&&/C--$CjjoGNNr(   )r   r   rI   rH   r#   rN   rO   rP   rQ   rR   rU   rW   rZ   rY   rK   rL   )
r1   r2   r3   r4   r   r   rM   r/   r5   r6   r7   s   @r&   r9   r9   =   s/     "( #$#&'P& r(   r9   )module_namec                   T   ^  \ rS rSrSrS\\\4   4U 4S jjrS r	S r
S	S jrSrU =r$ )
SpeakerVerificationResNet   zG
Args:
    model_dir: A model dir.
    model_config: The model config.
model_configc                   > [         TU ]  " X/UQ70 UD6  X l        U R                  S   U l        U R                  S   U l        X@l        SU l        [        U R
                  S   5      U l        [        U R                  U R                  S9U l
        US   nU R                  U5        U R                  R                  U R                  5        U R                  R                  5         g )N	embed_dimchannelsr>   device)rI   r^   pretrained_model)r   r   rt   rv   r^   other_configfeature_dimr
   rx   r9   embedding_model,_SpeakerVerificationResNet__load_check_pointtoeval)r"   	model_dirrt   argskwargspretrained_model_namer%   s         r&   r   "SpeakerVerificationResNet.__init__   s    B4B6B(**;7++J7"#D$5$5h$?@%>>doo G !''9 : 56,!!#r(   c                    [        U[        R                  5      (       a  [        R                  " U5      n[        UR                  5      S:X  a  UR                  S5      n[        UR                  5      S:X  d   S5       eU R                  U5      nU R                  UR                  U R                  5      5      nUR                  5       R                  5       $ )Nr   r   rC   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpylenshape	unsqueeze+_SpeakerVerificationResNet__extract_featurer|   r~   rx   detachcpu)r"   audiofeature	embeddings       r&   r/   !SpeakerVerificationResNet.forward   s    eRZZ(($$U+Eu{{q OOA&EKK
 	YX	Y  ((/((DKK)@A	!%%''r(   c                     [         R                  " XR                  S9nX"R                  SSS9-
  nUR	                  S5      nU$ )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr{   meanr   )r"   r   r   s      r&   __extract_feature+SpeakerVerificationResNet.__extract_feature   s@    ++e2B2BCLLQL==##A&r(   c                     U(       d  [         R                  " S5      nU R                  R                  [         R                  " [
        R                  R                  U R                  U5      US9SS9  g )Nr   )map_locationT)strict)	r   rx   r|   load_state_dictloadospathjoinr   )r"   r   rx   s      r&   __load_check_point,SpeakerVerificationResNet.__load_check_point   sW    \\%(F,,JJT^^-BC#% 	 	- 	r(   )rx   rv   r|   r{   r^   rt   rz   r*   )r1   r2   r3   r4   __doc__r   strr   r   r/   r   r}   r5   r6   r7   s   @r&   rr   rr      s0    $S#X $&( r(   rr   )(r   mathr   typingr   r   r   numpyr   r   torch.nnr   torch.nn.functional
functionalr+   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svrT   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr	   modelscope.utils.devicer
   Moduler   r9   register_modulespeaker_verification	resnet_svrr    r(   r&   <module>r      s     	 # #      + + B B & 0 + 1   FCRYY CL 	F,<,<>5
 5>5r(   