
    9i.                        S r SSKrSSKrSSKJrJrJr  SSKrSSK	r	SSK
Jr  SSKJs  Jr  SSKJs  Jr  SSKJs  Js  Js  Jr  SSKJr  SSKJrJr  SSKJr  SSKJ r   SSK!J"r"   " S	 S
\RF                  5      r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r(\RR                  " \ RT                  \RV                  S9 " S S\5      5       r,g)a  
To further improve the short-duration feature extraction capability of ERes2Net,
we expand the channel dimension within each stage. However, this modification also
increases the number of model parameters and computational complexity.
To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures,
ultimately reducing both the model parameters and its computational cost.
    N)AnyDictUnion)Models)MODELS
TorchModel)AFF)Tasks)create_devicec                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )ReLU   c                 0   > [         [        U ]  SSU5        g )Nr      )superr   __init__)selfinplace	__class__s     e/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/ERes2NetV2.pyr   ReLU.__init__   s    dD"1b'2    c                 j    U R                   (       a  SOSnU R                  R                  S-   U-   S-   $ )Nr    z ())r   r   __name__)r   inplace_strs     r   __repr__ReLU.__repr__!   s6    #'<<iR~~&&-  	 r    )F)r   
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r   r   r      s    3   r   r   c                   :   ^  \ rS rSr    SU 4S jjrS rSrU =r$ )BasicBlockERes2NetV2'   c                 4  > [         [        U ]  5         [        [        R
                  " X$S-  -  5      5      nXpl        [        R                  " XU-  SUSS9U l	        [        R                  " Xu-  5      U l        XPl        X`l        / n/ n	[        U R                  5       HN  n
UR                  [        R                  " XwSSSS95        U	R                  [        R                  " U5      5        MP     [        R                   " U5      U l        [        R                   " U	5      U l        ['        SS9U l        [        R                  " Xu-  X R                  -  SSS	9U l        [        R                  " X R                  -  5      U l        [        R.                  " 5       U l        US:w  d  XR                  U-  :w  aa  [        R.                  " [        R                  " UU R                  U-  SUSS9[        R                  " U R                  U-  5      5      U l        g g )
N      P@   Fkernel_sizestridebias   r.   paddingr0   Tr   r.   r0   )r   r(   r   intmathfloorwidthnnConv2dconv1BatchNorm2dbn1nums	expansionrangeappend
ModuleListconvsbnsr   reluconv3bn3
Sequentialshortcut)r   	in_planesplanesr/   	baseWidthscaler@   r9   rD   rE   ir   s              r   r   BasicBlockERes2NetV2.__init__)   s    	"D24DJJvT)9:;<
YYu}!FP
>>%-0	"tyy!ALL		%AquMOJJr~~e,- " ]]5)
==%&	YYM6NN2O
>>&>>"9:Q;)~~'>>MM		NNV+ !! 
 "$0G!HJDM ?r   c                 T   UnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X0R
                  S5      n[        U R                  5       Hp  nUS:X  a  XE   nOWXE   -   nU R                  U   " U5      nU R                  U R                  U   " U5      5      nUS:X  a  UnMX  [        R                  " X64S5      nMr     U R                  U5      nU R                  U5      nU R                  U5      nX2-  nU R                  U5      nU$ Nr,   r   )r<   r>   rF   torchsplitr9   rA   r?   rD   rE   catrG   rH   rJ   r   xresidualoutspxrO   sps          r   forwardBasicBlockERes2NetV2.forwardP   s    jjmhhsmiinkk#zz1-tyy!AAvV#&[Ar"B488A;r?+BAvii	1- " jjohhsm==#iin
r   )r>   rH   rE   r<   rG   rD   r@   r?   rF   rJ   r9   r,         r`   r   r!   r"   r#   r   r\   r$   r%   r&   s   @r   r(   r(   '   s"    
 %JN r   r(   c                   :   ^  \ rS rSr    SU 4S jjrS rSrU =r$ )BasicBlockERes2NetV2AFFm   c                   > [         [        U ]  5         [        [        R
                  " X$S-  -  5      5      nXpl        [        R                  " XU-  SUSS9U l	        [        R                  " Xu-  5      U l        XPl        X`l        / n/ n	/ n
[        U R                  5       HN  nUR                  [        R                  " XwSSSS95        U
R                  [        R                  " U5      5        MP     [        U R                  S-
  5       H  nU	R                  [!        USS95        M     [        R"                  " U5      U l        [        R"                  " U
5      U l        [        R"                  " U	5      U l        [+        S	S
9U l        [        R                  " Xu-  X R                  -  SSS9U l        [        R                  " X R                  -  5      U l        [        R2                  " 5       U l        US:w  d  XR                  U-  :w  aa  [        R2                  " [        R                  " UU R                  U-  SUSS9[        R                  " U R                  U-  5      5      U l        g g )Nr+   r,   Fr-   r1   r2      channelsrTr4   r5   )r   rc   r   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   r	   rC   rD   rE   fuse_modelsr   rF   rG   rH   rI   rJ   )r   rK   rL   r/   rM   rN   r@   r9   rD   rj   rE   rO   jr   s                r   r    BasicBlockERes2NetV2AFF.__init__o   s    	%t57DJJvT)9:;<
YYu}!FP
>>%-0	"tyy!ALL		%AquMOJJr~~e,- " tyy1}%AsEQ78 & ]]5)
==%==5&	YYM6NN2O
>>&>>"9:Q;)~~'>>MM		NNV+ !! 
 "$0G!HJDM ?r   c                 |   UnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X0R
                  S5      n[        U R                  5       H  nUS:X  a  XE   nOU R                  US-
     " WXE   5      nU R                  U   " U5      nU R                  U R                  U   " U5      5      nUS:X  a  UnMl  [        R                  " X64S5      nM     U R                  U5      nU R                  U5      nU R                  U5      nX2-  nU R                  U5      nU$ rR   )r<   r>   rF   rS   rT   r9   rA   r?   rj   rD   rE   rU   rG   rH   rJ   rV   s          r   r\   BasicBlockERes2NetV2AFF.forward   s   jjmhhsmiinkk#zz1-tyy!AAvV%%a!e,R8Ar"B488A;r?+BAvii	1- " jjohhsm==#iin
r   )r>   rH   rE   r<   rG   rD   r@   rj   r?   rF   rJ   r9   r^   ra   r&   s   @r   rc   rc   m   s"    
 *JX r   rc   c                   R   ^  \ rS rSr\\/ SQSSSSSSSS	4U 4S
 jjrS rS rSr	U =r
$ )
ERes2NetV2   )r1   rf      r1   @   P      r_   r`   TSTPFc           	        > [         [        U ]  5         X@l        XPl        X`l        [        US-  5      U-  S-  U l        Xl        Xpl	        Xl
        Xl        [        R                  " SUSSSSS9U l        [        R                  " U5      U l        U R#                  XUS   SS9U l        U R#                  XS-  US   SS9U l        U R#                  X$S	-  US   SS9U l        U R#                  X$S-  US   SS9U l        [        R                  " US	-  U R                  -  US-  U R                  -  SSSSS
9U l        [/        US-  U R                  -  S	S9U l        U
S:X  d  U
S:X  a  SOSU l        [5        [6        U
5      " U R                  U R                  -  S9U l        [        R:                  " U R                  U R                  -  U R2                  -  U5      U l        U R                  (       a6  [        R>                  " USS9U l         [        R:                  " Xf5      U l!        g [        RD                  " 5       U l         [        RD                  " 5       U l!        g )N   r,   r1   F)r.   r/   r3   r0   r   )r/   r`   rf   )r.   r3   r/   r0   rg   TAPTSDP)in_dim)affine)#r   rp   r   rK   feat_dim	embed_dimr6   	stats_dimtwo_emb_layerrM   rN   r@   r:   r;   r<   r=   r>   _make_layerlayer1layer2layer3layer4	layer3_dsr	   fuse34n_statsgetattrpooling_layerspoolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)r   block
block_fuse
num_blocks
m_channelsr}   r~   rM   rN   r@   pooling_funcr   r   s               r   r   ERes2NetV2.__init__   s    	j$(*# "X\*Z7!;*"
"YYzqAEK
>>*-&&z!}Q ' 8&&>:a= ' <&&Q
1a ' A&&Q
1a ' A NT^^+NT^^+ :>DNN#BaH(E1\V5KqQRNL9>>DNN24	YYt~~>M(*
NN9UCDM98DJKKMDMDJr   c                    U/S/US-
  -  -   n/ nU HX  nUR                  U" U R                  UUU R                  U R                  U R                  S95        X R                  -  U l        MZ     [
        R                  " U6 $ )Nr,   )rM   rN   r@   )rB   rK   rM   rN   r@   r:   rI   )r   r   rL   r   r/   strideslayerss          r   r   ERes2NetV2._make_layer   s    (aSJN33FMMNN"nn**"nn./ $nn4DN  }}f%%r   c                 \   UR                  SSS5      nUR                  S5      n[        R                  " U R	                  U R                  U5      5      5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  Xg5      nU R                  U5      n	U R                  U	5      n
U R                  (       a:  [        R                  " U
5      nU R                  U5      nU R!                  U5      nU$ U
$ )Nr   r`   r,   )permute
unsqueeze_FrF   r>   r<   r   r   r   r   r   r   r   r   r   r   r   )r   rW   rY   out1out2out3out4out3_ds
fuse_out34statsembed_aembed_bs               r   r\   ERes2NetV2.forward  s    IIaALLOffTXXdjjm,-{{3{{4 {{4 {{4 ..&[[/
		*%**U#&&/C--$CjjoGNNr   )rM   r>   r<   r~   r@   r}   r   rK   r   r   r   r   r   r   r   rN   r   r   r   r   r   )r   r!   r"   r#   r(   rc   r   r   r\   r$   r%   r&   s   @r   rp   rp      s;     ,3($$8't& r   rp   )module_namec                   T   ^  \ rS rSrSrS\\\4   4U 4S jjrS r	S r
S	S jrSrU =r$ )
SpeakerVerificationERes2NetV2i  a  ERes2NetV2 architecture with local and global feature fusion. ERes2NetV2 is mainly composed
of Bottom-up Dual-stage Feature Fusion (BDFF) and Bottleneck-like Local Feature Fusion (BLFF).
BDFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
The BLFF extracts localization-preserved speaker features and strengthen the local information interaction.
Args:
    model_dir: A model dir.
    model_config: The model config.
model_configc                 J  > [         TU ]  " X/UQ70 UD6  X l        U R                  S   U l        U R                  S   U l        U R                  S   U l        U R                  S   U l        X@l        SU l        [        U R                  S   5      U l
        [        U R                  U R                  U R
                  U R                  S9U l        US   nU R                  U5        U R                  R                  U R                  5        U R                  R                  5         g )	Nr~   rM   rN   r@   rt   device)r~   rM   rN   r@   pretrained_model)r   r   r   r~   rM   rN   r@   other_configfeature_dimr   r   rp   embedding_model0_SpeakerVerificationERes2NetV2__load_check_pointtoeval)r   	model_dirr   argskwargspretrained_model_namer   s         r   r   &SpeakerVerificationERes2NetV2.__init__&  s    B4B6B(**;7**;7&&w/
**;7"#D$5$5h$?@)nnnn**nn	 & !''9 : 56,!!#r   c                    [        U[        R                  5      (       a  [        R                  " U5      n[        UR                  5      S:X  a  UR                  S5      n[        UR                  5      S:X  d   S5       eU R                  U5      nU R                  UR                  U R                  5      5      nUR                  5       R                  5       $ )Nr,   r   r`   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayrS   
from_numpylenshape	unsqueeze/_SpeakerVerificationERes2NetV2__extract_featurer   r   r   detachcpu)r   audiofeature	embeddings       r   r\   %SpeakerVerificationERes2NetV2.forward>  s    eRZZ(($$U+Eu{{q OOA&EKK
 	YX	Y  ((/((DKK)@A	!%%''r   c                     [         R                  " XR                  S9nX"R                  SSS9-
  nUR	                  S5      nU$ )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr   meanr   )r   r   r   s      r   __extract_feature/SpeakerVerificationERes2NetV2.__extract_featureL  s@    ++e2B2BCLLQL==##A&r   c                     U(       d  [         R                  " S5      nU R                  R                  [         R                  " [
        R                  R                  U R                  U5      US9SS9  g )Nr   )map_locationT)strict)	rS   r   r   load_state_dictloadospathjoinr   )r   r   r   s      r   __load_check_point0SpeakerVerificationERes2NetV2.__load_check_pointR  sW    \\%(F,,JJT^^-BC#% 	 	- 	r   )	rM   r   r~   r   r@   r   r   r   rN   )N)r   r!   r"   r#   __doc__r   strr   r   r\   r   r   r$   r%   r&   s   @r   r   r     s0    $S#X $0( r   r   )-r   r7   r   typingr   r   r   numpyr   rS   torch.nnr:   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svr   modelscope.metainfor   modelscope.modelsr   r   !modelscope.models.audio.sv.fusionr	   modelscope.utils.constantr
   modelscope.utils.devicer   Hardtanhr   Moduler(   rc   rp   register_modulespeaker_verificationeres2netv2_svr   r    r   r   <module>r      s     	 # #      + + B B & 0 1 + 1 2;;  C299 CLIbii IX^ ^B 	F,@,@B=J =B=r   