
    9i/                        S r SSKrSSKrSSKJrJrJr  SSKrSSK	r	SSK
Jr  SSKJs  Jr  SSKJs  Jr  SSKJs  Js  Js  Jr  SSKJr  SSKJrJr  SSKJr  SSKJ r   SSK!J"r"   " S	 S
\RF                  5      r$SS jr%SS jr& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r*\RV                  " \ RX                  \RZ                  S9 " S S\5      5       r.g)a  Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
fusion (LFF) fuses the features within one single residual block to extract the local signal.
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
    N)AnyDictUnion)Models)MODELS
TorchModel)AFF)Tasks)create_devicec                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )ReLU   c                 0   > [         [        U ]  SSU5        g )Nr      )superr   __init__)selfinplace	__class__s     c/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/sv/ERes2Net.pyr   ReLU.__init__   s    dD"1b'2    c                 j    U R                   (       a  SOSnU R                  R                  S-   U-   S-   $ )Nr    z ())r   r   __name__)r   inplace_strs     r   __repr__ReLU.__repr__   s6    #'<<iR~~&&-  	 r    )F)r   
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r   r   r      s    3   r   r   c           	      4    [         R                  " U USUSSS9$ )z1x1 convolution without padding   r   Fkernel_sizestridepaddingbiasnnConv2d	in_planes
out_planesr+   s      r   conv1x1r4   $   %    99 r   c           	      4    [         R                  " U USUSSS9$ )z3x3 convolution with padding   r(   Fr)   r.   r1   s      r   conv3x3r8   /   r5   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )BasicBlockERes2Net:      c           
        > [         [        U ]  5         [        [        R
                  " X$S-  -  5      5      n[        XU-  U5      U l        [        R                  " Xe-  5      U l
        XPl        / n/ n[        U R                  5       HB  n	UR                  [        Xf5      5        UR                  [        R                  " U5      5        MD     [        R                  " U5      U l        [        R                  " U5      U l        [%        SS9U l        [        Xe-  X R(                  -  5      U l        [        R                  " X R(                  -  5      U l        [        R.                  " 5       U l        US:w  d  XR(                  U-  :w  a`  [        R.                  " [        R2                  " UU R(                  U-  SUSS9[        R                  " U R(                  U-  5      5      U l        X0l        X`l        XPl        g )N      P@Tr   r(   Fr*   r+   r-   )r   r:   r   intmathfloorr4   conv1r/   BatchNorm2dbn1numsrangeappendr8   
ModuleListconvsbnsr   relu	expansionconv3bn3
Sequentialshortcutr0   r+   widthscale)r   r2   planesr+   	baseWidthrT   rS   rK   rL   ir   s             r   r   BasicBlockERes2Net.__init__=   ss    $02DJJvT)9:;<Yv>
>>%-0	tyy!ALL./JJr~~e,- " ]]5)
==%&	U]F^^,CD
>>&>>"9:Q;)~~'>>MM		NNV+ !! 
 "$0G!HJDM 

r   c                 T   UnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X0R
                  S5      n[        U R                  5       Hp  nUS:X  a  XE   nOWXE   -   nU R                  U   " U5      nU R                  U R                  U   " U5      5      nUS:X  a  UnMX  [        R                  " X64S5      nMr     U R                  U5      nU R                  U5      nU R                  U5      nX2-  nU R                  U5      nU$ Nr(   r   )rD   rF   rM   torchsplitrS   rH   rG   rK   rL   catrO   rP   rR   r   xresidualoutspxrW   sps          r   forwardBasicBlockERes2Net.forward\   s    jjmhhsmiinkk#zz1-tyy!AAvV#&[Ar"B488A;r?+BAvii	1- " jjohhsm==#iin
r   )rF   rP   rL   rD   rO   rK   rG   rM   rT   rR   r+   rS   r(       r<   	r   r!   r"   r#   rN   r   rd   r$   r%   r&   s   @r   r:   r:   :   s    I> r   r:   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )BasicBlockERes2Net_AFFy   r<   c           
        > [         [        U ]  5         [        [        R
                  " X$S-  -  5      5      n[        XU-  U5      U l        [        R                  " Xe-  5      U l
        XPl        / n/ n/ n	[        U R                  5       HB  n
UR                  [        Xf5      5        U	R                  [        R                  " U5      5        MD     [        U R                  S-
  5       H  nUR                  [        US95        M     [        R                   " U5      U l        [        R                   " U	5      U l        [        R                   " U5      U l        [)        SS9U l        [        Xe-  X R,                  -  5      U l        [        R                  " X R,                  -  5      U l        [        R2                  " 5       U l        US:w  d  XR,                  U-  :w  a`  [        R2                  " [        R6                  " UU R,                  U-  SUSS9[        R                  " U R,                  U-  5      5      U l        X0l        X`l        XPl        g )Nr>   r(   channelsTr?   Fr@   )r   rj   r   rA   rB   rC   r4   rD   r/   rE   rF   rG   rH   rI   r8   r	   rJ   rK   rL   fuse_modelsr   rM   rN   rO   rP   rQ   rR   r0   r+   rS   rT   )r   r2   rU   r+   rV   rT   rS   rK   ro   rL   rW   jr   s               r   r   BasicBlockERes2Net_AFF.__init__|   s   $d46DJJvT)9:;<Yv>
>>%-0	tyy!ALL./JJr~~e,- " tyy1}%AsE23 & ]]5)
==%==5&	U]F^^,CD
>>&>>"9:Q;)~~'>>MM		NNV+ !! 
 "$0G!HJDM 

r   c                 |   UnU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X0R
                  S5      n[        U R                  5       H  nUS:X  a  XE   nOU R                  US-
     " WXE   5      nU R                  U   " U5      nU R                  U R                  U   " U5      5      nUS:X  a  UnMl  [        R                  " X64S5      nM     U R                  U5      nU R                  U5      nU R                  U5      nX2-  nU R                  U5      nU$ rZ   )rD   rF   rM   r[   r\   rS   rH   rG   ro   rK   rL   r]   rO   rP   rR   r^   s          r   rd   BasicBlockERes2Net_AFF.forward   s   jjmhhsmiinkk#zz1-tyy!AAvV%%a!e,R8Ar"B488A;r?+BAvii	1- " jjohhsm==#iin
r   )rF   rP   rL   rD   rO   rK   ro   rG   rM   rT   rR   r+   rS   rf   rh   r&   s   @r   rj   rj   y   s    I"H r   rj   c                   L   ^  \ rS rSr\\/ SQSSSSS4U 4S jjrS	 rS
 rSr	U =r
$ )ERes2Net   )r7         r7   rg   P      TSTPFc	           	        > [         [        U ]  5         X@l        XPl        X`l        [        US-  5      U-  S-  U l        Xl        [        R                  " SUSSSSS9U l        [        R                  " U5      U l        U R                  XUS   SS9U l        U R                  XS-  US   SS9U l        U R                  X$S	-  US   SS9U l        U R                  X$S-  US   SS9U l        [        R                  " US-  US	-  SSSSS9U l        [        R                  " US	-  US-  SSSSS
9U l        [        R                  " US-  US-  SSSSS
9U l        [-        US	-  S9U l        [-        US-  S9U l        [-        US-  S9U l        US:X  d  US:X  a  SOSU l        [7        [8        U5      " U R                  UR:                  -  S9U l        [        R>                  " U R                  UR:                  -  U R4                  -  U5      U l         U R                  (       a6  [        RB                  " USS9U l"        [        R>                  " Xf5      U l#        g [        RH                  " 5       U l"        [        RH                  " 5       U l#        g )N   r(   r7   Fr)   r   )r+   r<   rw   )r*   r,   r+   r-      rm   TAPTSDP)in_dim)affine)%r   ru   r   r2   feat_dim	embed_dimrA   	stats_dimtwo_emb_layerr/   r0   rD   rE   rF   _make_layerlayer1layer2layer3layer4layer1_downsamplelayer2_downsamplelayer3_downsampler	   fuse_mode12fuse_mode123fuse_mode1234n_statsgetattrpooling_layersrN   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)
r   block
block_fuse
num_blocks
m_channelsr   r   pooling_funcr   r   s
            r   r   ERes2Net.__init__   sl    	h&(# "X\*Z7!;*YYzqAEK
>>*-&&z!}Q ' 8&&>:a= ' <&&Q
1a ' A&&Q
1a ' A "$NN" "$NN" "$NO" 
Q7a8 *r/:(E1\V5KqQRNL9>>EOO35	YYt~~?$,,N(*
NN9UCDM98DJKKMDMDJr   c                     U/S/US-
  -  -   n/ nU H8  nUR                  U" U R                  X$5      5        X!R                  -  U l        M:     [        R                  " U6 $ )Nr(   )rI   r2   rN   r/   rQ   )r   r   rU   r   r+   strideslayerss          r   r   ERes2Net._make_layer  s^    (aSJN33FMM%?@#oo5DN  }}f%%r   c                    UR                  SSS5      nUR                  S5      n[        R                  " U R	                  U R                  U5      5      5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  XE5      nU R                  U5      nU R                  U5      nU R                  Xx5      n	U R                  U5      n
U R                  U	5      nU R                  X5      nU R!                  U5      nU R#                  U5      nU R$                  (       a:  [        R                  " U5      nU R'                  U5      nU R)                  U5      nU$ U$ )Nr   r<   r(   )permute
unsqueeze_FrM   rF   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r_   ra   out1out2out1_downsample
fuse_out12out3fuse_out12_downsamplefuse_out123out4fuse_out123_downsamplefuse_out1234statsembed_aembed_bs                   r   rd   ERes2Net.forward  s3   IIaALLOffTXXdjjm,-{{3 {{4 006%%d<
{{4  $ 6 6z B''D{{4 !%!7!7!D))$G		,'**U#&&/C--$CjjoGNNr   )rF   rD   r   r   r   r   r   r2   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r!   r"   r#   r:   rj   r   r   rd   r$   r%   r&   s   @r   ru   ru      s3     *2($$B'H& r   ru   )module_namec                   T   ^  \ rS rSrSrS\\\4   4U 4S jjrS r	S r
S	S jrSrU =r$ )
SpeakerVerificationERes2Neti+  aw  Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
Args:
    model_dir: A model dir.
    model_config: The model config.
model_configc                   > [         TU ]  " X/UQ70 UD6  X l        U R                  S   U l        U R                  S   U l        X@l        SU l        [        U R
                  S   5      U l        [        U R                  U R                  S9U l
        US   nU R                  U5        U R                  R                  U R                  5        U R                  R                  5         g )Nr   rn   ry   device)r   r   pretrained_model)r   r   r   r   r   other_configfeature_dimr   r   ru   embedding_model._SpeakerVerificationERes2Net__load_check_pointtoeval)r   	model_dirr   argskwargspretrained_model_namer   s         r   r   $SpeakerVerificationERes2Net.__init__6  s    B4B6B(**;7++J7"#D$5$5h$?@'nn B !''9 : 56,!!#r   c                    [        U[        R                  5      (       a  [        R                  " U5      n[        UR                  5      S:X  a  UR                  S5      n[        UR                  5      S:X  d   S5       eU R                  U5      nU R                  UR                  U R                  5      5      nUR                  5       R                  5       $ )Nr(   r   r<   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayr[   
from_numpylenshape	unsqueeze-_SpeakerVerificationERes2Net__extract_featurer   r   r   detachcpu)r   audiofeature	embeddings       r   rd   #SpeakerVerificationERes2Net.forwardI  s    eRZZ(($$U+Eu{{q OOA&EKK
 	YX	Y  ((/((DKK)@A	!%%''r   c                     [         R                  " XR                  S9nX"R                  SSS9-
  nUR	                  S5      nU$ )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr   meanr   )r   r   r   s      r   __extract_feature-SpeakerVerificationERes2Net.__extract_featureW  s@    ++e2B2BCLLQL==##A&r   c                     U(       d  [         R                  " S5      nU R                  R                  [         R                  " [
        R                  R                  U R                  U5      US9SS9  g )Nr   )map_locationT)strict)	r[   r   r   load_state_dictloadospathjoinr   )r   r   r   s      r   __load_check_point.SpeakerVerificationERes2Net.__load_check_point]  sW    \\%(F,,JJT^^-BC#% 	 	- 	r   )r   r   r   r   r   r   r   )N)r   r!   r"   r#   __doc__r   strr   r   rd   r   r   r$   r%   r&   s   @r   r   r   +  s0    $S#X $&( r   r   )r(   )/r   rB   r   typingr   r   r   numpyr   r[   torch.nnr/   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svr   modelscope.metainfor   modelscope.modelsr   r   !modelscope.models.audio.sv.fusionr	   modelscope.utils.constantr
   modelscope.utils.devicer   Hardtanhr   r4   r8   Moduler:   rj   ru   register_modulespeaker_verificationeres2net_svr   r    r   r   <module>r     s   
  	 # #      + + B B & 0 1 + 1 2;;  < <~BRYY BJjryy jZ 	F,>,>@7* 7@7r   