
    9i6                        S SK r S SKrS SKJrJr  S SKrS SKJr  S SKJs  J	r
  S SKJr  S SKJrJr  S SKJrJr  S SKJrJr  S SKJr  S SKJr  S	r\R6                  " \R8                  \R:                  S
9 " S S\5      5       rS r " S S\R@                  5      r! " S S\RD                  5      r# " S S5      r$ " S S\R@                  5      r% " S S\R@                  5      r& " S S\R@                  5      r'g)    N)AnyDict)Models)MODELS
TorchModel)MossFormerModuleScaledSinuEmbedding)CumulativeLayerNormGlobalLayerNorm)Tensor)Tasks:0yE>)module_namec                   `   ^  \ rS rSrSrS\4U 4S jjrS\S\\\	4   4S jr
SS jrS	 rS
rU =r$ )
MossFormer   z]Library to support MossFormer speech separation.

Args:
    model_dir (str): the model path.
	model_dirc                 ,  > [         TU ]  " U/UQ70 UD6  [        US   US   S9U l        [	        US   SUS   US   US   S9U l        [        US   US   [        US	   US
   US   US   US   US   US   5      US   US   S9U l        US   U l	        g )Nkernel_sizeout_channels)r   r   in_channels   stridebias)r   r   r   r   r   
num_blocksd_modelattn_dropout
group_sizequery_key_dimexpansion_factorcausalnormnum_spks)r"   r#   )
super__init__EncoderencoderDecoderdecoderMossFormerMaskNetMossFormerMmask_netr#   )selfr   argskwargs	__class__s       m/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/audio/separation/mossformer.pyr%   MossFormer.__init__!   s    4T4V4}-/1 }-}-(#! *=!>"|,fY.?~.|0D/8J1Kx(* J') z*    inputsreturnc           	         U R                  U5      nU R                  U5      n[        R                  " U/U R                  -  5      nX#-  n[        R
                  " [        U R                  5       Vs/ s H%  nU R                  XE   5      R                  S5      PM'     snSS9nUR                  S5      nUR                  S5      nXx:  a  [        R                  " USSSXx-
  45      nU$ US S 2S U2S S 24   nU$ s  snf )Ndimr   r   )r'   r,   torchstackr#   catranger)   	unsqueezesizeFpad)	r-   r4   mix_west_masksep_hi
est_sourcet_origint_ests	            r1   forwardMossFormer.forward7   s    V$=='UGdmm34 YY t}}--A UX&004- 

 ;;q>"zAq!X5E+FGJ  $Ayy!O4Js   2,C:c                 &   U(       d  U R                   nU(       d  [        R                  " S5      nU R                  R	                  [        R
                  " [        R                  R                  US5      US9SS9  U R                  R	                  [        R
                  " [        R                  R                  US5      US9SS9  U R                  R	                  [        R
                  " [        R                  R                  US5      US9SS9  g )Ncpuzencoder.bin)map_locationT)strictzdecoder.binzmasknet.bin)r   r:   devicer'   load_state_dictloadospathjoinr)   r,   )r-   	load_pathrO   s      r1   load_check_pointMossFormer.load_check_pointN   s    I\\%(F$$JJY6VM 	% 	 	$$JJY6VM 	% 	 	%%JJY6VM 	& 	r3   c                 T    [        U R                  U R                  U R                  S9$ )N)r'   r)   masknet)dictr'   r)   r,   )r-   s    r1   as_dictMossFormer.as_dict`   s$    LL$,,O 	Or3   )r)   r'   r,   r#   )NN)__name__
__module____qualname____firstlineno____doc__strr%   r   r   r   rI   rV   r[   __static_attributes____classcell__r0   s   @r1   r   r      s@    +# +,f c3h .$O Or3   r   c                     U S:X  a
  [        XSS9$ U S:X  a
  [        USS9$ U S:X  a  [        R                  " SUSS9$ [        R                  " U5      $ )	z5Just a wrapper to select the normalization type.
    glnT)elementwise_affineclnlnr   r   eps)r   r
   nn	GroupNormBatchNorm1d)r"   r9   shapes      r1   select_normrq   e   sU     u}sdCCu}"34@@t|||As--~~c""r3   c                   h   ^  \ rS rSrSr   S
S\S\S\4U 4S jjjrS\R                  4S jr	S	r
U =r$ )r&   s   a:  Convolutional Encoder Layer.

Args:
    kernel_size: Length of filters.
    in_channels: Number of  input channels.
    out_channels: Number of output channels.

Examples:

>>> x = torch.randn(2, 1000)
>>> encoder = Encoder(kernel_size=4, out_channels=64)
>>> h = encoder(x)
>>> h.shape # torch.Size([2, 64, 499])
r   r   r   c           	      x   > [         [        U ]  5         [        R                  " UUUUS-  SSS9U l        X0l        g )N   r   F)r   r   r   r   groupsr   )r$   r&   r%   rm   Conv1dconv1dr   )r-   r   r   r   r0   s       r1   r%   Encoder.__init__   sA     	gt%'ii#%#!#
 'r3   xc                     U R                   S:X  a  [        R                  " USS9nU R                  U5      n[        R
                  " U5      nU$ )a1  Return the encoded output.

Args:
    x: Input tensor with dimensionality [B, L].

Returns:
    Encoded tensor with dimensionality [B, N, T_out].
    where B = Batchsize
          L = Number of timepoints
          N = Number of filters
          T_out = Number of timepoints at the output of the encoder
r   r8   )r   r:   r>   rx   r@   relur-   rz   s     r1   rI   Encoder.forward   sA     q q)AKKNFF1Ir3   )rx   r   )ru   @   r   )r]   r^   r_   r`   ra   intr%   r:   r   rI   rc   rd   re   s   @r1   r&   r&   s   sL      %&%'$%'!'"' "' '  r3   r&   c                   8   ^  \ rS rSrSrU 4S jrU 4S jrSrU =r$ )r(      af  A decoder layer that consists of ConvTranspose1d.

Args:
    kernel_size: Length of filters.
    in_channels: Number of  input channels.
    out_channels: Number of output channels.

Example
---------
>>> x = torch.randn(2, 100, 1000)
>>> decoder = Decoder(kernel_size=4, in_channels=100, out_channels=1)
>>> h = decoder(x)
>>> h.shape
torch.Size([2, 1003])
c                 .   > [         [        U ]
  " U0 UD6  g N)r$   r(   r%   )r-   r.   r/   r0   s      r1   r%   Decoder.__init__   s    gt%t6v6r3   c                   > UR                  5       S;  a$  [        SR                  U R                  5      5      e[        TU ]  UR                  5       S:X  a  UO[        R                  " US5      5      n[        R                  " U5      R                  5       S:X  a  [        R                  " USS9nU$ [        R                  " U5      nU$ )zReturn the decoded output.

Args:
    x: Input tensor with dimensionality [B, N, L].
    where, B = Batchsize,
           N = number of filters
           L = time points
)ru      z{} accept 3/4D tensor as inputr   r   r8   )	r9   RuntimeErrorformatr]   r$   rI   r:   r>   squeeze)r-   rz   r0   s     r1   rI   Decoder.forward   s     557& ?FF     GOAA5??1a3HI==!Q&aQ'A  a Ar3    )	r]   r^   r_   r`   ra   r%   rI   rc   rd   re   s   @r1   r(   r(      s     7 r3   r(   c                   $    \ rS rSrSrS rS rSrg)IdentityBlock   zThis block is used when we want to have identity transformation within the Dual_path block.

Example
-------
>>> x = torch.randn(10, 100)
>>> IB = IdentityBlock()
>>> xhat = IB(x)
c                     g r   r   )r-   r/   s     r1   _init__IdentityBlock._init__   s    r3   c                     U$ r   r   r}   s     r1   __call__IdentityBlock.__call__   s    r3   r   N)r]   r^   r_   r`   ra   r   r   rc   r   r3   r1   r   r      s    r3   r   c                   ^   ^  \ rS rSrSr      SU 4S jjrS\R                  4S jrSr	U =r
$ )r+      a  This class implements the transformer encoder.

Args:
num_blocks : int
    Number of mossformer blocks to include.
d_model : int
    The dimension of the input embedding.
attn_dropout : float
    Dropout for the self-attention (Optional).
group_size: int
    the chunk size
query_key_dim: int
    the attention vector dimension
expansion_factor: int
    the expansion factor for the linear projection in conv module
causal: bool
    true for causal / false for non causal

Example
-------
>>> import torch
>>> x = torch.rand((8, 60, 512)) #B, S, N
>>> net = MossFormerM(num_blocks=8, d_model=512)
>>> output, _ = net(x)
>>> output.shape
torch.Size([8, 60, 512])
c           
         > [         T	U ]  5         [        UUUUUUUS9U l        SS KnUR
                  R                  R                  USS9U l        g )N)r9   depthr   r   r    r!   r   r   gư>rk   )	r$   r%   r   mossformerMspeechbrainnnetnormalization	LayerNormr"   )
r-   r   r   r   r   r   r    r!   sbr0   s
            r1   r%   MossFormerM.__init__  sX     	+!'-%' 	!GG))33G3F	r3   srcc                 J    U R                  U5      nU R                  U5      nU$ )z
Args:
    src: Tensor shape [B, S, N],
    where, B = Batchsize,
           S = time points
           N = number of filters
    The sequence to the encoder layer (required).
r   r"   )r-   r   outputs      r1   rI   MossFormerM.forward  s'     !!#&6"r3   r   )Ng?      g      @Fr]   r^   r_   r`   ra   r%   r:   r   rI   rc   rd   re   s   @r1   r+   r+      s7    < !""$G*5<<  r3   r+   c                   V   ^  \ rS rSrSr  SU 4S jjrS\R                  4S jrSr	U =r
$ )ComputeAttentioni%  a
  Computation block for dual-path processing.

Args:
att_mdl : torch.nn.module
    Model to process within the chunks.
 out_channels : int
    Dimensionality of attention model.
 norm : str
    Normalization type.
 skip_connection : bool
    Skip connection around the attention module.

Example
---------
    >>> att_block = MossFormerM(num_blocks=8, d_model=512)
    >>> comp_att = ComputeAttention(att_block, 512)
    >>> x = torch.randn(10, 64, 512)
    >>> x = comp_att(x)
    >>> x.shape
    torch.Size([10, 64, 512])
c                 x   > [         [        U ]  5         Xl        X@l        X0l        Ub  [        X2S5      U l        g g )Nr   )r$   r   r%   att_mdlskip_connectionr"   rq   att_norm)r-   r   r   r"   r   r0   s        r1   r%   ComputeAttention.__init__<  s>     	.0. 	'A>DM r3   rz   c                    UR                  SSS5      R                  5       nU R                  U5      nUR                  SSS5      R                  5       nU R                  b  U R	                  U5      nU R
                  (       a  X!-   nUnU$ )zReturns the output tensor.

Args:
    x: Input tensor of dimension [B, S, N].

Returns:
    out: Output tensor of dimension [B, S, N].
    where, B = Batchsize,
       N = number of filters
       S = time points
r   ru   r   )permute
contiguousr   r"   r   r   )r-   rz   att_outouts       r1   rI   ComputeAttention.forwardM  s~     ))Aq!$//1,,w' //!Q*55799 mmG,G kG
r3   )r   r   r"   r   )rj   Tr   re   s   @r1   r   r   %  s*    4 ?"  r3   r   c                   Z   ^  \ rS rSrSr    SU 4S jjrS\R                  4S jrSr	U =r
$ )r*   ik  a   The dual path model which is the basis for dualpathrnn, sepformer, dptnet.

Args:
in_channels : int
    Number of channels at the output of the encoder.
out_channels : int
    Number of channels that would be inputted to the intra and inter blocks.
att_model : torch.nn.module
    Attention model to process the input sequence.
norm : str
    Normalization type.
num_spks : int
    Number of sources (speakers).
skip_connection : bool
    Skip connection around attention module.
use_global_pos_enc : bool
    Global positional encodings.

Example
---------
>>> mossformer_block = MossFormerM(num_blocks=8, d_model=512)
>>> mossformer_masknet = MossFormerMaskNet(64, 64, att_model, num_spks=2)
>>> x = torch.randn(10, 64, 2000)
>>> x = mossformer_masknet(x)
>>> x.shape
torch.Size([2, 10, 64, 2000])
c           	        > [         [        U ]  5         XPl        [	        XAS5      U l        [        R                  " XSSS9U l        Xpl	        U R                  (       a  [        U5      U l        [        R                  " [        UUUUS95      U l        [        R                  " X"U-  SS9U l        [        R                  " X!SSS9U l        [        R$                  " 5       U l        [        R(                  " 5       U l        [        R,                  " [        R                  " X"S5      [        R.                  " 5       5      U l        [        R,                  " [        R                  " X"S5      [        R2                  " 5       5      U l        g )Nr   r   F)r   )r   )r   )r$   r*   r%   r#   rq   r"   rm   rw   conv1d_encoderuse_global_pos_encr	   pos_enccopydeepcopyr   mdl
conv1d_outconv1_decoderPReLUpreluReLU
activation
SequentialTanhr   Sigmoidoutput_gate)	r-   r   r   	att_modelr"   r#   r   r   r0   s	           r1   r%   MossFormerMaskNet.__init__  s    	/1 15	 iiqu6"4"".|<DL== /	 ))1qBYYqu6XXZ
'')mmIIl!4bggiA==IIl!4bjjlDr3   rz   c                    U R                  U5      nU R                  U5      nU R                  (       a;  UnUR                  SS5      nU R	                  U5      nUR                  SS5      nX#-   nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pEnUR                  X@R                  -  SU5      nU R                  U5      U R                  U5      -  nU R                  U5      nUR                  u  pWnUR                  X@R                  Xx5      nU R                  U5      nUR                  SS5      nU$ )a	  Returns the output tensor.

Args:
    x: Input tensor of dimension [B, N, S].

Returns:
    out: Output tensor of dimension [spks, B, N, S]
    where, spks = Number of speakers
       B = Batchsize,
       N = number of filters
       S = the number of time frames
r   r7   r   )r"   r   r   	transposer   r   r   r   rp   viewr#   r   r   r   r   )	r-   rz   baseembb_snLs	            r1   rI   MossFormerMaskNet.forward  s"     IIaL"""DAr"A,,q/C--2&C
AHHQKJJqMOOA''aFF1}}$b!,KKNT--a00q!''aFF1mmQ*OOAKK1r3   )r   r   r   r   r   r"   r#   r   r   r   r   r   )rj   ru   TTr   re   s   @r1   r*   r*   k  s3    B &DP+ + +r3   r*   )(r   rR   typingr   r   r:   torch.nnrm   torch.nn.functional
functionalr@   modelscope.metainfor   modelscope.modelsr   r   3modelscope.models.audio.separation.mossformer_blockr   r	   9modelscope.models.audio.separation.mossformer_conv_moduler
   r   modelscope.models.baser   modelscope.utils.constantr   EPSregister_modulespeech_separation(speech_mossformer_separation_temporal_8kr   rq   Moduler&   ConvTranspose1dr(   r   r+   r   r*   r   r3   r1   <module>r      s     	      & 0+* ) +
 	??AHO HOAHOV#3bii 3l'b   'T ">")) >BCryy CLp		 pr3   