
    9i2                     *   S SK r S SKrS SKJrJrJrJr  S SKrS SK	r
S SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S S	KJr  S S
KJr  \" 5       rS/r\R@                  " \RB                  \RD                  S9 " S S\5      5       r#g)    N)AnyDictListUnion)File)	Pipelines)
OutputKeys)pipeline)
InputModelPipeline)	PIPELINES)Tasks)
get_loggerSegmentationClusteringPipeline)module_namec            
         ^  \ rS rSrSrS\4U 4S jjrS\\\	R                  \4   S\\\4   4S jrS\S\	R                  4S	 jrS
\	R                  S\	R                  4S jrS\S\S\	R                  S
\	R                  S\4
S jrS\\\	R                  \4   S\4S jrS\4S jrS\S\4S jrS\S\S\\	R                  \4   S\	R                  4S jrS rS rSS jrSrU =r$ )r      a(  Segmentation and Clustering Pipeline
use `model` to create a Segmentation and Clustering Pipeline.

Args:
    model (SegmentationClusteringPipeline): A model instance, or a model local dir, or a model id in the model hub.
    kwargs (dict, `optional`):
        Extra kwargs passed into the pipeline's constructor.
Example:
>>> from modelscope.pipelines import pipeline
>>> from modelscope.utils.constant import Tasks
>>> p = pipeline(
>>>    task=Tasks.speaker_diarization, model='damo/speech_campplus_speaker-diarization_common')
>>> print(p(audio))

modelc                    > [         TU ]  " S
SU0UD6  U R                  R                  U l        SSS.nU R                  R                  U5        U R                  S   U l        [        SU R                  S   S9U l        g	)zuuse `model` to create a speaker diarization pipeline for prediction
Args:
    model (str): a valid official model id
r         ?g      ?)seg_dur	seg_shiftsample_ratezspeaker-verificationspeaker_modeltaskr   N )	super__init__r   other_configconfigupdatefsr
   sv_pipeline)selfr   kwargsr!   	__class__s       {/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/pipelines/audio/segmentation_clustering_pipeline.pyr   'SegmentationClusteringPipeline.__init__-   sx    
 	/u//jj--
 	6"++m,#'t{{?/KM    audioreturnc                    U R                   R                  U5        [        R                  S5        U R	                  U5      nU R                  U5        [        R                  S5        U R                  U5      n[        R                  S5        U R                  U5      n[        R                  S5        U R                  U5      n[        R                  S5        U R                  XCXe5      n[        R                  U0$ )a?  extract the speaker embeddings of input audio and do cluster
Args:
    audio (str, np.ndarray, list): If it is represented as a str or a np.ndarray, it
    should be a complete speech signal and requires VAD preprocessing. If the audio
    is represented as a list, it should contain only the effective speech segments
    obtained through VAD preprocessing. The list should be formatted as [[0(s),3.2,
    np.ndarray], [5.3,9.1, np.ndarray], ...]. Each element is a sublist that contains
    the start time, end time, and the numpy array of the speech segment respectively.
zDoing VAD...zDoing segmentation...zExtracting embeddings...zClustering...zPost processing...)r!   r"   loggerinfo
preprocesscheck_audio_listchunkforward
clusteringpostprocessr	   TEXT)r%   r+   paramsvad_segmentssegments
embeddingslabelsoutputs           r(   __call__'SegmentationClusteringPipeline.__call__=   s     	6"N#u-l++,::l+./\\(+
O$,()!!(&M((r*   inputc                     / nU H@  nU R                  US   /SS9nUS   R                  S:X  d  M,  UR                  US   5        MB     [        R                  " U5      nU$ )N   T)
output_embembs)      )r$   shapeappendnpconcatenate)r%   r?   r:   s	save_dicts        r(   r3   &SegmentationClusteringPipeline.forward\   sh    
A((!A$D(AI &&(2!!)F"34  ^^J/
r*   r:   c                 @    U R                   " U40 U R                  D6nU$ )N)r   r!   )r%   r:   r;   s      r(   r4   )SegmentationClusteringPipeline.clusteringe   s    J6$++6r*   r9   r8   r;   c                 ,   [        U5      [        U5      :X  d   eU R                  U5      n/ n[        [        U5      5       H#  nUR                  X   S   X   S   X6   /5        M%     U R	                  U5      n/ n[        UR                  5       S-   5       H*  nXCU:H     R                  S5      nUR                  U5        M,     [        R                  " U5      nS n	[        S[        U5      5       GH
  nU	" XVS-
     S   XV   S   5      (       d  M!  XV   S   XVS-
     S   -   S-  n
SU R                  ;   a  [        U S5      (       d*  [        [        R                  U R                  S   S9U l        [        U
S-
  XVS-
     S   5      n[        U
S-   XV   S   5      nX-
  S:  aE  U R!                  XU5      nXVS-
     S   nXV   S   nU R                  XU   X   /S	S
9u  nnUb  UU-   n
XU   S'   XUS-
     S'   GM     U R#                  U5      nU$ )Nr   rD   c                     XS-   :  a  gg)Ng-C6?TFr   )t1t2s     r(   is_overlappedASegmentationClusteringPipeline.postprocess.<locals>.is_overlappedz   s    I~r*   rA   change_locatorchange_locator_pipeliner   r   T)
output_res)lencorrect_labelsrangerG   merge_sequemaxmeanrH   stackr!   hasattrr
   r   speaker_diarizationrV   min	cut_audiosmooth)r%   r9   r8   r;   r:   distribute_resispk_embsspk_embrS   pshort_utt_stshort_utt_ed
audio_dataspk1spk2_cts                     r(   r5   *SegmentationClusteringPipeline.postprocessi   sJ   8}F+++$$V,s8}%A!!8;q>8;q>69"MN & )).9 vzz|a'(A 1-2215GOOG$ ) 88H%	 q#n-.A^E215~7H7KLL#&q)Nq5,A!,DDI#t{{2"4)BCC7?!&!:!:"&++.>"?8A4 $'q3w1u0Ea0H#IL#&q3w0A!0D#EL#2Q6%)^^L4@&B
-!e4Q7-03 $ < <&$(H'+ != !-2 > ,r 1A'(q!!$+,q1u%a(+ /0 ^4r*   c                    [        U[        5      (       a  UR                  S S9  U$ [        U[        5      (       Ga  [        R
                  " U5      n[        R
                  " [        R                  " U5      SS9u  p[        UR                  5      S:X  a	  US S 2S4   nX0R                  :w  a  [        R                  SU R                   S35        [        R                  R!                  ["        R$                  " U5      R'                  S5      US	[        U R                  5      //S
9u  pUR)                  S5      R+                  5       n[        UR                  5      S:X  d   S5       eUR,                  S;   a  US-  R/                  S5      nOUR/                  S5      n[1        U S5      (       d+  [3        [4        R6                  U R8                  S   SS9U l        U R;                  XR                  SS9S   S   n/ n[        U[        5      (       a  [<        R>                  " U5      nO/[        U[        5      (       a  UnO[A        S[C        U5      -  5      eU He  n[E        US   5      S-  n[E        US   5      S-  n	URG                  XU[E        XR                  -  5      [E        XR                  -  5       /5        Mg     U$ )Nc                     U S   $ )Nr   r   )xs    r(   <lambda>;SegmentationClusteringPipeline.preprocess.<locals>.<lambda>   s    QqTr*   )keyfloat32)dtyperA   r   z+[WARNING]: The sample rate of audio is not z, resample it.rate)effectsrD   %modelscope error: Wrong audio format.)int16int32int64i   vad_pipeline	vad_modelzv2.0.2)r   r   model_revisionT)r#   is_finalvaluezIncorrect vad result. Get %si  )$
isinstancelistsortstrr   readsfioBytesIOrX   rF   r#   r.   r/   
torchaudiosox_effectsapply_effects_tensortorch
from_numpy	unsqueezesqueezenumpyrx   astyper_   r
   r   voice_activity_detectionr!   r   astliteral_eval
ValueErrortypeintrG   )
r%   r+   
file_bytesr#   vad_timer8   vad_time_listtsteds
             r(   r0   )SegmentationClusteringPipeline.preprocess   se   eT""JJ>J*Ls##5)J

: 6iHIE5;;1$adWW}A$''.Y '22GG$$U+55a8$c$''l34 H 6	 a(..05;;1$M&MM$;;55g&..y9ELL+Et^,, (33kk+.'!)D $$gg % ../118:h$$,,X6M$''$M;tH~NOOAQqTT!BQqTT!Bs2<0R''\1BCDF  r*   c                    Sn[        [        U5      5       H  nX   nUS   US   :  d   S5       e[        US   [        R                  5      (       d   S5       e[        US   U R                  -  5      [        US   U R                  -  5      -
  US   R                  S   :X  d   S5       eUS:  a  US   UUS-
     S   :  d   S5       eX$S   US   -
  -  nM     US:  d   S5       eg )	Nr   rD   z$modelscope error: Wrong time stamps.rA   z"modelscope error: Wrong data type.zFmodelscope error: audio data in list is inconsistent with time length.   z<modelscope error: The effective audio duration is too short.)rZ   rX   r   rH   ndarrayr   r#   rF   )r%   r+   	audio_durre   segs        r(   r1   /SegmentationClusteringPipeline.check_audio_list   s2   	s5z"A(Cq6SV#K%KK#c!f jj* * P+OP *s1v'(3A , Q ] ]]  1u1vE""  FEF Q#a&(I # 1}\\\}r*   c                 n   ^  U 4S jn/ n[        U5       H  u  pEUR                  U" U5      5        M     U$ )Nc                 H  > U S   nU S   n[        T
R                  S   T
R                  -  5      n[        T
R                  S   T
R                  -  5      nSn/ n[        SUR                  S   U5       H  n[        Xs-   UR                  S   5      nX::  a    U$ Un[        SX-
  5      nX'U n	U	R                  S   U:  a)  [        R                  " U	SX9R                  S   -
  4S5      n	UR                  UT
R                  -  U-   UT
R                  -  U-   U	/5        M     U$ )Nr   rA   r   r   constant)
r   r!   r#   rZ   rF   ra   r\   rH   padrG   )seg_dataseg_stdata	chunk_lenchunk_shiftlast_chunk_edseg_reschunk_stchunk_ed
chunk_datar%   s             r(   	seg_chunk7SegmentationClusteringPipeline.chunk.<locals>.seg_chunk   s4   a[FA;DDKK	2TWW<=Idkk+6@AKMG!!TZZ]K@x3TZZ]C, N !)q("67!84
##A&2!#
)*I8H8H8K,K(L(2"4J tww&/DGG1Cf1L   A Nr*   )	enumerateextend)r%   r8   r   segsre   rJ   s   `     r(   r2   $SegmentationClusteringPipeline.chunk   s7    	0 l+DAKK	!% , r*   cut_stcut_edc                 0   [        U[        R                  5      (       a/  U[        XR                  -  5      [        X R                  -  5       $ [        U[
        5      (       Ga(  [        [        U5      5       Ht  nUS:X  a  XU   S   :  a  UnOXUS-
     S   :  a  XU   S   :  a  UnU[        U5      S-
  :X  a  X#U   S   :  a  UnMS  MU  X#U   S   :  d  Mb  X#US-      S   ::  d  Mr  UnMv     UWWS-    n/ n[        [        U5      5       Hb  nXt   u  pnUR                  U[        [        X5      U	-
  U R                  -  5      [        [        X*5      U	-
  U R                  -  5       5        Md     [        R                  " U5      nU$ [        S5      e)Nr   rD   r{   )r   rH   r   r   r#   r   rZ   rX   rG   r\   ra   rI   r   )r%   r   r   r+   re   st_ied_i
audio_segscut_datas_sts_edr   s               r(   rb   (SegmentationClusteringPipeline.cut_audio   s    eRZZ((Vgg-.s6GG3C/DEEt$$3u:&6a+ q1ua0VAhqk5I E
Q&a+  , a+A,q/0I  ' tD1H-JH3z?+#-= Dc&/$6#ww' ((+S->-E.2gg-6 )789 , ~~h/HODEEr*   c                     Sn0 n/ nU H$  nXS;  a	  X#U'   US-  nUR                  X5   5        M&     [        R                  " U5      $ )Nr   rD   )rG   rH   array)r%   r;   	labels_idid2id
new_labelsre   s         r(   rY   -SegmentationClusteringPipeline.correct_labels  sQ    	
A~$aQ	eh'	 
 xx
##r*   c                     US   /n[        S[        U5      5       HG  nX   S   US   S   :w  d  X   S   US   S   :  a  UR                  X   5        M:  X   S   US   S'   MI     U$ )Nr   rD   rA   )rZ   rX   rG   )r%   rd   resre   s       r(   r[   *SegmentationClusteringPipeline.merge_seque&  s    a !q#n-.A #s2wqz1^5F6R6$

>,-+.q1B
 / 
r*   c                    [        [        U5      5       H  n[        X   S   S5      X   S'   [        X   S   S5      X   S'   X   S   X   S   -
  U:  d  MD  US:X  a  XS-      S   X   S'   M[  U[        U5      S-
  :X  a  XS-
     S   X   S'   M~  X   S   XS-
     S   -
  XS-      S   X   S   -
  ::  a  XS-
     S   X   S'   M  XS-      S   X   S'   M     U R                  U5      nU$ )Nr   rA   rD   )rZ   rX   roundr[   )r%   r   mindurre   s       r(   rc   %SegmentationClusteringPipeline.smooth0  s   s3xAcfQi+CF1IcfQi+CF1Ivay36!9$v-6 #E
1CF1I#c(Q,& #E
1CF1IVAYUA.#!e*Q-#&)2KK #E
1CF1I #E
1CF1I ! s#
r*   )rV   r!   r#   r$   r   )rD   )__name__
__module____qualname____firstlineno____doc__r   r   r   r   rH   r   r   r   r   r=   r3   r4   r5   r0   r1   r2   floatrb   rY   r[   rc   __static_attributes____classcell__)r'   s   @r(   r   r      s=    Mj M )eCT$9: )"38n)>T bjj RZZ BJJ 1D 1 1JJ146JJ1CG1f+c2::t&; < + +Z]d ]"$ 4 @F Fu Frzz4/0F57ZZFB	$ r*   )$r   r   typingr   r   r   r   r   rH   	soundfiler   r   r   modelscope.fileior   modelscope.metainfor   modelscope.outputsr	   modelscope.pipelinesr
   modelscope.pipelines.baser   r   modelscope.pipelines.builderr   modelscope.utils.constantr   modelscope.utils.loggerr   r.   __all__register_moduler`   segmentation_clusteringr   r   r*   r(   <module>r      s     	 ) )     " ) ) ) : 2 + .	+
, 	9+L+LNeX eNer*   