
    9i5                     z   S SK JrJrJrJrJr  S SKrS SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJrJr  S SKJr  S S	KJrJr  S S
KJrJr  S SKJr  \" 5       r\R:                  " \R<                  \R>                  S9 " S S\5      5       r \R:                  " \R<                  \RB                  S9 " S S\5      5       r"g)    )AnyDictListTupleUnionN)Preprocessors)Preprocessor)PREPROCESSORS)"TextClassificationPreprocessorBase)NLPTokenizerForLSTM#TokenClassificationPreprocessorBase)NLPTokenizer)FieldsModeKeys)get_model_typeparse_label_mapping)
get_logger)module_namec                      ^  \ rS rSrSS jrSSSSS\R                  SSS4	S\S\S\\\	4   S\
S\S	\S
\4U 4S jjjrSrU =r$ )/SpeakerDiarizationDialogueDetectionPreprocessor   Nc                 ~    SU;  a%  U R                   [        R                  :X  a  SOS US'   U R                  " X40 UD6$ )Nreturn_tensorspt)moder   	INFERENCEnlp_tokenizer)self	sequence1	sequence2kwargss       `/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/speaker.py_tokenize_text>SpeakerDiarizationDialogueDetectionPreprocessor._tokenize_text   sD    6),0II9K9K,KDQU  "!!)A&AA    labelfirst_sequencesecond_sequencelabel2idr   
max_lengthuse_fastc
           	        > U
R                  SS5      U
S'   U
R                  SS5      U
S'   Ub  UOU
R                  SS5      U
S'   U
R                  SS 5        S nUb  [        U5      n[        XXS9U l        [
        TU ]  XX4XVU	5        g )N
truncationTpaddingr*   sequence_length   )r+   tokenize_kwargs)getpopr   r   r   super__init__)r   	model_dirr'   r(   r&   r)   r   r*   r+   keep_original_columnsr!   
model_type	__class__s               r"   r5   8SpeakerDiarizationDialogueDetectionPreprocessor.__init__!   s      &zz,=|"JJy,?y*4*@Jfjj!3G( 		 	

$d+
 '	2J)HNO!)>	@r%   r   )N)__name__
__module____qualname____firstlineno__r#   r   r   strr   r   r   intboolr5   __static_attributes____classcell__r9   s   @r"   r   r      s    
B  '+(,+2"&%//#'"&'+@!$@ #&@ c4i(	@
  @ @ !@  @ @r%   r   c                      ^  \ rS rSrSSSSS\R
                  SSSS4
S\S\S\S	\S
\S\4U 4S jjjr	S\
\\\   4   4S jrS rS rS rS rS rSrU =r$ ):SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor;   Ntextr&   FTr6   r'   r)   label_all_tokensr   c           
      *  > [         TU ]  XX4XVU	U
5        S nUb  [        U5      nUR                  SS5      US'   UR                  SS5      US'   Ub  UOUR                  SS5      US'   UR	                  SS 5        US:g  US'   [        UUUUS	9U l        g )
Nr-   Tr.   r*   r/   r0   lstmadd_special_tokens)r6   r8   r+   r1   )r4   r5   r   r2   r3   r   r   )r   r6   r'   r&   r)   rJ   r   r*   r+   r7   return_textr!   r8   r9   s                r"   r5   CSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.__init__@   s     	E)1F$	& 
 '	2J%zz,=|"JJy,?y*4*@Jfjj!3G( 		 	

$d+'1V';#$0!"	$r%   c                    UnU R                   [        R                  :w  a  [        U[        5      (       d   S5       eU R
                  R                  SS5      nU(       a  UR                  S5      nUS:X  d  U R                  (       a  [	        U5      nOQ/ nUR                  [	        US U 5      5        UR                  S5        UR                  [	        X5S-   S  5      5        UnU(       a4  U R                   [        R                  :X  a  U R                  " U40 UD6u  pxOPU R
                  R                  R                  (       a  U R                  " U40 UD6u  pxOU R                  " U40 UD6u  pxSn[!        US   5       H,  u  pXR
                  R                  R"                  :X  d  M*  U	n  O   US:w  a'  [%        U['        US   5      5       H  nSUS   U'   M     U R                   [        R                  :X  aC  UR)                  5        H,  n[*        R,                  " X|   5      R/                  S	5      X|'   M.     Xx4$ UR1                  S
S 5        Xx4$ )NzsInput needs to be lists in training and evaluating,because the length of the words and the labels need to be equal.is_split_into_wordsFz[SEP]   	input_ids
label_maskr   offset_mapping)r   r   r   
isinstancelistr   get_tokenizer_kwargfindis_lstm_modelextendappend_tokenize_text_by_words	tokenizeris_fast"_tokenize_text_with_fast_tokenizer"_tokenize_text_with_slow_tokenizer	enumeratesep_token_idrangelenkeystorchtensor	unsqueezer3   )r   rI   r!   tokensrQ   sep_idx
tmp_tokens	encodingsword_idsidxtoken_idikeys                r"   r#   ISpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text`   s/   99***fd++ S .S S+"00DD!5*kk'*G"} 2 2f
!!$vhw'7"89!!'*!!$vkl';"<=#4990B0B#B"&">">#" #"Ix))11"&"I"I#" #"Ix #'"I"I#" #"I &y'=>MC--77DDD ? b=7C	,(?$@A-2	,'* B 99*** ~~'!&in!=!G!G!J	 ( "" MM*D1""r%   c           	         / n/ n/ n/ n[        U5       H  u  pxU R                  R                  R                  USS9n	[	        U	5      S:X  a!  U R                  R                  R
                  /n	UR                  U	5        UR                  S/[	        U	5      -  5        UR                  S/S/[	        U	5      S-
  -  -   5        UR                  XwS-   4/5        M     UR                  SU R                  R                  S5      5      n
UR                  SUR                  SU R                  R                  S5      5      5      nU R                  R                  S	5      (       a  SOSn[	        U5      US
U-  -
  :  a  US US
U-  -
   nUS US
U-  -
   nUS [        U5       nU
S:X  a  S/U-  U-   S/U[	        U5      -
  U-
  -  -   nUS/U[	        U5      -
  -  -   nU R                  R                  R                  /U-  U-   U R                  R                  R                  /U-  -   U R                  R                  R                  /U[	        U5      -
  S
U-  -
  -  -   nUS/US
-  -  -   S/U[	        U5      -
  S
U-  -
  -  -   nOhS/U-  U-   S/U-  -   nU R                  R                  R                  /U-  U-   U R                  R                  R                  /U-  -   nUS/US
-  -  -   nUUUUS.nUS 4$ )NF)rM   r      Tr.   r*   r/   rM      r   r   )rT   attention_maskrU   rV   )rc   r   r_   encoderf   unk_token_idr\   r2   rY   sumcls_token_idrd   pad_token_id)r   rk   r!   rT   rU   rV   ry   offsettokensubtoken_idsr.   r*   special_tokenrn   s                 r"   r^   RSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_by_words   sr   	
&v.MF--77>>% ? 1L< A% $ 2 2 < < I IJ\*!!1#L(9"9:tfw#l2Ca2G'HHI!!FQJ#7"89 / **Y!//CCINPZZJJ())==lKMN
 "//CC " "'( 	z?Z!m*;;;#$Ej1}3D&DFJ!"CJ]1B$BDI'(8Z9l"=0:=:J7-GHIJ+vhS00/2 2N++55BBCmSV__##--::;mKL##--::;zCPYN?Z]^an]n?nopI ,qc!/# #&'S^!44q=7HH&JJN  =0:=-'(J++55BBCmSV__##--::;mKLI+qc]Q5F.GGN #,$,	
	 $r%   c                    [        U[        5      nU R                  " U4SUS.UD6n/ nUR                  5       n/ n[	        [        U5      5       H  nXh   c  UR                  S5        M  Xh   XhS-
     :X  a2  UR                  S5        U(       d  US   S   US   U   S   4US'   MX  MZ  UR                  S5        U(       a  UR                  Xh   Xh   S-   45        M  UR                  US   U   5        M     U R                  R                  S5      n	U	S	:X  a  US
/[        U5      [        U5      -
  -  -   nXtS'   XTS'   XF4$ )NT)return_offsets_mappingrQ   Frv   rR   r   rV   r.   r*   rx   rU   )rW   rX   r   ro   re   rf   r]   rY   )
r   rk   r!   rQ   rn   rU   ro   rV   rr   r.   s
             r"   ra   ]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_fast_tokenizer   sr   (6&&#' 3 		
 
%%'s8}%A{"!!%(Q/!!%(**8*<Q*?*34D*Ea*H*K*MN2& + !!$'&"))8;a*HI")))4D*Ea*HI & $$88Cl"+vhJ#n"55/7 7N&4"#",,""r%   c           	      n   U R                   [        R                  :X  a  [        U[        5      (       d   S5       eS nU R
                  " U4SS0UD6nU R
                  R                  5       nSU-   n[        X5      (       d  [        SU SU SU S35      e[        X5      " U5      u  pxUR                  S	U R
                  R                  S	5      5      n	UR                  S
U R
                  R                  S
5      5      n
UR                  SU R
                  R                  S5      5      (       a  SOSn[        U5      U
SU-  -
  :  a  US U
SU-  -
   nUS [        U5       nU	S
:X  a5  S/U-  U-   S/U
[        U5      -
  U-
  -  -   nUS/U
[        U5      -
  -  -   nOS/U-  U-   S/U-  -   nXS'   XtS'   XC4$ )NzSlow tokenizer now only support str input in inference mode. If you are training models, please consider using the fast tokenizer.rQ   F"get_label_mask_and_offset_mapping_zNo `z` method defined for tokenizer z>, please use a fast tokenizer instead, or try to implement a `z` methodr.   r*   rM   rv   r   rw   rx   rV   rU   )r   r   r   rW   r@   r   get_tokenizer_classhasattrRuntimeErrorgetattrr2   rY   rf   r|   )r   rk   r!   ro   rn   tokenizer_namemethodrU   rV   r.   r*   r   s               r"   rb   ]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_slow_tokenizer   s   yyH...:fc3J3J 	88	8J &&9(-9179	++??A5Ft$$vh +, -''-hh89 9 &-T%:6%B"
**Y!//CCINPZZ$,,@@NP
#ZZ 22$&' ' -. 	 z?Z!m*;;;#$Ej1}3D&DFJ'(8Z9l"=0:=:J7-GHIJ+vhS00/2 2N  =0:=-'(J&4"#",,""r%   c                 x   / n/ nU R                   R                  R                  U5      nSnU H  nUS S S:g  nU(       a  UR                  S5        OUSS  nUR                  S5        XQUS  R	                  U5      -   nU[        U5      -   n	U(       a  UR                  X45        OUS   S   U	4US'   U	nM     X#4$ )Nr   rw   z##TFrR   )r   r_   tokenizer]   indexrf   )
r   rI   rU   rV   rk   r   r   is_startstartends
             r"   /get_label_mask_and_offset_mapping_BertTokenizerjSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_BertTokenizer  s    
##--66t<Ebq	T)H!!$'ab	!!%(&']0077E#e*$C%%ul3&4R&8&;S%Ar"F  ))r%   c                    / n/ nU R                   R                  R                  U5      nSnSnU H  nUS   S:H  nU(       a*  USS  nUR                  S5        [	        U5      S:X  a  SnM;  OUR                  S5        XQUS  R                  U5      -   n	U	[	        U5      -   n
U(       d  U(       a  UR                  X45        OUS   S   U
4US'   U
nSnM     X#4$ )Nr   F_rv   TrR   )r   r_   r   r]   rf   r   )r   rI   rU   rV   rk   r   last_is_blankr   r   r   r   s              r"   5get_label_mask_and_offset_mapping_XLMRobertaTokenizerpSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_XLMRobertaTokenizer"  s    
##--66t<EaCHab	!!$'u:?$(M # !!%(&']0077E#e*$C%%ul3&4R&8&;S%Ar"F!M# $ ))r%   r;   )r<   r=   r>   r?   r   r   r@   r   rB   r5   r   r   r#   r^   ra   rb   r   r   rC   rD   rE   s   @r"   rG   rG   ;   s     #''-%"&*/%// '+!$$!$$ $  	$
 $($ $ $@,#5d3i#8 ,#\4l#B%#N*,* *r%   rG   )#typingr   r   r   r   r   rh   modelscope.metainfor   modelscope.preprocessorsr	    modelscope.preprocessors.builderr
   =modelscope.preprocessors.nlp.text_classification_preprocessorr   >modelscope.preprocessors.nlp.token_classification_preprocessorr   r   3modelscope.preprocessors.nlp.transformers_tokenizerr   modelscope.utils.constantr   r   modelscope.utils.hubr   r   modelscope.utils.loggerr   loggerregister_moduleaudiosen_cls_tokenizerr   token_cls_tokenizerrG    r%   r"   <module>r      s    1 0  - 1 :'> L 6 D .	 
LLm==? @* @? @F 
LLm??A}*+}*A}*r%   