
    9iY                        S SK Jr  S SKJrJr  S SKrS SKrS SKJ	r	J
r
Jr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJrJr  S
SKJr  \R6                  " \R8                  \R:                  S9 " S S\5      5       rg)    N)AnyDict)MosesDetokenizerMosesPunctNormalizerMosesTokenizer)	apply_bpe)Preprocessors)Preprocessor)PREPROCESSORS)Config)Fields	ModelFile   )	TextClean)module_namec                   Z   ^  \ rS rSrSr S
S\S\4U 4S jjjrS\S\\\	4   4S jr
S	rU =r$ )CanmtTranslationPreprocessor   z3The preprocessor used in text correction task.
    	model_dir
max_lengthc                   > SSK Jn   [        TU ]  " U0 UD6  [        R
                  " [        R                  " U[        R                  5      5      U l
        UR                  [        R                  " US5      5      U l        UR                  [        R                  " US5      5      U l        U R                  R                  5       U l        Ub  US-   OSU l        U R                  S   S   U l        U R                  S   S	   U l        ['        5       U l        U R"                  S
:X  a  [*        U l        O0[/        U R"                  S9U l        [3        U R"                  S9U l        [        R                  " XR                  S   S   S   5      U l        [6        R8                  " [;        U R4                  5      5      U l        g )Nr   )
Dictionaryzdict.src.txtzdict.tgt.txtr      preprocessorsrc_langtgt_langzh)langsrc_bpefile)fairseq.datar   super__init__r   	from_fileospjoinr   CONFIGURATIONcfgload	vocab_src	vocab_tgtpadpadding_valuer   r   r   r   tcjiebatokr   punct_normalizerr   src_bpe_pathr   BPEopenbpe)selfr   r   argskwargsr   	__class__s         n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/nlp/canmt_translation.pyr#   %CanmtTranslationPreprocessor.__init__   sN   
 	,	
 	$)&)##HHY	 7 78:#)^)LM#)^)LM!^^//1,6,B*q.0<0<+==D DH$8dmm$LD!%4==9DHHHxx/	:6BD==d&7&7!89    inputreturnc           	         U R                   S:X  aQ  U R                  R                  U5      nU R                  R	                  U5      nSR                  [        U5      5      nOUU Vs/ s H  o0R                  R                  U5      PM     nnU Vs/ s H  nU R                  R                  USSS9PM      nnU R                  R                  U5      R                  5       R                  5       nSR                  U Vs/ s H  oUPM     sn5      nU R                  R                  USSS9n[         R"                  " USS9nUR%                  5       S	   n	['        U R(                  U	5      n
[         R*                  " U R,                  /X-
  -  UR.                  S
9n[         R0                  " [         R2                  " X{/5      S	S9n[         R0                  " [         R2                  " X/5      S	S9n[         R0                  " [         R2                  " X/5      S	S9n[         R*                  " U	/5      n	UU	UUS.nU$ s  snf s  snf s  snf )u;  process the raw input data

Args:
    data (str): a sentence
        Example:
            '随着中国经济突飞猛近，建造工业与日俱增'
Returns:
    Dict[str, Any]: the preprocessed data
    Example:
    {'net_input':
        {'src_tokens':tensor([1,2,3,4]),
        'src_lengths': tensor([4])}
    }
r    T)
return_straggressive_dash_splitsF)
append_eosadd_if_not_existr   )shiftsr   )dtype)dim)
src_tokenssrc_lengthsprev_src_tokenssources)r   r.   cleanr0   cutr&   list_punct_normalizer	normalizetokenizer5   process_linestripsplitr*   encode_linetorchrollsizeminr   tensorr-   rF   	unsqueezecat)r6   r=   	input_tokitem	input_bpextextinputsprev_inputslengthsmax_lenpaddingrK   outs                 r:   __call__%CanmtTranslationPreprocessor.__call__:   s    ==D GGMM%(EU+Ii1IHMN++55d;EN " "D !!T$ " H!   HH)))4::<BBD	xxI.IqI./++TE , ;jj2++-"doow/,,  ,,  //%))V,=">AFG+<!=1Eooeii0F&GQO,,y) "*	
 
? O /s   &$H5%H:=H?)r5   r(   r   r-   r1   r2   r   r.   r   r0   r*   r+   )N)__name__
__module____qualname____firstlineno____doc__strintr#   r   r   rh   __static_attributes____classcell__)r9   s   @r:   r   r      sH    
 $(:: : :B3c 3d38n 3 3r<   r   )os.pathpathr%   typingr   r   r/   rV   
sacremosesr   r   r   subword_nmtr   modelscope.metainfor	   modelscope.preprocessors.baser
    modelscope.preprocessors.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   
text_cleanr   register_modulenlpcanmt_translationr    r<   r:   <module>r      sf        M M ! - 6 : * 7 ! 
JJM;;=X< X=Xr<   