
    9il                        S SK r S SKJr  S SKJr  S SKJrJrJrJ	r	J
r
Jr  S SKrSSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SSKJ r J!r!J"r"J#r#  \RH                  \" S5       " S S\5      5       5       r%g)    N)deepcopy)sleep)AnyDictListOptionalTupleUnion   )logging)pipeline_requires_extra   )MarkDownBatchSampler)	benchmark)	HPIConfig)PaddlePredictorOption   )BasePipeline   )DocumentResultLatexResultMarkdownResult)split_original_textssplit_text_recursivetranslate_code_blocktranslate_html_blocktransc            A          ^  \ rS rSrSrS/r     SFS\S\S\S\	S	\
\\\\4   \4      S
\	SS4U 4S jjjrS rS\SS4S jrS\SS4S jrSGS jr                              SHS\\\\   \R,                  \\R,                     4   S\
\	   S\
\	   S\
\	   S\
\	   S\
\	   S\
\	   S\
\	   S\
\	   S\
\\\4      S\
\	   S\
\\\\\4   \4      S\
\   S\
\   S\
\   S \
\   S!\
\   S"\
\   S#\
\   S$\
\   S%\
\   S&\
\   S'\
\   S(\
\   S)\
\   S*\	S+\	S,\	S-\	S.\	S/\	S\4@S0 jjrS1 rS2 r          SIS3\\   S4\S5\S6\S7\S8\S9\S:\S;\S<\S=\4S> jjrS?\S\4S@ jr SA\S\4SB jr!SC\S\4SD jr"SEr#U =r$$ )JPP_DocTranslation_Pipeline&   z
PP_ DocTranslation_Pipeline
zPP-DocTranslationNconfigdevice	pp_optionuse_hpip
hpi_configinitial_predictorreturnc                    > [         TU ]  X#XES9  US   U l        Xl        UR	                  SS5      U l        SU l        SU l        U(       a"  U R                  U5        U R                  U5        [        5       U l        g)a  Initializes the PP_Translation_Pipeline.

Args:
    config (Dict): Configuration dictionary containing various settings.
    device (str, optional): Device to run the predictions on. Defaults to None.
    pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
    use_hpip (bool, optional): Whether to use the high-performance
        inference plugin (HPIP) by default. Defaults to False.
    hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
        The default high-performance inference configuration dictionary.
        Defaults to None.
    initial_predictor (bool, optional): Whether to initialize the predictor. Defaults to True.
)r"   r#   r$   r%   pipeline_nameuse_layout_parserTN)super__init__r)   r!   getr*   layout_parsing_pipelinechat_botinintial_visual_predictorinintial_chat_predictorr   markdown_batch_sampler)selfr!   r"   r#   r$   r%   r&   	__class__s          v/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/paddlex/inference/pipelines/pp_doctranslation/pipeline.pyr,   #PP_DocTranslation_Pipeline.__init__/   s}    . 	 	 	
 $O4!',?!F'+$**62((0&:&<#    c                 T    U R                   b  U R                   R                  5         g g )N)r.   close)r3   s    r5   r9    PP_DocTranslation_Pipeline.closeW   s%    ''3((..0 4r7   c                     UR                  SS5      U l        U R                  (       a:  UR                  S0 5      R                  SSS05      nU R                  U5      U l        g)z
Initializes the visual predictor with the given configuration.

Args:
    config (dict): The configuration dictionary containing the necessary
                        parameters for initializing the predictor.
Returns:
    None
r*   TSubPipelinesLayoutParserpipeline_config_errorz)config error for layout_parsing_pipeline!N)r-   r*   create_pipeliner.   )r3   r!   layout_parsing_configs      r5   r0   4PP_DocTranslation_Pipeline.inintial_visual_predictor[   sb     "(,?!F!!$*JJ~r$B$F$F(*UV%! ,0+?+?@U+VD(r7   c                     SSK Jn  UR                  S0 5      R                  SSS05      nU" U5      U l        SSK Jn  UR                  S0 5      R                  S0 5      R                  S	S
S05      nU" U5      U l        g)z
Initializes the chat predictor with the given configuration.

Args:
    config (dict): The configuration dictionary containing the necessary
                        parameters for initializing the predictor.
Returns:
    None
r   create_chat_bot
SubModulesLLM_Chatchat_bot_config_errorzconfig error for llm chat bot!)create_prompt_engineeringPromptEngneeringTranslate_CommonTextpe_config_errorz%config error for translate_pe_config!N) rD   r-   r/   rH   translate_pe)r3   r!   rD   chat_bot_configrH   translate_pe_configs         r5   r1   2PP_DocTranslation_Pipeline.inintial_chat_predictoro   s     	' **\26::$&FG
 (80 JJ|R(S#R(S&"$KL 	 66IJr7   c                 0    [         R                  " S5        g )NzPP-Translation Pipeline do not support to call `predict()` directly! Please invoke `visual_predict`, `build_vector`, `chat` sequentially to obtain the result.)r   error)r3   argskwargss      r5   predict"PP_DocTranslation_Pipeline.predict   s     m	
 	r7   inputuse_doc_orientation_classifyuse_doc_unwarpinguse_textline_orientationuse_seal_recognitionuse_table_recognitionuse_formula_recognitionuse_chart_recognitionuse_region_detectionlayout_threshold
layout_nmslayout_unclip_ratiolayout_merge_bboxes_modetext_det_limit_side_lentext_det_limit_typetext_det_threshtext_det_box_threshtext_det_unclip_ratiotext_rec_score_threshseal_det_limit_side_lenseal_det_limit_typeseal_det_threshseal_det_box_threshseal_det_unclip_ratioseal_rec_score_thresh#use_wired_table_cells_trans_to_html&use_wireless_table_cells_trans_to_htmluse_table_orientation_classify use_ocr_results_with_table_cellsuse_e2e_wired_table_rec_model use_e2e_wireless_table_rec_modelc               +     #    U R                   S:X  a  [        R                  " S5        SS0v   U R                  c1  [        R                  " S5        U R                  U R                  5        U R                  R                  " U40 SU_SU_SU_S	U_S
U_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_S U_S!U_S"U_S#U_6 H  n!S$U!0n"U"v   M     g7f)%a  
This function takes an input image or a list of images and performs various visual
prediction tasks such as document orientation classification, document unwarping,
general OCR, seal recognition, and table recognition based on the provided flags.

Args:
    input (Union[str, list[str], np.ndarray, list[np.ndarray]]): Input image path, list of image paths,
                                                                numpy array of an image, or list of numpy arrays.
    use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
    use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
    use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
    use_seal_recognition (Optional[bool]): Whether to use seal recognition.
    use_table_recognition (Optional[bool]): Whether to use table recognition.
    use_formula_recognition (Optional[bool]): Whether to use formula recognition.
    use_region_detection (Optional[bool]): Whether to use region detection.
    layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
    layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
    layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
        Defaults to None.
        If it's a single number, then both width and height are used.
        If it's a tuple of two numbers, then they are used separately for width and height respectively.
        If it's None, then no unclipping will be performed.
    layout_merge_bboxes_mode (Optional[str], optional): The mode for merging bounding boxes. Defaults to None.
    text_det_limit_side_len (Optional[int]): Maximum side length for text detection.
    text_det_limit_type (Optional[str]): Type of limit to apply for text detection.
    text_det_thresh (Optional[float]): Threshold for text detection.
    text_det_box_thresh (Optional[float]): Threshold for text detection boxes.
    text_det_unclip_ratio (Optional[float]): Ratio for unclipping text detection boxes.
    text_rec_score_thresh (Optional[float]): Score threshold for text recognition.
    seal_det_limit_side_len (Optional[int]): Maximum side length for seal detection.
    seal_det_limit_type (Optional[str]): Type of limit to apply for seal detection.
    seal_det_thresh (Optional[float]): Threshold for seal detection.
    seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
    seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
    seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
    use_wired_table_cells_trans_to_html (bool): Whether to use wired table cells trans to HTML.
    use_wireless_table_cells_trans_to_html (bool): Whether to use wireless table cells trans to HTML.
    use_table_orientation_classify (bool): Whether to use table orientation classification.
    use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
    use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
    use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
    **kwargs (Any): Additional settings to extend functionality.

Returns:
    dict: A dictionary containing the layout parsing result.
Fz1The models for layout parser are not initialized.rR   NzGThe layout parsing pipeline is not initialized, will initialize it now.rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rm   rj   rk   rl   rn   ro   rp   rq   rr   rs   rt   ru   layout_parsing_result)r*   r   rR   r.   warningr0   r!   rU   )#r3   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rT   rw   visual_predict_ress#                                      r5   visual_predict)PP_DocTranslation_Pipeline.visual_predict   s    b !!U*MMMNOPP''/OOY **4;;7%)%A%A%I%I &
)E &
 0 &
 &>	 &

 "6 &
 #8 &
 %< &
 #8 &
 "6 &
 . &
 " &
 !4 &
 &> &
 %< &
 !4 &
  ,! &
" !4# &
$ #8% &
& #8' &
( !4) &
* %<+ &
, !4- &
. ,/ &
0 #81 &
2 #83 &
4 1T5 &
6 4Z7 &
8 ,J9 &
: .N; &
< +H= &
> .N? &
!F ()>" %$K &
s   C0C2c                     / nU R                   R                  U5       HB  nUR                  S   nUR                  S   nUS USS.nUR	                  [        U5      5        MD     U$ )Nr   TT)
input_path
page_indexmarkdown_textspage_continuation_flags)r2   sample	instancesinput_pathsappendr   )r3   rW   markdown_info_listmarkdown_samplemarkdown_contentr~   markdown_infos          r5   load_from_markdown-PP_DocTranslation_Pipeline.load_from_markdown  su    #::AA%HO.88;(44Q7J(""2+7	M %%n]&CD  I "!r7   c           	         / nSn[         R                  " S[        U5       S35        [         R                  " S5        [        U5       GH  u  pgUu  pUS:X  as  UR	                  5       (       a'  UR                  U" UR	                  5       5      5        Sn[         R                  " SUS-    S[        U5       S	35        [        XX45        M  [        U	5      U:  a  US
:X  a  [        U5      [        U	5      -   U:  a
  USU	-   -  nM  UR	                  5       (       aN  [         R                  " SUS-    S[        U5       S	35        UR                  U" UR	                  5       5      5        U	nGM%  [         R                  " SUS-    S[        U5       S	35        UR	                  5       (       a'  UR                  U" UR	                  5       5      5        SnUS
:X  a  UR                  [        XU5      5        GM  US:X  d  US:X  a  [        XX45        GM  [        SU 35      e   UR	                  5       (       a%  UR                  U" UR	                  5       5      5        SR                  U5      $ )au  
Chunks the given markdown blocks into smaller chunks of size `chunk_size` and translates them using the given
translate function.

Args:
    md_blocks (list): A list of tuples representing each block of markdown content. Each tuple consists of a string
  indicating the block type ('text', 'code') and the actual content of the block.
    chunk_size (int): The maximum size of each chunk.
    translate_func (callable): A callable that accepts a string argument and returns the translated version of that string.

Returns:
    str: A string containing all the translated chunks concatenated together with newlines between them.
rL   zSplit the original text into z blockszStarting translation...codezTranslating block r   /z...text

text_with_htmlhtmlzUnknown block type: )r   infolen	enumeratestripr   r   r   r   
ValueErrorjoin)
r3   	md_blocks
chunk_sizetranslate_functranslation_resultschunkidxblock
block_typeblock_contents
             r5   chunk_translate*PP_DocTranslation_Pipeline.chunk_translate%  s    !4S^4DGLM./#I.JC(-%JV#;;=='..~ekkm/LME1#a%#i.9IMN$!~ ]#j0Z65Iu:M 22Z?Vm33E{{}}'9#a%#i.AQQT%UV+22>%++-3PQ)E1#a%#i.9IMN;;=='..~ekkm/LME''..,]W  #33zV7K(%> %';J<%HIIC /F ;;==&&~ekkm'DE{{.//r7   ori_md_info_listtarget_languager   task_descriptionoutput_format	rules_strfew_shot_demo_text_contentfew_shot_demo_key_value_listglossaryllm_request_intervalrN   c           
   +     ^ ^^^^^^^
^#    T R                   c1  [        R                  " S5        T R                  T R                  5        Ub  SSKJn  U" U5      mOT R                   m[        U[        5      (       a'  US   R                  S5      b  T R                  U5      /n[        T
[        5      (       d  [        T
5      m
[        U	[        5      (       d
  U	b   S5       eSnU	bJ  U	R                  5        H6  u  nn[        U[        5      (       a  S	R                  U5      nX S
U S3-  nM8     US:w  a  Tc  UmO
TS-  mTU-  mUUUU
UUU UU4	S jnT R                  R!                  STTTTTTS9n[#        U5      nUU:  a  UU-
  nO[%        SU SU S35      eU HB  nUS   n['        U5      nT R)                  UUU5      n[+        TUS   US   US   US.5      v   MD     g7f)a  
Translate the given original text into the specified target language using the configured translation model.

Args:
    ori_md_info_list (List[Dict]): A list of dictionaries containing information about the original markdown text to be translated.
    target_language (str, optional): The desired target language code. Defaults to "zh".
    chunk_size (int, optional): The maximum number of characters allowed per chunk when splitting long texts. Defaults to 5000.
    task_description (str, optional): A description of the task being performed by the translation model. Defaults to None.
    output_format (str, optional): The desired output format of the translation result. Defaults to None.
    rules_str (str, optional): Rules or guidelines for the translation model to follow. Defaults to None.
    few_shot_demo_text_content (str, optional): Demo text content for the translation model. Defaults to None.
    few_shot_demo_key_value_list (str, optional): Demo text key-value list for the translation model. Defaults to None.
    glossary (Dict, optional): A dictionary containing terms and their corresponding definitions. Defaults to None.
    llm_request_interval (float, optional): The interval in seconds between each request to the LLM. Defaults to 0.0.
    chat_bot_config (Dict, optional): Configuration for the chat bot used in the translation process. Defaults to None.
    **kwargs: Additional keyword arguments passed to the translation model.

Yields:
    MarkdownResult: A dictionary containing the translation result in the target language.
Nz;The LLM chat bot is not initialized,will initialize it now.r   rC   r   r   zglossary must be a dictrL   u   或z: 
c           
        >	 [        T5        T	R                  R                  U T
TTTTTS9nTR                  US9R	                  SS5      nSU;  a  [        S5      eUc  [        S5      eUR                  SS5      R                  5       nU$ )z
Translate the given text using the configured translation model.

Args:
    text (str): The text to be translated.

Returns:
    str: The translated text in the target language.
original_textlanguager   r   r   r   r   )promptcontentrL   z<<END>>zThe translation did not reach the end. This may happen if your chunk_size is too large. Please reduce chunk_size and try again.z#The call to the large model failed.)r   rM   generate_promptgenerate_chat_resultsr-   	Exceptionreplacerstrip)r   r   	translater/   r   r   r   r   r   r3   r   r   s      r5   r   <PP_DocTranslation_Pipeline.translate.<locals>.translate_func  s     &'&&66"(!1+#+E-I 7 F !66f6EII)UWXI	)o    EFF!)))R8??AIr7   r   z:Chunk size should be greater than the base prompt length (z), but got .r   r~   r   )r   r~   r   r   r   )r/   r   rx   r1   r!   rL   rD   
isinstancelistr-   concatenate_markdown_pagesfloatdictitemsr   rM   r   r   r   r   r   r   )r3   r   r   r   r   r   r   r   r   r   r   rN   rT   rD   glossary_strkvr   base_prompt_contentbase_prompt_lengthori_mdoriginal_textsr   target_language_textsr/   s   ` ` ````` `             @r5   r   $PP_DocTranslation_Pipeline.translate^  s4    F == OOM ((5&*&7H}}H '.. #''5A !% ? ?@P QR.66#()=#> (D))X-=X?XX= (1a&&

1A#Rs"- )
 2+3/;,,4,,<,	 	@ #//??$-''A)E @ 
 !!45**#&88JLM_L``klvkwwxy  'F#$45N,^<I$($8$8:~%! ! /"("6"("6/56O/P&;  's   GG'markdown_listc                    SnSn[        U5      S:X  a  [        S5      eU H  nUS   S   nUS   S   nU(       d  U(       d  U(       a  US   OSnUS   (       a  US   S   OSnU(       a  [        R                  " S	U5      OS
n	U(       a  [        R                  " S	U5      OS
n
U	(       d  U
(       d  USUS   -   -  nOX$S   -  nOUSUS   -   -  nUnM     US   S   SSUS.n[	        U5      $ )z
Concatenate Markdown content from multiple pages into a single document.

Args:
    markdown_list (list): A list containing Markdown data for each page.

Returns:
    tuple: A tuple containing the processed Markdown text.
rL   Tr   z$The length of markdown_list is zero.r   r   r   z[\u4e00-\u9fff]F r   r~   Nr}   )r~   r   r   r   )r   r   rematchr   )r3   r   r   -previous_page_last_element_paragraph_end_flagres'page_first_element_paragraph_start_flag$page_last_element_paragraph_end_flaglast_char_of_markdownfirst_char_of_handlerlast_is_chinese_charfirst_is_chinese_charconcatenate_results               r5   r   5PP_DocTranslation_Pipeline.concatenate_markdown_pages  sI    8<5}"CDD C<?)==3 :==V9W:0 <E>Lr(:RT%034D0EC()!,2 & - HH/1FG % - HH/1FG &
 -0E"cC0@,A&AAN"*:&;;N&3/?+@"@@4 :I !R (*<8'3,	
 011r7   	word_listc                 "   [        U5      S:X  a  [        S5      e/ n/ n[        U5       HN  u  pEUS    H#  n[        U5      nXGS'   UR	                  U5        M%     US    H  nUR	                  U5        M     MP     [        UUS   S   US.5      $ )z
Concatenate Word content from multiple pages into a single document.

Args:
    word_list (list): A list containing Word data for each page.

Returns:
    tuple: A tuple containing the processed Word document.
r   z The length of word_list is zero.word_blocksr   imagesr~   )r   r~   r   )r   r   r   r   r   r   )	r3   r   merged_blocksimagepage_idxpage_blocksr   
block_copyimg_objs	            r5   concatenate_word_pages1PP_DocTranslation_Pipeline.concatenate_word_pages-  s     y>Q?@@%.y%9!H$]3%e_
+3<($$Z0 4 'x0W% 1 &: ,'l<8
 	
r7   latex_info_listc                 "   [        U5      S:X  a  [        S5      e/ n/ n[        U5       HN  u  pEUS    H#  n[        U5      nXGS'   UR	                  U5        M%     US    H  nUR	                  U5        M     MP     [        UUUS   S   S.5      $ )z
Concatenate LaTeX content from multiple pages into a single document.

Args:
    latex_info_list (list): A list containing LaTeX data for each page.

Returns:
    tuple: A tuple containing the processed LaTeX document.
r   z&The length of latex_info_list is zero.latex_blocksr   r   r~   )r   r   r~   )r   r   r   r   r   r   )	r3   r   r   merged_imagesr   r   r   r   r   s	            r5   concatenate_latex_pages2PP_DocTranslation_Pipeline.concatenate_latex_pagesM  s     1$EFF%.%?!H$^4%e_
+3<($$Z0 5
 'x0$$W- 1 &@  -'-a0>
 	
r7   )r/   r!   r.   r2   r)   rM   r*   )NNFNF)r'   N)NNNNNNNNNNNNNNNNNNNNNNNNFFTTFT)
zhi  NNNNNNg        N)%__name__
__module____qualname____firstlineno____doc__entitiesr   strr   boolr   r
   r   r   r,   r9   r   r0   r1   rU   r   npndarrayr   r	   intrz   r   r   r   r   tupler   r   r   __static_attributes____classcell__)r4   s   @r5   r   r   &   s    $$H
 +/AE"'&=&= &= )	&=
 &= U4S>9#<=>&=  &= 
&= &=P1  (d t > 8<,037/3042604/39=%)QU2615-1+//3151515-1+//31515497</315.315A@%S$s)RZZbjj1AAB@% '/tn@% $D>	@%
 #+4.@% 'tn@%  (~@% "*$@%  (~@% 'tn@% #5#56@% TN@% &eE53F,L&MN@% #+3-@% "*#@%  &c]!@%" "%#@%$ &e_%@%&  ('@%(  ()@%* "*#+@%, &c]-@%. "%/@%0 &e_1@%2  (3@%4  (5@%6 .27@%8 159@%: )-;@%< +/=@%> (,?@%@ +/A@%D 
E@%D"70x  $ $!*.,0&) $Lt*L L 	L
 L L L %(L '*L L $L L\?2 ?2 ?2B
 
 
@
t 
 
 
r7   r   )&r   copyr   timer   typingr   r   r   r   r	   r
   numpyr   utilsr   
utils.depsr   common.batch_samplerr   utils.benchmarkr   	utils.hpir   utils.pp_optionr   baser   resultr   r   r   r   r   r   r   time_methodsr    r7   r5   <module>r     so    
   : :   2 8 ( " 4  ? ?  !D	
 D	
 " D	
r7   