
    9iZ                     J    S SK JrJr  S SKrS SKJr  SSKJr   " S S\5      rg)    )AnyDictN)ModeKeys   )OfaBasePreprocessorc                      ^  \ rS rSrSr\R                  4U 4S jjrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4U 4S jjrS\\	\
4   S\\	\
4   4U 4S jjrS	 rS
rU =r$ )OfaSummarizationPreprocessor
   z+
OFA preprocessor for summarization tasks.
c                 :   > [         [        U ]
   " XU/UQ70 UD6  g)zpreprocess the data

Args:
    cfg(modelscope.utils.config.ConfigDict) : model config
    model_dir (str): model path,
    mode: preprocessor mode (model mode)
N)superr	   __init__)selfcfg	model_dirmodeargskwargs	__class__s         j/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/ofa/summarization.pyr   %OfaSummarizationPreprocessor.__init__   s8     	*		 T	D48	D<B	D    datareturnc                     U R                   [        R                  :X  a  U R                  U5      $ U R	                  U5      $ )N)r   r   TRAIN_build_train_sample_build_infer_sample)r   r   s     r   __call__%OfaSummarizationPreprocessor.__call__   s4    99&++D11++D11r   c                 |  > U R                  U5      nUS   R                  5       n[        TU ]  X0R                  S9nUR                  SS5      R                  SS5      nU R                  USS9US'   U R                  US   S	S
 R                  5       5      n[        R                  " U R                  U/5      US'   U$ )aQ  
Building training samples.

step 1. Preprocess the data using the logic of `_build_infer_sample`
    and make sure the label data in the result.
step 2. Preprocess the label data. Contains:
    - Get the lower case of label, and using `pre_caption` function
    to do the str preprocessing as new input label.
    - Tokenize the new input label as `target` for model input.
    - Add noise to the `target`
    - Calculate the `prev_output_tokens` from noise `target` for model input.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt` and
        `label`, `image` refers the image input data, `prompt` refers the text input data
        and the `label` is the supervised data for training.
Return:
    A dict object, contains source, image, mask, label, target tokens,
    and previous output tokens data.
label	max_words[unk]unk<unk>F)add_bostargetNprev_output_tokens)r   lowerr   pre_captionmax_tgt_lengthreplacetokenize_textadd_noise_to_tgtclonetorchcatbos_item)r   r   sample
target_strr(   noise_target_itemr   s         r   r   0OfaSummarizationPreprocessor._build_train_sample%   s    * ))$/G_**,
$Z;N;N$O/77G--fe-Dx 118Sb!'')+',yy]]-.(0#$r   c                   > [         TU ]  XR                  S      U R                  S9nUR	                  SS5      R	                  SS5      nU R
                  R                  R                  SS5      nUR                  U5      nU R                  U5      nU R                  S:X  a  U R                  nOU R                  S	:X  a  US
S nO[        eUUS.nSU R                  ;   a'  U R                  S   U;   a  XR                  S      US'   U$ )a  
Building inference samples.

step 1. Preprocessing the input text via `pre_cation` function, see more
    details from the doc of `pre_cation`.
step 2. Uniform the unknown token, such as `<unk>` -> `unk` and `<unk>` -> `unk`.
step 3. Get the prompt from input, concatenate with the input text, as new input.
step 4. Tokenize the input text and generate the decoder prompt.
step 5. Determine Whether or not to add labels to the sample.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
        the former refers the image input data, and the later refers the text input data.
Return:
    A dict object, contains text, decoder prompt and label data.
textr"   r$   r%   r&   promptz, " {} " Summarize the article with a title: noneprev_outputNr)   )sourcedecoder_promptsummaryr!   )r   r,   
column_mapmax_src_lengthr.   r   modelgetformatr/   prompt_typer4   NotImplementedError)	r   r   r>   r;   r:   inputsr?   r5   r   s	           r   r   0OfaSummarizationPreprocessor._build_infer_sampleE   s   " $()T5H5H % J/77G##DF}}V$##D)v%!]]N.#CR[N%%,
 'DOOI,F$,N"??9#=>F7Or   c                    [         R                  " UR                  S5      5      R                  5       U R                  R
                  R                  SS5      :  n[         R                  " S[        U R                  5      U R                  R
                  R                  SS5      -
  U R                  R
                  R                  SS5      -
  UR                  5       4S	9X'   U$ )
a5  
Add noise token to the target sentence.

step 1. Sampling from uniform distribution to randomly select the
    noise indices.
step 2. Sampling from normal distribution as noise token to replace
    the relative token in the target.

Args:
    target: A sequence of tokens.
Returns:
    A sequence of tokens.
r   noise_ratiog           	num_codesi    num_binsi  )size)r2   FloatTensorrO   uniform_r   rC   rD   randintlensrc_dictsum)r   r(   noise_indicess      r   r0   -OfaSummarizationPreprocessor.add_noise_to_tgtk   s     ))KKN$HJ););s*$$ !&!3!3K!FFhhnn  T23##%(	!*
 r    )__name__
__module____qualname____firstlineno____doc__r   	INFERENCEr   r   strr   r   r   r   r0   __static_attributes____classcell__)r   s   @r   r	   r	   
   s     ((D 2T#s(^ 2S#X 2S#X 4S> @$S#X $4S> $L r   r	   )	typingr   r   r2   modelscope.utils.constantr   baser   r	   rX   r   r   <module>re      s       . %w#6 wr   