
    9i                     J    S SK JrJr  S SKrS SKJr  SSKJr   " S S\5      rg)    )AnyDictN)ModeKeys   )OfaBasePreprocessorc                      ^  \ rS rSrSr\R                  4U 4S jjrS\\	\
4   S\\	\
4   4S jrS rS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S	 jrS
rU =r$ )!OfaTextClassificationPreprocessor
   z1
OFA preprocessor for text classification tasks.
c                 :   > [         [        U ]
   " XU/UQ70 UD6  g)zpreprocess the data

Args:
    cfg(modelscope.utils.config.ConfigDict) : model config
    model_dir (str): model path,
    mode: preprocessor mode (model mode)
N)superr	   __init__)selfcfg	model_dirmodeargskwargs	__class__s         p/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/ofa/text_classification.pyr   *OfaTextClassificationPreprocessor.__init__   s8     	/		 T	D48	D<B	D    datareturnc                     U R                   [        R                  :X  a  U R                  U5      $ U R	                  U5      $ )N)r   r   TRAIN_build_train_sample_build_infer_sample)r   r   s     r   __call__*OfaTextClassificationPreprocessor.__call__   s4    99&++D11++D11r   c                 z   SR                  US   R                  5       R                  5       R                  5       SU R                   5      nSR                  US   R                  5       R                  5       R                  5       SU R                   5      nSnUR                  X#5      nU R                  U5      nU$ )a  
Building text classification task's instruction.

The `data` should contains key `text` and `text2`, and the final instruction
is like  ` can text1 " {} " imply text2 " {} "?`, the first `{}` refer to
the value of `text` and the latter refer to `text2`

step 1. Preprocess for input text `text` and `text2` in `data`.
    - Do lower, stripe and restrict the maximum length as `max_src_length`.
step 2. Using instruction template to generate the final instruction.
step 3. Tokenize the instruction as result.
 textNtext2z% can text1 " {} " imply text2 " {} "?)joinlowerstripsplitmax_src_lengthformattokenize_text)r   r   text1r#   promptr"   instruction_itms          r   _build_instruction4OfaTextClassificationPreprocessor._build_instruction%   s     L &&(..01E$2E2EFHM!'')//12F43F3FGI8}}U*,,T2r   c                    U R                  U5      nSU;   d   S5       eUS   nU R                  (       a  U R                  U   nU R                  SU 3SS9nU R                  S:X  a  UnO2U R                  S:X  a  [        R
                  " USS	 U/5      nO[        e[        R
                  " U R                  US
S	 /5      nU R                  US
[        U5      * & UUUS.nU R                  U5        U$ )aa  
Building training samples.

step 1. Building instruction for text classification using `_build_instruction`.
step 2. If the `label` is not text, transfer it to text using `label2ans`.
step 3. Tokenize the label data.
step 4. Concatenate the instruction and label tokens as the target item.
    - padding the instruction tokens from target item as `target`.
    - remove the eos token from target item as `prev_output_tokens`.
step 5. Add constraint mask.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `text`, `text2`
        and `label`, both of them refer to a text input, and the target of this job
        is to find whether or not `text` imply `text2`, the `label` is the supervised
        data for training.
Return:
    A dict object, contains source text input, target tokens and previous output
    tokens and constraint mask.
labelz-there must has `label` column in train phase r!   F)add_bosnoneprev_outputr   N)sourcetargetprev_output_tokens)r.   	label2ansr*   prompt_typetorchcatNotImplementedErrorbos_itempad_itemlenadd_constraint_mask)r   r   r-   r1   	label_itm
target_itmprev_output_itmsamples           r   r   5OfaTextClassificationPreprocessor._build_train_sample;   s    * 11$7$O OOW>>NN5)E&&5'{E&B	v%"J.OAb$99#EFJ%%))T]]JsO$DE'+}}
#S^O$% "1

 	  (r   c                     U R                  U5      nU R                  S:X  a  / nU R                  nO!U R                  S:X  a  USS nUSS nO[        eUUUS.nSU;   a  U R                  US      US'   U$ )a  
Building inference samples.

step 1. Building instruction for text classification using `_build_instruction`.
step 2. Whether or not to add `prefix_token`.
step 3. Whether or not to add `label` data.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `text` and `text2`,
        both of them refer to a text input, and the target of this job is to find
        whether or not `text` imply `text2`.
Return:
    A dict object, contains source text input, prefix tokens and label data.
r3   r4   Nr5   )r6   prefix_tokendecoder_promptr1   )r.   r:   r>   r=   r9   )r   r   r-   rH   rI   rE   s         r   r   5OfaTextClassificationPreprocessor._build_infer_samplef   s     11$7v%L!]]N.*3B/L,Sb1N%%%(,

 d?"nnT'];F7Or    )__name__
__module____qualname____firstlineno____doc__r   	INFERENCEr   r   strr   r   r.   r   r   __static_attributes____classcell__)r   s   @r   r	   r	   
   s     ((D 2T#s(^ 2S#X 2,)S#X )4S> )VS#X 4S>  r   r	   )	typingr   r   r;   modelscope.utils.constantr   baser   r	   rK   r   r   <module>rX      s       . %{(; {r   