
    9i                     n    S SK JrJr  S SKrS SKJr  S SKJr  S SKJ	r	  S SK
Jr  SSKJr   " S	 S
\5      rg)    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                      ^  \ rS rSrSr\R                  4U 4S jjrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS	rU =r$ )
OfaVisualEntailmentPreprocessor   z/
OFA preprocessor for visual entailment tasks.
c           
      p  > [         [        U ]
   " XU/UQ70 UD6  [        R                  " S [        R
                  " U R                  U R                  4[        R                  R                  S9[        R                  " 5       [        R                  " U R                  U R                  S9/5      U l        g)zpreprocess the data

Args:
    cfg(modelscope.utils.config.ConfigDict) : model config
    model_dir (str): model path,
    mode: preprocessor mode (model mode)
c                 $    U R                  S5      $ )NRGB)convert)images    n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/ofa/visual_entailment.py<lambda>:OfaVisualEntailmentPreprocessor.__init__.<locals>.<lambda>#   s    %--.    )interpolation)meanstdN)superr   __init__r   ComposeResizepatch_image_sizeInterpolationModeBICUBICToTensor	Normalizer   r   patch_resize_transform)selfcfg	model_dirmodeargskwargs	__class__s         r   r   (OfaVisualEntailmentPreprocessor.__init__   s     	-		 T	D48	D<B	D '1&8&8.&&(=(=>(::BBD !  diiTXX>:
 '#r   datareturnc                     U R                   [        R                  :X  a  U R                  U5      $ U R	                  U5      $ )N)r'   r   TRAIN_build_train_sample_build_infer_sample)r$   r,   s     r   __call__(OfaVisualEntailmentPreprocessor.__call__+   s4    99&++D11++D11r   c                    U R                  U5      nSR                  US   5      nUS   S0US'   U R                  USSS9nU R                  S:X  aH  [        R
                  " U R                  U/5      n[        R
                  " USS	 U R                  /5      nOU R                  S
:X  aA  [        R
                  " US   U/5      n[        R
                  " USS	 U R                  /5      nOZU R                  S:X  aD  [        R
                  " US   S	S U/5      n[        R
                  " USS	 U R                  /5      nO[        eU R                  R                  US	[        U5      * S-
  & XbS'   XRS'   U R                  b  [        R                  " [        U5      [        U R                  5      45      R                  5       n[        U5      [        U5      -
  S-
  n[!        [        U5      [        U5      -
  S-
  [        U5      5       HS  n	U R                  R#                  5       /XhU	 R%                  5       -   n
U R                  R'                  U
5      nSXy   U'   MU     XrS'   U$ )a  
  Building training samples.

  step 1. Preprocess the data using the logic of `_build_infer_sample`
      and make sure the label data in the result.
  step 2. Preprocess the label data to generate the `target` and
  `prev_output_tokens`.
      - tokenize the label data.
      - calculate the target item.
          1) if `promp_type` is `None`, using tokenized label data.
          2) if `promp_type` is `src`, concatenating the `source` data
          and tokenized label data.
          3) if `promp_type` is `prev_output`, concatenating the `source`
          data without eos token and tokenized label data
  step 3. Add constraint mask

Args:
      data (`Dict[str, Any]`): Input data, should contains the key of `text`
          `text2` and `label` are optional.
  Return:
      A dict object, contains source text input, patch images, patch masks
      with `Tensor([True])` value, decoder prompt, label, target, previous
      output tokens and constraint mask.
  z {}labelg      ?ref_dictF)add_bosadd_eosnoner	   Nsrcsourceprev_outputtargetprev_output_tokensTconstraint_mask)r1   formattokenize_textprompt_typetorchcatbos_itemeos_itemNotImplementedError	tokenizerpad_token_idlenconstraint_triezerostgt_dictboolrangebostolistget_next_layer)r$   r,   sampler>   tgt_itemprev_output_itemtarget_itemr@   	start_idxiconstraint_prefix_tokenconstraint_nodess               r   r0   3OfaVisualEntailmentPreprocessor._build_train_sample1   sK   2 ))$/fWo.$Wos3z%%feU%Kv%$yy$--)BC))%5ab%94==$IJK&$yy&*:H)EF))%5ab%94==$IJK.$yy&*:3B*?)JK))%5ab%94==$IJK%%+/>>+F+F'c(m^a'(&x'7#$+#kk[!3t}}#5688< K(3x=81<I$s8}4q8#k:JL MM%%'+!,335+6' $(#7#7#F#F+$- 7;"#34L )8$%r   c                    U R                  XR                  S      5      nU R                  U5      nSU;  ac  U R                  XR                  S      U R                  5      nU R
                  R                  R                  SS5      nUR                  U5      nOSU;   d   SUR                  5        35       eU R                  XR                  S      U R                  5      nU R                  XR                  S      U R                  5      nU R
                  R                  R                  SS5      nUR                  Xt5      nU R                  U5      nU R                  S:X  a  / n	U R                  n
O!U R                  S	:X  a  US
S n	US
S n
O[        eUU[        R                  " S/5      U	U
S.nSU R                  ;   a'  U R                  S   U;   a  XR                  S      US'   U$ )a  
Building inference samples.

step 1. Preprocessing the image as model's image input.
    - get the pillow image input from `data`
    - do some transforms to the pillow image, such as resize, normalize etc.
step 2. Building the instruction as model's source text input.
    - use text input to build instruction. so far, we support two kind of
    input form, we will take different examples to both of them to explain
    how to use them.
        1) only `text` input in data. this setting can solve the tasks which
        judge whether or not the input `text` describe the input image.
        2) both `text` and `text2` input in data. this setting can solve the
        tasks which judge whether or not the `text` together with input image
        can imply the `text2`
    - tokenize the instruction above.
step 3. Calculate the decoder prompt input.
step 4. Whether or not to add label data.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `text`
        `text2` and `label` are optional.
Return:
    A dict object, contains source text input, patch images, patch masks
    with `Tensor([True])` value, decoder prompt and label.
r   text2textpromptz  does the image describe " {} "?ztext must be in the input z/ can image and text1 " {} " imply text2 " {} "?r9   r<   Nr=   T)r;   patch_image
patch_maskprefix_tokendecoder_promptrelationr5   )get_img_pil
column_mapr#   pre_captionmax_src_lengthr%   modelgetrA   keysrB   rC   rF   rH   rD   tensor)r$   r,   r   ra   
hypothesisr`   r_   captioninputsrc   rd   rT   s               r   r1   3OfaVisualEntailmentPreprocessor._build_infer_sampleo   s   6   oog&>!?@11%8$))$v/F*G*.*=*=?JXX^^''(JLF==,DT>M%?		}#MM>&&tOOG,D'E'+':':<G))$v/F*G*.*=*=?JXX^^''KMF==5D##D)v%L!]]N.!#2;L#CR[N%%&,,v.(,
 (T__.#.$"??:#>?F7Or   )r#   )__name__
__module____qualname____firstlineno____doc__r   	INFERENCEr   r   strr   r2   r0   r1   __static_attributes____classcell__)r*   s   @r   r   r      s     ((22T#s(^ 2S#X 2<S#X <4S> <|?S#X ?4S> ? ?r   r   )typingr   r   rD   PILr   torchvisionr   modelscope.preprocessors.imager   modelscope.utils.constantr   baser
   r    r   r   <module>r      s)       " 5 . %a&9 ar   