
    9i$!                         S SK JrJr  S SKrS SKrS SKJr  S SKJ	r	  S SK
Jr  S SKJr  SSKJr  SSKJ	r   " S	 S
\5      rg)    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                      ^  \ rS rSrSr\R                  4U 4S jjrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS	rU =r$ )
OfaVisualGroundingPreprocessor   z.
OFA preprocessor for visual grounding tasks.
c           
        > [         [        U ]
   " XU/UQ70 UD6  U R                  R                  R                  SS5      U l        U R                  [        R                  :X  a  [        R                  " [        R                  " U R                  /U R                  S9[        R                  " 5       [        R                  " U R                   U R"                  U R$                  S9/5      U l        g[(        R                  " S [(        R*                  " U R                  U R                  4[(        R,                  R.                  S9[(        R                  " 5       [(        R                  " U R                   U R"                  S9/5      U l        g)	zpreprocess the data

Args:
    cfg(modelscope.utils.config.ConfigDict) : model config
    model_dir (str): model path,
    mode: preprocessor mode (model mode)
num_binsi  )max_size)meanstdmax_image_sizec                 $    U R                  S5      $ )NRGB)convert)images    m/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/ofa/visual_grounding.py<lambda>9OfaVisualGroundingPreprocessor.__init__.<locals>.<lambda>3   s    emmE2    )interpolation)r   r   N)superr   __init__cfgmodelgetr   moder   TRAINTComposeRandomResizepatch_image_sizeToTensor	Normalizer   r   r   positioning_transformr   ResizeInterpolationModeBICUBICpatch_resize_transform)selfr   	model_dirr"   argskwargs	__class__s         r   r   'OfaVisualGroundingPreprocessor.__init__   s=    	,		 T	D48	D<B	D **:t<99&)* 5 56(,(=(=?

#'#6#68	4 *D& +5*<*<2!!**D,A,AB",">">"F"FH ##%$$$))B> +D'r   datareturnc                     U R                   [        R                  :X  a  U R                  U5      $ U R	                  U5      $ )N)r"   r   r#   _build_train_sample_build_infer_sample)r/   r5   s     r   __call__'OfaVisualGroundingPreprocessor.__call__;   s4    99&++D11++D11r   c           	         U R                  XR                  S      5      nUR                  u  p4/ / / [        R                  " XC/5      S.nXR                  S      R                  5       R                  S5      u  pgp[        R                  " [        U5      [        U5      [        U5      [        U	5      /5      n
[        R                  " [        U5      [        U5      [        U5      [        U	5      //5      US'   [        R                  " S/5      US'   [        U5      [        U5      -
  [        U	5      [        U5      -
  -  /n[        R                  " U5      US'   U R                  X%5      u  pUS	   S   US	   S
   pSR                  [        US   S   S   U R                  S
-
  -  R                  5       5      5      nSR                  [        US   S   S
   U R                  S
-
  -  R                  5       5      5      nSR                  [        US   S   S   U R                  S
-
  -  R                  5       5      5      nSR                  [        US   S   S   U R                  S
-
  -  R                  5       5      5      nSR                  UUUU5      nU R                  XR                  S      U R                   5      nU R"                  R$                  R'                  SS5      nUR                  U5      nU R)                  U5      nU R)                  USS9n[        R*                  " U R,                  USS /5      nUU[        R                  " S/5      UUX-  X-  U
S.nU$ )aq  
Building training samples.

step 1. Preprocessing the image input for model's image input.
    - get the pillow image.
    - calculate the target boxes using for getting the exact area
    in the pillow image for input text by input `region_coord`. in
    training setting, `region_coord` will be a label data.
    - getting the target image as patch images and do some transforms
    such as resize, normalize etc.
step 2. Preprocessing the text input for model's source text input.
    - do the str preprocessing to text input by function `pre_caption`.
    - build the instruction. the default instruction is
    ` which region does the text " {} " describe?`, `{}` refer to the
    text input.
    - tokenize the instruction as source text input.
step 3. Preprocessing the patch image boxes for model's target text input.
    - quantize the coordinate of selected patch images
    - concatenate the quantization results by blank
    - tokenize the result above as target text input.
step 4. Get the previous output tokens using target item without eos token.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `image`
        `text` and `region_coord`.
Return:
    A dict object, contains source text input, patch images, patch masks
    with `Tensor([True])` value, target, previous output tokens,
    width scale ratio, height scale ratio and region coordinate.
r   )boxeslabelsareasizeregion_coord,r=   r   r>   r?   r@   r	   z<bin_{}>      z{} {} {} {}textprompt, which region does the text " {} " describe?F)add_bosNT)sourcepatch_image
patch_masktargetprev_output_tokensw_resize_ratioh_resize_ratiorA   )get_img_pil
column_mapr@   torchtensorstripsplitfloatnparrayr*   formatintr   roundpre_captionmax_src_lengthr   r    r!   tokenize_textcatbos_item)r/   r5   r   whboxes_targetx0y0x1y1regionr?   rK   patch_boxesresize_hresize_wquant_x0quant_y0quant_x1quant_y1rA   src_captionrF   rE   src_itemtarget_itemprev_output_itemsamples                               r   r8   2OfaVisualGroundingPreprocessor._build_train_sampleA   s?   >   oog&>!?@zzLL!(	
 oon=>DDFLLuRy%)U2Yb	JK %BirE"IBi !W "$1#XrU2Y&59uRy+@AB$||D1V#'#=#=$! (03[5H5K($$W%a(+t}}q/@AHHJKM$$W%a(+t}}q/@AHHJKM$$W%a(+t}}q/@AHHJKM$$W%a(+t}}q/@AHHJKM$++Hh,46&&tOOF,C'D'+':':<##DF}}[)%%d+((% ) ) 99dmm["5E%FG &,,v.!"2&l&l"	
 r   c                 6   U R                  XR                  S      5      nUR                  u  p4U R                  U5      n[        R
                  " U R                  U-  5      n[        R
                  " U R                  U-  5      nU R                  XR                  S      U R                  5      nU R                  R                  R                  SS5      n	U	R                  U5      n
U R                  U
5      nUU[        R
                  " S/5      UUS.nSU R                  ;   as  U R                  S   U;   a`  UU R                  S      R                  5       R                  S5      u  pnn[!        U5      [!        U5      [!        U5      [!        U5      /US	'   U$ )
ax  
Building inference samples.

step 1. Preprocessing image input for model's image input.
    - get pillow image from data.
    - do some transforms to the pillow image, such as resize, normalize etc.
step 2. Preprocessing the text input for model's text input.
    - do the str preprocessing to text input by function `pre_caption`.
    - build the instruction. the default instruction is
    ` which region does the text " {} " describe?`, `{}` refer to the
    text input.
    - tokenize the instruction as source text input.
step 3. Whether or not to add label data which refer to a region coordinate
    in this task.

Args:
    data (`Dict[str, Any]`): Input data, should contains the key of `image`
        `text`.
Return:
    A dict object, contains source text input, patch images, patch masks
    with `Tensor([True])` value, width scale ratio, height scale ratio
    and label.
r   rE   rF   rG   T)rJ   rK   rL   rO   rP   rA   rB   label)rQ   rR   r@   r.   rS   rT   r'   r]   r^   r   r    r!   rZ   r_   rU   rV   rW   )r/   r5   r   rb   rc   rK   rO   rP   rq   rF   rE   rr   ru   re   rf   rg   rh   s                    r   r9   2OfaVisualGroundingPreprocessor._build_infer_sample   sk   0   oog&>!?@zz11%8d&;&;a&?@d&;&;a&?@&&tOOF,C'D'+':':<##DF}}[)%%d+&,,v.,,
 T__,2 #'2(!/116s BB$Ry%)U2Yb	JF7Or   )r   r.   r*   )__name__
__module____qualname____firstlineno____doc__r   	INFERENCEr   r   strr   r:   r8   r9   __static_attributes____classcell__)r3   s   @r   r   r      s     ((%N2T#s(^ 2S#X 2RS#X R4S> Rh0S#X 04S> 0 0r   r   )typingr   r   numpyrX   rS   PILr   torchvisionr   modelscope.preprocessors.imager   modelscope.utils.constantr   baser
   utilsr$   r    r   r   <module>r      s/        " 5 . % "v%8 vr   