
    9iw                        S SK Jr  S SKrS SKJr  S SKJrJrJ	r	J
r
Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKJrJrJrJr  S S	KJr  S S
K J!r!  S SK"J#r#  S SK$J%r%J&r&J'r'J(r(J)r)  S SK*J+r+  S SK,J-r-  S SK.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5  SSK6J7r7  SSK87  SSK9J:r:  SSK;J<r<  / SQr=\7R|                  " \/R~                  \!R                  S9 " S S\55      5       rA\7R|                  " \/R~                  \!R                  S9 " S S\55      5       rCS rD\7R|                  " \/R~                  \!R                  S9 " S S\55      5       rF\7R|                  " \/R~                  \!R                  S9 " S S \55      5       rH\7R|                  " \/R~                  \!R                  S9 " S! S"\55      5       rJ\7R|                  " \/R~                  \!R                  S9 " S# S$\55      5       rL\7R|                  " \/R~                  \!R                  S9 " S% S&\55      5       rN\7R|                  " \/R~                  \!R                  S9 " S' S(\55      5       rPg))    N)BytesIO)AnyDictListTupleUnion)Image)create_transform
transforms)ImageFolder)Compose	NormalizeResizeToTensor)snapshot_download)Preprocessors)Input)VCenterCropVCompose
VNormalizeVRescale	VToTensor)
load_image)Config)FieldsInvokeModeKeys	ModelFileTasks   )Preprocessor)PREPROCESSORS)*)
collate_fn)OFA_TASK_KEY_MAPPING)$DiffusionImageGenerationPreprocessorOfaPreprocessorMPlugPreprocessorHiTeAPreprocessorMplugOwlPreprocessor)module_namec                   D   ^  \ rS rSrSrU 4S jrS\\\4   4S jr	Sr
U =r$ )r'   '   zPreprocessor the data with the combination of image and text.
Args:
    data: process the value as an image for keys ending with 'FILE'
        or existing in preprocessor_image_keys and pass-through the values of other keys.

c           	        > [         TU ]  " U0 UD6  UR                  SS5      U l        UR                  SS/5      U l        UR                  SS/5      U l        [        UR                  S/ 5      5      U l        UR                  SS5      U l        [        R                  " [        R                  " U R                  [        R                  R                  S	9U R                  (       a   [        R                  " U R                  5      O[        R                  " U R                  5      [        R                   " 5       [        R"                  " U R                  U R
                  5      /5      U l        g )
N
resolutioni   meang      ?std
image_keyscenter_cropTinterpolation)super__init__poppreprocessor_resolutionpreprocessor_meanpreprocessor_stdsetpreprocessor_image_keysr4   r   r   r   InterpolationModeBILINEAR
CenterCrop
RandomCropr   r   transform_input)selfargskwargs	__class__s      d/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/multi_modal.pyr8   -DiffusionImageGenerationPreprocessor.__init__2   s   $)&)'-zz,'D$!'FSE!: &

53% 8'*6::lB+G'H$!::mT:)11,,(::CCE  !!$">">?%/%:%:,,&.!  !7!7!%!6!68
3
 
     returnc                 <   0 nUR                  5        H  u  p4UR                  S5      (       d  X0R                  ;   a@  [        U5      nU R	                  U5      nXbUR                  SS5      R                  5       '   Mj  U(       a  UOSX#R                  5       '   M     U$ )Nz:FILE )itemsendswithr>   r   rC   replacelower)rD   dataresultskeyvalueimageimgs          rH   __call__-DiffusionImageGenerationPreprocessor.__call__F   s    **,JC||G$$/K/K(K"5)**51<?GR0668905u2		$ ' rJ   )r4   r>   r;   r:   r<   rC   )__name__
__module____qualname____firstlineno____doc__r8   r   strr   rX   __static_attributes____classcell__rG   s   @rH   r'   r'   '   s&    (	S#X 	 	rJ   r'   c                      ^  \ rS rSr\R
                  4S\4U 4S jjjrS\\	\
\	   4   S\\\4   4S jrS rS\\\\\\4   4   S\\\4   4S jrS	rU =r$ )
r(   R   	model_dirc                 .  > [         TU ]  " U0 UD6  [        R                  [        [        R
                  [        [        R                  [        [        R                  [        [        R                  [        [        R                  [        [        R                  [         [        R"                  [$        [        R&                  [(        [        R*                  [,        [        R.                  [0        [        R2                  [4        0n[6        R8                  " U5      (       a  UO'[;        U[<        R>                  [<        R@                  0S9n[B        RD                  " [6        RF                  " U[H        RJ                  5      5      U l&        XPRL                  RN                     " U RL                  XS9U l(        [R        U RL                  RN                     U l*        U RP                  RV                  U l+        URY                  SS5      (       a  SU l-        gSU l-        g)dpreprocess the data

Args:
    model_dir (str): model path
    mode: preprocessor mode (model mode)

user_agent)cfgre   mode
no_collateNTF).r7   r8   r    ocr_recognitionOfaOcrRecognitionPreprocessorimage_captioningOfaImageCaptioningPreprocessorvisual_groundingOfaVisualGroundingPreprocessorvisual_question_answering&OfaVisualQuestionAnsweringPreprocessorvisual_entailmentOfaVisualEntailmentPreprocessorimage_classification"OfaImageClassificationPreprocessortext_classification!OfaTextClassificationPreprocessortext_summarizationOfaSummarizationPreprocessortext_to_image_synthesis#OfaTextToImageSynthesisPreprocessorauto_speech_recognitionOfaASRPreprocessorsudokuOfaSudokuPreprocessortext2sqlOfaTextToSqlPreprocessorospexistsr   r   KEYPREPROCESSORr   	from_filejoinr   CONFIGURATIONrj   task
preprocessr&   keys	tokenizergetrl   )rD   re   rk   rE   rF   preprocess_mappingrG   s         rH   r8   OfaPreprocessor.__init__V   sg    	$)&)!!#@""$B""$B++2##%D&&(J%%'H$$&B))+N))+=LL/NN4
 "%I!6!6I<M6::v/B/B"C=E	##HHY	 7 78:,XX]];I:(7	22::lD))"DO#DOrJ   inputrK   c                     [        5       n[        U[        5      (       d  [        U[        5      (       d  U4n[	        U R
                  U5       H	  u  p4XBU'   M     U$ N)dict
isinstancetuplelistzipr   )rD   r   rR   rT   items        rH   _build_dictOfaPreprocessor._build_dict   sL    v%''
5$0G0GIETYY.ICI /rJ   c                 f   SU;   a  U R                   R                  R                  SS 5      S:X  a  [        US   [        5      (       a  [        US   5      nOUS   nUR                  S:w  a  UR                  S5      n[        5       nUR                  USS9  [        R                  " U5      US'   U$ )NrV   typeofaRGBJPEG)format)rj   modelr   r   r_   r   rk   convertr   saver	   open)rD   rR   rV   
img_buffers       rH   #_ofa_input_compatibility_conversion3OfaPreprocessor._ofa_input_compatibility_conversion   s    d?txx~~11&$?5H$w---"4=1WzzU"e, JJJz&J1!JJz2DMrJ   c                 l   [        U[        5      (       a  UnOU R                  U5      nU R                  U5      n[        5       nUR	                  5        H  u  px[        U5      Xg'   M     XeS'   U R                  (       a  U$ [        U/U R                  R                  U R                  R                  S9$ )Nsample)pad_idxeos_idx)r   r   r   r   rN   r_   rl   r%   r   pad_token_ideos_token_id)	rD   r   rE   rF   rR   r   str_datakvs	            rH   rX   OfaPreprocessor.__call__   s    eT""D##E*D&6JJLDAa&HK !#x??Mvh&*nn&A&A&*nn&A&AC CrJ   )rj   r   rl   r   r   )rZ   r[   r\   r]   r   	INFERENCEr_   r8   r   r   r   r   r   r   r   r   rX   r`   ra   rb   s   @rH   r(   r(   R   s     ((&$&$ &$Rud5k'9!: tCH~ CeCS#X$>? C"38nC CrJ   r(   c                 $    U R                  S5      $ )Nr   )r   )rV   s    rH   _convert_to_rgbr      s    ==rJ   c                      ^  \ rS rSr\R
                  4S\4U 4S jjjrS r SS\	\\
\   4   S\S\R                  4S jjrS	\4S
 jrS	\4S jrS\	\\\\\4   4   S\\\4   4S jrSrU =r$ )CLIPPreprocessor   re   c                 L  > [         TU ]  " U0 UD6  [        R                  " U5      (       a  UO'[	        U[
        R                  [
        R                  0S9nX l        SSK	J
n  SU;   a  [        US   U5      (       a  US   U l        O U S[        R                   3nU" US9U l        SU;   a#  [        US   [        5      (       a  US   U l        O5["        R$                  " ['        SR)                  U5      S	S
95      S   U l        U R+                  5       U l        SSS.U l        g)rg   rh   r   )FullTokenizerr   /)
vocab_filer0   z{}/vision_model_config.jsonutf-8encodingimage_resolutionrW   text)rW   r   N)r7   r8   r   r   r   r   r   r   rk   1modelscope.models.multi_modal.clip.bert_tokenizerr   r   r   r   
VOCAB_FILEintr   jsonloadr   r   _build_image_transformimg_preprocess
input_keys)rD   re   rk   rE   rF   r   r   rG   s          rH   r8   CLIPPreprocessor.__init__   s    	$)&)!$I!6!6I<M6::v/B/B"C=E		S& Z{0C0=&? &?#K0DN%;a	(<(<'=>J*jADN6!j1Es&K&K$*<$8D!$(II188C$&%' (:%;D! #99; #(8rJ   c                 |   U R                   [        R                  :X  aL  [        U R                  SSS SSSSS9n[        UR                  S S [        /-   UR                  SS  -   5      nU$ [        [        U R                  U R                  4[        R                  S	9[        [        5       [        SS5      /5      nU$ )
N)g?g      ?Toriginalbicubicg3<4'?gwgM?gy{ ?gB91?gwt.?g	U?)
input_sizescaleis_trainingcolor_jitterauto_augmentr6   r1   r2   r5   )rk   r   TRAINr
   r   r   r   r   r   r	   BICUBICr   r   )rD   	transforms     rH   r   'CLIPPreprocessor._build_image_transform   s    99&(00  !''88	I  	 4 4Sb 9_<M M"+"6"6rs";!< =I   --t/D/DE%*]]4
=>@! I rJ   textscontext_lengthrK   c           	      8   [        U[        5      (       a  U/n/ nU H  nUR                  U R                  R                  S   /U R                  R                  U R                  R                  U5      5      SUS-
   -   U R                  R                  S   /-   5        M     [        R                  " [        U5      U[        R                  S9n[        U5       H;  u  pg[        U5      U::  d   e[        R                  " U5      XVS[        U5      24'   M=     U$ )a  
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
    An input string or a list of input strings to tokenize
context_length : int
    The context length to use; all baseline models use 24 as the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
z[CLS]N   z[SEP])dtype)r   r_   appendr   vocabconvert_tokens_to_idstokenizetorchzeroslenlong	enumeratetensor)rD   r   r   
all_tokensr   resultitokenss           rH   r   CLIPPreprocessor.tokenize   s    eS!!GE
D%%g./..66NN++D133FNQ4FHH >>''0123  S_nEJJO":.IAv;.000&+ll6&:Fls6{l?# / rJ   new_keyc                      XR                   S'   g )NrW   r   rD   r   s     rH   set_input_img_key"CLIPPreprocessor.set_input_img_key  s    !(rJ   c                      XR                   S'   g )Nr   r   r   s     rH   set_input_text_key#CLIPPreprocessor.set_input_text_key  s    ")rJ   r   c           	         0 nU R                   S   nXQ;   Ga7  X   Gb1  X   n[        U[        R                  5      (       a!  U R                  U5      R	                  S5      nO[        U[
        5      (       a  [        U Vs/ s H  n[        U[        R                  5      PM     sn5      (       a6  [        R                  " U Vs/ s H  nU R                  U5      PM     snSS9nOeU Vs/ s H/  n[        U[        R                  5      (       a  M$  [        U5      PM1     snS   n	[        SU	 35      e[        S[        U5       35      eXtS'   U R                   S   n
X;   a  X   b  X   n[        U[        5      (       a  U R                  U5      nO[        U[
        5      (       a  [        U Vs/ s H  n[        U[        5      PM     sn5      (       a  U R                  U5      nO[U Vs/ s H%  n[        U[        5      (       a  M  [        U5      PM'     snS   n	[        SU	 35      e[        S[        U5       35      eXS'   U$ s  snf s  snf s  snf s  snf s  snf )	NrW   r   dimzfimg should be PIL.Image or List[PIL.Image],                             but got a List containing one z4img should be PIL.Image or List[PIL.Image], but got r   z?text should be str or List[str], but got a List containing one z)text should be str or List[str], but got )r   r   r	   r   	unsqueezer   allr   stackr   	TypeErrorr_   r   )rD   r   rE   rF   outputinput_img_keyimage_inputimage_tensorelemunsupported_elem_typeinput_text_key
text_inputtext_tensors                rH   rX   CLIPPreprocessor.__call__  sq   .!e&:&F.K +u{{33#22;?II!LK..$/1$/D #45$/1 2 2#(;;%02%0T ,,T2%02$L 0;-/:t)$< #T
{- -) $;;P:QS   J4P[K\J]^  )5M 0"u'<'H.J *c**"mmJ7J--*E*$
4-*EFF"&--
";K 0:-/9t)$4 #T
z- -) $YZoYpq 
  ?Z@P?QR  )6Nc12-2 F-s*    $I
I*#II1I-I
I)r   r   r   rk   r   )4   )rZ   r[   r\   r]   r   r   r_   r8   r   r   r   r   r   
LongTensorr   r   r   r   r   r   rX   r`   ra   rb   s   @rH   r   r      s     (("9"9 "9H8 (* c49n- !$ .3.>.> D) )*# *>eCS#X$>? >"38n> >rJ   r   c                      ^  \ rS rSr\R
                  S4S\S\S\4U 4S jjjr\	S 5       r
\	S 5       rS	\S
\\R                  \4   4S jrS\\R                  \\\\4   4   S
\\\4   4S jrSrU =r$ )r)   iV     re   rk   tokenizer_max_lengthc                 t   > [         TU ]  " U0 UD6  Xl        X l        X0l        S U l        S U l        0 U l        g r   )r7   r8   re   rk   r  
_tokenizer_patch_resize_transform
_image_maprD   re   rk   r  rE   rF   rG   s         rH   r8   MPlugPreprocessor.__init__Z  s=     	$)&)"	$8!'+$rJ   c                     SSK Jn  U R                  c   UR                  U R                  5      U l        U R                  $ Nr   )BertTokenizertransformersr  r  from_pretrainedre   rD   r  s     rH   r   MPlugPreprocessor.tokenizeri  /    .??"+;;DNNKDOrJ   c           	         U R                   c  SSKJn  SSKJnJn  UR                  [        R                  " U R                  U5      5      nSnSnUR                  " UR                  " UR                  UR                  4[        R                  S9UR                  " 5       UR                   " XVS9/5      U l         U R                   $ )Nr   r   )CONFIG_NAMEMPlugConfigr   r   r5   r1   r2   )r  torchvisionr   #modelscope.models.multi_modal.mplugr  r   from_yaml_filer   r   re   r   r   	image_resr	   r   r   r   )rD   r   r  r   configr1   r2   s          rH   patch_resize_transform(MPlugPreprocessor.patch_resize_transformq      ''/.T //57F 7D6C+5+=+=!!6#3#3V5E5E"F05?##%$$$8	? ,D( +++rJ   pathrK   c                     XR                   ;  a/  [        U R                   5      n[        U5      U4U R                   U'   U R                   U   $ r   r  r   r   rD   r*  indexs      rH   
image_openMPlugPreprocessor.image_open  A    &(E%/%5u$=DOOD!t$$rJ   rR   c                    [         R                  " [        R                  " U R                  [
        R                  5      5      U l        [        U[        R                  [        45      (       a  UnO [        U[        5      (       a  US   nOUS   nSn[        U[        5      (       a  U R                  U5      u  p#UR                  S5      nU R                  U5      nU R                  R                  [         R"                  :X  a  SO#U[        U[        5      (       a  SO	SU;   a  SOS   nU R%                  UR'                  5       SS	U R(                  S
S9nU R*                  [,        R.                  :X  a  [0        R2                  " U/SS9nX$S.$ US   nU R%                  USS	U R(                  S
S9nUUR4                  R7                  5       UR8                  R7                  5       UR4                  R7                  5       UR8                  R7                  5       S.nU R                  R                  [         R:                  :X  a  X6S'   U$ )Nr   rV   r   rM   r!   r   question
max_lengthTptpadding
truncationr4  return_tensorsr   )rV   r3  answer)rV   question_input_idsquestion_attention_maskanswer_input_idsanswer_attention_maskr.  )r   r   r   r   re   r   r   rj   r   r	   r_   r   r/  r   r'  r   r    ro   r   rQ   r  rk   r   r   r   r   	input_idssqueezeattention_maskimage_text_retrieval)rD   rR   rV   r.  r3  r:  r   s          rH   rX   MPlugPreprocessor.__call__  s    ##HHT^^Y%<%<=? dU[[#.//Ee$$GEMEeS!!??51LEe$++E2%*@*@@2:dE22a&,nF*G 	 >>NN 00 " ! 99***KKQ/E"99(^F^^$44# $ %F &.&8&8&@&@&B+3+B+B+J+J+L$*$4$4$<$<$>)/)>)>)F)F)HF xx}} : ::"'wMrJ   )r  r  r  rj   rk   re   r  )rZ   r[   r\   r]   r   r   r_   r   r8   propertyr   r'  r   r	   r/  r   r   r   r   rX   r`   ra   rb   s   @rH   r)   r)   V  s     &//-/ (+    , ,&%s %uU[[#-='> %/ekk5"38n- ./26sCx./ /rJ   r)   c                   t   ^  \ rS rSr\R
                  4S\S\4U 4S jjjrS\\\	4   S\\\	4   4S jr
SrU =r$ )	VLDocPreprocessori  re   rk   c           	        > [         TU ]  " U0 UD6  Xl        X l        [        R
                  " US5      n[        USSS9 n[        R                  " U5      nSSS5        SSK	J
n  [        R
                  " U[        R                  5      n	UR                  U	5      U l        SSKJn
Jn  U" S	S	WS
   S   US
   S   S.S	SS9U l        U
" US   US   U R$                  U R                  US
   S   US
   S   S9U l        g! , (       d  f       N= f)zPreprocess data for the model `VLDocForDocVLEmbedding`.

Args:
    model_dir (str): model path in model hub.
    mode (str): model mode, in ('train', 'eval', 'inference').
zconfig.jsonrr   r   Nr   )VLDocXLMTokenizer)	ProcessorImageProcessorT
image_sizer!   )heightwidthF)do_preprocess	do_resizerL  do_normalize	apply_ocrmax_seq_lengthmax_block_num)rS  rT  img_processorr   rN  rM  )r7   r8   re   rk   r   r   r   r   r   0modelscope.models.multi_modal.vldoc.tokenizationrI  r   TOKENIZER_FOLDERr  r   .modelscope.models.multi_modal.vldoc.processingrJ  rK  img_procproc)rD   re   rk   rE   rF   model_cfg_pathf	model_cfgrI  tokenizer_pathrJ  rK  rG   s               rH   r8   VLDocPreprocessor.__init__  s    	$)&)"	)];.#8A		!I 9 	W)Y-G-GH*::>J\&#L1!4"<03  $%56#O4--nnL)!,\*1-
	# 98s    C==
Dr   rK   c                     / nUS    HD  n[        US5       n[        R                  " U5      nUS   nUR                  U5        SSS5        MF     US   US.nU R                  " S0 UD6n	U	$ ! , (       d  f       Ms  = f)z
Args:
    input: {
        'images': ['img_path1', 'img_path2', ...],
        'ocr_info_paths': ['json_path1', 'json_path2', ...]
    }
Return:
    encodings: Dict[str, Tensor]
ocr_info_pathsrH  formNimages)rc  	ocr_infos )r   r   r   r   rZ  )
rD   r   rE   rF   rd  one_ocr_info_pathr\  ocr_info
proc_input	encodingss
             rH   rX   VLDocPreprocessor.__call__  s     	!&'7!8'-99Q<#F+  * .- "9 !&hiH
II+
+	 .-s   -A,,
A;	)rY  rk   re   rZ  r   )rZ   r[   r\   r]   r   r   r_   r8   r   r   rX   r`   ra   rb   s   @rH   rF  rF    sR     &//)
)
)
 )
Vd38n "38n rJ   rF  c                     ^  \ rS rSr\R
                  S4S\S\S\4U 4S jjjr\	S 5       r
\	S 5       r\	S	 5       rS
\S\\R                  \4   4S jrS\S\S\\   4S jrS\\R                  \\\\4   4   S\\\4   4S jrSrU =r$ )r*   i  r  re   rk   r  c                    > [         TU ]  " U0 UD6  Xl        X l        X0l        S U l        S U l        S U l        0 U l        g r   )	r7   r8   re   rk   r  r  r  _num_frames
_video_mapr  s         rH   r8   HiTeAPreprocessor.__init__	  sE     	$)&)"	$8!'+$rJ   c                     SSK Jn  U R                  c   UR                  U R                  5      U l        U R                  $ r  r  r  s     rH   r   HiTeAPreprocessor.tokenizer  r  rJ   c           	         U R                   c  SSKJn  SSKJnJn  UR                  [        R                  " U R                  U5      5      nSnSnUR                  " UR                  " UR                  UR                  4[        R                  S9UR                  " 5       UR                   " XVS9/5      U l         U R                   $ )Nr   r   r  HiTeAConfigr   r   r5   r!  )r  r"  r   r#  r  rt  r$  r   r   re   r   r   r%  r	   r   r   r   )rD   r   r  rt  r&  r1   r2   s          rH   r'  (HiTeAPreprocessor.patch_resize_transform!  r)  rJ   c                     U R                   cO  SSKJn  SSKJnJn  UR                  [        R                  " U R                  U5      5      nUR                  U l         U R                   $ )Nr   r   rs  )rm  r"  r   r#  r  rt  r$  r   r   re   
num_frames)rD   r   r  rt  r&  s        rH   rw  HiTeAPreprocessor.num_frames5  sR    #.T //57F  &00DrJ   r*  rK   c                     XR                   ;  aN  [        U R                   5      n[        R                  " U[        R                  " S5      S9nX24U R                   U'   U R                   U   $ )Nr   )ctx)rn  r   decordVideoReadercpu)rD   r*  r.  vrs       rH   
video_openHiTeAPreprocessor.video_openA  sS    &(E##Dfjjm<B%'KDOOD!t$$rJ   rw  vlenc                 l   [        X5      n[        R                  " SX#S-   S9R                  [        5      n/ n[        US S 5       H   u  pgUR                  XtUS-      S-
  45        M"     U Vs/ s H  oS   US   -   S-  PM     n	n[        U	5      U:  a  U	S   /U-  n
XS [        U	5      & U
n	U	$ s  snf )Nr   r!   )startstopnumr   )minnplinspaceastyper   r   r   r   )rD   rw  r  acc_samples	intervalsrangesidxintervxframe_indicespadded_frame_indicess              rH   sample_framesHiTeAPreprocessor.sample_framesH  s    *+KK$!O55;VC[ 	$Ys^4KCMM6S1W#5#9:; 5 6<<VA$1+!+V<}
*$1"$5#6#C 8E!4#m"450M =s   /B1rR   c                 |   [         R                  " [        R                  " U R                  [
        R                  5      5      U l        [        U[        R                  [        45      (       a  UnO [        U[        5      (       a  US   nOUS   nSn[        U[        5      (       a  U R                  U5      u  p#U R                  U R                  [!        U5      5      nUR#                  S5        [$        R&                  " UR)                  U5      R+                  5       5      nUR-                  5        Vs/ s H(  nU R/                  [0        R2                  " U5      5      PM*     nn[$        R4                  " USS9nU R                  R6                  [8        R:                  :X  a  SO#U[        U[        5      (       a  SO	SU;   a  SOS   nU R=                  UR?                  5       SS	U R@                  S
S9nU RB                  [D        RF                  :X  a  [$        R4                  " U/SS9nX&S.$ US   nU R=                  USS	U R@                  S
S9nUURH                  RK                  5       URL                  RK                  5       URH                  RK                  5       URL                  RK                  5       S.nU$ s  snf )Nr   videor   rM   r!   r   r3  r4  Tr5  r6  )r  r3  r:  )r  r;  r<  r=  r>  )'r   r   r   r   re   r   r   rj   r   r{  r|  r_   r   r  r  rw  r   seekr   
from_numpy	get_batchasnumpynumpyr'  r	   	fromarrayr   r   r    video_captioningr   rQ   r  rk   r   r   r?  r@  rA  )	rD   rR   r  r.  r  r\  r3  r:  r   s	            rH   rX   HiTeAPreprocessor.__call__Y  sT    ##HHT^^Y%<%<=? dV//566Ee$$GEMEeS!!??51LE**4??CJG

1  !?!G!G!IJ [[]
" ''(:;" 	 
 Eq)%*@*@@2:dE22a&,nF*G 	 >>NN 00 " ! 99***KKQ/E"99(^F^^$44# $ %F &.&8&8&@&@&B+3+B+B+J+J+L$*$4$4$<$<$>)/)>)>)F)F)HF MC
s   6/J9)rm  r  r  rn  rj   rk   re   r  )rZ   r[   r\   r]   r   r   r_   r   r8   rD  r   r'  rw  r   r{  r|  r  r   r  r   r   r   r   rX   r`   ra   rb   s   @rH   r*   r*     s     &//-/ (+     , ,& 	  	 %s %uV-?-?-D'E % 3 49 "3&,,esCx.) *3.238n3 3rJ   r*   c                      ^  \ rS rSr\R
                  4S\S\4U 4S jjjr\S 5       r	\S 5       r
S\S\\R                  \4   4S	 jrS
\S\\   4S jrS\\\\   4   S\4S jrS\\\4   S\\\4   4S jrSrU =r$ )r+   i  re   rk   c                 z   > [         TU ]  " U0 UD6  Xl        X l        S U l        S U l        SS0U l        0 U l        g )N	<|image|>A   )r7   r8   re   rk   r  r  media_tokenr  )rD   re   rk   rE   rF   rG   s        rH   r8   MplugOwlPreprocessor.__init__  sD    
 	$)&)"	'+$',rJ   c                     SSK Jn  U R                  c   UR                  U R                  5      U l        U R                  $ )Nr   )LlamaTokenizer)modelscope.models.nlp.llamar  r  r  re   )rD   r  s     rH   r   MplugOwlPreprocessor.tokenizer  s/    >??",<<T^^LDOrJ   c           	          U R                   c^  SSKJn  SnSnUR                  " UR                  " S[
        R                  S9UR                  " 5       UR                  " X#S9/5      U l         U R                   $ )Nr   r   r   r   )   r  r5   r!  )	r  r"  r   r   r   r	   r   r   r   )rD   r   r1   r2   s       rH   r'  +MplugOwlPreprocessor.patch_resize_transform  so    ''/.6D6C+5+=+=!!*EMMJ##%$$$8? ,D(
 +++rJ   r*  rK   c                     XR                   ;  a/  [        U R                   5      n[        U5      U4U R                   U'   U R                   U   $ r   r,  r-  s      rH   r/  MplugOwlPreprocessor.image_open  r1  rJ   r   c           	      $   [        U R                  R                  5       5       VVs0 s H  u  p#U[        US-   5      * _M     nnnU R                  R	                  5       nU R
                  R                  /nUR                  5        Vs/ s H  owU;  PM	     nn[        U5      (       a  UU R                  USS9S   -   n	U	$ Un	SR                  [        [        R                  [        UR                  5       5      5      5      n
[        R                  " SU
 S3U5      nU Vs/ s H  n[        U5      S:  d  M  UPM     nn[        U5       H0  u  pX;   a  XU   /X^   -  -  n	M  U R                  USS9S   nX-  n	M2     U	$ s  snnf s  snf s  snf )	Nr!   F)add_special_tokensr?  |()r   )r   r  r   r   copyr   bos_token_idr   r   mapreescaper   splitr   )rD   r   r   r   media_tokensmedia_lengthsprompt_chunkr  	condition	enc_chunkpattern
chunk_strsr  r  	chunk_str	tmp_chunks                   rH   tokenize_text"MplugOwlPreprocessor.tokenize_text  s    "$"2"2"7"7"9:
: AE
{N: 	 
 ((--/334 8D7H7H7J
7Jt#7J 	 
 y>>$t>{KLI"  %Ihhs299d<3D3D3F.GHIGAgYa.$7J%/>Z3q6A:!ZJ>"+J"7,y"9!:%0"1 1I !%!e !/ !==H!JI*I #8 =

 ?s   F	F"F9Fmessagesc                    / n/ nUS   nU H  nUS   S:X  a  SnOUS   S:X  a  SnOSn[        US   [        5      (       a  U US    3nUR                  U5        MQ  US    HI  n[        U[        5      (       a  U U 3nOU S	3nUR                  US
   5        UR                  U5        MK     M     SR                  U5      nUS-  nX24$ )Nr  rolesystemrM   userzHuman: zAI: contentr  rV   
z
AI: )r   r_   r   r   )rD   r  r   rV   turnr  r   ts           rH   r   MplugOwlPreprocessor.convert  s    J'DF|x'f' $y/3//Y01T"iA!!S))"&s|"&y1QwZ0LL& ) $ 		% |rJ   c                 V   0 nU R                  U5      u  pE[        U5      S:  aS  / nU HJ  nUR                  U R                  U R	                  U5      S   5      5        [
        R                  " USS9nML     OSnU R                  U5      n[
        R                  " U/5      nUUS.UEnU$ )a  
Args:
    messages: {[
        {'role': 'system', 'content': 'message1'},
        {'role': 'user', 'content': 'message2'},
        {'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]},
    ]}
    The 'role' should be choose from ['system', 'user', 'assistant'].
    The 'content' can be either str or List[Union[str, Dict]]
Return:
    output: Dict[str, Tensor]
r   r   N)pixel_valuesr?  )	r   r   r   r'  r/  r   r   r  r  )	rD   r  forward_paramsr   rc  r   r  rV   r?  s	            rH   rX   MplugOwlPreprocessor.__call__  s     ||H-v;?L##//0Fq0IJL${{<Q?  
  L&&t,	$$i[1	 )"
 
 rJ   )r  r  r  r  rk   re   )rZ   r[   r\   r]   r   r   r_   r8   rD  r   r'  r   r	   r   r/  r   r  r   r   r   rX   r`   ra   rb   s   @rH   r+   r+     s     &//    , ,%s %uU[[#-='> %# $s) BS$t*_ 5 # 4#c3h #&*38n# #rJ   r+   c                   @   ^  \ rS rSrU 4S jrS\\\4   4S jrSr	U =r
$ )+ImageCaptioningClipInterrogatorPreprocessori  c                 &   > [         TU ]  " S0 UD6  g )Nre  )r7   r8   )rD   rF   rG   s     rH   r8   4ImageCaptioningClipInterrogatorPreprocessor.__init__$  s    "6"rJ   rK   c                 j    [        U5      n[        R                  " U5      R                  SSS5      nU$ )Nr   r   r!   )r   r  array	transpose)rD   rR   rV   s      rH   rX   4ImageCaptioningClipInterrogatorPreprocessor.__call__'  s-    4 xx((Aq1rJ   re  )rZ   r[   r\   r]   r8   r   r_   r   rX   r`   ra   rb   s   @rH   r  r    s!    
#S#X  rJ   r  )Qos.pathr*  r   r  ior   typingr   r   r   r   r   r{  r   r  r  r   PILr	   	timm.datar
   r"  r   torchvision.datasetsr   torchvision.transformsr   r   r   r    modelscope.hub.snapshot_downloadr   modelscope.metainfor   modelscope.pipelines.baser   7modelscope.pipelines.cv.cmdssl_video_embedding_pipeliner   r   r   r   r   modelscope.preprocessorsr   modelscope.utils.configr   modelscope.utils.constantr   r   r   r   r    baser"   builderr#   r   ofa.utils.collater%   ofa.utils.constantr&   __all__register_modulemulti_modal'diffusion_image_generation_preprocessorr'   ofa_tasks_preprocessorr(   r   clip_preprocessorr   mplug_tasks_preprocessorr)   vldoc_preprocessorrF  hitea_tasks_preprocessorr*   mplug_owl_preprocessorr+   /image_captioning_clip_interrogator_preprocessorr  re  rJ   rH   <module>r     sX    	  0 0      & " , G G > - +< < / *. .  "  ) 4 
EEG%< %G%P 
M$H$HJPCl PCJPCf  
M$C$CEf| fEfR 
M$J$JLb bLbJ 
M$D$DFC CFCL 
M$J$JLE ELEP 
M$H$HJK< KJK\ 
MMO, OrJ   