
    9iR                     
   S r SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  \" 5       rS/r\R6                  " \R8                  \	R:                  S9 " S S\5      5       rg)z$Generative Multimodal Model Wrapper.    )AnyDictN)
transforms)Models)
TorchModel)MODELS)	RLEGModel)
OutputKeys)	LoadImage)	ModelFileTasks)
get_loggerRLEGForMultiModalEmbedding)module_namec                   b   ^  \ rS rSrSrS
U 4S jjrS rS rS\\	\
4   S\\	\
4   4S jrS	rU =r$ )r      zGenerative multi-modal model for multi-modal embedding.
The model is trained by representation learning with embedding generation.
Inputs could be image or text or both of them.
Outputs could be features of input image or text,
c                 <  > [         TU ]  " X1US.UD6  [        US9U l        [        R
                  " SR                  U[        R                  5      5      nU R                  R                  U5        U R                  R                  5         X l        U R                  S:  a  [        R                  R                  5       (       ac  U R                  R                  SR                  U R                  5      5        [        R!                  SR                  U R                  5      5        OSU l        [        R!                  S5        ["        R$                  " ["        R&                  " S	5      ["        R(                  " 5       ["        R*                  " S
S5      /5      U l        g )N)	model_dir	device_id)r   z{}/{}r   cuda:{}zUse GPU: {}zUse CPU for inference)   r   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)super__init__r	   modeltorchloadformatr   TORCH_MODEL_BIN_FILEload_state_dictevalr   cudais_availabletologgerinfoTComposeResizeToTensor	Normalizeimg_preprocessor)selfr   r   argskwargspretrained_params	__class__s         g/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/rleg/rleg.pyr   #RLEGForMultiModalEmbedding.__init__   s   7;9	G?E	G3
!JJw~~y55(7 8

""#45

">>Q5::#:#:#<#<JJMM)**4>>:;KK,,T^^<=DNKK/0 !		HHZ JJLKK;<>+
 !    c                     Uc  g [         R                  " U5      nU R                  U5      S   nU R                  S:  a*  UR	                  SR                  U R                  5      5      nU$ )N)N.r   r   )r   convert_to_imgr,   r   r$   r   )r-   	input_img
img_tensors      r2   parse_image&RLEGForMultiModalEmbedding.parse_image5   s_    ,,Y7	**95i@
>>Q#y'7'7'GHJr4   c                 P   Ub  [        U5      S:X  a  g [        U[        5      (       a  U R                  R	                  U5      nO[        S[        U5       35      eU R                  S:  a*  UR                  SR                  U R                  5      5      nUR                  SS5      $ )Nr   ztext should be str, but got r      r   )len
isinstancestrr   tokenize	TypeErrortyper   r$   r   view)r-   text_strtext_ids_tensors      r2   
parse_text%RLEGForMultiModalEmbedding.parse_text>   s    s8}1h$$"jj11(;O:4>:JKLL>>Q-001A1A2  !O##Ar**r4   inputreturnc           	         UR                  SUR                  SS 5      5      nUR                  SUR                  SS 5      5      nU R                  U5      nU R                  U5      nU R                  XE5      n[        R
                  UR                  SS 5      [        R                  UR                  SS 5      [        R                  UR                  SS 5      0nU$ )Nimageimgtexttxtimage_featuretext_featurecaption)getr9   rF   r   r
   IMG_EMBEDDINGTEXT_EMBEDDINGCAPTION)r-   rH   image_input
text_inputrK   rM   outoutputs           r2   forward"RLEGForMultiModalEmbedding.forwardJ   s    ii5$)?@YYvuyy'=>
  -z*jj%$$cggot&D%%sww~t'D	4 8

 r4   )r   r,   r   )r   )__name__
__module____qualname____firstlineno____doc__r   r9   rF   r   r?   r   rZ   __static_attributes____classcell__)r1   s   @r2   r   r      s?    ,
+T#s(^ S#X  r4   )r`   typingr   r   r   torchvisionr   r'   modelscope.metainfor   modelscope.models.baser   modelscope.models.builderr   (modelscope.models.multi_modal.rleg.modelr	   modelscope.outputsr
   modelscope.preprocessorsr   modelscope.utils.constantr   r   modelscope.utils.loggerr   r%   __all__register_module generative_multi_modal_embeddingrlegr    r4   r2   <module>rr      so    +   ' & - , > ) . 6 .	'
( 	**E= =E=r4   