
    9i                         S r SSKrSSKrSSKrSSKJs  Jr  SSKJr  SSKJ	r	J
r
   " S S\R                  5      r " S S\R                  5      r " S	 S
\R                  5      rg)z)Generative Multimodal Model Architecture.    N)nn)	gemm_base	tokenizerc                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )ImageEncoder   z,Image Feature Encoder
ViT Style Transformer
c           
      x   > [         TU ]  5         US S u  p#pEn[        R                  " UUUUUS-  USS9U l        g )N   @   F)input_resolution
patch_sizewidthlayersheads
output_dimuse_gc)super__init__r   VisualTransformervisual)selfconfigs	embed_dimimage_resolutionvision_layersvision_widthvision_patch_size	__class__s          h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/rleg/model.pyr   ImageEncoder.__init__   sO    %bqk	m	11-( "$     c                 p    U R                  U5      nUS S 2SS 2S S 24   nUS S 2SS S 24   nU(       a  XT4$ U$ )N   r   r   )r   imagereturn_tokensfeaturestokens	embeddings         r   forwardImageEncoder.forward%   sB    ;;u%!QR(#Q1W%	&3	"BBr!   r$   F)	__name__
__module____qualname____firstlineno____doc__r   r*   __static_attributes____classcell__r   s   @r   r   r      s    C Cr!   r   c                   @   ^  \ rS rSrSrU 4S jrSS jrSS jrSrU =r	$ )	TextEncoder,   z,Text Feature Encoder
BERT style transformer
c                   > [         TU ]  5         USS  u  p#pEn[        R                  " UUUU R	                  U5      S9U l        [        R                  " X45      U l        [        R                  " [        R                  " X$5      5      U l        [        R                  " U5      U l        [        R                  " [        R                  " XAS   5      5      U l        g )N)r   r   r   	attn_maskr   )r   r   r   Transformerbuild_attention_masktransformerr   	Embeddingtoken_embedding	Parametertorchemptypositional_embedding	LayerNormln_finaltext_projection)r   r   context_length
vocab_sizemodel_widthmodel_headsmodel_layersr   s          r   r   TextEncoder.__init__1   s     	[	$00//?	
  "||JD$&LLKK4%6![1!||KKQZ0 2r!   c                 Z    [         R                  " X5      S-  nUR                  S5        U$ )Ng     r#   )rA   onestriu_)r   
seq_lengthmasks      r   r<    TextEncoder.build_attention_maskD   s%    zz*1D8

1r!   c                 n   U R                  U5      nX0R                  -   nUR                  SSS5      nU R                  U5      nUR                  SSS5      nU R	                  U5      nU[
        R                  " UR                  S   5      UR                  SS9S4   U R                  -  nU(       a  XC4$ U$ )Nr#   r      )dim.)
r?   rC   permuter=   rE   rA   arangeshapeargmaxrF   )r   textr&   xr)   s        r   r*   TextEncoder.forwardI   s      &)))IIaAQIIaAMM!ell1771:.kkbk)3/ 0262F2FG	!.	~=I=r!   )rE   rC   rF   r?   r=   Nr,   )
r-   r.   r/   r0   r1   r   r<   r*   r2   r3   r4   s   @r   r6   r6   ,   s    2&

> 
>r!   r6   c                   v   ^  \ rS rSrSrU 4S jrS rS rS rS r	\
R                  " 5       S
S j5       rS	rU =r$ )	RLEGModelV   zGenerative multi-modal model, trained with RLEG method.
It takes image or text or both of them as input, and produce
the corresponding features of inputs.
c                 &  > [         TU ]  5         [        SR                  U5      SSS9 n[        R
                  " UR                  5       5      nS S S 5        [        WR                  5       5      S   nX4   n[        R                  R                  US5      n[        R                  " U5      U l        [        U5      U l        [!        U5      U l        [$        R&                  " [(        R*                  " / 5      5      U l        g ! , (       d  f       N= f)Nz{}/encoder_config.jsonrzutf-8)encodingr   zbpe_vocab_16e6.txt.gz)r   r   openformatjsonloadsreadlistkeysospathjoinr   SimpleTokenizerr   image_encoderr6   text_encoderr   r@   rA   rN   logit_scale)r   	model_dirfmodel_config
model_nameconfig_argsbpe_pathr   s          r   r   RLEGModel.__init__\   s    (//	:C "%&::affh/L" ,++-.q1
".77<<	+BC"228<)+6'4<<

27" "s   %D
Dc                 P    [         R                  " U R                   U/5      S   nU$ )Nr   )r   clip_tokenize)r   text_strtext_tensors      r   tokenizeRLEGModel.tokenizek   s%    --dnnxjI!Lr!   c                 T    U R                  U5      n[        R                  " USSS9nU$ NrT   rU   )prV   )rq   F	normalize)r   r[   features      r   encode_textRLEGModel.encode_texto   s)    ##D)++g3r!   c                 T    U R                  U5      n[        R                  " USSS9nU$ r   )rp   r   r   )r   r%   r   s      r   encode_imageRLEGModel.encode_imaget   s)    $$U+++g3r!   c                 B    UR                  5       R                  5       nU$ r^   )cpunumpy)r   featouts      r   
parse_featRLEGModel.parse_featy   s    hhj 
r!   c                     Su  p4Ub   U R                  U R                  U5      5      nUb   U R                  U R                  U5      5      nUUS.nU$ )zFIt takes image or text as input,
and extracts the features as output.
NN)image_featuretext_feature)r   r   r   )r   r%   r[   img_featurer   r   s         r   r*   RLEGModel.forward}   s^    
 %/!//$*;*;E*BCK??4+;+;D+ABL((
 
r!   )rp   rr   rq   r   r   )r-   r.   r/   r0   r1   r   r~   r   r   r   rA   no_gradr*   r2   r3   r4   s   @r   r`   r`   V   s;    
8

 ]]_ r!   r`   )r1   rl   rg   rA   torch.nn.functionalr   
functionalr   "modelscope.models.multi_modal.gemmr   r   Moduler   r6   r`    r!   r   <module>r      sR    1 	      CC299 C2'>")) '>T5		 5r!   