
    ifF                     2   S SK r S SKJrJrJrJr  S SKrS SKrS SKrS SK	J
r
  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJrJrJr  S\S\S\S\R>                  4S jr  " S S\RB                  RD                  5      r# " S S\5      r$g)    N)AnyDictOptionalTuple)construct_transformer)	ModelArgs)%replace_kv_cache_with_custom_kv_cache)replace_sdpa_with_custom_op)EagerModelBase)Image)Dim)
functional)AutoProcessorCLIPImageProcessorLlavaForConditionalGenerationimagetarget_htarget_wreturnc                 h   [         R                  R                  R                  U 5      nUR                  S   U-  nUR                  S   U-  n[        XE5      n[        UR                  S   U-  5      [        UR                  S   U-  5      4n[         R                  R                  US9" U5      nU$ )a  Read image into a tensor and resize the image so that it fits in
a target_h x target_w canvas.

Args:
    image (Image): An Image object.
    target_h (int): Target height.
    target_w (int): Target width.

Returns:
    torch.Tensor: resized image tensor.
      )size)torchvision
transformsr   pil_to_tensorshapemaxintResize)r   r   r   imgratio_hratio_wratiooutput_sizes           e/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/executorch/examples/models/llava/model.pyprepare_imager'   '   s     
 
 
+
+
9
9%
@CiilX%GiilX%G!Esyy|e+,c#))A,2F.GHK

 
 
'
'[
'
9#
>CJ    c                   6  ^  \ rS rSr   SS\S\S\S\S\4
U 4S jjjrS\	\
\4   4S	 jrS
 rS rS\R                   S\R                   4S jrS\R                   S\R                   4S jrS\R                   S\R                   4S jr SS\R                   S\\R                      S\R                   4S jjrS\R                   S\R                   4S jrS\R                   S\R                   S\R                   S\R                   4S jrS\R                   S\R                   S\R                   S\\\R                   4   4S jrS\R                   S\R                   S\R                   S\R                   4S jrS\R                   S\R                   4S jrSrU =r$ )Llava?   llava_modelimage_processoruse_sdpa_with_kv_cache_opmax_context_lenmax_seq_lenc                   > [         TU ]  5         X0l        Xl        X l        U R                  R
                  R                  U l        U R                  R
                  R                  U l        [        SSU R                  R
                  R                  R                  U R                  R
                  R                  R                  SSSUSUUS9U l        [        U R                  5      U l        U(       a4  [        U R                  5      U l        [!        U R                  5      U l        U R                  R#                  U R%                  5       SSS9  g )NT    r   )use_kv_cachen_layers
vocab_size
hidden_dimmax_batch_sizeffn_dim_multiplierenable_dynamic_shaper.   use_hf_roper/   r0   F)
state_dictstrictassign)super__init__r.   model_r-   configvision_feature_layervision_feature_select_strategyr   text_configr5   intermediate_sizetext_model_argsr   
text_modelr	   r
   load_state_dict$_translate_state_dict_for_text_model)selfr,   r-   r.   r/   r0   	__class__s         r&   r?   Llava.__init__@   s    	)B&!.$(KK$6$6$K$K!KK== 	+  ){{))55@@{{))55GG !%&?+# 
 00D0DE$CDOOTDO9$//JDO''@@B 	( 	
r(   r   c                    ^ U R                   R                  5       nSSSSSSSSS	S
SS.m0 nS[        S[        4U4S jjnUR                  5        H  nU" U5      nX   X%'   M     U$ )Nzlayers.\1.attention.wq.zlayers.\1.attention.wk.zlayers.\1.attention.wv.zlayers.\1.attention.wo.zlayers.\1.attention_norm.zlayers.\1.feed_forward.w1.zlayers.\1.feed_forward.w2.zlayers.\1.feed_forward.w3.zlayers.\1.ffn_norm.znorm.zoutput.)z6model.language_model.layers.([0-9]+).self_attn.q_proj.z6model.language_model.layers.([0-9]+).self_attn.k_proj.z6model.language_model.layers.([0-9]+).self_attn.v_proj.z6model.language_model.layers.([0-9]+).self_attn.o_proj.z5model.language_model.layers.([0-9]+).input_layernorm.z3model.language_model.layers.([0-9]+).mlp.gate_proj.z3model.language_model.layers.([0-9]+).mlp.down_proj.z1model.language_model.layers.([0-9]+).mlp.up_proj.z>model.language_model.layers.([0-9]+).post_attention_layernorm.zmodel.language_model.norm.zlm_head.old_keyr   c                 z   > TR                  5        H%  u  p[        R                  " XU 5      =o0:w  d  M#  Us  $    U $ N)itemsresub)rN   old_patternreplacementnew_keykey_maps       r&   get_new_key?Llava._translate_state_dict_for_text_model.<locals>.get_new_key   s8    ,3MMO(!vvkHHGT"N -< Nr(   )r@   r;   strkeys)rJ   r;   new_state_dictrX   rN   rV   rW   s         @r&   rI   *Llava._translate_state_dict_for_text_modelj   s    [[++-
 HbGaGaGaFbDaDaB_Oe+3#
" 	 	 	 "(G!'*G&0&9N# )
 r(   c                     UR                   U R                     nU R                  S:X  a  US S 2SS 24   nU$ U R                  S:X  a  UnU$ [        SU R                   35      e)Ndefaultr   fullzUnexpected select feature: )hidden_statesrB   rC   
ValueError)rJ   image_outputsselected_image_features      r&   _feature_selectLlava._feature_select   s~    !.!<!<T=V=V!W..);%;AqrE%B" &% 00F:%;"
 &% -d.Q.Q-RS r(   c                 6    U R                   R                  5       $ rP   )r@   	get_modelrJ   s    r&   rh   Llava.get_model   s    {{$$&&r(   tokensc                 L    U R                   R                  R                  U5      $ rP   )r@   language_modelembed_tokens)rJ   rk   s     r&   rn   Llava.embed_tokens   s    {{))66v>>r(   imagesc                    UR                  U R                  R                  S9n[        U5      [        L a  / nU H  nU R                  R                  UR                  U R                  R                  U R                  R                  S9R                  S5      SS9nU R                  U5      R                  UR                  5      nUR                  U5        M     OzU R                  R                  UR                  U R                  R                  U R                  R                  S9SS9nU R                  U5      R                  UR                  5      nU R                  R                  U5      nU$ )Ndtype)devicers   r   T)output_hidden_states)tor@   rs   typelistvision_towerrt   	unsqueezere   appendmulti_modal_projector)rJ   rp   image_featuresr   image_forward_outimage_featureimage_forward_outss          r&   encode_imagesLlava.encode_images   s6   !2!23<4N$(KK$<$<HH#{{119J9J  il)-	 %= %! !% 4 45F G J J5;; W%%m4   "&!9!9		!3!34;;;L;L	M%) ": "
 "112DEHHVN::>Jr(   r!   c                    U R                   R                  S   nU R                   R                  S   nX1R                  S   -
  S-  nX!R                  S   -
  S-  nX1R                  S   -
  S-  * nX!R                  S   -
  S-  * n[        R                  " US:  5        [        R                  " US:  5        [        R                  " US:  5        [        R                  " US:  5        [        R
                  R                  R                  UXFXW45      nXR                   R                  -  n	[        R                  " U	U R                   R                  U R                   R                  5      n
U
R                  S5      $ )Nheightwidthr   r   r   )r-   	crop_sizer   torch_checknnr   padrescale_factorF	normalize
image_mean	image_stdrz   )rJ   r!   r   r   l_padt_padr_padb_padresizedscalednormeds              r&   image_preprocessLlava.image_preprocess   sG   ''11(;''11':IIaL(Q.IIaL(Q.iil*r12iil*r12UaZ UaZ UaZ UaZ  ((%%))5(
8 //>>>  ++  **
 ""r(   token	input_posc                     U R                  U5      R                  S5      nU R                  R                  SSU0U5      $ )z1Input is one token. Return logits for next token.r   Nr   )rn   rz   rG   forward)rJ   r   r   token_embedss       r&   step
Llava.step   s=     ((/99!<&&tk9-E|TTr(   c                 F    U R                  U5      nU R                  U5      $ rP   )r   r   )rJ   rp   preprocessed_imgs      r&   image_embeddingLlava.image_embedding   s%    008!!"233r(   prompt_before_imageprompt_after_imagec                     U R                  U5      nU R                  U5      nU R                  U5      n[        R                  " XTU4SS9nU$ )Nr   )dim)r   rn   r   cat)rJ   r   rp   r   image_embedsembeds_before_imgembeds_after_imgresults           r&   prefill_embeddingLlava.prefill_embedding  sU     ++F3 --.AB,,-?@-=MNTUVr(   c                     U R                  XU5      nUR                  S   U R                  R                  SS[        R
                  " S/5      0U5      4$ )pAvoiding the torch.where() call to find <image> placeholder and insert image embedding. Taking 3 inputs instead.r   Nr   r   )r   r   rG   r   r   tensorrJ   r   rp   r   embedss        r&   prefillLlava.prefill  sU     ''(;EWX||A 7 7;aS 12F!
 
 	
r(   c                 ^    U R                  XU5      nU R                  R                  USSSS9$ )r   Fr   )inputs_embeds	use_cachereturn_dictlogits_to_keep)r   r@   r   r   s        r&   prefill_refLlava.prefill_ref  s>     ''(;EWX{{"" EuUV # 
 	
r(   c                 $    U R                  U5      $ rP   )r   )rJ   rp   s     r&   r   Llava.forward)  s     ##F++r(   )r-   r@   rG   rF   r.   rB   rC   T   r   rP   )__name__
__module____qualname____firstlineno__r   r   boolr   r?   r   rZ   r   rI   re   rh   r   Tensorrn   r   r   r   r   r   r   r   r   r   r   __static_attributes____classcell__)rK   s   @r&   r*   r*   ?   s   
 +/"(
2(
 ,(
 $(	(

 (
 (
 (
T#d38n #J&'?5<< ?ELL ?ELL U\\ 47#ELL 7#U\\ 7#t HLU\\U.6u||.DU	U4ell 4u|| 4
"\\
 
 "LL	

 


"\\
 
 "LL	

 
sELL 	!

"\\
 
 "LL	

 

,, 
, ,r(   r*   c                   D    \ rS rSr SS jrS rS rS rS rS r	S r
S	rg
)
LlavaModeli0  c                    Xl         X0l        X l        [        R                  " SSSS9U l        [        R                  " SSU R
                  R                  R                  R                  S9U l
        U R                  R                  U l        U R                  R                  U l        SU l        [        R                  " [         R"                  " U R                  SS9R$                  5      U l        S	U l        U R                  R*                  nU R(                  U-   nXPR                  l        S
U l        SSU R                  S.SSS./S./U l        S U l        S U l        g )Nzllava-hf/llava-1.5-7b-hfcpu(a272c74b2481d8aff3aa6fc2c4bf891fe57334fb)
device_maprevision)r   
patch_sizez1https://llava-vl.github.io/static/images/view.jpgT)streamzA chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. zllava-1.5-7b-hfuserr   )rw   urltextzAWhat are the things I should be cautious about when I visit here?)rw   r   )rolecontent)r.   r/   r0   r   from_pretrainedmodelr   ry   rA   r   	processor	tokenizerr-   	image_urlr   openrequestsgetrawr   system_promptchat_template
model_nameconversationinputresized_image)rJ   r.   r0   r/   current_templatenew_templates         r&   r?   LlavaModel.__init__1  s6    *C&.&2BB&?


 '66&?zz..55@@

 11#~~==LZZT^^D I M MN
 A>>77)),<< (4$+ $T^^< & c	
 
!r(   c                     [        U R                  U R                  U R                  U R                  U R
                  5      nUR                  [        R                  S9  U$ )Nrr   )	r*   r   r-   r.   r/   r0   rv   r   float32)rJ   r   s     r&   get_eager_modelLlavaModel.get_eager_model_  sP    JJ  **  
 	u}}%r(   c                     U R                   (       a  U R                   $ [        U R                  U R                  R                  S   U R                  R                  S   5      nU4U l         U R                   $ )z4Returns a resized image as input to model.forward().r   r   )r   r'   r   r-   r   )rJ   r   s     r&   get_example_inputsLlavaModel.get_example_inputsj  sh    %%%JJ  **84  **73

 &Z!!!r(   c                    U R                   (       a  U R                   $ U R                  R                  U R                  SSSSS9nUS   U l        [
        R                  " U R                  U R                  R                  R                  :H  5      S   nU R                  SS2SUS   24   U l
        U R                  SS2US   S-   S24   U l        U R                  /U R                  5       QU R                  P7U l         U R                   $ )	z!Returns prompts as well as image.Tpt)add_generation_prompttokenizer   return_tensors	input_idsr   Nr   )r   r   apply_chat_templater   r   r   wherer   rA   image_token_indexr   r   r   )rJ   inputsindexs      r&   get_inputs_for_prefill!LlavaModel.get_inputs_for_prefillv  s    ::::33"& 4 
  ,DNNdjj.?.?.Q.QQRSTU#'>>!ZuQxZ-#@ "&..E"IMO1C"D $$
$$&
 ##


 zzr(   c                 "    U R                  5       $ rP   )_get_image_dynamic_shapesri   s    r&   get_dynamic_shapesLlavaModel.get_dynamic_shapes  s    --//r(   c                     [        SSU R                  R                  S   S-  S9n[        SSU R                  R                  S   S-  S9nSU-  nSU-  nX4S./nU$ )	N_heightr   r   r   minr   _widthr   )r   r   )r   r-   r   )rJ   r  r  r   r   dynamic_shapess         r&   r   $LlavaModel._get_image_dynamic_shapes  ss    1$"6"6"@"@"Ja"O
 X1$*>*>*H*H*QUV*VWWF
$/0r(   c                 h    [         R                  R                  SSU R                  S9nSU0SS04nU$ )N	token_dimr   r  r   r   )r   exportr   r0   )rJ   r   text_model_dynamic_shapess      r&   _get_prompt_dynamic_shapes%LlavaModel._get_prompt_dynamic_shapes  s<    ll{t7G7GH&'X1v$6!((r(   )r   r   r-   r   r   r   r/   r0   r   r   r   r   r   r   r   r   r.   Nr   )r   r   r   r   r?   r   r   r   r  r   r  r    r(   r&   r   r   0  s)    OR,"\	
"00	)r(   r   )%rR   typingr   r   r   r   r   r   r   2executorch.examples.models.llama.llama_transformerr   +executorch.examples.models.llama.model_argsr   Fexecutorch.examples.models.llama.source_transformation.custom_kv_cacher	   ;executorch.examples.models.llama.source_transformation.sdpar
   %executorch.examples.models.model_baser   PILr   torch.exportr   torchvision.transforms.v2r   r   transformersr   r   r   r   r   r'   r   Moduler*   r   r  r(   r&   <module>r     s    
 - -    T A A   5  #   0n,EHHOO n,bo) o)r(   