
    9i/A                        S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJ	r
  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJr  S SKJrJr  S S	KJrJrJ r   S S
K!J"r"  \"" 5       r#S/r$ " S S\%5      r& " S S\5      r'\RP                  " \ RR                  \RT                  S9 " S S\5      5       r+   SS jr,g)    N)	roi_align)Models)
TorchModel)MODELS)FPNTrans)LayoutRobertaModelLayoutRobertaPreTrainedModel)TransformerDecoderTransformerDecoderLayer)ModeKeys	ModelFileTasks)
get_loggerVLDocForDocVLEmbeddingc                       \ rS rSrS rSrg)GeoVLDocModelOutputs    c                 @    Xl         X l        X0l        X@l        XPl        g )Ntext_featurestext_mm_featuresblock_vis_featuresblock_vis_mm_featuresimage_mm_features)selfr   r   r   r   r   s         i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/multi_modal/vldoc/model.py__init__GeoVLDocModelOutputs.__init__"   s"     + 0"4%:"!2    )r   r   r   r   r   N)__name__
__module____qualname____firstlineno__r   __static_attributes__ r   r   r   r       s    3r   r   c                   h   ^  \ rS rSrSU 4S jjrS\4S jr                  SS jrSrU =r	$ )	GeoVLDocModel6   c                 v  > [         TU ]  U5        Xl        X l        [	        U R                  SS 5      (       a?  U R                  R
                  S   S:X  a  [        U5      U l        O![        U5      U l        O[        U5      U l        [        U R                  R                  SS9U l
        [        R                  " SS/5      U l        [        R                  " SU R                  R                  5      U l        [#        U R                  R                  U R                  R$                  U R                  R&                  SS	9n[)        US5      U l        [#        U R                  R                  U R                  R$                  U R                  R&                  SS	9n[)        US5      U l        U R/                  5         g )
Narchitecturesr   r   F)img_size	inner_vit      T)	self_attn)superr   confighard_negtive_samplinggetattrr*   r   text_encoderr   
image_sizevisual_encodernnAdaptiveAvgPool2dpoolLinearhidden_size
vis_linearr   num_attention_headsintermediate_sizer
   cross_modal_textcross_modal_visualinit_weights)r   r1   r2   cross_modal_text_layercross_modal_visual_layer	__class__s        r   r   GeoVLDocModel.__init__8   sZ    %:"4;;66{{((+/CC$6v$>!$6v$>! 26 :D&[[++u>((!Q0	))C)@)@A!8KK##KK++KK))	"
 !33I1 M#:KK##KK++KK))	$ 
 #55M56#8 	r   	ckpt_pathc                     [         R                  " USS9n0 nUR                  5        H  u  pEUR                  SS5      nXSU'   M     U R	                  U5        g )Ncpumap_locationgeo_vl_doc_model. )torchloaditemsreplaceload_state_dict)r   rF   
state_dictstate_dict_newkvs         r   from_pretrainedGeoVLDocModel.from_pretrainedZ   sU    ZZ	>
$$&DA		-r2A !1 ' 	^,r   c                 h   UR                   u  nnUb  UOU R                  R                  nUUS'   U R                  R                  S   S:X  a  U R                  " U4UUUU	U
UUUUS.	UD6nOU R                  " U4UUUU	U
UUUUS.	UD6nUS S u  nnUR                   u  nn[
        R                  " SUUR                  S9R                  US5      R                  UU5      nUUU4   nUR                   u  nnnU R                  U5      n[
        R                  " SUUR                  S9R                  US5      R                  UU5      R                  S5      n[
        R                  " UU4S5      R                  UU-  S	5      R                  US
   R                  S9n US
   R                  [
        R                  :X  a{  [!        US
   R                  [
        R"                  5      U R                  [
        R"                  5      SUS
   R%                  S5      S-  S9n!U!R                  US
   R                  S9n!OA[!        US
   U R                  [
        R"                  5      SUS
   R%                  S5      S-  S9n!U!R'                  S5      R'                  S5      R                  UUS5      n!U R)                  U!5      n!U!UR                  S5      -  n!U R+                  US
   5      R'                  S5      R'                  S5      n"U R)                  U"5      R                  S5      n"[
        R                  " U"U!4S5      n#[
        R,                  " US45      R                  UR                  5      n$[
        R                  " U$U4S5      n%SU-
  S:  n&SU%-
  S:  n'U R/                  UR1                  SS5      U#R1                  SS5      U&U'S9n(U R3                  U#R1                  SS5      UR1                  SS5      U'U&S9n)U(R1                  SS5      n(U)R1                  SS5      n)U)S S 2SS 24   n*[5        UU(U!U*U)S9$ )N	line_bboxr   r   )	bboxattention_masktoken_type_idsposition_ids	head_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dict   )devicer-      feat_ms)dtypeg     @@)spatial_scaler.   )tgtmemorytgt_key_padding_maskmemory_key_padding_maskr   )shaper1   use_return_dictr*   r4   rM   arangerd   reshapeexpandr6   	unsqueezecattorh   float16r   float32sizesqueezer<   r9   onesr?   	transposer@   r   )+r   	input_idsimagerZ   bbox_4p_normalizedr[   first_token_idxesfirst_token_idxes_maskr\   r]   r^   r_   encoder_hidden_statesencoder_attention_maskpast_key_values	use_cacher`   ra   rb   kwargs
batch_sizeseq_lenoutputssequence_outputpooled_output_	num_firstB_batch_dimfeature_bbox	block_num
visual_out
batch_idxsbatch_idx_with_bboxblk_vis_featuresfull_img_featuresvis_inpsglb_feat_attnvis_masknew_attention_masknew_vis_masktext_mm_featvis_mm_featr   s+                                              r   forwardGeoVLDocModel.forwardb   s   * (oo
G & ,0KK,G,G 	 #{;;$$Q'+??'''--)#+"3%9' G '''--)#+"3%9' G *1!& )..9llz##%%,WZ-.&006z90M 	
 K)::;&,,9a((/
\\z/"8"8::A'A;%vj)<YYr] 	
 $ii&wzI-":i#8#>#>"? 	
 i &&%--7(9%((7#&&u}}5(388<vE	 G
  022 +11  3  3  )9%#&&u}}5(388<vE	 G ,33A6>>qAII	3(
  ??+;<+.D.N.N/  !IIy!##*71:ggaj 	 !OO,=>HHK
 99/1ABAF

J?366y7G7GH99m-CDaH
  .0A5H),,))!Q/%%a+!3$0	 - 2 --""1a(",,Q2!-$6	 . 
 $--a3!++Aq1 !,AqrE 2#))/"7)
 	
r   )r1   r?   r@   r2   r9   r4   r<   r6   )FNNNNNNNNNNNNNNNNNN)
r    r!   r"   r#   r   strrV   r   r$   __classcell__rD   s   @r   r'   r'   6   sY     D- - #'#"&'+#!"&*'+ $"&%) %Z
 Z
r   r'   )module_namec                   b   ^  \ rS rSrSrS\4U 4S jjr                  SS jrSrU =r	$ )r      z
Generate multi-modal document embeddings in segment-level and token-level.

Args:
    model_dir:
        the path in model hub, e.g., 'damo/multi-modal_convnext-roberta-base_vldoc-embedding'
	model_dirc           
      n  > [         T	U ]  " USU0UD6  SSKJn  [        R
                  R                  US5      n[        R                  SR                  U5      5        [        R
                  R                  U5      (       d   eUR                  U5      U l        [        U R                  5      U l        [        R
                  R                  U[        R                   5      n[        R
                  R                  U5      (       d   eU R                  R#                  U5        [        R                  SR                  U5      5        SSKJn  [        R
                  R                  U[        R(                  5      nUR#                  U5      U l        [,        R.                  R1                  5       (       a8  SR                  [3        [        R4                  R7                  S	S5      5      5      OS
U l        [,        R.                  R1                  5       (       aq  U R                  R;                  U R8                  5        [        R                  SR                  [3        [        R4                  R7                  S	S5      5      5      5        g U R                  R=                  5         [        R                  S5        g )Nr   r   )LayoutRobertaConfigzconfig.jsonzLoading config file from {}zLoading model from {})VLDocXLMTokenizerzcuda:{}
LOCAL_RANKrH   z%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inference)r0   r   ;modelscope.models.multi_modal.vldoc.modeling_layout_robertar   ospathjoinloggerinfoformatexistsfrom_json_filer1   r'   	doc_modelr   TORCH_MODEL_FILErV   0modelscope.models.multi_modal.vldoc.tokenizationr   TOKENIZER_FOLDER	tokenizerrM   cudais_availableintenvirongetrd   ru   float)
r   r   argsr   r   model_cfg_path
model_pathr   tokenizer_pathrD   s
            r   r   VLDocForDocVLEmbedding.__init__	  s   t>9>v> 	di?188HIww~~n----)88H&t{{3 WW\\)Y-G-GH
ww~~j))))&&z2+22:>? 	Wi1K1KL*::>J #(**"9"9";";  &&s2::>>!, ( AF 	::""$$NNdkk*KK?FFBJJNN<346 7 NN  "KK<=r   c                     U R                   " S0 SU_SU_SU_SU_SU_SU_SU_SU_S	U	_S
U
_SU_SU_SU_SU_SU_SU_SU_SU_UD6n[        UR                  UR                  S9$ )a  
Args:
    - input_ids: :math:`(B, T, E)`, the input tokens, where B is the batch size,
      T is the max token size, E is the embedding dimension.
    - image: :math:`(B, C, H, W)`, normalized images.
    - bbox: :math:`(B, T, 4)`, segment boxes denoted by top-left and bottom-right
      vertexes whose values are normalized to [0, 1000).
    - bbox_4p_normalized: :math:`(B, T, 8)`, word boxes denoted by 4 vertexes, whose
      values are normalized to [0, 1).
    - attention_mask: :math:`(B, T)`, mask for input tokens, where 0 means masked.
    - first_token_idxes: :math:`(B, S)`, indexes of the corresponding first tokens
      of all segments, where S is the max segment size.
    - first_token_idxes_mask: :math:`(B, S)`, mask for segments, where 0 means masked.
Optional:
    - line_rank_id: :math:`(B, T)`, orders of segments.
    - line_rank_inner_id: :math:`(B, T)`, BIE-like tags.

To be more specific, please refer to the class `TextLayoutSerializer` in
  `modelscope/models/multi_modal/vldoc/processing.py`.
r|   r}   rZ   r~   r[   r   r   r\   r]   r^   r_   r   r   r   r   r`   ra   rb   )img_embeddingtext_embeddingr%   )r   dictr   r   )r   r|   r}   rZ   r~   r[   r   r   r\   r]   r^   r_   r   r   r   r   r`   ra   rb   r   vldoc_outputss                        r   r   VLDocForDocVLEmbedding.forward*  s    R     2	
 * 0 $: * &   ( #8 $: ,    0!" "6#$ $'* '99(99
 	
r   )r1   rd   r   r   r   )
r    r!   r"   r#   __doc__r   r   r   r$   r   r   s   @r   r   r      sY    ># >D #'#"&'+#!"&*'+ $"&%) %A
 A
r   c                   ^^^^^^ Tc  [         R                  " USS9m/ n/ n[        TR                  5       5      nUS:X  a  [	        [        U5      5       Ht  nXx   n	S n
U	R                  S5      (       a(  U	R                  SS5      n
[        R                  " U
5      n	U
(       d  MP  UR                  Xx   5        UR                  U
5        Mv     [        XV5       H  u  pTR                  U5      TU
'   M     / m/ m/ m[        TSS 5      mTR                  5       mTb  TTl        SUUUUUU4S jjmSn[        U S	5      (       d'  [!        S
 TR                  5        5       5      (       a  SnT" XS9  [        T5      S:  a9  ["        R%                  SR'                  U R(                  R*                  T5      5        [        T5      S:  a9  ["        R%                  SR'                  U R(                  R*                  T5      5        [        T5      S:  a>  [-        SR'                  U R(                  R*                  SR/                  T5      5      5      eU $ )NrH   rI   robertazroberta.zgeo_vl_doc_model.text_encoder.	_metadatarL   c           	         > Tc  0 OTR                  US S 0 5      nU R                  T	XSTT
T5        U R                  R                  5        H  u  p4Uc  M
  T" XAU-   S-   5        M     g )Nre   T.)r   _load_from_state_dict_modulesrO   )moduleprefixlocal_metadatanamechild
error_msgsrN   metadatamissing_keysrR   unexpected_keyss        r   rN   $init_pretrained_weight.<locals>.load  so    '/X\\3BK6$$Z%1?J	P!??002KD UTMC/0 3r   geo_vl_doc_modelc              3   B   #    U  H  oR                  S 5      v   M     g7f)rK   N)
startswith).0ss     r   	<genexpr>)init_pretrained_weight.<locals>.<genexpr>  s       6J7H!LL,--7Hs   rK   )r   r   z7Weights of {} not initialized from pretrained model: {}z0Weights from pretrained model not used in {}: {}z*Error(s) in loading state_dict for {}:
	{}z
	)rL   )rM   rN   listkeysrangelenr   rP   copydeepcopyappendzippopr3   r   hasattranyr   r   r   rD   r    RuntimeErrorr   )modelpretrained_model_pathrR   	cache_dirinit_backboneold_keysnew_keysstate_dict_keysikeynew_keyold_keystart_prefixr   rN   r   r   r   s     `          @@@@@r   init_pretrained_weightr   n  s    ZZ 5EJ
HH:??,-O	!s?+,A!$CG~~j))++j&FHmmG,w 23( -  3(nnW5
7 4 LOJ z;5H"J'
1 1 L5,--# 6J7A7H6J 3J 3J*$
<1ELL((,8	9 ?aFMMOO$$o7 	8
::AA((&++j*ACD 	D Lr   )NNr   )-r   loggingmathr   resysjsonrM   torch.distributeddistributeddisttorch.nnr7   torchvision.opsr   modelscope.metainfor   modelscope.modelsr   modelscope.models.builderr   2modelscope.models.multi_modal.vldoc.conv_fpn_transr   r   r   r	   5modelscope.models.multi_modal.vldoc.transformer_localr
   r   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   r   __all__objectr   r'   register_moduledocument_vl_embeddingvldocr   r   r%   r   r   <module>r     s       	 	 
      % & ( , G61 @ @ .	#
$36 3,F
0 F
R 33Nk
Z k
 Ok
b Cr   