
    9ic                         S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJ	r	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  \R&                  " \
R(                  \R*                  S9 " S S\5      5       rg)    N)Models)
TorchModel)MODELS)	ModelFileTasks   )SwinTransformer)DeformableTransformer)FPNFusionModule)Detector)module_namec                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )	VidtModel   a  
The implementation of 'ViDT for joint-learning of object detection and instance segmentation'.
This model is dynamically initialized with the following parts:
    - 'backbone': pre-trained backbone model with parameters.
    - 'head': detection and segentation head with fine-tuning.
	model_dirc                   > [         [        U ]  5         [        R                  R                  U[        R                  5      n[        R                  " USSS9n[        SS/S/ SQ/ SQSS	S
9nUR                  SSSS/S9  XPl        U R                  R                  US   SS9  [        UR                  SS9n[!        SSSSSSSSSSS9
n[#        UUSSSSUSSSSSSS9nXl        U R$                  R                  US   SS9  g)zInitialize a Vidt Model.
Args:
  model_dir: model id or path, where model_dir/pytorch_model.pt contains:
            - 'backbone_weights': parameters of backbone.
            - 'head_weights': parameters of head.
cpuT)map_locationweights_only   `   )   r      r   )   r            g?)pretrain_img_size	embed_dimdepths	num_headswindow_sizedrop_path_ratevidti,     r   )methoddet_token_numpos_dimcross_indicesbackbone_weights)strict)fuse_dim   r   i   g?relu   F)
d_modelnheadnum_decoder_layersdim_feedforwarddropout
activationreturn_intermediate_decnum_feature_levelsdec_n_pointstoken_labelr   N)num_classesnum_queriesaux_losswith_box_refineepffwith_vectorprocessor_dct	iou_awarer9   vector_hidden_dimdistilhead_weights)superr   __init__ospathjoinr   TORCH_MODEL_FILEtorchloadr	   finetune_detbackboneload_state_dictr   num_channelsr
   r   head)
selfr   kwargs
model_path
model_dictrN   r>   deform_transformersrQ   	__class__s
            _/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vidt/model.pyrF   VidtModel.__init__   s>    	i')WW\\)Y-G-GH
ZZU?
 #"Cj$  	c! 	 	N %%)*4 	& 	9 x44sC3  $( 
  !%& 			!!*^"<T!J    c           	      ^    U R                  X5      u  p4pVpxU R                  X4XVXxU5      u  pX4$ )zvDynamic forward function of VidtModel.
Args:
    x: input images (B, 3, H, W)
    mask: input padding masks (B, H, W)
rN   rQ   )rR   xmask
features_0
features_1
features_2
features_3det_tgtdet_posout_pred_logitsout_pred_boxess              rX   forwardVidtModel.forwardX   sF     LP==LH

*.))J4>4;d+L' ..rZ   r\   )
__name__
__module____qualname____firstlineno____doc__strrF   rg   __static_attributes____classcell__)rW   s   @rX   r   r      s!    <K# <K|/ /rZ   r   )rG   rK   modelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.constantr   r   rN   r	   deformable_transformerr
   
fpn_fusionr   rQ   r   register_moduleimage_object_detectionr$   r    rZ   rX   <module>rz      s[    
  & > , 6 % 9 '  44&++NQ/
 Q/ OQ/rZ   