
    9i9A                         S SK r S SKrS SKrS SKJr  S SKJs  Jr   " S S\R                  5      r	 " S S\R                  5      r
SS jrS rSS jr " S	 S
\R                  5      rS rg)    Nc                      ^  \ rS rSrSr         SU 4S jjrS r\R                  R                  S 5       r
SrU =r$ )Detector   zNThis is a combination of "Swin with RAM" and a "Neck-free Deformable Decoder" c                   > [         TU ]  5         X@l        X l        UR                  n[
        R                  " X5      U l        [        XSS5      U l	        XPl
        X`l        Xl        Xl        U R                  (       a4  [        SU
 S3SS9  [        XU R                  R                  S5      U l        Xl        Xl        Xl        UGc  [)        UR*                  5      n/ n[-        U5       Ha  nUR*                  U   nUR/                  [
        R0                  " [
        R2                  " UUSS	9[
        R4                  " S
U5      5      5        Mc     [
        R6                  " U5      U l        U R8                   H[  n[
        R:                  R=                  US   R>                  SS9  [
        R:                  RA                  US   RB                  S5        M]     SU l"        OXpl"        [
        R0                  " [
        R2                  " UR*                  S   USS	9[
        R4                  " S
U5      5      U l#        [
        R0                  " [
        R2                  " XSS	9[
        R4                  " S
U5      5      U l$        Sn[J        RL                  " SU-
  U-  5      * n[N        RP                  " U5      U-  U R                  RB                  l)        [
        R:                  RA                  U R                  RT                  S   R>                  RR                  S5        [
        R:                  RA                  U R                  RT                  S   RB                  RR                  S5        [
        R:                  R=                  U RF                  S   R>                  SS9  [
        R:                  RA                  U RF                  S   RB                  S5        [
        R:                  R=                  U RH                  S   R>                  SS9  [
        R:                  RA                  U RH                  S   RB                  S5        U R                  (       a  [
        R:                  RA                  U R                   RT                  S   R>                  RR                  S5        [
        R:                  RA                  U R                   RT                  S   RB                  RR                  S5        URV                  RX                  S-   nU(       a  [[        U R                  U5      U l        [[        U R                  U5      U l	        [
        R:                  RA                  U R                  S   RT                  S   RB                  RR                  SS S5        U R                  U R                  RV                  l	        O[
        R:                  RA                  U R                  RT                  S   RB                  RR                  SS S5        [
        R6                  " [-        U5       Vs/ s H  nU R                  PM     sn5      U l        [
        R6                  " [-        U5       Vs/ s H  nU R                  PM     sn5      U l	        SU R                  RV                  l	        U R                  (       a  [
        R:                  RA                  U R                   RT                  S   RB                  RR                  SS S5        [
        R6                  " [-        U5       Vs/ s H  nU R                   PM     sn5      U l        U R"                  (       au  [        XSS5      U l.        U(       a  [[        U R\                  U5      U l.        g[
        R6                  " [-        U5       Vs/ s H  nU R\                  PM     sn5      U l.        ggs  snf s  snf s  snf s  snf )a  Initializes the model.
Args:
    backbone: torch module of the backbone to be used. See backbone.py
    transformer: torch module of the transformer architecture. See transformer.py
    num_classes: number of object classes
    num_queries: number of object queries (i.e., det tokens). This is the maximal number of objects
                 DETR can detect in a single image. For COCO, we recommend 100 queries.
    aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
    with_box_refine: iterative bounding box refinement
    epff: None or fusion module available
    iou_aware: True if iou_aware is to be used.
      see the original paper https://arxiv.org/abs/1912.05992
    token_label: True if token_label is to be used.
      see the original paper https://arxiv.org/abs/2104.10858
    distil: whether to use knowledge distillation with token matching
      z Training with vector_hidden_dim .T)flushN   )kernel_size    r   )gaing{Gz?   g       )/super__init__num_queriestransformerd_modelnnLinearclass_embedMLP
bbox_embedaux_losswith_box_refinewith_vectorprocessor_dctprintn_keepvector_embed	iou_awaretoken_labeldistillennum_channelsrangeappend
SequentialConv2d	GroupNorm
ModuleList
input_projinitxavier_uniform_weight	constant_biasfusiontgt_projquery_pos_projmathlogtorchonesdatalayersdecoder
num_layers_get_clones	iou_embed)selfbackboner   num_classesr   r   r   epffr   r   vector_hidden_dimr#   r$   r%   
hidden_dimnum_backbone_outsinput_proj_list_in_channelsproj
prior_prob
bias_valuenum_pred	__class__s                          ^/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/cv/vidt/head.pyr   Detector.__init__   s   B 	&& ((
99Z=ja; !. '*23D2EQG !$J$($6$6$=$=q!BD #&  < #H$9$9 : O,-&33A6&&MM		+zqIR4 . !mmO<DO ''QQ'?!!$q',,2 ( DK K IIh++B/KLLZ(
 !mmIIj!<LLZ(
 
hhJ*<==
%*ZZ%<z%I"
$//004;;@@!D
$//00499>>B 	a 0 7 7a@
$--*//3
 3 3A 6 = =AF
$--a055q9GGd//66r:AAFFJGGd//66r:??DDaH &&11A5 *4+;+;XFD)$//8DDOGGdooa077;@@EEabI"$ 37//D$$/GGdoo44R8==BB12FM!}}+0?;?a!!?; =D mm*//:/Q/:<DO26D$$/GGd//66r:??DDQRH$O ",1(O<Oq""O<!>D >> A>DN!,T^^X!F!#-28_=_T^^_="?  <: = >s   ^8^=__c                 R	   XX4/nU R                  UR                  S5      5      R                  S5      R                  SSS5      nU R	                  UR                  S5      5      R                  S5      R                  SSS5      n/ n	[        U5       H#  u  pU	R                  UR                  SS 5        M%     / nU R                  c9  [        U5       H)  u  pUR                  U R                  U
   " U5      5        M+     OU R                  U5      n/ n[        U5       H  u  pU	R                  UR                  SS 5        [        R                  " US   R                  5       UR                  SS S9R                  [        R                  5      S   nUR                  U5        Ub  M   e   / n/ nU R!                  XXV5      u  nnnn[#        UR                  S   5       H  nUS:X  a  UOUUS-
     n[%        U5      nU R&                  U   " UU   5      nU R(                  U   " UU   5      nUR                  S   S:X  a  UU-  nO&UR                  S   S:X  d   eUS	SS24==   U-  ss'   UR+                  5       nUR                  U5        UR                  U5        M     [        R,                  " U5      n[        R,                  " U5      nSnU R.                  (       a`  / n[#        UR                  S   5       H,  nU R0                  U   " UU   5      nUR                  U5        M.     [        R,                  " U5      nUS   US   S
.nU R.                  (       a  UR3                  SUS   05        U R4                  (       a:  U R                   R6                  R8                  S:  a  U R;                  UUU5      US'   U R<                  (       a  / n[#        UR                  S   5       H*  nUR                  U R>                  U   " UU   5      5        M,     [        R,                  " U5      nUS   US'   U R4                  (       a   [        US   5       H  u  nn UU   U S'   M     U R@                  (       a  SU0US'   U RB                  (       a	  UUUS.US'   US   n!US   n"U!U"4$ )a  The forward step of ViDT

Args:
    The forward expects a NestedTensor, which consists of:
    - features_0: images feature
    - features_1: images feature
    - features_2: images feature
    - features_3: images feature
    - det_tgt: images det logits feature
    - det_pos: images det position feature
    - mask: images mask
Returns:
    A dictionary having the key and value pairs below:
    - "out_pred_logits": the classification logits (including no-object) for all queries.
                    Shape= [batch_size x num_queries x (num_classes + 1)]
    - "out_pred_boxes": The normalized boxes coordinates for all queries, represented as
                   (center_x, center_y, height, width). These values are normalized in [0, 1],
                   relative to the size of each individual image (disregarding possible padding).
                   See PostProcess for information on how to retrieve the unnormalized bounding box.
r   r   r   r   r   N)sizer   .pred_logits
pred_boxespred_vectorsaux_outputs	pred_iousrU   
enc_tokens)patch_tokenbody_det_tokenneck_det_tokendistil_tokensrV   )"r5   	unsqueezesqueezepermuter6   	enumerater)   shaper4   r.   Finterpolatefloattor9   boolr   r(   inverse_sigmoidr   r   sigmoidstackr   r"   updater   r=   r>   _set_aux_lossr#   r@   r$   r%   )#rA   
features_0
features_1
features_2
features_3det_tgtdet_posmaskfeaturesshapeslesrcsrcsmasks_maskoutputs_classesoutputs_coordshsinit_referenceinter_referencesenc_token_class_unflatlvl	referenceoutputs_classtmpoutputs_coordoutputs_vectoroutputs_vectorsoutoutputs_iousoutputs_iouiauxout_pred_logitsout_pred_boxess#                                      rP   forwardDetector.forward   sm   , JC -- 1 1" 56>>rBJJq!%%b!##*72;wwq!Q/? 	  *GBMM#))BC.) + ;;$X.DOOB/45 / ;;x(D GBMM#))BC.)MMT
  "2399;EJJKELL### '  HLGWGWH+DN,.D !%C*-(8HKLJM 9NI'	2I ,,S1"S':M//#&r#w/Cr"a'y  r*a///C!G	)KKMM""=1!!-0! && O4N3 ORXXa[)!%!2!23!73!@&&~6 * #[[9N ),'+

 JJr(:;< ==T--55@@1D!%!3!3M4A4B"DC
 >>LRXXa[)##DNN3$73$@A *++l3K*2C}}'M(:;FAs'21~C$ < !.0F GC;;  $")"$$C  m,\*..    c           	          Uc*  [        US S US S 5       VVs/ s H
  u  pEUUS.PM     snn$ [        US S US S US S 5       VVVs/ s H  u  pEnUUUS.PM     snnn$ s  snnf s  snnnf )Nr   rT   )rU   rV   rW   )zip)rA   r   r   r   abcs          rP   rm   Detector._set_aux_loss)  s     ! mCR0-2DEG F$!  ! FG G !s!3]3B5G!/!46	76'!  ! !6	7 7G
7s   A A&)r   r   r   r%   r4   r.   r#   r@   r   r   r6   r5   r$   r   r"   r   r   )	FFNFN   FFF)__name__
__module____qualname____firstlineno____doc__r   r   r9   jitunusedrm   __static_attributes____classcell__rO   s   @rP   r   r      sO    Y !!Q?fD/L YY7 7r   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r   i=  z4Very simple multi-layer perceptron (also called FFN)c                    > [         TU ]  5         X@l        U/US-
  -  n[        R                  " S [        U/U-   XS/-   5       5       5      U l        g )Nr   c              3   R   #    U  H  u  p[         R                  " X5      v   M     g 7fN)r   r   ).0nks      rP   	<genexpr>MLP.__init__.<locals>.<genexpr>D  s!      $P(NBIIaOO(Ns   %')r   r   r>   r   r-   r   r<   )rA   	input_dimrF   
output_dimr>   hrO   s         rP   r   MLP.__init__@  sU    $LJN+mm $P(+YK!OQ=M(N$P Pr   c                     [        U R                  5       H;  u  p#X R                  S-
  :  a  [        R                  " U" U5      5      OU" U5      nM=     U$ )Nr   )rb   r<   r>   rd   relu)rA   xr   layers       rP   r   MLP.forwardG  sB    !$++.HA$%!(;$;uQx qA /r   )r<   r>   )	r   r   r   r   r   r   r   r   r   r   s   @rP   r   r   =  s    ?P r   r   c                     U R                  SSS9n U R                  US9nSU -
  R                  US9n[        R                  " X#-  5      $ )Nr   r   )minmax)r   )clampr9   r8   )r   epsx1x2s       rP   ri   ri   M  sI    	A1A	
S	B
a%3	B99RWr   c                     U R                  S5      u  pp4USU-  -
  USU-  -
  USU-  -   USU-  -   /n[        R                  " USS9$ )Nr   g      ?dim)unbindr9   rk   )r   x_cy_cwr   r   s         rP   box_cxcywh_to_xyxyr   T  sR    XXb\NCa
a-3q=C#'MS37]LA;;qb!!r   c                    / nU  H  n/ n[        [        US   5      5       H  n[        US   U   R                  5       5      n[	        US   U   R                  5       5      n/ nUS   U   R                  5        H  n	UR                  [	        U	5      5        M     Xa:  d  M  UR                  XgU/5        M     UR                  U5        M     U$ )Nscoreslabelsboxes)r(   r&   rf   cpuintr)   )
post_resultsbbox_thubatch_final_resper_img_resper_img_final_resr   scorelabelbboxits
             rP   get_predictionsr   [  s    O#s;x012A+h/26689EH-a04467ED!'*1-113CG$ 4 !((%)=> 3 	01 $ r   c                   ^   ^  \ rS rSrSrSU 4S jjr\R                  " 5       S 5       rSr	U =r
$ )PostProcessik  zPThis module converts the model's output into the format expected by the coco apic                 .   > [         TU ]  5         Xl        g r   )r   r   r   )rA   r   rO   s     rP   r   PostProcess.__init__n  s    *r   c           	         [        U5      [        U5      :X  d   eUR                  S   S:X  d   eUR                  5       n[        R                  " UR                  UR                  S   S5      SSS9u  pVUnXaR                  S   -  nXaR                  S   -  n	[        U5      n
[        R                  " U
SUR                  S5      R                  SSS5      5      n
UR                  S5      u  p[        R                  " XX/SS9R                  [        R                  5      nXSS2SSS24   -  n
[        XyU
5       VVVs/ s H  u  pnUUUS	.PM     nnnnU$ s  snnnf )
a  Perform the computation

Args:
    out_logits: raw logits outputs of the model
    out_bbox: raw bbox outputs of the model
    target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
                  For evaluation, this must be the original image size (before any data augmentation)
                  For visualization, this should be the image size after data augment, but before padding
r   r   r   r   d   r   r   N)r   r   r   )r&   rc   rj   r9   topkviewr   gatherr_   repeatr   rk   rg   float32r   )rA   
out_logitsout_bboxtarget_sizesprobtopk_valuestopk_indexesr   
topk_boxesr   r   img_himg_w	scale_fctslr   resultss                     rP   r   PostProcess.forwards  sm    :#l"3333!!!$)))!!#$)JJIIj&&q)2.%<!!%5%5a%88
 0 0 33"8,UA'11"5<<Q1EG $**1-KKu <$%'')r%--'8 	!T1*-- VU3	5 4gaA 
 4	 	 5 5s   E)r   r   )r   r   r   r   r   r   r9   no_gradr   r   r   r   s   @rP   r   r   k  s%    [+
 ]]_# #r   r   c                     [         R                  " [        U5       Vs/ s H  n[        R                  " U 5      PM     sn5      $ s  snf )zClone a module N times )r   r-   r(   copydeepcopy)moduleNr   s      rP   r?   r?     s2     ==qBA$--/BCCBs    A)gh㈵>)g?)r   r7   r9   torch.nnr   torch.nn.functional
functionalrd   Moduler   r   ri   r   r   r   r?    r   rP   <module>r      sb   
      m7ryy m7`	"))  " ,")) ,^Dr   