
    9in                     x    S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJ	r	  S SK
Jr  SS jrSS jr " S S5      rg)	    N)mpuprint_rank_0)poissonc                 f    Uc  [        U 5      S-
  n[        USS5       H  nX   U:X  d  M  Us  $    g)N   )lenrangelstvalstartis       h/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/mglm/blocklm_utils.pyrindexr      s:    }C15"b!6S=H "     c                 ^    Uc  Sn[        U[        U 5      5       H  nX   U:X  d  M  Us  $    g)Nr   r   )r
   r	   r   s       r   index_in_listr      s4    }5#c(#6S=H $ r   c                       \ rS rSr                    SS jrS r\SS j5       rS r SS jr	 SS jr
 SS jrS	 rS
 r\S 5       rSrg)ConstructBlockStrategy    c           
         UR                   U l         X l        SU l        X0l        [        R
                  " 5       U l        [        R                  " 5       U l        SUs=::  a  S::  d   e   eX@l	        XPl
        SU-
  U-
  U l        U R                  S:  d   eX`l        Xpl        Xl        Xl        [!        SU5       Vs/ s H  n["        R$                  " UU
5      PM     snU l        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        U(       a  SOSU l        U R                  R=                  U R:                  5      R>                  U l        U(       a  SOSU l         U R                  R=                  U R@                  5      R>                  U l         UU l!        UU l"        [G        S	U R                   S
U R                   SU R                   SU R                   35        [G        SU R                   SU R                   SU R                   35        [G        SU R&                   35        [G        SU R(                   SU R*                   35        g s  snf )Nr                 ?r   g|۽gMASKMASKsMASKz
BERT prob z, gap sent prob z, GPT prob z, infill prob zgeneration min ratio z, block ratio z, gap sent ratio zblock length distribution zblock mask prob z, context mask ratio )$	eod_token	tokenizercountmax_seq_lengthr   get_data_parallel_rankrankget_data_parallel_world_size
world_size	bert_probgap_sentence_probgpt_probinfill_probgpt_min_ratio
bert_ratiogap_sentence_ratior
   r   pmfblock_length_distributionblock_mask_probcontext_mask_ratiocontext_mask_rangeshort_seq_probsingle_span_probblock_position_encodingencoder_decodershuffle_blockssentinel_tokengeneration_maskget_commandIdgap_sentence_maskrandom_position	masked_lmr   )selfargsr   r!   r&   r'   gpt_infill_probr*   r+   r,   average_block_lengthmax_block_lengthr/   r0   r1   r2   r3   r4   r5   r6   r7   	task_maskr<   r=   r   s                            r   __init__ConstructBlockStrategy.__init__"   sf   0 "
,..0	::< i&3&&&&&"!2I(99}}&&&**$"4 1./*
/ KK/0/*
&  /"4"4, 0'>$.,,*3w#~~99   ""$" 	,56!%!;!;"""$$&B 	."((89O9O8PP[\`\i\i[jjxy}  zJ  zJ  yK  L	
 	#D$6$6#7~dooEVVghlhh  hA  B	
 	()G)G(HI	Kt3344I$JaJaIbc	
;*
s   <!I%c                     U R                   R                  U5      nSU;   a  gSU;   a  gSU;   a  gSU;   a  gSU;   a  gSU;   a  gSU;   a  gS	U;   a  gS
U;   a  gSU;   a  gSU;   a  gg)N.T?!;:u   。u   ？u   ！u   ；u   …
F)r   	IdToToken)r>   toks     r   contains_sentence_end,ConstructBlockStrategy.contains_sentence_endl   s    nn&&s+#:#:#:#:#:C<C<C<C<C<3;r   c                 L   U[        U 5      -
  nU[        U 5      -
  S-   n[        [        U 5      5       Vs/ s H  obR                  US-   5      PM     nnUR	                  5         / n[        Xp5       H)  u  pX9-   nX9-   U
-   nUR                  X45        X:S-   -  nM+     U$ s  snf Nr   )sumr	   r
   	randrangesortzipappend)span_lengthstotal_lengthrngoffsetblank_lengthm_placesspansplacespan_lengthr   ends                r   sample_spans#ConstructBlockStrategy.sample_spans   s    #c,&773|,,q005c,6G0HI0H1--A&0HI"%f";ENE.;.CLL%&Ao%F	 #<
  Js   B!c                    UR                  U5        / nSnS/[        R                  " XR                  :H  5      S   R	                  5       -   n[        U5      n/ n[        U5       H  n	U	n
U
S-   [        U5      :  a4  UU
S-      U R                  R                  S5      R                  :X  a  U
S-  n
Xz-
  S-
  nU[        U5      :X  a  US:  a  US-  nUR                  U
S-   U45        U	nM     UR                  S S9  [        U5       GHV  u  nu  pU[        U5      S-
  :X  a  Su  pX_-   [        U5      :  aF  UUU-      U-   U-   U::  a4  XUU-      -  nUS-  nX_-   [        U5      :  a  UUU-      U-   U-   U::  a  M4  US:  a  U R                  X%X_-    UUUS9nUU-  nX_-   [        U5      S-
  :  a  [        XUS  US U U5        M  M  [        XR                   -  5      nSu  pX_-   [        U5      :  a@  UUU-      U-   U::  a1  XUU-      -  nUS-  nX_-   [        U5      :  a  UUU-      U-   U::  a  M1  US:  d  GM7  U R                  X%X_-    UUUS9nUU-  nX_-  nGMY     U$ )	Nr   r   r   ENCc                     U S   $ rR    xs    r   <lambda>@ConstructBlockStrategy.sample_span_in_document.<locals>.<lambda>   s    QqTr   keyr   r   )r[   )shufflenpwherer   tolistr	   reversedr   r9   r:   rW   rU   	enumeraterd   printintr+   )r>   tokensmasked_lengthsrZ   
mask_spans
mask_indexindices
last_index	documentsindexstart_indexlengthr   r[   current_masked_lengthcurrent_countr`   current_masked_totals                     r   sample_span_in_document.ConstructBlockStrategy.sample_span_in_document   sa   N#

$&NN":;A>EEGG[
	g&EKQV,!O2%(,(B(B5(I(L(L2Mq -1FS[(VaZ!kAov67J ' 	>*#,Y#7AC	NQ&&7;4% 03&4 $"%&')>?ANORXY *J>K=L .M M)!Q&M !03&4 $"%&')>?ANORXY !1$ --&*2LM%	 . 'E
 %'J-N0Ca0GG&"=(*5w@ H (+6OO+C'D$7;4% 03&4 $"%&')>?BVW *J>K=L .M M)!Q&M !03&4 $"%&')>?BVW !1$ --&*2LM%	 . 'E
 %'J/JO $8P r   c                 Z   [         R                  " [        U5      [        S9n[        R
                  " U5      nU R                  R                  S5      R                  n	[         R                  " [        U5      [        S9n
U H  u  p[        X5       H  nXU'   M	     SXU& M!     X*-  nXX'4$ )Ndtyper   r   )rr   aranger	   rx   copydeepcopyr   r9   r:   zerosr
   )r>   ry   
loss_masksattention_maskblock_spansrZ   taskposition_idstargetsmask_id	mlm_masksr   rc   idxs                 r   make_masked_data'ConstructBlockStrategy.make_masked_data   s     yyVC8--'..,,V477HHS[4	%JEU(%s )#$IC  &  +

88r   c           	         [        U5      n[        R                  " [        U5      [        S9nU H  u  pSXS-   U
& M     [        R                  " U5      S-
  nU R
                  (       a>  US   U R                  S-
  :  a(  U R                  US   -
  nUR                  SU5      nX-   nU R                  (       d  U R                  (       d  UR                  S S9  OUR                  U5        U R                  (       a'  [        U5       VV	V
s/ s H  u  nu  pXU4PM     nn	nn
OU V	V
s/ s H	  u  pXS4PM     nn	n
/ / / / 4u  pnnU GH  u  pnUS:X  a  SOSU 3nUR                  U R                  R!                  U5      R"                  /5        [$        R&                  " XU
 5      nU R(                  S:  an  US	:X  ah  [+        [        U5      5       HP  n[,        R,                  " 5       U R(                  :  d  M(  U R                  R!                  S
5      R"                  UU'   MR     UR                  U5        UR                  XU
 5        UR                  U R                  R!                  S5      R"                  /5        U R                  (       d+  XU
 nUR                  U5        UR                  US   /5        O$UR                  U R                  /X-
  S-   -  5        U R.                  (       a1  UR                  [        R0                  " SX-
  S-   [        S95        GM  UR                  S/X-
  S-   -  5        GM     UR                  S S9  / / / nnnSu  nnU H  u  pnUS:X  a  U R2                  nOEUS:X  a  U R4                  nO2US:X  a  SOSU 3nU R                  R!                  U5      R"                  nUR                  UUU	-   U-
  45        UR                  UUU	 5        UR                  U/5        UR                  UUU	 5        UR                  X   /5        UU	U-
  S-   -  nU
nM     U[        U5      :  aJ  UR                  UU[        U5      -   U-
  45        UR                  UUS  5        UR                  UUS  5        [7        [9        [         U5      5      nUb  UU:X  d   eU(       a]  U R:                  [        R<                  " U5      R?                  5       ;   a+  [A        SU R                  RC                  U5      5        [D        eU R                  (       aO  UU R                  R!                  S5      R"                  /-   n[        R                  " [        U5      [        S9nUX4$ [        R<                  " UU-   5      nUS	:X  a  U RF                  S:  a  [I        5       nU Hy  u  pU	S:w  a3  [K        XU RL                  -   5      nURO                  [+        U	U5      5        U
S:w  d  MF  [Q        XU RL                  -
  5      nURO                  [+        UU
5      5        M{     URS                  U[        U RF                  U-  5      5      n U  H+  n!U R                  R!                  S
5      R"                  UU!'   M-     [        R<                  " UU-   5      n[        R                  " [        U5      [        S9nSUS U& [        R<                  " UU-   5      n[        R<                  " [        RT                  " U[        S9/U-   5      n"[        RV                  " UU"/SS9nUb  UUX(4$ UUX(U4$ s  sn
n	nf s  sn
n	f )Nr   r   r   r   c                     U S   $ Nr   ri   rj   s    r   rl   8ConstructBlockStrategy.make_block_data.<locals>.<lambda>   s    1Q4r   rn   sopr   bertdBLOCKeop   c                     U S   $ r   ri   rj   s    r   rl   r     s    qtr   rp   
generationgap_sentencer   zFound EOS in targetaxis),r	   rr   onesrx   cumsumr<   r!   rT   r5   r6   rU   rq   r7   rv   rW   r   r9   r:   r   r   r/   r
   randomr4   r   r8   r;   rS   mapr   concatenatert   rw   	DecodeIdsRuntimeErrorr0   setminr1   updatemaxsampler   stack)#r>   ry   r   r   r   rZ   r   text_lengthr   r   rc   position_biasr   target_tokenstarget_position_idstarget_block_position_idsr   	sop_tokenspan_tokenssub_idxtarget_position_idsource_tokenssource_position_idslocal_spanslastcurrent_lengthr   
mask_tokensource_lengthmask_candidates	local_endlocal_startmask_posposblock_position_idss#                                      r   make_block_data&ConstructBlockStrategy.make_block_data   s    &kwws6{#6%JE*+L3' &yy.2L$4t7J7JQ7N$N //,r2BBMMM!];M'7Lt':':0KK$4=k4JL4J0sLU ",4J  LK >II[zuE?[KIQSUWY[]_Q_N,Ew*OE!$SE{I  $.."<"<Y"G"J"J!KL--S(9:K##c)dfn$S%56G}})=)==/3~~/I/I$0&&(b $G,  7   -NN6,-NNDNN66u=@@AB&&%1%<"#**+=>#**,>q,A+BC#**D,?,?+@,/K!O,= >++)00IIaq<> *00!a1HI/  +0 	^,:<b"K*#n*OE|#..'00'*axVtC5\
..44Z@CC0F0MNO  U!34  '+&&|D'?@&&(;'<=edlQ..ND  + #f+#f+!=!DEG  /&&|DE':;C]34% N222T^^r~~0%vx(')A)A&)IJ)**5144- M ]!33?J -;;^^MM$ABFv~$"9"9A"="%%"-JEz$'T5L5L-L$M	'..uUI/FGax&)%t7N7N1N&O'..u[#/FG #. ::#//+=>@ $C"&.."<"<X"F"I"IF3K $nn]W%<=GVC8J)*J~&>>*=,?+@ AL!#-s34+,"- 88\3E$FQOL)w
@@w
-OO{L Js   \5)\<c           	      `   UR                  U5        US   US   pvUS   U R                  R                  S5      R                  :X  d   eU R	                  XbU5      n[        U5      [        U5      :  a  g U R                  (       a  U R                  XgUX5      n	U	$ U R                  UUUUUUS9n	U	$ )Ntext	loss_maskr   rg   r   )	rq   r   r9   r:   r   r	   r=   r   r   )
r>   r   rz   r   rZ   r   ry   r   r   datas
             r   generate_blank_data*ConstructBlockStrategy.generate_blank_dataV  s     	N##F^VK-@
ayDNN66u=@@@@@2263O{c.11>>((^)4;D  '' ( D r   c                    UR                  SU R                  S-
  5      nU R                  S-
  U-  n/ nU R                  R                  S5      R                  nU R                  R                  S5      R                  nU GH  nUS   SS  US   SS  p[        U5       GHc  nU[        U	5      :  a  XpGOUR                  S[        U	5      U-
  5      nUS:  ak  X   U:X  d&  U R                  XS-
     5      (       dH  XS-
     U:X  d=  US-  nUS:  a2  X   U:X  a  M  U R                  XS-
     5      (       d  XS-
     U:X  d  M=  X-   nX:  aX  U R                  XS-
     5      (       d=  XS-
     U:X  d2  US-  nX:  a(  U R                  XS-
     5      (       d  XS-
     U:X  d  M2  X-
  US-  :  a  X-   nU	X U
X p[        R                  " U/U45      n[        R                  " S/U45      nUR                  UUS	.5        GMf     GM     U$ )
Nr   r   rg   eosr   r   r   r   )r   r   )rT   r!   r   r9   r:   r
   r	   rO   rr   r   rW   )r>   samplesrZ   target_length
num_splitsnew_samplescls_ideos_idr   ry   r   r^   
new_tokensnew_loss_masksrandom_start
random_ends                   r   split_samples$ConstructBlockStrategy.split_sampleso  sK   b$*=*=*AB))A--?
++E255++E255F!'!3VK5H5LJ:& CK/17#&==14V}1L$NL&*"0F:!%!;!; &a'7 8": ":!'q(8!9V!C$) '*"0F:!%!;!; &a'7 8": ":!'q(8!9V!C!-!=J$3 66v1n7MNN%1n5?"a
 %3 66v1n7MNN%1n5?!0=A3EE%1%A
17$212<(35 !/  ^^fXz,BC
!#!n0E!F""&!/$ 1 ' < r   c                 
   [         R                  R                  R                  5       nUb  UR                  UR
                  pCOSu  p4[        R                  " U R                  U-  U-   U R                  -  U R                  -   5      nU =R                  S-  sl        / / / / 4u  pgp/ / pzUR                  5       U R                  :  a  U R                  X5      nUR                  5       nXR                  :  nU(       a  SOUR                  5       n/ nXR                  :  Ga   SnU GH  nU(       aD  UR                  [!        S[#        U R$                  5      S-   5      U R$                  S9S   /nUS   nO/ SnnU['        U R(                  [#        US   5      -  5      :  a}  UR                  [!        S[#        U R$                  5      S-   5      U R$                  S9S   nUR+                  U5        UU-  nU['        U R(                  [#        US   5      -  5      :  a  M}  U R,                  (       a  [#        US   5      nO[#        US   5      U-
  [#        U5      -   nU R/                  UUUUSS9nUc  GMN  U R0                  (       a:  Uu  nnnU
R+                  U5        UR+                  U5        UR+                  U5        OKUu  nnnnUR+                  U5        UR+                  U5        UR+                  U5        U	R+                  U5        UR+                  U5        GM     GOXR                  U R2                  -   :  Ga(  S	nU GH  nUS   US
   nn/ nUS   U R4                  R7                  S5      R8                  :X  a  SOSn[!        [#        U5      5       Hu  nU R;                  UU   5      (       a&  UUS-   :  a  UR+                  UUS-   45        US-   nMB  UU   U R4                  R7                  S5      R8                  :X  d  Mp  US-   nMw     U[#        U5      :  a  UR+                  U[#        U5      45        U(       dJ  [         R<                  R?                  5       S:X  a(   [A        U R4                  RC                  USS  5      5        URG                  U5        / SnnU HH  u  nn UR+                  UU 45        UU U-
  -  nU['        U RH                  [#        U5      -  5      :  d  MH    O   U RK                  UUS UUSS9nUu  nnnnnUR+                  U5        UR+                  U5        UR+                  U5        U	R+                  U5        UR+                  U5        GM      GOSnURM                  ['        U RN                  [Q        [S        S U5      5      -  5      [U        [S        S U5      5      S-
  5      n!U GHg  n[Q        U![#        US   5      S-
  5      n"UR+                  [#        US   5      U"-
  S-   5        [W        US   U R4                  R7                  S5      R8                  5      S[#        US   5      S-
  4;  n#U#(       d  UR                  5       U RX                  :  GaF  [#        US   5      U"-
  n$US   US
   nnUS U$ UU$S  nnUU$S  n%[Z        R\                  " UU R^                  U R4                  R7                  S5      R8                  /US S 45      n[Z        R\                  " UU R^                  /U45      n[Z        R\                  " [Z        R`                  " [#        U5      S-   [&        S9U%45      nUR+                  U5        UR+                  U5        UR+                  U5        [Z        Rb                  " [#        U5      [#        U5      -   S-   [&        S9n[#        U5      U[#        U5      S-   S & U Rd                  (       a\  [Z        R\                  " [Z        R`                  " [#        U5      [&        S9[Z        Rb                  " [#        U5      S-   [&        S945      n&O^[Z        R\                  " [Z        R`                  " [#        U5      S-   [&        S9[Z        Rf                  " [#        U5      S-   [&        S945      n&U	R+                  [Z        Rh                  " UU&/SS95        GM  U R/                  UU"/US   USS9u  nnnnUR+                  U5        UR+                  U5        UR+                  U5        U	R+                  U5        Ub  GMZ  [A        UU"U#5        GMj     U R0                  (       ai  [         Rj                  " U
[         Rl                  S9[         Rj                  " U[         Rl                  S9[         Rj                  " U[         Rl                  S9S.$ U Ro                  XgX5      u  pgp[         Rj                  " U[         Rl                  S9[         Rj                  " U[         Rl                  S9[         Rj                  " U[         Rl                  S9[         Rj                  " U	[         Rl                  S9[         Rj                  " U[         Rl                  S9US.$ ! [D         a    [A        USS  5         GNf = f)N)r   r   r   r   r   )weightsr   r   r   sentencer   rg   r   r   gptc                     [        U S   5      $ Nr   r	   rj   s    r   rl   9ConstructBlockStrategy.construct_blocks.<locals>.<lambda>  s    AfIr   c                     [        U S   5      $ r   r   rj   s    r   rl   r     s    #ai.r   r   r   r   r   r   r   )r   targetr   )r   r   r   position_idr   mode)8torchutilsr   get_worker_infoidnum_workersr   Randomr    r%   r#   r2   r   r3   r&   choicesr
   r	   r.   rx   r+   rW   r=   r   r5   r'   r   r9   r:   rO   distributedget_rankrw   r   
IndexErrorrq   r,   r   randintr*   r   r   r   r   r)   rr   r   r8   r   r   r4   r   r   tensorlong	pad_batch)'r>   r   worker_info	worker_idr   rZ   token_batchtarget_batchloss_mask_batchposition_id_batchsource_batchrandsingle_spanr   r   r   rz   masked_countblock_lengthsepr   r   r   r   ry   r   r   sentence_spansr~   r   r   r   rc   max_generation_lengthgeneration_lengthmultiple_docdivisiontarget_masksr   s'                                          r   construct_blocks'ConstructBlockStrategy.construct_blocks  s	   kk&&668"%0^^[5L5L{%)"ImmTZZ+5	A"oo.04		: ;

a
HJBPRTVE?%'l::<$---((6Gzz|222!sszz|.. D!!!"%d&D&D"E"IK$($B$B $ D EFG&N $2!#4L35qLN& OOc&..AA*C C'*{{!!"%d&D&D"E"IK$($B$B (3 (D EF(G '--l;$4 ' OOc&..AA*C C >>fVn-Cv(*679<^9LMC//NC6 0 C#++CG@}j$++M:$++M:'..z:DHA\#**62$++G4'..z:)00>"))#.M "P NNT%;%;;;D!%+F^VK5H
!#"()t~~/I/I02#Q#$ s6{+A11&)<<%A-*11:q1u2EF%&U
dnn&@&@&G&J&JJ%&U
 , F+")):s6{*CD%%*;*;*D*D*F!*K*dnn66vabzBC N+,.\"0JE3&&s|4 C%K/L#s 33c&kA(C C #1 ++' , ) BF>\3""6*##G,&&z2!((6%%c*O "^ D$'KKD&&#6@AB CC0':;a?%A! "$'(=(+F6N(;a(?%A!%%v'*;;a?A,6NNN..u588 : Cv/!3B 
  3::<$2B2B#B"6&>25FFH)/9LJF39)83Df 	G#=M#-hi#8L^^],,2259<<= %Sb)-+ ,F !nn&)=)=(>NPG!##m"4q"8(+-.:<"=J  &&v. ''0#**:6#%99M*S-??!C3$PL<?<NL]!3a!7!8933-/^^XXc-&8DYYs='9A'=SIK.L* .0^^XXc-&81&<CHWWS%7!%;3GI.J* &,,,0B!C!LN AE@X@X!2 3&r*)	 AY A+=FGZ
  &&v. ''0#**:6%,,\:~f&7Fi "j \D,,|5::F"\\/L  MQNN?MOIK [

C,,|5::F"\\/L.ejjA^5::>	 	C & *fQRj)*s   $'i&&jjc                    [        [        [        U 5      5      nUR                  US   5      [        U5      :w  Ga;  [	        U5      nU  Vs/ s H>  n[
        R                  " U[
        R                  " U[        U5      -
  [        S945      PM@     n nU Vs/ s H>  n[
        R                  " U[
        R                  " U[        U5      -
  [        S945      PM@     nnU Vs/ s H>  n[
        R                  " U[
        R                  " U[        U5      -
  [        S945      PM@     nnU V	s/ s HB  n	[
        R                  " U	[
        R                  " SXYR                  S   -
  4[        S94SS9PMD     nn	XX#4$ s  snf s  snf s  snf s  sn	f )Nr   r   r   r   r   )
listr   r	   r    r   rr   r   r   rx   shape)
r   r   r   r  seq_lengths
max_lengthry   r   r   r   s
             r   r    ConstructBlockStrategy.pad_batchO  s    3sK01[^,K0@@[)J * *F RXXj3v;&>cJKM)    ,  ,G bhhzCL'@LMO+   #2	 #2J XXj3z?:#FHI #2	   1B!
 1B	 !XXZ*<*<Q*??@MN 	 1B  ! /LL/

!s   AE;AF !AF,A	F
)r&   r+   r.   r/   r4   r1   r0   r    r5   r   r;   r'   r,   r8   r*   r(   r)   r=   r!   r<   r#   r7   r2   r6   r3   r   r%   N)r   r         ?r  333333?r     (   r   r   r  r   r   TFTFFFF)r   )r   )__name__
__module____qualname____firstlineno__rD   rO   staticmethodrd   r   r   r   r   r   r  r   __static_attributes__ri   r   r   r   r       s     #&!$" $(&'"$!$$'$% #"%)-!& $ % !& /H
T4  :D %90 $rPr "(2$Lxt M Mr   r   )N)r   mathr   numpyrr   r   torch.utils.datamegatron_utilr   r   scipy.statsr   r   r   r   ri   r   r   <module>r%     s7          + L	M L	Mr   