
    9iKF                        S r SSKrSSKrSSKrSSKJr  SSKJr  SSKr	SSK
r
SSKr
SSKJrJr  SSKJr  SSKJr  SS	KJr   " S
 S\
R*                  R,                  R.                  5      r " S S5      rS r  SS jrS rS rS rS rS r g)z&parses arguments and preps data loader    N)bisect_right)
accumulate)mpuprint_rank_0   )
data_utils)ConstructBlockStrategy)make_tokenizerc                   N   ^  \ rS rSr   SU 4S jjrS r\S 5       rS rSr	U =r
$ )MultiTaskDataset    c           	      
  > [         [        U ]  5         Xl        X l        X0l        X@l        U Vs/ s H  n[        U5      PM     snU l        [        R                  " U R                   Vs/ s H  n[        Xu5      U-  PM     sn5      U l        [        U R                  5      U l        [        [!        U R                  5      5      U l        U R
                  (       a>  [%        [        ['        U R                  U R                  U R                  5      5      5        O2[%        [        ['        U R                  U R                  5      5      5        U =R                  U R                  R                  5       -  sl        g s  snf s  snf N)superr   __init__tasksdatasetsreweighttemperaturelenlensnparrayminweightssum	total_lenlistr   cumulative_lensr   zip)	selfr   r   r   r   	max_limitdatasetlength	__class__s	           i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/mglm/configure_data.pyr   MultiTaskDataset.__init__"   s     	.0
  &19:gS\:	xx?CyyIyVS#[0yIKTYY#Jtyy$9:==c$**diiFGHc$**dii89:((** ;Is   E;+F c                      U R                   S-  $ )Ni  )r   )r!   s    r&   __len__MultiTaskDataset.__len__8   s    ~~$$    c                    U S   nU S   nU S   nU S   nU S   nU S   n[        UR                  5      S:X  a  X   nX&   nX6   nXF   nXV   nOX6   nUR                  (       d  UR                  [        U5      5      nUUUUUS.$ )	Ntext
logit_masktargetmaskpositionlabel   )r-   r/   	loss_maskposition_idattention_mask)r   shaperepeat)datar-   r4   r/   r6   r5   r2   s          r&   pet_wrapperMultiTaskDataset.pet_wrapper;   s    F|&	hf:&Wtzz?a;D!(I]F+2N%,K]F||]]3t9-F"&,
 	
r+   c           	         U R                   (       a  [        R                  " U5      n[        R                  R	                  [        S5       Vs/ s H  o2R                  SS5      PM     snS9nUR                  [        R                  " [        U R                  5      5      U R                  S9nU R                  U   nUR                  [        R                  " [        U5      5      5      nU R                  U   U   nOE[        U R                  U5      nUS:X  a  UnOXR                  US-
     -
  nU R                  U   U   nU R                  U5      nU$ s  snf )N   r   l    )seed)pr   )r   randomRandomr   RandomStaterangerandintchoicearanger   r   r   r   r   r:   )r!   idxrng_dataset_idxr#   
sample_idxitems           r&   __getitem__MultiTaskDataset.__getitem__U   s   ==--$C))''9>rCAkk!Y/C ( EC**		#dmm,- % ?KmmK0GBIIc'l$;<J==-j9D&t';';SAKa 
 #7#7a#HH
==-j9D% Ds   E)r   r   r   r   r   r   r   r   )Tg?i@ )__name__
__module____qualname____firstlineno__r   r)   staticmethodr:   rM   __static_attributes____classcell__r%   s   @r&   r   r       s6    
  !+,% 
 
2 r+   r   c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )
DataConfigj   c                 @   > [         [        U ]  5         Uc  0 nXl        g r   )r   rX   r   defaults)r!   r[   r%   s     r&   r   DataConfig.__init__l   s     j$(*H r+   c                     [         R                  R                  5       S:X  a  [        S5        U R	                  U5        [        X5      $ )Nr   zconfiguring data)torchdistributedget_rankprintapply_defaultsmake_loaders)r!   args	tokenizers      r&   applyDataConfig.applyr   s;    %%'1,$%D!D,,r+   c                 R    UR                  5        H  u  p#X0R                  U'   M     g r   )itemsr[   )r!   kwargskvs       r&   set_defaultsDataConfig.set_defaultsx   s     LLNDA MM! #r+   c                     U R                   R                  5        H5  u  p#UR                  SS5      n[        X5      (       a  M)  [	        XU5        M7     g )N-rI   )r[   ri   replacehasattrsetattr)r!   rd   rk   rl   s       r&   rb   DataConfig.apply_defaults|   s@    MM'')DA		#s#A4### *r+   r[   r   )
rO   rP   rQ   rR   r   rf   rm   rb   rT   rU   rV   s   @r&   rX   rX   j   s    !-!$ $r+   rX   c                 .   SnU R                   (       a  U R                  n[        U R                  S U R                  U R
                  U R                  U R                  U R                  UU R                  U R                  S:  =(       d    U R                  S:  S9
n[        R                  " 5       S:X  a  UR                  nUR                  S5      R                   nXBR                  S5      R                   :X  d   eUnUnU R"                  nXg-  S:w  a  US-  nXg-  S:w  a  M  [%        SR'                  XVU-
  U5      5        [%        SR'                  U5      5        [(        R*                  R-                  Xd/5      nO![(        R*                  R-                  SS/5      n[(        R.                  R1                  U[        R2                  " 5       [        R4                  " 5       S	9  US   R7                  5       nUS   R7                  5       nX4sU l        U l        U$ )
Nr           )add_block_symbols	cache_diradd_sentinel_tokenadd_task_maskadd_decoder_maskeospadr   z=> padded vocab (size: {}) with {} dummy tokens (new size: {})z!> found end-of-document token: {}group)sentinel_tokenmax_position_embeddingsr
   tokenizer_typetokenizer_path
vocab_sizetokenizer_model_typeblock_lmry   	task_maskblock_mask_probcontext_mask_ratior   get_model_parallel_rank
num_tokensget_commandIdmake_vocab_size_divisible_byr   formatr^   cuda
LongTensorr_   	broadcastget_model_parallel_src_rankget_model_parallel_grouprL   	eod_token)	rd   rz   re   r   r   beforeaftermultipletoken_countss	            r&   prepare_tokenizerr      s   !99!!--..-nn--3 )""S(*I ""$)))
))%033	11%8;;;;;44A%QJE A% --3VFFN49.;	< 	8??	JKzz,,e-?@zz,,aV4	'')**,   . a%%'JQ$$&I&0#DOT^r+   c           	         [         R                  R                  [        R                  " 5       S9n[         R                  R                  [        R                  " 5       S9nUR                  b*  XR                  -  nXtR                  -  nX$R                  -  nUS:  n	UR                  (       a+  [        R                  R                  [        U 5      X2X5      n
OU(       a9  [        R                  R                  U SX$R                  -  UR                  -  S9nO)[         R                  R                   R#                  U 5      nU	nU	(       a-  [        R                  R%                  UUUUUUR                  S9n
O*[         R                  R                   R'                  XU5      n
S nU(       Ga  [)        UUUR*                  40 SUR,                  _SUR.                  _SUR0                  _S	UR2                  _S
UR4                  _SUR6                  _SUR8                  _SUR:                  _SUR<                  _SUR>                  _SUR@                  (       + _SURB                  (       + _SURD                  _SURF                  _SURH                  _SURJ                  _SURL                  _6RN                  n[         R                  R                   RQ                  U U
URR                  SUS9nU$ )Nr   r   T)replacementnum_samples)gradient_accumulation_steps	bert_probgap_sentence_probgap_sentence_ratiogpt_infill_probaverage_block_lengthgpt_min_ratior   r   short_seq_probsingle_span_probshuffle_blocksblock_position_encodingr   encoder_decoderr   random_position	masked_lm)batch_samplernum_workers
pin_memory
collate_fn)*r^   r_   get_world_sizer   get_data_parallel_groupr`   loader_scattertransformer_xlr   samplersDistributedSequentialSamplerr   RandomSamplertrain_itersr   utilsr9   SequentialSamplerDistributedBatchSamplerBatchSamplerr	   
seq_lengthr   r   r   r   avg_block_lengthr   r   r   r   r   no_shuffle_blockno_block_positionr   r   r   r   r   construct_blocks
DataLoaderr   )r#   re   
batch_size	num_itersrd   shuffleblock_collate
world_sizerankr_   r   sampler	drop_lastr   data_loaders                  r&   make_data_loaderr      s    ""11))+ 2 -J%%C,G,G,I%JD&***#6#66
#6#66
q.K"++HHL)C  ))77 &)9)99223 8 4G kk&&88AG	&//GG,0,L,L H NM "KK,,99Y0MJ+OO& nn	&
 #44&  $66& !00& "&!6!6& ,,& !00&  $66&  ..& "22&  $444& )-(>(>$>&   ..!&" !00#&$ nn%&& !00'&( nn)&( '7&6) 	* ++""--#$$ . K r+   c           	      
   SSK nU R                  U R                  U R                  S[	        U R
                  S5      U R                  U R                  -   S-   U R
                  S:  S.nUR                  R                  " U R                  40 UD6nSUS'   U R                  b  U R                  US'   U R                  b  U R                  US	'   SnU R                  b'  UR                  R                  " U R                  40 UD6nSnU R                  b'  UR                  R                  " U R                  40 UD6nUR                  " U R                   UU R"                  U R$                  U R&                  U R(                  S
9nX4U4U4$ )z3Load train/val/test dataset from shuffled TFRecordsr   NTr   )r   max_seq_lenmax_preds_per_seqtrainr   r>   threaded_dlFr   r   r   )ry   )data_utils.tf_dlr   r   r   maxr   r>   r   tf_dlTFRecordDataLoader
train_dataeval_seq_lengtheval_max_preds_per_seq
valid_data	test_datar
   r   r   r   r   ry   )rd   r   data_set_argsr   validtestre   s          r&   make_tfrecord_loadersr      s~    oo!334++Q/		DII%)''!+M // A2?AE"M'''+';';m$"".-1-H-H)*E"  33DOO E6CED~~!224>> D5BD))!!.."I $**r+   c           
         U R                   (       a  [        U 5      $ [        R                  R	                  [
        R                  " 5       S9nU R                  b  X R                  -  S:X  d   eU R                  U-  nUnU R                  b  U R                  U-  nU R                  nUS:  a  XR-  nU R                  nUb
  US:  a  Xb-  n[        U 5      n0 SU R                  _SU_SU R                  _SU R                  _SU R                   _SS	_S
U R"                  _SU_SU R$                  _SU R&                  _SU R(                  _SU R*                  _SU R,                  _SU R.                  (       + _SU_SU R0                  _SU R2                  _U R4                  U R6                  U R                  [
        R8                  " 5       U R:                  U R<                  S.En[>        R>                  " U5      n	S/U	S'   U(       a  XiS'   U R@                  (       a  U R@                  U	S'   U RB                  b  U RB                  U	S'   Su  pnU R                  b:  [D        RF                  " S0 UD6n
[D        RH                  " U5      (       a  U
u  pnXS'   Uc6  U RJ                  b)  U RJ                  U	S'   [D        RF                  " S0 U	D6nXS'   Uc2  U RL                  b%  U RL                  U	S'   [D        RF                  " S0 U	D6nU RN                  =(       d    U RP                  nU
b;  U R                  S:  a+  [S        U
UUU RT                  U U RV                  US9n
SU l,        OSU l,        US:w  a  UOUnUb+  [S        UUUU RT                  U U RV                  US9nSU l-        OSU l-        Ub0  [S        UUU[]        U5      U-  S-   U U RV                  US9nSU l/        OSU l/        XU4$ )zmakes training/val/testr   r   pathr   
mem_lengthdelimtext_key	label_keyr2   ds_typesplitlooser   presplit_sentencessample_one_documentfilter_englishpre_tokenizere   save_splitsload_splits)save_test_datano_lazy_loaderr   data_parallel_ranknon_sentence_starthalf_lazy_loader      ?)NNN)r   r   TFr    )0use_tfrecordsr   r^   r_   r   r   r   r   r   eval_batch_sizer   r   	get_splitr   r   r   r   data_set_type
loose_jsonr   r   r   r   no_pre_tokenizer   r   r   r   get_data_parallel_rankr   r   copyr   eval_text_keyr   make_datasetshould_splitr   r   r   r   r   r   r   do_traindo_validr   do_test)rd   re   r   r   r   r   r   r   r   eval_set_argsr   r   r   	use_blocks                 r&   rc   rc     s:    $T**""11))+ 2 -J&///1444:-J O'..;JA~,
**O"':)6dOEj 	doo 		
 	DMM 	W 	4%% 	 	 	T33 	d55 	t77 	$-- 	D000 	Y  	t''!" 	t''#$ ------!88:"55 11/M4 IIm,M TM' &5l#""-1-H-H)*%$($6$6j! *E$"''8-8""5))!&E$%.k" }4 $f''8-8%.k"|2 $f&&77 5!5!5IT__q0 LL#% )8A)=o:O LL#% I(1,LL#% r+   c                 6   SSSSSSSSS	S
SSS.nSu  p4[         R                  " 5       S:X  Gaj  U R                  nU R                  b  U R                  n/ / pvU R                   H{  nUR                  5       n[        R                  R                  U R                  X(   5      n	UR                  [        U UU	USUSS95        UR                  [        U UU	USUSS95        M}     [        U R                  U5      n[        U R                  U5      n[        R                  R                  [         R                   " 5       S9n
U R"                  U
-  nU R$                  b  U R$                  U
-  n['        UUUU R(                  U SS9n['        UUUU R(                  U SS9nX44$ )NMNLICoLAMRPCQNLIQQPzSST-2Agnewsyelp_review_polarity_csvyelp_review_full_csvYahooSQuADRACE)mnlicolamrpcqnliqqpsst2agnewszyelp-polarityz	yelp-fullyahoosquadrace)NNr   r   T)pattern_ensembledevr   )r   )r   r   r   multi_seq_lengthmulti_task_datalowerosr   joindata_dirappendSuperGlueDatasetr   r^   r_   r   r   r   multi_batch_sizer   r   )rd   re   	task_dirsr   r   r  train_datasetsvalid_datasetstaskr"  r   r%  s               r&   build_multi_task_datasetr*    s   3+I LE
""$)??  ,#44)+R((D::<Dww||DMM9?CH!! $%)+, !! $%)+, )* !!5!5~F !5!5~F&&55--/ 6 1
??Z7  ,#44zA  ! <r+   c                    / nU R                   R                  S5      S:w  a5  U R                   R                  S5       Vs/ s H  n[        U5      PM     nnOjU R                   R                  S5      S:w  a5  U R                   R                  S5       Vs/ s H  n[        U5      PM     nnO[        U R                   5      /n[        U5      nUS:  a  UR	                  SU-
  5        [        U5      S:  a"  UR	                  S5        [        U5      S:  a  M"  USS nU R                  b  SUS'   U R                  b  SUS	'   [        U5      nU Vs/ s H  o"U-  PM	     sn$ s  snf s  snf s  snf )
z5
Get dataset splits from comma separated string list
,/r   r      rw   Nr3   )r   findfloatr   r#  r   r   r   )rd   splitsssplit_total	final_sums        r&   r   r     s=    Fzzsr!$(JJ$4$4S$9:$9q%($9:			#$(JJ$4$4S$9:$9q%($9:

#$f+KRa+o&
f+/b f+/BQZF"q	~~!q	FI#)*6a	M6**! ;: +s   E E%E*c            
      ,    SSSSSSSSSS.	n [        U S	9$ )
z*add cmdline flags for configuring datasetsr   r-  r   F
supervised   d   )	r   r   persist_statelazy	transposer   r   r   samples_per_shardru   )rX   ru   s    r&   configure_datar>    s3     % 
H x((r+   )FF)!__doc__r   r   r@   bisectr   	itertoolsr   numpyr   r^   torch.utils.datamegatron_utilr   r    r   blocklm_utilsr	   data_utils.tokenizationr
   r   r9   Datasetr   rX   r   r   r   rc   r*  r   r>  r   r+   r&   <module>rI     s    -  	        +  1 3Gu{{''// GT$ $2(` ##(FR$+Ntn?D+2)r+   