
    9im                     r    S r SSKrSSKrSSKrSSKrSSKrSSKJr  S rS r	S r
S rS	 rS
 rS rS rS rg)zargparser configuration    N   )get_hostnamec                    U R                  SS5      nUR                  SSSS9  UR                  SSSS9  UR                  S	SS
S9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S [        SS!S9  UR                  S"[        S#S$S9  UR                  S%[        S&S'S9  UR                  S(SS)S9  UR                  S*[        S+S,S9  UR                  S-SS.S9  UR                  S/SS0S9  U $ )1zModel argumentsmodelzmodel configurationz--transformer-xl
store_truezuse transformer-xl for trainingactionhelpz--pretrained-bertzuse a pretrained bert-large-uncased model insteadof initializing from scratch. See --tokenizer-model-type to specify which pretrained BERT model to usez--encoder-decoderz0use the encoder-decoder architecture for blocklmz--attention-dropout皙?z)dropout probability for attention weightstypedefaultr
   z--num-attention-heads   z"num of transformer attention headsz--hidden-sizei   ztansformer hidden sizez--intermediate-sizeNzMtransformer embedding dimension for FFNset to 4*`--hidden-size` if it is Nonez--num-layers   znum decoder layersz--layernorm-epsilongh㈵>zlayer norm epsilonz--hidden-dropoutz0dropout probability for hidden state transformerz--output-dropoutz%dropout probability for pooled outputz--max-position-embeddings   z,maximum number of position embeddings to usez--vocab-sizei  znvocab size to use for non-character-level tokenization. This value will only be used when creating a tokenizerz--deep-initzinitialize bert model similar to gpt2 model.scales initialization of projection layers by a factor of 1/sqrt(2N). Necessary to train bert models larger than BERT-Large.z--make-vocab-size-divisible-by   zcPad the vocab size to be divisible by this value.This is added for computational efficieny reasons.z--cpu-optimizerzRun optimizer on CPUz--cpu_torch_adamz#Use Torch Adam as optimizer on CPU.add_argument_groupadd_argumentfloatintparsergroups     d/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/mglm/arguments.pyadd_model_config_argsr      s?    %%g/DEE	.  0 
   
?  A 
8	  :
 
1	  3
 
c46N  P	1	  2 
S"3G  I	!	  #
 
?	  A
 
4	  6
 
#;	  =
 
	    
)  * 
(=	  > 
,5K  M	2  4
 M    c                    U R                  SS5      nUR                  SSSS9  UR                  SSSS9  UR                  S	SS
S9  UR                  SSSS9  UR                  SSSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SS9  U $ )zMixed precision arguments.fp16zfp16 configurationsz--fp16r   zRun model in fp16 moder   z--fp32-embeddingzembedding in fp32z--fp32-layernormzlayer norm in fp32z--fp32-tokentypeszembedding token types in fp32z--fp32-allreducezall-reduce in fp32z--hysteresis   z#hysteresis for dynamic loss scalingr   z--loss-scaleNzsStatic loss scaling, positive power of 2 values can improve fp16 convergence. If None, dynamicloss scaling is used.z--loss-scale-window  z.Window over which to raise/lower dynamic scalez--min-scaler   z)Minimum loss scale for dynamic loss scalez--attention-scale      ?r   r   )r   r   r   r   r   s     r   add_fp16_config_argsr$   t   sF    %%f.CDE	,D  F	<6I  K	<6J  L	,  . 
<6J  L	2	  4
 
 	  ! 
=	  ?
 
8	  :
 
*DMr   c                    U R                  SS5      nUR                  S[        SSS9  UR                  S[        SS	S9  UR                  S
[        SS	S9  UR                  S[        SSS9  UR                  SSSS9  UR                  S[        SSS9  UR                  SSSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S [        S!S"9  UR                  S#[        S$S%S9  UR                  S&[        S'S(S9  UR                  S)[        S*S+S9  UR                  S,SS-S9  UR                  S.SS/S9  UR                  S0[        SS1S9  UR                  S2[        S3/ S4QS5S69  UR                  S7[        S8S"9  UR                  S9[        S:S;S9  UR                  S<[        SS=S9  UR                  S>SS?S9  UR                  S@[        SSAS9  UR                  SBSSC9  UR                  SD[        SSES9  UR                  SF[        SGSHS9  UR                  SISSJS9  UR                  SKSSLS9  UR                  SM[        SSNS9  UR                  SOSSPS9  UR                  SQSSRS9  UR                  SSSSTS9  UR                  SUSSVS9  UR                  SWSSXS9  UR                  SYSSZS9  UR                  S[S\S]S\S^/S_9  UR                  S`Sa/ SbQScSd9  UR                  Se[        SSfS9  UR                  SgSShS9  UR                  SiSSjS9  UR                  Sk[        SlS"9  UR                  Sm[        SlS"9  UR                  Sn[        SlS"9  UR                  So[        S!S"9  UR                  Sp[        SqS"9  UR                  Sr[        SsS"9  UR                  St[        S!S"9  UR                  Su[        S!S"9  UR                  SvSSwS9  UR                  SxSSyS9  UR                  SzSS{S9  UR                  S|SS}S9  UR                  S~[        S!S"9  UR                  S[        S!S"9  UR                  SSSS9  U $ )zTraining arguments.trainztraining configurationsz--experiment-namezgpt-345Mz.The experiment name for summary and checkpointr   z--batch-size   zData Loader batch sizez--gradient-accumulation-stepsr   z--weight-decayg{Gz?z.weight decay coefficient for L2 regularizationz--checkpoint-activationsr   zLcheckpoint activation to allow for training with larger models and sequencesr   z--checkpoint-num-layersz/chunk size (number of layers) for checkpointingz$--deepspeed-activation-checkpointingz,uses activation checkpointing from deepspeedz--epochsNz>Number of finetunning epochs. Zero results in evaluation only.z--clip-gradr"   zgradient clippingz--train-itersr   z:total number of iterations to train over all training runsz--label-smoothing        r#   z--log-intervald   zreport intervalz--summary-dir z"The directory to store the summaryz--seedi  zrandom seedz--reset-position-idsz/Reset position ids after end-of-document token.z--reset-attention-maskz7Reset self attention maske after end-of-document token.z--lr-decay-iterszUnumber of iterations to decay LR over, If None defaults to `--train-iters`*`--epochs`z--lr-decay-stylelinear)constantr+   cosineexponentialzlearning rate decay functionr   r   choicesr
   z--lr-decay-ratior   z--lrg-C6?zinitial learning ratez--warmupzNpercentage of data to warmup on (.01 = 1% of all training iters). Default 0.01z--switch-linearz'Switch to linear decay for cosine decayz--savez(Output directory to save checkpoints to.z--new-save-directoryr	   z--save-epochznumber of epochs between savesz--save-intervali  z"number of iterations between savesz--no-save-optimzDo not save current optimizer.z--no-save-rngzDo not save current rng state.z--loadz2Path to a directory containing a model checkpoint.z--no-load-optimz.Do not load optimizer when loading checkpoint.z--no-load-rngz.Do not load rng state when loading checkpoint.z--no-load-lr-schedulerz1Do not load lr scheduler when loading checkpoint.z--no-deepspeed-loadz)Not use deepspeed when loading checkpointz
--finetunezLoad model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.z--resume-dataloaderzResume the dataloader when resuming training. Does not apply to tfrecords dataloader, try resumingwith a different seed in this case.z--distributed-backendncclzBwhich backend to use for distributed training. One of [gloo, nccl]gloo)r   r
   r0   z
--DDP-impltorch)localr4   nonez4which DistributedDataParallel implementation to use.)r   r0   r
   z--local_rankz+local rank passed from distributed launcherz
--block-lmz$whether use the BlockLM pre-trainingz--masked-lmz whether to use the mlm objectivez--bert-probg      ?z--gpt-infill-probz--gpt-min-ratioz--gap-sentence-probz--gap-sentence-ratiog333333?z--avg-block-length   z--short-seq-probz--single-span-probz--task-maskz3Use different mask for generation and blank fillingz--no-shuffle-blockz-not shuffle the blocks when filling the blankz--no-block-positionz9Use (rough) absolute positions instead of block positionsz--sentinel-tokenz:Use sentinel (mask) tokens to replace 2d position encodingz--block-mask-probz--context-mask-ratioz--random-positionz>Use random start position to cover all the position embeddingsr   r   strr   r   r   s     r   add_training_argsr:      sn    %%g/HIE	=	  ?
 
S!2J  L	'%	  '
 
=	  ?
 
"+  ,
 
!>	  @
 
.;  = 
M	  O
 
E35H  J	I	  K
 
*D	sC6G  I	1	  3
 
xc4mL	>  @ 
 !  " 
:	  ; 
?+  - 
)sC	UF1H  J	(	  ) 
6  8
 
7	  9
 
-lC	-	  /
 
1	  3
 
-  / 
-  / 
A	  C
 
=  ? 
=  ? 
 @  B 
8  : 
5  6 
.  / 
L   " 
*C	  E 
:	  < 
3  5 
/  1 
}5#>	*D	(ucB	,5#F	-E4H	+#qA	)sC	+%E	B  D 
<  > 
H  J 
I  K 
*D	-E3G	M  O Mr   c                 \   U R                  SS5      nUR                  S[        SSS9  UR                  S[        SS	S9  UR                  S
[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SS9  U $ )zEvaluation arguments.
validationzvalidation configurationsz--eval-batch-sizeNzIData Loader batch size for evaluation datasets.Defaults to `--batch-size`r   z--eval-itersr)   z=number of iterations to run for evaluationvalidation/test forz--eval-intervalr!   z5interval between running evaluation on validation setz--eval-epochr   z2epoch between running evaluation on validation setz--eval-seq-lengthzMMaximum sequence length to process for evaluation. Defaults to `--seq-length`z--eval-max-preds-per-seqziMaximum number of predictions to use for evaluation. Defaults to math.ceil(`--eval-seq-length`*.15/10)*10z--overlapping-eval    r#   )r   r   r   r   s     r   add_evaluation_argsr>   s  s    %%l&ACE 
%	  & 
	   
D	  F
 
A	  C
 
1	  2 
"3	  4 
+#rBMr   c                    U R                  SS5      nUR                  S[        SS9  UR                  S[        SS9  UR                  S[        S	S9  UR                  S
[        SS9  UR                  S[        SS9  UR                  S[        SS9  UR                  S[        S	S9  UR                  S[        S	S9  UR                  SSS9  UR                  S[        SS9  U $ )zText generate arguments.zText generationconfigurationsz--temperaturer"   r#   z--top_pr(   z--top_kr   z--out-seq-length   z--num-beamsr   z--length-penaltyz--no-repeat-ngram-sizez--min-tgt-lengthz--select-topkr   r1   z--blank-maskratior   r   r   s     r   add_text_generate_argsrB     s     %%&79IJE	UC@	yuc:	ysA6	)SA	}3:	)sC	/c1E	)Q?	|<	*DMr   c                    U R                  SS5      nUR                  S[        SSS9  UR                  SSS	S
9  UR                  SSS9  UR                  SSSSS9  UR                  SSSSS9  UR                  SSSSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  SSSS9  UR                  S S!S"S9  UR                  S#SS$S9  UR                  S%S&S'S9  UR                  S(SS)S
9  UR                  S*SS9  UR                  S+[        SS,S9  UR                  S-SS.S
9  UR                  S/SS0S
9  UR                  S1[        S2S3S9  UR                  S4[        SS5S9  UR                  S6[        S7S8S9  UR                  S9[        S:/ S;QS<S=9  UR                  S>SS9  UR                  S?S[        S@SA9  UR                  SBSSCS
9  UR                  SD[        SESFS9  UR                  SG[        SHSIS9  UR                  SJ[        SSKS9  UR                  SL[        SMSN9  UR                  SOSSPS
9  UR                  SQ[        SSRS9  UR                  SS[        SSTS9  UR                  SU[        SSVS9  UR                  SWSSSXS9  UR                  SY[        SMSZS9  UR                  S[[        SSN9  UR                  S\[        SSN9  U $ )]z Train/valid/test data arguments.datazdata configurationsz--model-parallel-sizer   zsize of the model parallel.r   z	--shuffler   zIShuffle data. Shuffling is deterministic based on seed and current epoch.r   z--filter-englishr1   z--train-data+Nz=Whitespace separated filenames or corpora names for training.)nargsr   r
   z--valid-data*zFilename for validation data.z--test-datazFilename for testingz
--data-dirz#The data path to all the data filesz--input-data-sizes-filez	sizes.txtz,the filename containing all the shards sizesz--delim,z&delimiter used to parse csv data files)r   r
   z
--text-keysentencez(key to use to extract text from json/csvz--eval-text-keyz<key to use to extract text from json/csv evaluation datasetsz--splitz1000,1,1zLcomma-separated list of proportions for training, validation, and test splitz--no-lazy-loaderz!whether to lazy read the data setz--half-lazy-loaderz--loader-scatterz)Number of scatters to use for dataloadersz--loose-jsonzlUse loose json (one json-formatted string per newline), instead of tight json (data file is one json string)z--presplit-sentenceszaDataset content consists of documents where each document consists of newline separated sentencesz--num-workersr    z(Number of workers to use for dataloadingz--tokenizer-model-typea  Model type to use for sentencepiece tokenization                        (one of ['bpe', 'char', 'unigram', 'word']) or                        bert vocab to use for BertWordPieceTokenizer (one of                        ['bert-large-uncased', 'bert-large-cased', etc.])z--tokenizer-pathztokenizer.modelz8path used to save/load sentencepiece tokenization modelsz--tokenizer-typeBertWordPieceTokenizer)CharacterLevelTokenizerSentencePieceTokenizerrJ   GPT2BPETokenizerChineseSPTokenizerzwhat type of tokenizer to user/   z--no-pre-tokenizez--cache-dirz)Where to store pre-trained BERT downloads)r   r   r
   z--use-tfrecordszgload `--train-data`, `--valid-data`, `--test-data` from BERT tf records instead of normal data pipelinez--seq-lengthr   z"Maximum sequence length to processz--mem-lengthr   zThe memory length to preservez--max-preds-per-seqzMaximum number of predictions to use per sequence.Defaults to math.ceil(`--seq-length`*.15/10)*10.MUST BE SPECIFIED IF `--use-tfrecords` is True.z--non-sentence-startr(   r#   z--sample-one-documentz&only sample one document in one samplez--load-splitsz#The path to load split indices fromz--save-splitsz!The path to save split indices toz--save-test-datazThe path to save the test dataz--multi-task-dataz0Downsteam task names for multi-task pre-trainingz--multi-task-ratioz!Ratio for multi-task pre-trainingz--multi-seq-lengthz--multi-batch-size)r   r   r   r9   r   r   s     r   add_data_argsrO     sx    %%f.CDE	*	  ,
 
+  ,
 
),?		   
0	  2
 
'	  )
 
2	  4
 
!;	  = 
3%M  O	7  9 
'  (
 
&  ' 
0  2 
+LA	8	  :
 
   
@  A
 
;	  =
 
 J	  K 
!	   
(
 -  . 
*<@	8	  :
 
    
1	  3
 
,	  .
 
:	  ; 
-E3G	5  7 
2	  4
 
0	  2
 
-	  /
 
?	  A
 
0	  2
 
+#tD	+#tDMr   c                    U R                  SS5      nUR                  S[        SS9  UR                  S[        SS S9  UR                  S	[        / S
QSSS9  UR                  SSSS9  UR                  SSSS9  UR                  S[        SSS9  UR                  S[        / SQSS9  UR                  S[        SS9  UR                  SSS S9  UR                  S![        SS9  UR                  S"SS#S9  UR                  S$SS%9  UR                  S&SS'S9  UR                  S([        S S9  UR                  S)SS*S9  UR                  S+[        S S9  UR                  S,[        S S9  UR                  S-[        S.S9  UR                  S/[        S0S9  UR                  S1[        S2S9  UR                  S3[        S4S5/S4S9  UR                  S6SS%9  UR                  S7SS%9  UR                  S8SS%9  UR                  S9SS:S9  UR                  S;[        SS9  UR                  S<S=/ S>QS?9  UR                  S@SSASB9  UR                  SC[        S S9  UR                  SD[        SS9  UR                  SESSASB9  U $ )FNfinetunezfinetune configurationsz--taskz
Task name.)r   r
   z--load-pretrainedzLoad pretrained model)r   r
   r   z--pool-token)startpadclsz-The token to pool the sequence representationrT   )r   r0   r
   r   z--cloze-evalr   z"Evaluation dataset with cloze taskr   z--multi-tokenz$Use multi token for cloze evaluationz--segment-lengthr   z/The maximum segment length for cloze evaluationr   z--loss-func)cross_entropyhinge
generativemixrU   )r   r0   r   z--block-lm-ratior(   r#   z--adapetz.Use the decoupled cross entropy loss in AdaPETz--pattern-idz--fast-decodezRFast decode for multi-token cloze. Can only be used without checkpoint activation.z--few-supergluer1   z--eval-validz!Whether evaluate on the valid setz--validation-metricz--unidirectionalz$Use the left to right language modelz--src-seq-lengthz--tgt-seq-lengthz--adam-beta1g?z--adam-beta2g+?z
--adam-epsg:0yE>z--optimizeradam	adafactorz--wsc-negativez--overwritez--no-validationz--continuous-promptzUse continuous prompt for PETz--num-prompt-tokensz--prompt-funclstm)r[   mlpr6   )r   r0   z--freeze-transformerF)r	   r   z--tune-prefix-layersz--prefix-promptz--prompt-initr8   r   s     r   add_finetune_config_argsr]   _  s4   %%j2KLE	xc=	$	  
 
'<   
1  3 
3  5 
>	  @
 
?	  !
 
)sC	=  ? 
~C;	\	   
(>	0  2 
,3E	3  5 
)TB	)TB	~E3?	~E5A	|%>	C&+)>  P	'=	}\:	(>	,  . 
,3B	1H  J	|U  D	-CF	(sA>	|UKMr   c                  z   [         R                  " SS9n [        U 5      n [        U 5      n [	        U 5      n [        U 5      n [        U 5      n [        U 5      n [        U 5      n [        R                  " U 5      n U R                  / S9nUR                  (       d  UR                  (       d  [        S5        [        R                   R#                  5       Ul        [%        [&        R(                  " SS5      5      Ul        [%        [&        R(                  " SS5      5      Ul        [/        US	5      (       a  UR0                  (       a  [3        U5        O[&        R(                  " S
5      (       a  [%        [&        R(                  " S
5      5      n[%        [&        R(                  " S5      5      n[%        [&        R(                  " SS5      5      n[%        [&        R(                  " SS5      5      nX!l        XS-  U-   Ul        XC-  Ul        [7        UR8                  UR,                  5      Ul        UR*                  S:X  a/  [        SR;                  UR,                  UR8                  5      5        SUl        UR>                  c"  SUl        UR*                  S:X  a  [        S5        UR@                  (       d  SUl!        SUl"        SUl#        [/        US5      (       a  UR                  (       a  URH                  b  [K        URH                  SS9 n[L        RN                  " U5      nSSS5        SW;   a
  US   Ul(        SU;   a  US   Ul)        OSUl)        SU;   aW  US   RU                  S0 5      nURU                  SURV                  5      Ul+        URU                  SURX                  5      Ul,        U$ ! , (       d  f       N= f)zParse all the args.zPyTorch BERT Model)description)argsz#WARNING: No training data specifiedRANK0
WORLD_SIZE1deepspeed_mpiOMPI_COMM_WORLD_LOCAL_RANKOMPI_COMM_WORLD_LOCAL_SIZESLURM_JOB_NUM_NODESSLURM_NODEIDr   z1using world size: {} and model-parallel size: {} FNTz > using dynamic loss scaling	deepspeedzutf-8)encodingtrain_micro_batch_size_per_gpugradient_accumulation_stepsr   	optimizerparamslrweight_decay)-argparseArgumentParserr   r$   r:   r>   rB   rO   r]   rj   add_config_arguments
parse_args
train_datadata_dirprintr4   cudais_availabler   osgetenvrank
world_sizehasattrre   mpi_define_env
local_rankminmodel_parallel_sizeformatdynamic_loss_scale
loss_scaler   fp32_embeddingfp32_tokentypesfp32_layernormdeepspeed_configopenjsonload
batch_sizerm   getrp   rq   )	r   r`   r   
local_size	num_nodesnodeidfiler   optimizer_params_configs	            r   get_argsr     s    $$1EFF"6*F!&)Fv&F (F#F+F6"F%f-F ++F3F"%D??4==34

'')DIBIIfc*+DI"))L#67DOt_%%$*<*<t	/	0	0#?@A
#?@A
 		"7=>	RYY~s34$'*4	#0"4#;#;T__MDyyA~AHHOOT557 	8 $D"&99>12
 99#$#t[  ^^(=(=(I$''':d#yy ;+/??.02DO(,<</?-0/D, 01D,**&6{&C&G&G"'#-11$@DG 7 ; ; 1 1!3DK! ;:s   N,,
N:c           
         SSK Jn  UR                  nUR                  5       nUR	                  5       nS nUS:X  a
  [        5       nUR                  USS9nUR                  5       nUR                  U5      n[        US U  Vs/ s H  oU:H  PM	     sn5      n	[        U5      [        R                  S'   [        U5      [        R                  S'   Xl        X@l        X0l        U[        R                  S'   S[        R                  S'   [!        S	R#                  [        R                  S   U R                  [        R                  S   [        R                  S   [        R                  S   5      5        g s  snf )
Nr   )MPI)rootra   rc   MASTER_ADDR29500MASTER_PORTzfDiscovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={})mpi4pyr   
COMM_WORLDGet_rankGet_sizer   bcastGet_processor_name	allgathersumr9   r{   environr   r~   r}   rx   r   )
r`   r   commr}   r~   master_addr	proc_name	all_procsir   s
             r   r   r     s2   >>D==?DJKqy"n**[q*1K &&(Iy)Ii.>?.>9n.>?@JTBJJv":BJJ| O OI +BJJ}  JJ 
p	

6"DOORZZ5M

=)2::m+D
FG @s   E4)__doc__rr   r{   rj   r   r4   utilsr   r   r$   r:   r>   rB   rO   r]   r   r    r   r   <module>r      sV      	    Wt(VQh+\"jZM`K\Gr   