
    9iE                         S SK r S SKrS SKJr  S SKJr  S SKJrJ	r	J
r
  S SKJr  SSKJr  SSKJrJrJrJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SS jr    SS jrS rS rS r     SS jr!S r"SS jr#  SS jr$g)    N)	FusedAdam)mpu)DynamicLossScalerFP16_ModuleFP16_Optimizer)distributed   )DistributedDataParallel)GLMForMultiTokenClozeGLMForMultiTokenClozeFastGLMForSequenceClassificationGLMForSingleTokenClozeGLMModel)PyTorchDistributedDataParallel),glm_get_params_for_weight_decay_optimization)get_checkpoint_iterationget_checkpoint_nameprint_rank_0c                 8  ^ [        U5      u  pEpg[        XEU5      n[        R                  " 5       S:X  a7  [	        SR                  [        R                  R                  5       U5      5        [        R                  " USS9n	TR                  (       a  U R                  n [        U [        5      (       a  U R                  n [        U [        5      (       a  U R                  n [        U S5      (       a  U R                   n U4S jn
TR"                  (       a  SU	S   ;   am  U	S   S	   nTR$                  S
-   UR&                  S   :  aE  U
" UU R)                  5       S	   R*                  5      U	S   S	'   [-        STR$                  S
-    35        SU	S   ;   am  U	S   S   nTR$                  S
-   UR&                  S   :  aE  U
" UU R)                  5       S   R*                  5      U	S   S'   [-        STR$                  S
-    35        [/        U R)                  5       R1                  5       5       H^  n[	        U5        U R)                  5       R3                  U5      U R)                  5       UR5                  SS5      R5                  SS5      '   M`     U R7                  U	S   SS9u  pU(       d  U(       a  [-        SU SU 35        TR8                  (       aM  TR:                  (       a;  U R<                  R?                  U R@                  RB                  R*                  U5        g g g )Nr   z-global rank {} is loading pretrained model {}cpu)map_locationmodelc                 x   > U R                   S   nUTR                  S-   ::  d   eUR                  5       nXS U& U$ )Nr   r	   )shapemax_position_embeddingsclone)state_weightsmodel_weightsoriginal_lengthnew_weightsargss       f/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/mglm/train_utils.pyextend_embedding_weights1load_pretrained.<locals>.extend_embedding_weights%   sK    '--a0$">">"BBBB#))+(5$_%    z,transformer.block_position_embeddings.weightmodulez&transformer.position_embeddings.weightr	   zExtend position embedding to z#Extend block position embedding to z@mixins.block_position_embedding.block_position_embeddings.weightz"transformer.word_embeddings.weightzword_embeddings.weightF)strictzMissing keys z, unexpected keys )"r   r   r   get_data_parallel_rankprintformattorchr   get_rankload	deepspeedr&   
isinstanceTorchDDPr   hasattrr   block_lmr   r   
state_dictdatar   listkeyspopreplaceload_state_dictcontinuous_promptprompt_initprompt_spellinit_embeddingword_embeddingsweight)r   checkpoint_pathr!   task_tokensload_dirtagreleasesuccesscheckpoint_namesdr#   position_weightsblock_position_weightskeymissing_keysunexpected_keyss     `             r"   load_pretrainedrM      s   &>&O#H7)(AO
!!#q(=DD&&(/; 	< 
O%	8B~~%""%%%ug }}9R\I!(|8 :++a/2B2H2H2KK@X(((*ACCG4AI 8<>
 3D4P4PST4T3UV :R\I%'\>&@"++a/2H2N2N3  G_.((*GIIMGO 8BD
 9$:V:VYZ:Z9[\ E$$&++-.c

 .3-=-=-?-C-CC-H	 	3;;N:<<CG4(=*	+ / %*$9$9
8U %: %$!LL>);O;LM	O$"2"2))%*?*?*F*F*K*K*5	7 #3r%   c                 	   [        S5        U R                  (       a  US:X  aK  [        R                  U R                  U R
                  U R                  U R                  U R                  S9nGOUS:X  aL  [        R                  U R                  U R
                  U R                  U R                  U R                  US9nGO4[        eSu  pgUS:X  d  US:X  a  U R                  (       d  SnUb  SnUb  [        SU 35        [        S(0 S	U R                  _S
U R                  _SU R                  _SU R                   _SU R"                  _SU R$                  _SU R"                  _SU R&                  _SU R(                  _SU R*                  _SU R,                  _SU_SU R.                  _SU R0                  =(       a    U R2                  (       + _SU_SU_SU R4                  _SU R6                  _6nU R8                  (       a  UR9                  U R:                  S9  Ub  US:X  a  U R                  (       aT  U(       a9  U R<                  (       a  [?        XPR@                  S9nO[C        XPR@                  S9nO[E        XPRF                  S9nOp[I        UU R                  U RJ                  U RL                  US9nODUS:X  a,  [I        UU R                  U RJ                  U RL                  US9nOUS:X  a  O[        U5      e[N        RP                  " 5       S :X  ab  [S        S!RU                  [N        RV                  " 5       [Y        UR[                  5        Vs/ s H  oR]                  5       PM     sn5      5      S"S#9  U R^                  (       a  URa                  5         URc                  [d        Rb                  Rg                  5       5        U R^                  (       a  [i        U5      nU Rj                  (       d  U Rl                  (       d  U Rn                  (       ax  U Rp                  S$:X  a@  [d        Rb                  Rg                  5       n	[s        UU	/U	[N        Rt                  " 5       S%9nU$ U Rp                  S&:X  a  [w        U5      nU$ [        S'5        U$ s  snf ))zBuild the model.zbuilding GPT2 model ...multiple_choice)	cache_dirfp32_layernormfp32_embeddinglayernorm_epsilonclassification)rP   rQ   rR   rS   
num_labels)TTFzContinuous spell length 
num_layers
vocab_sizehidden_sizenum_attention_headsembedding_dropout_probattention_dropout_proboutput_dropout_probmax_sequence_lengthmax_memory_lengthcheckpoint_activationscheckpoint_num_layersparallel_outputrelative_encodingblock_position_encodingoutput_predictspell_length
spell_funcattention_scale)tune_prefix_layers)length_penalty)take_softmax)	num_class
generationr   z5 > number of parameters on model parallel rank {}: {}T)flushr+   )
device_idsoutput_deviceprocess_grouplocalzSkip DDP model )<r   pretrained_bertBertForMultipleChoicefrom_pretrainedtokenizer_model_typerP   rQ   rR   rS   BertForSequenceClassificationNotImplementedError
cloze_evalr   rV   rW   rX   rY   hidden_dropoutattention_dropoutr   
mem_lengthr_   r`   transformer_xlr2   	masked_lmprompt_funcrg   freeze_transformerrh   fast_decoder   ri   r   r   adapetr   output_dropout
pool_tokenr   r(   r)   r*   get_model_parallel_ranksum
parametersnelementfp16halfcudar+   current_devicer   r.   train_itersepochsDDP_implr0   get_data_parallel_groupLocalDDP)
r!   
model_typemulti_tokenrU   re   r   rd   paralle_outputpis
             r"   	get_modelr   X   sA    *+**)99))..#22#22"&"8"8 : :E ++1AA))..#22#22"&"8"8% B 'E &%)3&++!114??"N!"N#3L>BC 222 ((2 !% 8 8	2
 $(#6#62 $(#9#92 !% 3 32 !% < <2 #oo2 $(#>#>2 #'"<"<2 +2 #112 %)MM$H$..6H2 *2  &!2" ''#2$ !00%2& ""$$#'#:#: % <!..??"++$= %6I6I%KE %: %6I6I%KE !7!!= 9((++",.E //4$$''OO(* |+)*55
!!#q(CJJ++-5+;+;+=>+=aZZ\+=>?A 		 yy

 
JJuzz((*+ yyE" >>t//4;;==G#

))+A3!99;	=E L	 ]]g%UOE L )*L7 ?s   Rc                    [        U [        [        [        45      (       a.  U R                  n [        U [        [        [        45      (       a  M.  [        U 5      nU H)  nUS    H  n[        US5      (       a  M  SUl        M     M+     U$ )Nparamsmodel_parallelF)r/   r   r0   r   r&   r   r1   r   )r   param_groupsparam_groupparams       r"   get_optimizer_param_groupsr      sx    
UXx=
>
> UXx=
>
>?FL $ *E5"233',$ + $ r%   c           	         UR                   (       aP  UR                  (       a  [        R                  R                  nOSSKJn  UnU" XR                  UR                  S9nOUR                  S:X  aB  [        U UR                  UR                  UR                  UR                  4UR                  S9nO0UR                  S:X  a  SSKJn  U" U UR                  SSS	9nO[         e[#        S
UR$                  R&                   35        [)        US5      (       a  UR*                  (       a  [         eUR,                  (       aB  [/        UUR0                  UR2                  UR4                  UR6                  UR8                  S.S9nU$ )zSet up the optimizer.r   )DeepSpeedCPUAdam)lrweight_decayadam)r   r   betaseps	adafactor)	AdafactorF)r   relative_stepwarmup_initzOptimizer = r.   )scale_window	min_scaledelayed_shift)static_loss_scaledynamic_loss_scaledynamic_loss_args)cpu_optimizercpu_torch_adamr+   optimAdamWdeepspeed.ops.adamr   r   r   	optimizerAdam
adam_beta1
adam_beta2adam_epstransformersr   rx   r)   	__class____name__r1   r.   r   r   
loss_scaler   loss_scale_windowr   
hysteresis)r   r!   cpu_adam_optimizerr   r   r   s         r"   get_optimizerr      s;   !&!2!2;!1&WW43D3DF	 >>V#77!..8MM#I ^^{*.!77#!	#I &%	L,,556
78t[!!dnn!!
 yy""oo#66 $ 6 6!^^!%		 r%   c           
      (   UR                   b  UR                   nOUR                  nUR                  (       a  X!R                  -  n[	        SU5      nSnUR
                  U-  n[        U UR                  UX$-
  UR                  UUR                  S9nU$ )z"Build the learning rate scheduler.r	   )start_lrwarmup_iter	num_itersdecay_style	last_iterdecay_ratio)
lr_decay_itersr   finetunegradient_accumulation_stepsmaxwarmupAnnealingLRr   lr_decay_stylelr_decay_ratio)r   r!   r   	init_stepr   lr_schedulers         r"   get_learning_rate_schedulerr     s     &''	$$	}}!A!AA	Ay!II++	)K)'''')L r%   c                 ^   [        U UUUUS9n[        U5      nU R                  c-  U R                  bt  U R                  S:  d  U R
                  S:  aT  U R                  (       a,  [        S5        [        R                  " UUU [        SS9u  pW  nO[        X`5      n[        Xp5      n	OSu  pyXWU	4$ )zSetup model and optimizer.)r   r   rU   re   r   zDeepSpeed is enabled.F)r   model_parametersr!   r   dist_init_required)NN)r   r   
train_datadata_dirr   r   r.   r   
initializer   r   r   )
r!   r   r   rU   re   r   r   r   _r   s
             r"   setup_model_and_optimizerr   .  s     !#E .e4L"dmm&?KK!Ot//!3>>01%.%9%9!-#(&*"Ea &l9I29C",	\))r%   c                    UnUR                   (       a  UR                  U5        O2UR                  (       a  U R                  USS9  OUR                  5         UR                   (       d  UR                  S:X  a  U" S5      R	                  5         OFU" S5      R                  5         UR                  SUR                  S9  U" S5      R                  5         UR                   (       d  UR                  (       a  U R                  5         UR                  S:  a]  UR                  (       d1  [        R                  " UR                  5       UR                  5        U$ U R                  UR                  5        U$ )zBackward step.F)update_master_gradsr+   	allreduce)reduce_afterfp32_allreducer   )r.   backwardr   r   resetstartallreduce_paramsr   stopr   	clip_gradr   clip_grad_normr   clip_master_grads)r   r   lm_lossr!   timerslosss         r"   backward_stepr   Q  s    D ~~t 99t?MMO~~'1 	{!!#{!!#t/B/B 	 	D{  " >>99))+ >>A99""5#3#3#5t~~F N ++DNN;Nr%   c                    U(       d  g [         R                  " 5         [         R                  " 5       S:X  a  [        U 5        [        S[        R
                  R                  5       S-  S5        [        S[        R
                  R                  5       S-  S5        [        S[        R
                  R                  5       S-  S5        [        S[        R
                  R                  5       S-  S5        [        S5        g g )	Nr   zMemory Allocated i   @	GigaByteszMax Memory Allocated zCache Allocated zMax cache Allocated  )
distbarrierr,   r)   r+   r   memory_allocatedmax_memory_allocatedmemory_cachedmax_memory_cached)messageforces     r"   see_memory_usager   z  s    LLN}}!g!jj))+/AB	 	%jj--/3EF	 	 jj&&(,>?	N$jj**,0BC	 	c
 r%   c	                 p   Su  pUc  / OUnUR                   (       d  UR                  5          Su  pU" S5      R                  5         U" XUXW5      u  pnU" S5      R                  5         UR                   (       d  XR                  -  nUR                  5       R                  5       R                  S5      n[        R                  R                  UR                  [        R                  " 5       S9  UR                  UR                  UR                  -  -  Ul        [         R"                  " U5      (       GdL  X-  n	U
S-  n
U" S5      R                  5         [%        X!XU5        U" S5      R                  5         U" S5      R                  5         UR                   (       an  UR'                  5       (       aH  UR)                  5         SnUR*                  (       a  UR,                  (       d  UR)                  5         OjSnOgUR)                  5         OVXR                  :X  aG  UR)                  5         SnUR*                  (       a  UR,                  (       d  UR)                  5         OSnU" S5      R                  5         U(       a  OO[/        S	5        AA/ nU(       a  OGMq  UR                   (       a  X-  n	XU4$ )
zSingle training step.)g        r   T)r   Fforwardr	   )groupr   r   zFound NaN loss, skip backward)r.   	zero_gradr   r   r   detachr   viewr+   r   
all_reducer4   r   r   
world_sizemodel_parallel_sizer   _has_inf_or_nanr   !is_gradient_accumulation_boundarystepr   overflowr   )data_iteratorr   r   r   r!   r   forward_step_funcmemssingle_steplm_loss_totalcountskipped_itercompleter   r   reduced_losss                   r"   
train_stepr    s>    "M24D>>
!)y!,]4-3;qy ~~777G~~'--/44Q7$$S%@%@%B 	% 	D(--OOd6668 !00>>)MQJE :$$&)G6B:##% ;%%'~~::<<JJL#H II)*<*<$))+'(JJL<<<NN$#H II)*<*<$))+'(;$$&  89Dq r ~~%-,,r%   )N)NTNN)F)NF)%r.   r+   apex.optimizersr   r   megatron_utilr   megatron_util.fp16r   r   r   r   r   r   r
   r   r   r   r   r   r   r   r0   r   utilsr   r   r   rM   r   r   r   r   r   r   r   r  rr   r%   r"   <module>r     s      -  M M % 6  > ? N NB7L 	wt 0f4 *.*.)-+/	 *F&R8  I-r%   