
    9iv                     *   S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	J
r
  S SKJrJrJr  S SKJr  S SKJr  S S	KJrJr  S S
KJr  S SKJr  S SKJr  S SKJr  S SKJr   " S S\5      r \
RB                  " \RD                  S9 " S S\5      5       r"g)    N)mpu)Hooks)EpochBasedTrainer)HOOKS)BestCkptSaverHookCheckpointHookCheckpointProcessor)LoadCheckpointHook)Hook)load_checkpointsave_checkpoint)DistributedParallelType)create_device)
get_logger)is_megatron_initialized)get_local_rankc                   J    \ rS rSrSrS rS rS rS r  SS jr	S	 r
S
 rSrg)MpuProcessor   modelc                      [         R                  " 5       nUS:X  a  g[         R                  " 5       nSR                  U5      $ ! [        [
        4 a     gf = f)N    z_mp_rank_{:02d})r   $get_tensor_model_parallel_world_sizeget_tensor_model_parallel_rankformatImportErrorAssertionError)selftp_world_sizemp_ranks      s/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/trainers/hooks/distributed/megatron_hook.py	rank_nameMpuProcessor.rank_name   sV    	DDFM!88:G$++G44^, 		s   A %A AAc                 Z    [         R                  " 5       nSR                  U5      nSU S3$ )Nz{:02d}mp_rank_z_model_states.pt)r   r   r   )r   r!   ranks      r"   get_bin_filenameMpuProcessor.get_bin_filename&   s.    446w'$/00    c                     [         R                  R                  5       (       + =(       d    [        R                  " 5       S:H  $ Nr   )torchdistributedis_initializedr   get_data_parallel_rankr   trainers     r"   should_save_on_rank MpuProcessor.should_save_on_rank+   s2    %%4466 7//1Q6	7r*   c                     UR                   n[        R                  " XUU R                  5        [        R
                  " [        R                  R                  X R                  5      SS9  g )NT)exist_ok)cfgr	   copy_files_and_dump_config_BIN_FILE_DIRosmakedirspathjoin)r   r2   
output_dirconfigs       r"   prepare_outputMpuProcessor.prepare_output0   sO    66w7=7;7I7I	K 	GGLL%7%784	Ir*   Nc           
      |   UR                  UR                  5      nX R                  5       -   [        R                  -   n[        UUU(       a  UR                  OS U(       a  UR                  OS USS9  [        R                  R                  U5      n[        R                  R                  U5      n	U R                  5       n
[        R                  R                  XS-   U
-   5      n[        XkSS9  Un[        R                  R                  X0R                  U
5      n[        R                  R                  U5      (       a  [        R                   " U5         [        R"                  " X5        g ! [$         aC  n['        5       R)                  SU SU SU S35        [*        R,                  " X5         S nAg S nAff = f)	NF)meta
with_model_)	with_metazLink z to z error: z@, changing to copy the bin file, this may case more space usage.)unwrap_moduler   r#   r	   TRAINER_STATE_SUFFIXr   	optimizerlr_schedulerr:   r<   dirnamebasenamer(   r=   r9   isfileunlinklinkOSErrorr   errorshutilcopyfile)r   r2   checkpoint_path_prefixr>   rC   save_optimizersr   _train_state_filesave_dirprefixbin_fileprefix_bin_filesrc_file	dest_filees                  r"   save_checkpointsMpuProcessor.save_checkpoints8   sf    %%gmm42^^ 6
 
445 	!0Gd$3G  	 77??#9:!!"89((*'',,x#1HI%@"GGLL-?-?J	77>>)$$IIi 	1GGH( 	1Lzi[ <Q Q OOH00	1s   E. .
F;89F66F;c                    X R                  5       -   [        R                  -   n[        R                  R                  U5      (       a  [        R                  " U5        [        R                  R                  U5      n[        R                  R                  U5      nU R                  5       n[        R                  R                  XES-   U-   5      n[        R                  R                  U5      (       a  [        R                  " U5        g g NrE   )r#   r	   rH   r:   r<   rM   removerK   rL   r(   r=   )r   r2   rT   rV   rW   rX   rY   absolute_files           r"   remove_checkpointsMpuProcessor.remove_checkpoints^   s    2^^ 6
 
44577>>+,,II'(77??#9:!!"89((*X|h/FG77>>-((IIm$ )r*   c                 v   UR                  UR                  5      n[        R                  R	                  U5      (       a?  UnU R                  5       n[        R                  R                  Xg5      n[        XS S 5        g XR                  5       -   [        R                  -   n	[        R                  " X)U5      n
[        R                  R                  U5      n[        R                  R                  U5      nU R                  5       n[        R                  R                  XkS-   U-   5      n[        XS S 5        U
$ ra   )rG   r   r:   r<   isdirr(   r=   r   r#   r	   rH   r
   load_trainer_staterK   rL   )r   rT   r2   load_all_statestrictr   rW   rY   
model_filerV   rC   rX   s               r"   load_checkpointsMpuProcessor.load_checkpointsk   s    %%gmm477==/00-H,,.Hh9JJtT: 6 : !#88!9%88N<D ww'=>HWW%%&<=F,,.Hhx0GHJJtT:Kr*    NT)__name__
__module____qualname____firstlineno__r9   r#   r(   r3   r@   r^   rd   rl   __static_attributes__rn   r*   r"   r   r      s4    M	1
7
I #)-$1L%r*   r   )module_namec                   D    \ rS rSrSrS rS\4S jrS rS r	S r
S	 rS
rg)MegatronHook   r   c                     SU l         g )NFwrapped)r   s    r"   __init__MegatronHook.__init__   s	    r*   r2   c                 8   [        5       nUR                  [        5      n[        U5      S:  a6  [	        US   R
                  [         5      (       d  US   R                  U5        UR                  [        5      n[        U5      S:  a6  [	        US   R
                  [         5      (       d  US   R                  U5        UR                  [        5      n[        U5      S:  a8  [	        US   R
                  [         5      (       d  US   R                  U5        g g g r,   )	r   get_hookr   len
isinstance	processorset_processorr   r
   )r   r2   r   	ckpt_hookbest_ckpt_hookload_ckpt_hooks         r"   register_processorMegatronHook.register_processor   s     N	$$^4	y>Aj11G1G1='? '?aL&&y1 ))*;<~":q!++\,; ,;1++I6 ))*<=~":q!++\,; ,;1++I6,;"r*   c                    [        5       (       d   e[        5       n[        SU 35      Ul        UR                  R                  UR                  5        [        R                  " 5       UR                  [        R                  '   [        R                  " 5       UR                  [        R                  '   [        R                  " 5       UR                  [        R                  '   g )Nzcuda:)r   r   r   devicer   tor   get_data_parallel_groupparallel_groupsr   DPget_tensor_model_parallel_groupTP!get_pipeline_model_parallel_groupPP)r   r2   
local_ranks      r"   
after_initMegatronHook.after_init   s    &((((#%
&zl';<(*-*E*E*G 	#&&	( '*&I&I&K 	 7 "	$ '*&K&K&M 	 7 "	$r*   c                 &    U R                  U5        g Nwrap_moduler1   s     r"   
before_runMegatronHook.before_run       !r*   c                 &    U R                  U5        g r   r   r1   s     r"   
before_valMegatronHook.before_val   r   r*   c                     UR                   (       a:  U R                  (       d(  UR                  UR                  5      Ul        SU l        g g g ro   )_distr{   to_parallelr   r1   s     r"   r   MegatronHook.wrap_module   s7    ==<< ' 3 3GMM B#   r*   rz   N)rp   rq   rr   rs   r9   r|   r   r   r   r   r   r   rt   rn   r*   r"   rw   rw      s0     M7*; 7
N""$r*   rw   )#r:   rR   r-   megatron_utilr   modelscope.metainfor   modelscope.trainersr   !modelscope.trainers.hooks.builderr   4modelscope.trainers.hooks.checkpoint.checkpoint_hookr   r   r	   9modelscope.trainers.hooks.checkpoint.load_checkpoint_hookr
   modelscope.trainers.hooks.hookr   modelscope.utils.checkpointr   r   modelscope.utils.constantr   modelscope.utils.devicer   modelscope.utils.loggerr   modelscope.utils.megatron_utilsr   modelscope.utils.torch_utilsr   r   register_modulerw   rn   r*   r"   <module>r      sz    	    % 1 3< < / H = 1 . C 7h& hV 5#5#56,$4 ,$ 7,$r*   