
    9iH                     v    S SK r S SKrS SKJr  S SKJr  S SKJrJ	r	  S SK
Jr  S SKJr  S rS r " S S	\5      rg)
    N)mpu)_flatten_dense_tensors_unflatten_dense_tensors)Variable)Modulec                    ^ ^ U U4S jnU$ )Nc                 V   > [         R                  R                  R                  U TTS9$ N)meanstdtorchnninitnormal_tensorr   r   s    `/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/utils/nlp/distributed.pyinit_!normal_init_method.<locals>.init_   #    xx}}$$V$C$@@     )r   r   r   s   `` r   normal_init_methodr      s    A Lr   c                 P   ^ ^ T[         R                  " SU-  5      -  mU U4S jnU$ )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                 V   > [         R                  R                  R                  U TTS9$ r
   r   r   s    r   r   !scaled_init_method.<locals>.init_&   r   r   )mathsqrt)r   r   
num_layersr   s   ``  r   scaled_init_methodr!   "   s(    
		#
*+
+CA Lr   c                   B   ^  \ rS rSrU 4S jrS rSS jrSS jrSrU =r	$ )	DistributedDataParallel,   c                 ^  >^ ^ [         [        T ]  5         [        R                  [        R
                  R                  :X  a  SOST l        UT l        [        R                  " 5       T l        [        R                  " 5       nT R                  R                  5        H?  n[        R                  " U5      (       d  M   [        R                   " X2T R                  S9  MA        SU 4S jjm/ T l        / T l        ['        T R                  R                  5       5       H	  nU4S jnM     TT l        g )NTFgroupc                   > TR                   (       Ga  STl         0 nTR                  R                  5        H]  u  pEUR                  (       d  M  UR                  c  M'  UR
                  R                  5       nXc;  a  / X6'   X6   R                  U5        M_     TR                  (       a1  [        R                  R                  U;   a  [        SS5        STl        U GH  nX6   nU Vs/ s H  oUR                  R
                  PM     nn[        U5      n	U(       a  U	R                  5       n	U(       d(  U (       d!  U	[        R                   " TR"                  S9-  n	[        R$                  " U	TR"                  S9  [        R                  R'                  5         U(       d(  U (       a!  U	[        R                   " TR"                  S9-  n	[)        U[+        X5      5       H  u  pU
R-                  U5        M     GM     g g s  snf )NFzEWARNING: gloo dist backend for half parameters may be extremely slow.z7It is recommended to use the NCCL backend in this case.r&   )needs_reductionmodulenamed_parametersrequires_gradgraddatatypeappendwarn_on_halfr   cuda
HalfTensorprintr   floatdistget_world_sizedata_parallel_group
all_reducesynchronizezipr   copy_)reduce_afterno_scalefp32_allreducebucketsnameparamtpbucketgrads	coalescedbufsyncedselfs               r   allreduce_params:DistributedDataParallel.__init__.<locals>.allreduce_params9   s    $$$',$#';;#?#?#AKD***uzz/E#jjoo/,*,GK**51 $B $$zz,,7cU -2)!B$[F:@A&ZZ__&EA 6u =I%$-OO$5	#L!T%8%8"&":":&< <	OOIT5M5MNJJ**,#!T%8%8"&":":&< <	'*!#;I#M(O		&)(O "! %$ Bs   (G<c                  D   > [         R                  R                  T5        g N)r   _execution_enginequeue_callback)unusedrJ   s    r   allreduce_hook8DistributedDataParallel.__init__.<locals>.allreduce_hookb   s    **99:JKr   )TFF)superr#   __init__r6   _backenddist_backendGLOOr1   r*   r   get_data_parallel_groupr8   get_tensor_model_parallel_rank
parametersr   	is_tensor	broadcasthook_handleshookslistrJ   )rI   r*   src_rankprB   rQ   rJ   	__class__s   `     @r   rT    DistributedDataParallel.__init__.   s    %t57$(MMT5F5F5K5K$KDQV#&#>#>#@ 557'')Aq!!q$2J2JK * +/&+,1#	*J 
$++0023EL 4
 !1r   c                 4    SU l         U R                  " U0 UD6$ )NT)r)   r*   )rI   inputskwargss      r   forwardDistributedDataParallel.forwardg   s    #{{F-f--r   c                 >    U R                   R                  XU5      nU$ rM   )r*   
state_dict)rI   destinationprefix	keep_varssds        r   rj   "DistributedDataParallel.state_dictk   s    [[##KC	r   c                 6    U R                   R                  XS9  g )N)strict)r*   load_state_dict)rI   rj   rq   s      r   rr   'DistributedDataParallel.load_state_dictp   s    ##J#>r   )rJ   r8   r]   r^   r*   r)   r1   )N F)T)
__name__
__module____qualname____firstlineno__rT   rg   rj   rr   __static_attributes____classcell__)rb   s   @r   r#   r#   ,   s    71r.
? ?r   r#   )r   r   torch.distributeddistributedr6   megatron_utilr   torch._utilsr   r   torch.autogradr   torch.nn.modulesr   r   r!   r#   r   r   r   <module>r      s4          I # #E?f E?r   