
    9i                   @   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	J
r
  S SKJr  S SKJrJrJrJr  S SKrS SKJr  S SKJr  S SKJr  S	S
KJrJr  S	SKJr  \(       a  S	SKJrJr  S	SK J!r!  S	SK"J#r#J$r$J%r%J&r&J'r'  S	SK(J)r)J*r*J+r+J,r,J-r-J.r.J/r/  S	SK0J1r1  \Rd                  " \35      r4\Rj                  Rm                  \3S5      r7\(       a  S SK8J9r9    S1S jr:S2S jr;S2S jr<    S2S jr=    S2S jr>\ " S S5      5       r?S3S jr@S4S jrAS rBS5S6S jjrCS rDS7S jrES  rF    S8S! jrG    S9S" jrH          S:S# jrI    S;S$ jrJ\ " S% S&5      5       rK    S<S' jrL    S2S( jrMS=S) jrNS* rOS+ rP    S2S, jrQS>S- jrRS?S. jrSS/ rT        S@S0 jrUg)A    )annotationsN)defaultdict)	dataclass)AnyOptionalTYPE_CHECKINGUnion)trace_structured)StorageWeakRef)
OrderedSet   )configir)WeakDep)IRNode	Operation)SchedulerBuffer)estimate_peak_memoryestimate_peak_memory_allocfreeFreeableInputBufferget_freeable_input_bufSNodeMemory)contains_collectivecontains_waitfind_recursive_deps_of_nodefind_recursive_users_of_nodeis_collectiveis_fallback_opis_wait)Voverlap)BaseSchedulerNodec                   0 nU  H  nUR                  5       X'   M     SS KJn  SSKJn  UR                  5       nU" 5       n[        U5       Vs/ s H  n/ PM     nnUR                  U[        UR                  5       5      U5        [        R                  " [        R                  " U5      SS9R                  R                  5       n	[        [        U 5      5       H  n
X   X
   l        M     g s  snf )Nr   )_get_default_group)dim)get_estimated_runtimetorch.distributeddistributed"torch.distributed.distributed_c10dr$   get_world_sizerangeall_gather_objectlistvaluestorchmediantensortolistlenoverride_estimated_runtime)snodesruntime_estimationssnodedistr$   
world_sizepg_gathered_runtime_estimationsmedian_runtime_estimationsis              U/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/torch/_inductor/comms.py6align_runtime_estimations_across_all_distributed_ranksr@   7   s     %*%@%@%B" $E$$&J		BCHCT6UCTarCT 6U$d+>+E+E+G&H" "'12"fVVX  3v;/I/L	,   7Vs   C.c                    [        U SSSS9$ )z/
Greedily schedules waits as late as possible.
FTraise_comms
sink_waitsreorder_for_overlap_schedule_for_commr5   s    r?   rD   rD   M   s     Ed     c                    [        U SSSS9$ )z0
Greedily schedules comms as early as possible.
TFrB   rF   rH   s    r?   rC   rC   V   s     DU rI   c                    [        U SSSS9$ )a  
This achieves the following overall scheduling procedure:
    Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
        that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
    Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
        Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
        We prioritize compute nodes that are needed sooner.
    Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
    Step 4: We schedule comm N + 1.
    Repeat this for subsequent comm nodes.
TrB   rF   rH   s    r?   reorder_compute_for_overlaprL   _   s     DTt rI   c                     [        U 5      u  pU$ )a+  
Reorders communication ops relative to computation ops to improve communication-compute overlapping and hide comm
latency.  Stops moving a particular op if it reaches a point that would have increased the peak memory footprint.

Currently, follows these heuristics (subject to change or tune):
- never reorders collectives relative to one another, for SPMD safety
- has an option for per-collective prefetch limit, but does not enable it by default
- limits the total number of reorder steps to some factor of the graph size to prevent worst-case quadratic
  performance

Prerequisite: sink_comms_and_waits - ensure comm and wait nodes are scheduled as late as possible, respecting data
dependencies.  That allows reorder_communication_preserving_peak_memory to take a best case peak-memory snapshot,
and then monotonically improve latency by moving collectives backward in time.

Peak memory impact is computed in an iterative fashion.  First, memory use at each timestep is computed, and global
peak memory is computed as a max over timesteps.  Then, when swapping any two adjacent nodes, only the curr-memory
for the earlier of the nodes after the swap is affected.  This enables checking step by step whether a swap is
peak-memory-safe, and bailing out if not.  Example:

0   n0      C0
1   n1      C0 + Allocs(n1) - Frees(n1)
2   n2      C0 + Allocs(n1) - Frees(n1) + Allocs(n2) - Frees(n2)

0   n0      C0
1   n2      C0 + Allocs(n2) - Frees(n2)    <-- After moving n2 to Time 1, only time1 memory changes
2   n1      C0 + Allocs(n2) - Frees(n2) + Allocs(n1) - Frees(n1)

)6_reorder_communication_preserving_peak_memory_internal)r5   reordered_snodes
node_statss      r?   ,reorder_communication_preserving_peak_memoryrQ   r   s    @ 	?vF ! rI   c                  ~    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S	'   S
r	S\S'   S
r
S\S'   SrS\S'   \S 5       rSrg)ReorderInfo   z=
Debug info describing how an individual snode was reordered
floatinitial_exposedfinal_exposedNonestrlimiting_factorr   intmovesgrouped grouped_infoc                4    U R                   U R                  -
  $ N)rW   rX   )selfs    r?   improvementReorderInfo.improvement   s    ##d&8&888rI    N)__name__
__module____qualname____firstlineno____doc__rW   __annotations__rX   r[   r]   r^   r`   propertyrd   __static_attributes__rf   rI   r?   rS   rS      sV      OUM5!OS!E3NGSL#9 9rI   rS   c                    U c  g[        U [        R                  R                  R                  R
                  5      (       a  g[        U SS 5      =n(       a  SU;   a  gg)NFTpython_kernel_nameextern_kernels)r   r/   opsaten#_scaled_dot_product_flash_attentiondefaultgetattr)noderp   s     r?   is_gemm_likerx      s]    |		::BB   &d,@$GGG
0
0rI   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7frb   )contains_gemm_like).0xs     r?   	<genexpr>%contains_gemm_like.<locals>.<genexpr>   s     ?,Q%a((,   )torch._inductor.schedulerr{   
isinstanceanyr5   rx   rw   )r7   r{   s     r?   r}   r}      s4    >%..?%,,???EJJ''rI   c                    SSK Jn  [        X5      (       a-  U R                  (       a  U R                   H  nU" U5        M     g U" U 5        g )Nr   rz   )r   r{   r   temp_groupingr5   )r7   fnr{   _snodes       r?   _temp_group_visit_leavesr      s6    >%..53F3FllFvJ # 	5	rI   c                    SnU R                    HG  nU(       a  US-  nX#R                  5       -  nU(       d  M*  U[        U R                  5       5       -  nMI     U$ )Nr_   r;   )r5   get_namer-   get_buffer_names)r7   	with_bufsretns       r?   _group_namer      sV    
C\\3JCzz|9d5113456C  JrI   c                H    [        U [        5      =(       a    U R                  $ rb   )r   r   is_fake)ds    r?   _is_fake_depr      s    a!/aii/rI   c                j    SR                  U  Vs/ s H  oR                  5       PM     sn5      $ s  snf )N~)joinr   )gnsgns     r?   _group_namesr      s'    88S1Sr[[]S1221s   0c                v    [        X5      n[        XU5      u  pEpg[        [        X5      5      nSUS'   UUUUU4$ )z*Initialize memory tracking data structures)r   r   N)r   r   dictzip)	r5   graph_inputsgraph_outputsname_to_freeable_input_bufpeak_memorysnodes_curr_memorysnodes_allocfreebuf_to_snode_last_use_curr_memorys	            r?   _initialize_memory_trackingr      sY    !7!M&	
 MK%5
 F78LL" rI   c                    0 n0 n[        U 5       H3  u  p4US:  a  XS-
     OSX'   U[        U 5      S-
  :  a  XS-      OSX$'   M5     U S   nXU4$ )z/Create double-linked list structure from snodesr   r   N)	enumerater3   )r5   _prev_nextr>   r7   _heads         r?   _initialize_double_linked_listr      sh     EEf%()Av!e}4()CK!O(;v!e} & 1IErI   c                  ^2^3^4^5^6^7^8^9^:^;^<^=^>^? SnU  H  n[        U5      (       d  M  Sn  O   U(       d  U 0 4$ SSKJn  [        U 5      n[	        [
        R                  R                  R                  5       5      n[	        [
        R                  R                  5       5      n[        XU5      u  nm2m?m6nU  Vs0 s H  o"[        U5      _M     snm=0 n	      S)U=4S jjn
Sn[        U 5      u  m5m4m3      S*U44S jjnU3U4U54S jnU2U7U8U9U:U;4S jnU2U6U7U8U;U?4S	 jn[        R                  nSnT3n[        R                  nSnT4U   Gb&  U(       a  GO[        U5      (       Ga  Ub  UU:  a  GOUS-  n[!        5       =nU	U'   U
" UU" T4U   S
5      5      =Ul        Ul        T5U   nUnUm;T2U   S   m:UGb  [        U5      (       a	  SUl        GOU" UT;5      m9U" UR(                  T9SS9nUR*                   Vs0 s H"  n[-        U5      (       a  M  UR.                  U_M$     nnUR1                  5       nS
nU H.  nUR3                  UR5                  5       S
5      =n(       d  M,  Un  O   Ub      S+S jnU" U5      u  nn U(       aA  Un[7        T:T2U   S   5      m:U=R8                  S-  sl        [;        T95      Ul        T5U   nGM  SU S[?        UR                  5       5       SUR5                  5        SURA                  5       / S[;        T95       SU  3n!U!Ul        GO?T?U   m7T7RB                  T7RD                  -
  m8[G        [>        5      n"T6RI                  5        HT  u  n#m>U#RJ                  RL                  n$UU$;  a  M$  [O        U>4S jT9 5       5      (       d  M@  U"T>   RQ                  U#5        MV     U" UT9U"5      u  n%n&U%U:  a  SU% SU 3Ul        OU=RR                  S-  sl)        US-  nU" UUT;5        U
" UU" T4U   S
5      5      Ul        U" UT9U"U&5        U(       a0  SSK*J+n'  U'" UT9[;        T95      U" T3S
5      UUUT2T?SU"5      nU(       a  OT5U   nUb  GM  T4U   nT4U   b  GM&  U	n(U( Vs0 s H  o"U(U   RX                  _M     n)n[[        U) Vs/ s H  nU)U   PM
     sn5      n*[[        U( Vs/ s H  nU(U   RR                  PM     sn5      nSU* SU S3m</ SQn+U(RI                  5        VV,s/ s H_  u  nn,[]        U5      U,R"                  U,R$                  U,RX                  U,R&                  U,RR                  U,R8                  U,R<                  /PMa     n-nn,[^        R`                  Rc                  S5      (       a  SSK2J2n.  T<U." U-U+S 9-  m<O8T<S!-  m<T<[g        U+5      S"-   -  m<T<S"Ri                  [k        [f        U-5      5      -  m<U" T3S
5      n/[        U/5      U:X  d   e[m        U/X5      u  n0    n1T<S#U 3-  m<T<S$U0 3-  m<[n        Rq                  T<5        [s        S%S& U<4S' jS(9  U/U	4$ s  snf s  snf s  snf s  snf s  snf s  sn,nf ),z|
Internal testing helper that also returns debug info.
Returns:
    - reordered snodes list
    - dict {snode: ReorderInfo}
FTr   rz   c                   >^ [        U 5      nSmU H;  n[        U5      (       a  M  [        U5      (       a    OSUU4S jjn[        X45        M=     [	        SUT-
  5      $ )N        c                   > TTU    -  mg rb   rf   )r   compute_timeruntimess    r?   accumulate_times_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time.<locals>.accumulate_time7  s     00rI   r   )r   r"   returnrY   )estimate_op_runtimer   r   r   max)collective_snoderemaining_snodes	comm_timer7   r   r   r   s        @r?   exposed_communication_timeZ_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time)  se     ((89	%E"5))U## 1 1 %U< & 1i,.//rI   c                T   > / nU n Ub  UR                  U5        X1:X  a   U$ TU   nM#  rb   appendheadtailr   r   r   s       r?   _group_nodesL_reorder_communication_preserving_peak_memory_internal.<locals>._group_nodesB  ?     }

1y
 aA rI   c                   > TU    nU(       a  UTU'   UTU'   TU   nU(       a  U TU'   UTU '   UTU '   U TU'   TU :X  a  Umg g rb   rf   )	candidate
group_head
group_tailcandidate_prevgroup_tail_nextr   r   r   s        r?    _perform_double_linked_list_swap`_reorder_communication_preserving_peak_memory_internal.<locals>._perform_double_linked_list_swapO  ss     y)$.E.!*j  
+%.E/"*i &i%j IE rI   c                z  > 0 nSnU(       d(  [        TT-
  TT   S   T-
  TR                  -   5      nXC4$ T* nT HU  nTU   S   U-   nXsU'   [        XG5      nUR                  US 5      nUc  M4  U H  n	XYR                  R                  -  nM     MW     TT   S   U-   TR                  -   n
XU '   [        XJ5      nXC4$ Nr   r   )r   
size_allocget
mpi_buffer	size_free)r   group_ns/group_n_to_bufs_after_swap_dealloc_by_candidate_post_alloc_updatepotential_peakmem_after_reorder_deltar   gn_post_alloc_membufsbufcandidate_mem_post_allocr   candidate_allocfreecandidate_delta_memr   group_peak_memoryr   s              r?    _calculate_potential_peak_memory`_reorder_communication_preserving_peak_memory_internal.<locals>._calculate_potential_peak_memoryi  s   
 <>> !$77Z(+%&%001N "55 )<';B ,R 0 36M M%6r" CNBFFr4PDC+~~/G/GG+    $Q'%&!,,- 	!
 )A9%^F11rI   c                  > U(       dM  U H  nTU   nUS   T-
  US   T-
  4TU'   M     TT   S   TR                   -   nUTR                  -
  nUU4TU '   g UR                  5        H  u  nnU H  n	U TU	'   M
     M     Sn
U HO  nX;   n[        S X+    5       5      nX-  n
TU   =R                  U-  sl        UTU   R                  -
  nX4TU'   MQ     X0   nTU    =R                  U
-  sl        UTU    R                  -
  nUU4TU '   g )Nr   r   c              3  N   #    U  H  nUR                   R                  v   M     g 7frb   r   r   r~   r   s     r?   r   u_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap.<locals>.<genexpr>  s!      6MC ((M   #%)r   r   itemssum)r   r   r   r   r   cm_candidate_post_alloc_mem_candidate_post_free_memr   r   "size_free_to_move_to_candidate_sumr   _gn_post_alloc_memsize_free_to_move_to_candidategn_post_free_memcandidate_post_free_memr   r   r   r   r   r   s                   r?   "_update_memory_tracking_after_swapb_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap  s    ?!"%qE//qE//$R   Z(+.A.L.LL & *,?,I,II % *('L#  =BBD
-6%c*  E 34*A&8&;25 6JM6 3* /P.Q))-KK)$69I!9L9V9V$V1DLO  %7$A!#--1SS-%(8(C(M(MM 	  &##
YrI   Nr   zcollective orderingr   c                H    [        U 5      (       a  g[        U 5      (       a  gg)N)Fr   )Fr}   TN)r   r}   )r   s    r?   is_groupableL_reorder_communication_preserving_peak_memory_internal.<locals>.is_groupable   s#     /y99#?-i88#>)rI   data dependency (dep_names:)
 candidate:z(outs:)dep on 
 non_group_reason:c              3  ,   >#    U  H	  oT:H  v   M     g 7frb   rf   )r~   r   snode_last_uses     r?   r   I_reorder_communication_preserving_peak_memory_internal.<locals>.<genexpr>:  s     Bc^3cs   peak memory new:	 vs base:!_debug_iterative_memory_recomputerQ   zAreorder_communication_preserving_peak_memory improved overlap by z
 ns after z reorders.
)zCollective nodezinitial exposedzfinal exposedrd   limiting factorr]   r^   r`   tabulater  headers>Please `pip install tabulate` to nicely render overlap stats.

z
 peak_memory_before:z
 peak_memory_after:artifactc                     SSS.$ )NrQ   stringnameencodingrf   rf   rI   r?   <lambda>H_reorder_communication_preserving_peak_memory_internal.<locals>.<lambda>  s    B 
rI   c                    > T $ rb   rf   )reorder_log_strs   r?   r  r    s    ?rI   metadata_fn
payload_fn)r   r"   r   list[BaseSchedulerNode]r   rV   r   Optional[BaseSchedulerNode]r   r  r   r  )r   r"   r   ztuple[bool, Optional[str]]):r   r   r{   r3   r   r    graphr   keysget_output_namesr   r   r   r   (reorder_iterative_debug_limit_to_reorder(reorder_iterative_debug_memory_recomputerS   rW   rX   r[   	schedulerunmet_dependenciesr   r  get_outputsr   r   r   r^   r   r`   r-   r   r   r   r   r   r   
succ_nodesr   r   r]   comms_debugr  rd   r   node_summary	importlibutil	find_specr  rZ   r   mapr   overlap_loginfor
   )@r5   has_collectivesr7   r{   original_snodes_numr   r   r   r   statsr   total_movesr   r   r   r    debug_num_collectives_to_reordernum_processed_collectivescurr debug_iterative_memory_recomputeiterative_recompute_errorr-  r   r   groupr   	data_depscandidate_outsdata_depor   is_groupable_resultgrouping_reasonmsgr   r   r%  r   r   r  rP   rd   total_improvementr
  	node_inforowsr  
new_snodesnew_peak_memoryr;   r   r   r   r   r   r   r   r   r   r   r  r   r   r   s@                                                     @@@@@@@@@@@@@@r?   rN   rN     sn    Ou%%"O  rz>f+$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$F-H" 8>07=e"5))v0H 35E0+0?V0	0* K8@E5%)1L	 4&2 &2P5
 5
p 	77 % &'D'-'V'V$ %
+
!$t$$/;)-MM%*%!,.D5;8Rl5;59 D 4#5 dIJJ ,T 21 5'&y11+@D(/;J
/S,NN"& (-'?'?'?!|TUIAFFAI'?   "+!6!6!8'A%MM!**,==q=#$ (
 '	*#4	*3	* <H	;R8'*%.
,/-|I/Fq/I-) ),8,=)$))$4	  /xjDIYDZC[ \,,5,>,>,@+AIcIcIeHfGg h&&23&7%82?2CE  03,3CI3N#'225H5R5RR $"  % @ +002"!$!:!:J 
2 BcBBB C&fSk 3 6Vs$S62 2 "K/*>*:)K=Q ( 

a
q 0J
S%?,uT{D9&" 3C&	 4 O0Q!$S)$UD12%#$(FG1- 1!*-	W 'X T{ +
!B JEOPZE*U+777ZKP[I[E[/[IJJGJ5z%(..JGHK LL]K^ _l	, 	G* !+ 0 0 2 !3E9 %%##!!%%OO""		
 !3 	  ~~
++%8
 	

 	M	
 	3w<$..499Sd^44eT*Jz?1111=. OQ1 /}==O..?@@O_%
 + u_0`@ QIG s+   8X39X8X86X=Y9Y9A&Yc                  ^^^^^^^^^^^^ 0 n0 m0 0 0 smmm[        U 5       H|  u  pVUR                  5        H  nXdU'   M	     UR                  5        H  nUTU'   M
     UTUR                  5       '   UR                  5       n	[        R
                  TU	'   STU	'   UTU	'   M~     Sn
U  H  nU(       ab  [        U5      (       aR  U
TUR                  5       '   UR                   H(  nTU   R                  5       n[        TU   U
5      TU'   M*     U
S-  n
Ml  U(       d  Mu  [        U5      (       d  M  STUR                  5       '   M      " UUUU4S jS5      mU  Vs0 s H   nU[        S UR                   5       5      _M"     snm/ m[        [        5      mU  Vs0 s H  of[        U5      _M     snmTR                  5        HN  u  pm[        U5      S:X  a  [         R"                  " TT" U5      5        U H  nTU   R%                  U5        M     MP     / mUUUUU4S jmU4S jmUUUU4S jn[        T5      (       aZ  [         R&                  " T5      R(                  nU(       a  [        U5      (       a	  U" U5        OT" U5        [        T5      (       a  MZ  TR                  5        H  u  pm[        U5      S:X  a  M   S	T 35       e   T$ s  snf s  snf )
a  
Schedule `snodes` for various comm optimization objectives.

Args:
    snodes: the nodes to be scheduled.
    raise_comms: whether to greedily schedule collectives as early as possible
    sink_wait: whether to greedily schedule waits as late as possible
    reorder_compute_for_overlap: whether to reorder compute nodes to
        optimize for compute/communication overlapping.

Returns:
    The new schedule order.

Some notes on the synergy between different options:
    - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
    - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
r   r   c                  2   > \ rS rSrSU UUU4S jjrS rSrg)$_schedule_for_comm.<locals>.Runnablei  c                   > Xl         [        [        UR                  5       5      5      nTU   R	                  5       nTU   TU   TU   4U l        g rb   )r7   nextiterget_operation_namesr   score)rc   r7   r  
fused_namename_to_fused_nodescores_0scores_1scores_2s       r?   __init__-_schedule_for_comm.<locals>.Runnable.__init__  sT    JU6689:D+D1::<J$$$DJrI   c                4    U R                   UR                   :  $ rb   rK  )rc   others     r?   __lt__+_schedule_for_comm.<locals>.Runnable.__lt__  s    ::++rI   )rK  r7   N)r   rY   )rg   rh   ri   rj   rQ  rV  rn   )rM  rN  rO  rP  s   r?   RunnablerF    s    	 		,rI   rX  c              3  8   #    U  H  oR                   v   M     g 7frb   )r  )r~   deps     r?   r   %_schedule_for_comm.<locals>.<genexpr>  s     G.Fs((.Fr   c                   > TR                  U 5        U R                  5        HT  nTU    HH  n TU    R                  U5        [        TU    5      S:X  d  M+  [        R
                  " TT" U 5      5        MJ     MV     g)zE
Schedules `snode` and put all unblocked nodes onto the ready queue.
r   N)r   r   remover3   heapqheappush)r7   buf_namerX  buffer_usersready	scheduled
unmet_depss     r?   schedule$_schedule_for_comm.<locals>.schedule  sl     	..0H%h/5!((2z%()Q.NN5(5/: 0 1rI   c                    > T V s/ s H=  n [        U R                  5      (       a  M  [        U R                  5      (       a  M;  U PM?     nn [        U5      S:X  a  g[	        US S9$ s  sn f )zP
Return the next node in the ready queue that's neither a collective or
a wait.
r   Nc                    U R                   $ rb   rT  r   s    r?   r  G_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<lambda>,  s    QWWrI   key)r   r7   r   r3   min)r   
candidatesrb  s     r?   get_overlapping_candidate5_schedule_for_comm.<locals>.get_overlapping_candidate   sf     
&qww/ 8Eagg8N  	 

 z?a:#455
s   A'A'A'c                  > [        U 5      (       d   eT" U 5        TU    nUS:  aQ  T" 5       =nbG  TR                  U5        T" UR                  5        UTUR                     -  nUS:  a  T" 5       =nb  MG  [        R                  " T5        g)z
Schedules collective node `snode`, along with one or more compute nodes
to overlap with it. The strategy is described in the comment of
`reorder_compute_for_overlap`.
r   N)r   r]  r7   r^  heapify)r7   collective_costr   ro  rb  re  snode_to_costs      r?   schedule_collective_for_overlap;_schedule_for_comm.<locals>.schedule_collective_for_overlap.  s     #5))))'.a799FLL#Y__%}Y__==O a799F
 	erI   z;Detected unscheduled nodes. Nodes with unmet dependencies: )r   r   rJ  r   sysmaxsizer   	ancestorsrm  r   r   r#  r   r   r   r3   r^  r_  addheappopr7   )r5   rC   rD   rE   buf_name_to_snodeidxr7   r`  op_name	node_namecomm_idxancestoranc_fused_namedepsrZ  ru  rX  ra  ro  rM  rb  re  rc  rN  rO  rP  rt  rd  s                   @@@@@@@@@@@@r?   rG   rG     s   L #%r2 Hh'
..0H*/h' 1 002G*/w' 3/45>>+,NN$	!kk! ( H.u55)1HU^^%&!OO!3H!=!F!F!H+.x/G+R( , MHZM%00)*HU^^%& , ,  <E 	zGe.F.FGGG<J
 E=H=TLDJKF5/66FKM!'')t9>NN5(5/2C!!%(  * I	; 	;6 & e**e$**#6u#=#=+E2UO e** "'')4yA~ 	
I*V	
~ * Q< Ls   'KKc           
        [         R                  R                  5       (       d  U $ U  Vs/ s H  n[        U5      (       d  M  UPM     nn[	        S[        U5      5       H^  n[        [        XE   R                  5       5      5      nXES-
     R                  5        H  nXE   R                  [        XvSS95        M      M`     U $ s  snf )z
Decide global ordering of comms, by just enforcing the ordering that's in the input graph
(might not be the same ordering as the eager mode program).
TODO: Come up with a better approach
r   Tmutating_bufr   )r/   r(   is_availabler   r+   r3   rH  rI  r   add_fake_depr   )nodesname_to_bufrM  r   
comm_nodesr>   r  r   s           r?   decide_global_ordering_of_commsr  O  s     ))++"=U&9!&<!UJ=1c*o&D!?!?!ABC!e$557CM&&E 8 ' L >s   CCc                  \    \ rS rSr% SrS\S'   SrS\S'   SrS\S'   SrS\S	'   S
r	S\S'   Sr
g)SinkWaitInfoig  r   r\   r^   r_   rZ   r`   r]   
moves_inforY   r[   rf   N)rg   rh   ri   rj   r^   rl   r`   r]   r  r[   rn   rf   rI   r?   r  r  g  s3    GSL#E3NJ!OS!rI   r  c                  ^1^2^3^4^5^6^7^8^9^:^; SSK Jn  [        U 5      nUS:X  a  U 0 4$ [        [        R
                  R                  R                  5       5      n[        [        R
                  R                  5       5      n[        XU5      u  nm1m;nn[        U 5      u  m4m3m20 n      S(U34S jjn	U1U5U6U7U8U9U;4S jn
U2U3U44S jnU1U5U6U;4S jnU S   n[        5       n[        R                  n[        R                  nSnT4U   Gb  U(       a  GOxUb  [        U5      U:  a  GOd[        U5      (       GaE  X;  Ga?  UR                  U5        [!        5       =nX'   T3U   nUnUm8UnT1U   S   m9UGb	  U(       a  GO U	" T8U5      m7U" UR"                  T7S	S
9nUR$                   Vs0 s H"  n['        U5      (       a  M  UR(                  U_M$     nnUR+                  5       nS nU H.  nUR-                  UR/                  5       S 5      =n(       d  M,  Un  O   Uc$  [1        U5      =(       a    [1        U5      =n(       a  S nU" U5      u  nnU(       aA  Un[3        T9T1U   S   5      m9U=R4                  S-  sl        [7        T75      Ul        T3U   nGM$  Uc-  W(       a&  S[7        T75       SUR/                  5        3Ul        GOSU S[=        UR                  5       5       SUR/                  5        SUR?                  5       / ST7 SU Vs/ s H  nUR/                  5       PM     sn SU 3Ul        GOHT;U   m5T5R@                  T5RB                  -
  m6[E        [<        5      n URG                  5        HR  u  n!n"U!RH                  RJ                  n#U"U:w  a  M$  S n$T7 H  n%U%U#;   d  M  U%n$M     U$c  M>  U U$   RM                  U!5        MT     U
" UT7U 5      u  n&n'n(U&U:  a  SU& SU 3Ul        OU=RN                  S-  sl'        U=RP                  SUR/                  5        3-  sl(        U" UT8U5        U" UT7U U'U(5        U(       a0  SSK)J*n)  U)" UT7[7        T75      U	" T2S 5      UUUT1T;SU 5      nU(       a  OT3U   nUb  GM	  T4U   nT4U   b  GM  / SQn*URG                  5        V+Vs/ s HI  u  n+n[W        U+5      UR4                  UR8                  URN                  URP                  UR:                  /PMK     n,n+nSm:[X        RZ                  R]                  S5      (       a  SSK/J/n-  T:U-" U,U*S9-  m:O8T:S -  m:T:[a        U*5      S!-   -  m:T:S!Rc                  [e        [`        U,5      5      -  m:[f        Ri                  T:5        U	" T2S 5      n.[        U.5      U:X  d   e[k        U.Xt5      u  n/    n0T:S"U 3-  m:T:S#U/ 3-  m:[m        S$S% U:4S& jS'9  U.U4$ s  snf s  snf s  snn+f ))Nr   rz   c                T   > / nU n Ub  UR                  U5        X1:X  a   U$ TU   nM#  rb   r   r   s       r?   r   4_sink_waits_iterative_internal.<locals>._group_nodes  r   rI   c                  > TT   S   TT   R                   -
  n0 n0 nSnU(       d   [        TT-   UTR                   -   5      nXdU4$ UTR                   -   nXtU '   Un[        S [        R                  R                  UR                  5       5       5       5      nU* XP'   TU-   n	T HQ  n
TU
   S   U	-   nXU
'   [        Xk5      nSnX;   a)  X*   nU H  nXR                  R                  -  nM     XU
'   X-  n	MS     XdU4$ )Nr   c              3  N   #    U  H  nUR                   R                  v   M     g 7frb   r   r   s     r?   r   [_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory.<locals>.<genexpr>  s%      *
 NN$$r   )	r   r   r   	itertoolschainfrom_iterabler.   r   r   )r   r   7group_n_to_bufs_after_swap_dealloc_instead_of_candidatepre_group_memr   _size_free_delta_updater   candidate_post_alloccandidate_size_free_to_move	delta_memr   gn_post_allocgn_size_free_to_addr   r   r   r   r   r   r   r   r   s                  r?   r   H_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory  sY    $Q'*::*F*Q*QQ 	 <>@BF !$77 3 > >>N "7NNN,/B/M/MM(<9%-&) *
 44GNNP*
 '
# /J-I*'*EE	B(,Q/);M%2r" ?N"#LNRC'>>+C+CC'  .A+,I  3JJJrI   c                   > TU   nU(       a  U TU'   UTU '   TU    nU(       a  UTU'   UTU'   U TU'   UTU '   UT:X  a  U mg g rb   rf   )r   r   r   group_head_prevcandidate_nextr   r   r   s        r?   r   H_sink_waits_iterative_internal.<locals>._perform_double_linked_list_swap  sq      
+%.E/"*i y)$.E.!*j &j%iE rI   c                Z  > US   nTU   S   TU   R                   -
  nU(       dE  UTR                   -   nUUTR                  -
  4TU '   U H  nTU   n	U	S   T-   U	S   T-   4TU'   M     g U /UQ H8  n
X:   nTU
   =R                  XJ   -  sl        UUTU
   R                  -
  4TU
'   M:     g r   )r   r   )r   r   r  r   r  r   r  r  r   r   r   
post_allocr   r   r   r   s               r?   r   J_sink_waits_iterative_internal.<locals>._update_memory_tracking_after_swap  s     V
$Q'*::*F*Q*QQ 	 G#03F3Q3Q#Q $$':'D'DD'L# !"%qE//qE//$R   "c"A+.JQ))-D-GG)-a0:::LO #rI   rU   FTr   c                    [        U 5      (       a  SSU R                  5        34$ [        U 5      (       a  SSU R                  5        34$ g)NFzcandidate contains collective zcandidate contains gemm_like r   )r   r   r}   r7   s    r?   r   4_sink_waits_iterative_internal.<locals>.is_groupable0  s]    .u55 %"@AQ@R S$  .e44 %"?@P?Q R$   *rI   r   zcollective ordering z with candidate:r   r   r   z(os:r   z
 outs:r   r  r  +r  sink_waits_iterative)z	Wait noder^   r`   r]   r  r  r_   r  r  r	  r  r  z*
 sink_waits_iterative peak_memory_before:z)
 sink_waits_iterative peak_memory_after:r  c                     SSS.$ )Nsink_waits_iterative_infor  r  rf   rf   rI   r?   r  0_sink_waits_iterative_internal.<locals>.<lambda>  s    / 
rI   c                    > T $ rb   rf   )log_strs   r?   r  r    s    7rI   r  r  )7r   r{   r3   r   r    r  r   r  r  r   r   r   r!  (sink_waits_iterative_debug_limit_to_sinkr   rz  r  r"  r#  r   r  r$  r   r   r   r   r^   r   r`   r[   r-   r   r   r   r   r   r   r%  r   r]   r  r&  r  r'  r(  r)  r*  r  rZ   r   r+  r,  r-  r   r
   )<r5   r{   r/  r   r   r   r   r   r0  r   r   r   r   r4  processed_waitsr5  debug_num_sink_waits_to_reorderr6  r-  r   
wait_snoder   r7  r   r8  
group_outsr:  r;  both_contain_commsr   is_grp
grp_reasonr  r   r   r%  last_succ_gnr   r   r   r  r  r
  r7   rA  r  rB  rC  r;   r   r   r   r   r   r   r   r   r   r  r   s<                                                    @@@@@@@@@@@r?   _sink_waits_iterative_internalr  p  s    ?f+arz$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$F-H" 9@E5%35E)1L	 )K )KV* B ":D lO'-'V'V$77 $ !&
+
!$+7O$(GG4#>%!-/D5;dIJJJ ,T 21 5',/;J
/S,(("& '999'? AFFAI9   #..0
#A%MM!**,==q=#$ $ '+E2U7J97U& 
* *6i)@&FJ%.
,/-|I/Fq/I-) ),8,=)$))$4	 "*0B2<3D2E.y/A/A/C.DF ,  /xjDIYDZC[ \,,5,>,>,@+AyGaGaGcFdEe f&&)U&j'Ij

j'I&J2:,	@ , 3CI3N#'225H5R5RR $  % H +002"!$!:!:J%2 #'L!++-L " $+  L$fSk! 3& 5!O L 24K "K/*>*:)K=Q ( 

a
Qy'9'9';&<#==0J
S2K&+ 4N0Q!$S)$UD12%#$(.O1- 1!*-	o 'p T{S +
!VG" !;;=
 )KE4 LLJJOO  	
 ) 	 
 G~~
++%8
 	

 	TT3w<$&&499Sd^,,WeT*Jz?1111=. OQ1 <[MJJG;O;LMMG
 # uyn (J|
s   !W9WWAWc                    [        U 5      S   $ )Nr   )r  rH   s    r?   r  r    s     *&1!44rI   c                    [         R                  S:X  a  U R                  5       nU$ [        [         R                  5      (       d   e[         R                  " U 5      nU$ )z2
Returns estimated op runtime in nanoseconds (ns)
ru   )r   r   r&   callable)r7   runtimes     r?   r   r     sU     !!Y.--/ N 223333,,U3NrI   c           
        U R                  5       n[        U5      S:X  Ga  Sn[        U R                  [        R
                  [        R                  45      (       a  SU R                  5        Vs/ s H  o3R                  5       PM     sn 3nSU R                   Vs/ s H  oUR                  PM     sn 3nSU R                  5        SU R                  R                   SU SU S	3	nU R                  5        Vs/ s H  owR                  R                  5       PM     nnS
R                  U V	s/ s HA  n	[        U	[        R                  5      (       a  SU	R                   SU	R                    S	3OSPMC     sn	5      n
 U R                  R#                  5       nU R                  R&                  R(                   U U
 SU SU R+                  5       S S3$ / nU H  nUR-                  [/        U5      5        M     U R&                  R(                   SSR                  U5       3$ s  snf s  snf s  snf s  sn	f ! [$         a    Sn Nf = f)Nr   r_   zouts:zins: z (z)
 z
 (),z (size=z	, stride=z.0fz ns): z, )	get_nodesr3   r   rw   r   ExternKernelOut_CollectiveKernelr$  r   r#  r  rp   get_output_specr   Layoutsizestridemaybe_get_nameAttributeError	__class__rg   r&   r   r'  )r7   r5   detailr;  outs_strr   ins_strchildlayoutslayoutout_tensor_infor  	summarieschild_snodes                 r?   r'  r'    s&   __F
6{aejj2#5#5r7K7K"LMMe6G6G6IJ6I

6IJKLHe.F.FG.Fff.FGHIG)*"UZZ-J-J,K4PXzY]^e]ffghF=B__=NO=NE::--/=NO((
 &	 &F fbii00 &++ia@ &	
	

113I **&&//08II;VXY^YtYtYvwzX{{  A  	A Ik23 oo&&'r$))I*>)?@@/  KGO  	I	s+   /H"H'9#H,-AH1=H6 6IIc                X   SnS nS n[        U 5       H  u  pEUci  [        U5      (       a  U[        U5      -  nUR                  nO)[	        UR                  5      (       a  OU[        U5      -  nU" U[        U5       5        Mq  [        U5      (       a/  U[        U5      -  nUR                  nU" U[        U5       5        M  [	        UR                  5      (       a  U" U[        U5       5        S nM  U" US[        U5       35        M     [        R                  SUS-  S-   35        g )Nr   c                :    [         R                  U S SU 35        g )Nz>6r  )r,  debug)stepr>  s     r?   step_log#visualize_overlap.<locals>.step_log  s    T"IRu-.rI   z| zEst. runtime (ms): i  )r   r   r   rw   r   r'  r,  r  )ordertotal_est_runtimecur_comm_noder  r  r7   s         r?   visualize_overlapr    s#     #M/ !' "5))!%8%??! %

$$ !%8%??!Tl5124"5))!%8%??! %

,u"5!68$$,u"5!68 $L$7#89:- (. 
/$6=>?rI   c                @   U n[        [        R                  R                  R	                  5       5      n[        [        R                  R                  5       5      n[        R                   GHX  n[        U[        5      (       a  U[        5       ;   a  [        5       U   n[        U5      (       d   SU S35       e[        U [        X5      U5      u  pV[        R                  R!                  5       S:X  a)  ["        R%                  SU SU< S35         ['        U5        [*        R*                  " 5       nU" U5      n[*        R*                  " 5       U-
  n	[        R                  R!                  5       S:X  a(  ["        R%                  S	U S
U	 S35         ['        U5        [        U [        X5      U5      u  pV[-        SU< 35        GM[     U$ ! [(         a  n["        R%                  SUS9   S nANS nAff = f! [(         a  n["        R%                  SUS9   S nAN|S nAff = f)Nz3Invalid reorder_compute_and_comm_for_overlap pass: z is not callabler   z.==== Visualize overlap before reordering pass z, peak_memory=z ====r_   )exc_infoz-==== Visualize overlap after reordering pass z	 (ran in z	 sec)====zfinal peak_memory=)r   r    r  r   r  r  r   'reorder_for_compute_comm_overlap_passesr   rZ   globalsr  r   r   r/   r(   get_rankr,  r  r  	Exceptiontimeprint)
r5   r  r   r   pr   r;   et0ts
             r?   $reorder_compute_and_comm_for_overlapr  2  s    E$.qww/C/C/H/H/J$KL%/0H0H0J%KM;;a!wy.	!A{{ 	
A!DTU	
{ .*6@-
 %%'1,@?k^SXY2!%( YY[%IIK"%%'1,?s)A3iX2!%( .*6@-
 	#{n%&? <@ L#  2!!"q!12  2!!"q!12s0   G
G5

G2G--G25
H?HHc           
     ^	  ^^^^^^ [        U R                  5      m[        [         5      m[        [         5      m[        T5       H  u  pUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  :X  d  MU  UR                  S   R                  S:X  d   SU SUR                  S    S35       eUR                  S   nUR                  S   nUS:  a  TU   R                  U5        M  TU   R                  U5        M     UUU4S jn[        [         5      n[        T5       H  u  pUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  :X  d  MU  UnUR                  S   mTR                  S:X  d   S	T S
U  S35       eU" T5      (       d  M  UT   R                  U5        M     S nS mT H  nUR                  S:X  d  M  [        UR
                  [        R                   R"                  5      (       d  MJ  UR
                  R$                  R&                  (       d  Mq  U" U5      (       a  M  T" X&R)                  5       5      (       d  M   SU S35       e   UR+                  5        GH3  u  mn	[        U	5       GH  u  pTU   nUR                  S   TL d   eUR                  u  nmUS-   nU
[-        U	5      S-
  :  a  XS-      O[-        T5      S-
  nTX n[/        UU4S jU 5       5      (       a   ST SU SU  S35       eU H  nUR                  S:X  d  M  TUR                  ;   d  M'  UR
                  [        R                  R                  R                  R                  :w  d  Me  [1        UU4S jUR                   5       5      nUUl        M     GM     GM6     UR+                  5        H0  u  mn	[        U	5       H  u  pTU   nU R3                  U5        M     M2     T Hy  nUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  :X  d  MS  UR                  S   U;   d  Mh  U R3                  U5        M{     g)ab  
This FX graph pass replaces uses of FSDP2 unsharded params with their corresponding
graph intermediates that were fsdp.copy_ into the unsharded params in the original graph.

NOTE: Can only apply this pass to any of the FSDP2 unsharded params that have this pattern
(or repetition of): `resize_(full) -> copy_ -> resize_(0)`. Because of this, for partial-graph case
where `resize_(full) -> copy_` is in one graph and `resize_(0)` is in another graph, we can't
remove these resize and copy ops and thus we will have worse performance there.

In other words, "do we try to remove all the resize_(full) -> copy_ -> resize_(0) nodes for this unsharded param"
is actually a per-unsharded-param decision, since for each unsharded param, we look at its resize sequence pattern
(in `check_resize_pattern()`) to determine if its set of resize and copy nodes can be removed.
call_functionr   placeholderz1Resize can only operate on graph inputs, but got z# which is resizing non-graph-input r  r   c                n  > TR                  U / 5      nTR                  U / 5      n[        U5      [        U5      :X  d2  [        R                  SU  S[        U5       S[        U5       S35        g[	        X5       H7  u  p4X4:  d  M  [        R                  SU  STU    SU S	TU    SU S
35          g   g)NzH
Unequal number of resize-to-full and resize-to-0 nodes for graph input z:
z vs. zK.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass.
Fz
For graph input z: resize-to-full node z
 at index z 
happens after resize-to-0 node zd.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass for that unsharded param.
T)r   r3   logwarningr   )graph_inputresized_to_full_idxesresized_to_0_idxesresize_to_full_idxresize_to_0_idx&graph_input_to_resized_to_0_node_idxes)graph_input_to_resized_to_full_node_idxes	node_lists        r?   check_resize_patternLremove_fsdp2_unsharded_param_graph_input_usage.<locals>.check_resize_pattern}  s    !J M M!
 DGGUWX()S1C-DDKKHHS} U E#&8"9!: ;  47!4
/ "43I>P4Q3RR\]o\p q  )/ :;:oEV W 4
 rI   z\
Assumed all FSDP2 `unsharded_param`s to be graph input, but it's not true!
Offending node: z	. Graph: c                    U R                   [        R                  R                  R                  R
                  :H  =(       d;    U R                   [        R                  R                  R                  R
                  :H  $ rb   )targetr/   rr   fsdpcopy_ru   inductorresize_storage_bytes_)rw   s    r?   is_allowed_mutationKremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_allowed_mutation  sO    KK599>>//777 O{{eii00FFNNN	
rI   c           	        [        U R                  [        R                  R                  5      (       aj  [        U R                  R                  R                  5       VVs/ s H3  u  p#UR                  c  M  UR                  R                  (       d  M1  UPM5     snnO/ n[        U Vs/ s H6  n[        U R                  U   R                  S   R                  5       5      PM8     sn5      n[        U Vs/ s H)  n[        UR                  S   R                  5       5      PM+     sn5      n[        XW-  5      S:  $ s  snnf s  snf s  snf )Nvalr   )r   r  r/   _ops
OpOverloadr   _schema	arguments
alias_infois_writer   r   argsmetauntyped_storager3   )rw   unsharded_paramsr>   r   mutated_arg_idxesmutated_node_arg_storagesunsharded_paramstorages_of_unsharded_paramss           r?   -is_node_mutating_unsharded_param_or_its_aliaseremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias  s)    $++uzz'<'<== &dkk&9&9&C&CDDDA<< 010E0E D  	 %/ +*A tyy|007GGIJ*%
! (2 (8'7O 33E:JJLM'7(
$ ,KLqPP)s    D=7D=D=)=E60EzdUser mutation on FSDP2 unsharded param is not allowed when Traceable FSDP2 is used. Violating node: c              3  8   >#    U  H  nT" UT/5      v   M     g 7frb   rf   )r~   rw   r  r  s     r?   r   Aremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>  s%      *D >d_DUVV*s   z(Assumed no ops mutating unsharded param z in subgraph z, but it's not true!
Graph: c              3  6   >#    U  H  nUTL a  TOUv   M     g 7frb   rf   )r~   argreplacementr  s     r?   r   r    s$      %#,C (+o'=3F#,s   N)r-   r  r   r   opr  r/   rr   r  r  ru   r  r   r  r  r   r
  r  r  
is_mutabler  r   r3   r   tuple
erase_node)r  r}  rw   r  new_sizer  'unsharded_param_to_fsdp_copy_node_idxesfsdp_copy_noder  fsdp_copy_node_idxesr>   fsdp_copy_node_idxr;   subgraph_start_idxsubgraph_end_idxsubgraph_nodesnew_argsr  r  r  r  r  r  s                    @@@@@@r?   .remove_fsdp2_unsharded_param_graph_input_usager,  [  sb    U[[!I 1<D0A--8->*y)	GG&uyy11GGOOO99Q<??m3  :2267Z[_[d[def[gZh i6 3 ))A,Kyy|H!|9+FMMcR6{CJJ3O *"J /:$.?+y)	77o%$++9M9M9U9U*U!N"iilO"%%6  = !5' 29 6 $O447HOOPST *
Q4 GG&4;;

(=(=>>##...'--DBBD  eeidj k  : 
1	6	6	8	%./C%D!A&'9:N!&&q)_<<<+00NA{!3a!7 s/0144 %U+^a' 
 ''9KN *   ))8(9~FV Ww   'GG.'4994uyy'9'9'O'O'W'WW$ %#'99%  H !)DI ') &E 
9J 
1	6	6	8	%./C%D!A&'9:N^, &E 
9 GG&uyy11GGOOO		! GGT" rI   c                  ^	  SS K m	T	R                  R                  5       (       d   eT	R                  R                  R
                  (       a%  T	R                  R                  R                  (       d   e SSK
JnJnJnJnJn   U	4S jnU" 5       nU" U" T	R                  R                  R
                  R                   U" ["        R$                  U" T	R                  R&                  R(                  R                   U" S5      U" S5      U" S5      U" S5      U" S	5      5      U" S
5      5      U" S5      U" S5      5      US S9SU	4S jj5       nU" U 5        UR+                  U 5        g ! [        [        [        4 a     g f = f)Nr   r   )CallFunction
KeywordArgMatchPatternMatcherPassregister_graph_patternc                Z  > [        U R                  5      nU H  nUR                  [        R                  :X  d  M#  UR
                  S   R                  TR                  R                  R                  R                  L d  Mi  UR
                  S   S:X  d  M~  U R                  U5        M     g r   )r-   r  r  operatorgetitemr  rr   r  all_gather_copy_inru   r"  )gr  r   r/   s      r?   remove_unused_getitem8reinplace_fsdp_all_gather.<locals>.remove_unused_getitemD  sp    M	AH,,,FF1I$$		(I(I(Q(QQFF1INQ rI   all_gather_inputsall_gather_outputinp_split_sizesall_gather_input_numelrankitem_idx
group_size
group_namec                &    U R                   S   S:H  $ )Nr?  r   )kwargs)matchs    r?   r  +reinplace_fsdp_all_gather.<locals>.<lambda>d  s    %,,z":a"?rI   )	pass_dictextra_checkc                n   > U4S jnU R                  UUS   US   US   US   US   US   US   /5        g )	Nc                    > U S S nU S   nU S   nTR                   R                  R                  R                  " U6 nUS   nUS   nTR                   R                  R
                  R                  XRX6S9nU$ )NrU   r   r   )out)rr   r  r6  ru   _c10d_functionalall_gather_into_tensor_out)	r  copy_in_argsr@  rA  r6  r5  	getitem_1all_gather_into_tensorr/   s	           r?   replEreinplace_fsdp_all_gather.<locals>.reinplace_all_gather.<locals>.replg  s      9LbJbJ!&!B!B!J!J" )+G*1-I		**EEMM N  #
 *)rI   r:  r;  r<  r=  r>  r@  rA  )replace_by_example)rD  r  rC  rQ  r/   s       r?   reinplace_all_gather7reinplace_fsdp_all_gather.<locals>.reinplace_all_gatherQ  s[    ,	*$ 	  *+*+()/0v|$|$	
rI   )rD  r0  )5torch.distributed.fsdp._fully_shard._fsdp_collectivesr(   r  rr   rL  rP  rM  ImportErrorr  AssertionErrorpattern_matcherr.  r/  r0  r1  r2  ru   r4  r5  r  r6  apply)
r  r.  r/  r0  r1  r2  r8  
graph_passrT  r/   s
            @r?   reinplace_fsdp_all_gatherr\    sg   
D  --//// II&&==		**EE	
FE
  	  $%JII&&==EE  IINN55==23230178v& :& |$|$	
" ?'*
+*
@ % U} 8 s   A1E) )F Fc                    [        U [        R                  R                  R                  [        R                  R                  R
                  45      (       a   e[        U R                  5       SS  5      $ )N   )r   r/   	_inductorr"  FusedSchedulerNoder{   r\   r   r  s    r?   
get_op_idxra    sb    OO%%88OO%%::	
    u~~#$$rI   c           
     	  ^^^ ^! SSK Jm   / n[        [           " 5       nSnSn0 n0 n0 m!U U!4S jn	U  GH"  n
[	        U
R
                  [        R                  R                  R                  R                  S9(       Ga  [        U4S jU
R                   5       5      (       Ga  SnU
n[        5       n[        UUUT5        [        [        R                  R                  R                  R                  [        R                  R                  R                  R                  [        R                  R                  R                   R                  /5      m[#        UUUTUU 4S jS	9  [%        US
 S9n['        U5      nSn[)        ['        U5      5       H^  nUU   n[+        UR
                  [        R                  R                  R                   R                  5      (       a  US-  nUS:  d  M\  Un  O   US U nS n[)        ['        U5      S-
  5       H9  n[-        UUS-      R
                  [.        R0                  5      (       d  M4  US-   n  O   Uc   eU	" US U 5      nU	" UUS  5      nUUU'   GM;  [+        U
R
                  [        R                  R                  R2                  R                  5      (       d  GM  SnU
n[        5       n[#        UUUT5        [%        US S9nS n[)        ['        U5      S-
  5       H9  n[-        UUS-      R
                  [.        R0                  5      (       d  M4  US-   n  O   Uc   eU	" US U 5      nU	" UUS  5      nUUU'   GM%     ['        T!5      S:  d   eU(       a  ['        U5      S:  d   eU(       a  ['        U5      S:  d   eU  HS  n
U
R5                  5       T!;   a  T!U
R5                  5          n
X;   a  M1  UR7                  U
5        UR9                  U
5        MU     S nUR;                  5        Hl  u  nnUba  [=        [?        URA                  5       5      5      nURC                  5        H+  nURE                  [G        UR5                  5       USS95        M-     UnMn     S nUR;                  5        Hl  u  nnUba  [=        [?        URA                  5       5      5      nURC                  5        H+  nURE                  [G        UR5                  5       USS95        M-     UnMn     U$ )Nr   )r"  Fc                   > TR                   R                  U 5      nU  H  nUTUR                  5       '   M     UTUR                  5       '   U$ rb   )r{   creater   )snodes_to_group
group_noder7   r"  snode_name_to_final_snodes      r?   _create_group_node:enforce_comm_ordering_for_fsdp.<locals>._create_group_node  sO    33::?K
$E:D%enn&67 %;E!*"5"5"78rI   )r  c              3     >#    U  HJ  n[        TU   R                  [        R                  R                  R
                  R                  5      v   ML     g 7frb   )r   rw   r/   rr   r  r6  ru   )r~   r   rM  s     r?   r   1enforce_comm_ordering_for_fsdp.<locals>.<genexpr>  sJ      
 % "1%**EIINN,M,M,U,U  %s   AATc                   > [        U TR                  5      =(       d6    [        U TR                  5      =(       a    U R                  R                  T;   (       + $ rb   )r   NopKernelSchedulerNodeExternKernelSchedulerNoderw   op_overload)r   allowed_opsr"  s    r?   r  0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  sD    q)"B"BC "1i&I&IJ >FF..+=	'rI   )criteria_cbc                    [        U 5      $ rb   ra  ri  s    r?   r  rq        JqMrI   rk  r   c                    [        U 5      $ rb   rt  ri  s    r?   r  rq    ru  rI   r  )$r_   r"  r   r   r   rw   r/   rr   rL  rM  ru   r   ry  r   wait_tensorr  split_with_sizes_copyr   sortedr3   r+   r   r   r   _WaitKernel	chunk_catr   r   rz  r   rH  rI  r   r$  r  r   )"r5   r  rM  	new_orderrc  	ag_exists	rs_exists$ag_grouped_node_to_wait_grouped_node$rs_grouped_node_to_wait_grouped_noderh  r7   ag_snodeag_related_snode_setag_related_snodesend_idx_of_current_ag_blockcopy_out_countr>   	cur_snodewait_node_idxag_group_nodeag_wait_group_noders_snoders_related_snode_setrs_related_snodesrs_group_noders_wait_group_nodeprev_ag_waitwait_group_noder  r;  prev_rs_waitrp  r"  rg  s"     `                            @@@r?   enforce_comm_ordering_for_fsdpr    s   
 )+I3!III+-(+-( " JJ59955PPXX
 
 
 __	
 
 
 IHLVL  ($"	 %II..IIQQII..::BBIINN88@@K )$" !'$*A! +..?*@'N3012-a0	!NNEIINN$H$H$P$P  #a'N!A%23/ 3 !22N3N O !M301A56/A6;;R^^LL$%EM 7 !,,,./@-/PQM "44Emn4U!VBT0? EJJ		(@(@(H(HIIIH MWL ($"	 !'$*A!
 !M301A56/A6;;R^^LL$%EM 7 !,,,./@-/PQM "44Emn4U!VBT0?] ` ()A---781<<<781<<< >>88-enn.>?Ee  L*N*T*T*V&#]%C%C%E FGL!--/**AJJL|TR 0 ' +W L*N*T*T*V&#]%C%C%E FGL!--/**AJJL|TR 0 ' +W rI   )r5   r  )r5   r  r   r  )rw   z"Optional[Union[IRNode, Operation]]r   bool)r7   r"   r   r  )F)r   rZ   )r   r  r   rZ   )r5   r  r   ztuple[dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], BaseSchedulerNode])r5   r  r   zDtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]])
r5   r  rC   r  rD   r  rE   r  r   r  )r  r  r   r  )r5   r  r   zEtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]])r7   r"   r   rV   )r  torch.fx.Graph)r  r  r   rY   )r5   1list[torch._inductor.scheduler.BaseSchedulerNode]r  z4dict[str, torch._inductor.scheduler.SchedulerBuffer]rM  zdict[str, BaseSchedulerNode]r   r  )V
__future__r   r^  r(  r  loggingr4  rw  r  collectionsr   dataclassesr   typingr   r   r   r	   r/   torch._loggingr
    torch.multiprocessing.reductionsr   torch.utils._ordered_setr   r_   r   r   dependenciesr   r   r   r"  r   memoryr   r   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr    	getLoggerrg   r  _logginggetArtifactLoggerr,  r   r"   r@   rD   rC   rL   rQ   rS   rx   r}   r   r   r   r   r   r   rN   rG   r  r  r  r  r   r'  r  r  r,  r\  ra  r  rf   rI   r?   <module>r     sB   #      
  # ! 6 6  + ; /  ! %*     !nn..xC;M#M,#&####L 9 9 9""(03&#"m#mIm`W#WW W 	W
 Wt"0 " " "h#hJhV5#55	A>#L&#&&RA#HhV%n=nEn 5n 7	nrI   