
    9ipk                   (   % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SKJrJrJrJrJrJrJr  S SKJrJr  \(       a  S SKJrJr  S SKJ r   S SK!r!S SK"r"S SK#r"S SK$J%s  J&r'  S SK(J)r)J*r*  S S	K+J,r,J-r-  S S
K.J/r/  S SK0J1r1J2r2  S SK3J4r4  S SK5J6r6  S SK7J8r8J9r9J:r:  S SK;J<r<  SSK=J>r>J?r?J@r@JArAJBrBJCrC  SSKDJErE  SSKFJGrGJHrHJIrI  SSKJJKrKJLrL  SSKAJMrMJNrNJOrOJPrP  SSKQJRrRJSrS  SSKTJUrU  SSKBJVrVJWrWJXrXJYrYJZrZ  SSK[J\r\  SSK]J^r^J_r_  SSK`JaraJbrb  SSKcJdrd  SSK%JereJfrfJgrgJhrhJiriJjrjJkrkJlrlJmrmJnrnJoroJprpJqrqJrrrJsrsJtrtJuru  SSKvJwrw  \R                  " \y5      rz\"R                  R                  \yS5      r}\"R                  R                  \yS 5      r~\"R                  R                  \yS!5      r\S"   rS#\S$'   \" S%5      r\" S&5      r\GR
                   " S' S(5      5       r\GR
                   " S) S*\5      5       r " S+ S"5      r\GR                  SLS, j5       rSMS- jrSNS. jrSOS/ jr " S0 S15      rSPS2 jr " S3 S45      r        SQS5 jr " S6 S7\5      r " S8 S9\5      r " S: S;\5      r    SRS< jr        SSS> jr " S? S@\5      r " SA SB\5      r " SC SD\5      r ST       SUSE jjr\GR
                   " SF SG5      5       r\GR8                  " 5       rSVSH jr " SI S=5      r " SJ SK5      rg)W    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)	ParamSpec	TypeAlias)IteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitmaybe_log_cudagraph_partitionsympy_product)Vfusionloop_orderingcompute_dependenciesBaseSchedulerNoder   PartitionType_T_Pc                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg) SchedulerBuffer_   	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr:   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ N)r_   get_name)selfops     Y/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/torch/_inductor/scheduler.pydefining_op_name SchedulerBuffer.defining_op_namei   s#    ~~{{}    c                @    [        U R                  R                  5      $ re   )hashr]   namerg   s    ri   __hash__SchedulerBuffer.__hash__n   s    DIINN##rl   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r    z	.users = z
.users = [,])rG   rf   	writelinetyper]   __name__layoutget_aliasespformatget_mutationslenrb   indentgetrawvalue)rg   resultro   users       ri   	debug_strSchedulerBuffer.debug_strq   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ re   r]   rf   rp   s    ri   rf   SchedulerBuffer.get_name       yy!!##rl   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)r]   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr%   CommBufferLayoutrP   graphwrapper_codecodegen_allocationhasattrkernelrf   inplace_update_buffersr\   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rg   input_buffer_nameinput_buffers      ri   allocateSchedulerBuffer.allocate   sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rl   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)r]   r   rz   r%   r8   rK   rb   
OutputNode)rg   uses     ri   can_freeSchedulerBuffer.can_free   sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  rl   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g re   )idr]   mergelistvaluesrb   )rg   rb   r   r   s       ri   	set_usersSchedulerBuffer.set_users   sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
rl   c                T    U R                   c   eU R                   R                  5       $ re   )r]   r   rp   s    ri   r{   SchedulerBuffer.get_aliases   s%    yy$$$yy5577rl   c                T    U R                   c   eU R                   R                  5       $ re   )r]   r   rp   s    ri   r}   SchedulerBuffer.get_mutations   %    yy$$$yy++--rl   c                R    U R                   R                  5       R                  5       $ re   )r]   r   
get_devicerp   s    ri   r   SchedulerBuffer.get_device   s    yy((*5577rl   )rb   Nreturnstrr   intr   Noner   bool)rb   ra   r   r   r   zSequence[str]r   Optional[torch.device])ry   
__module____qualname____firstlineno____annotations__dataclassesfieldr   rb   r:   rc   rj   rq   r   rf   r   r   r   r{   r}   r   __static_attributes__ rl   ri   rY   rY   _   sv    
O,,'--dCE>C.9.?.?3/J+ 
$$($?B
+8.8rl   rY   c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBuffer   Nr^   r_   r   )ry   r   r   r   r_   r   r   r   rl   ri   r   r      s    /3K,3rl   r   c                     \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   SrS\S'   SES jrSFS jrSGS jrSGS jr	SGS jr
SHS jrSGS jrSIS jr      SJS jrSKS jrSLS jrSMS jrSNS jr      SOS jrSIS jrSPS jrSPS  jrSIS! jrSIS" jr    SQS# jrSGS$ jrSGS% jr\SPS& j5       r\SPS' j5       r\SMS( j5       r\SMS) j5       r SRS* jr!SSS+ jr"STS, jr#SUS- jr$SMS. jr%SMS/ jr&SMS0 jr'SMS1 jr(SMS2 jr)SMS3 jr*SMS4 jr+SVS5 jr,SMS6 jr-SIS7 jr. SW     SXS8 jjr/\SYS9 j5       r0\SYS: j5       r1\SYS; j5       r2      SZS< jr3      S[S= jr4\S\S> j5       r5S]S? jr6\S]S@ j5       r7S^SA jr8S_SB jr9\:    S`SC j5       r;SDr<g)arT      z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr;   mpi_nodeNOptional[float]override_estimated_runtimec                     Xl         S U l        g )Nc                     / $ re   r   )r   kwargss     ri   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>   s    Brl   )r\   debug_device_str)rg   r\   s     ri   __init__BaseSchedulerNode.__init__   s    $-& 	rl   c           	     ^   Xl         [        5       U l        [        [           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l	        U R                   Vs0 s H  o3R                  5       U_M     snU l        0 U l        g s  snf s  snf )NF)r\   r]   r_   )r]   r   	ancestorsr   
last_usagewrittenget_outputsrY   r\   outputsrf   outputs_by_namemutation_renames)rg   r]   outputbufs       ri   _init_from_node!BaseSchedulerNode._init_from_node   s    ,0	*4,$
   **,/
 - .. 
 -/
 ,0<<<
+7CLLNC<<
 13#/
<
s   B%;B*c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)rx   ry   rf   rp   s    ri   __repr__BaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAArl   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsrt   (r]   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rv   Ignoring error in debug_str()Texc_info)rf   rG   splicerx   ry   getattrr|   r   writesr   readsr   r   r   rw   debug_str_extra	Exceptionlogwarningr   rstrip)rg   ro   r   outs       ri   r   BaseSchedulerNode.debug_str  sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N r   rp   s    ri   r   !BaseSchedulerNode.debug_str_extra      rl   c                $    U R                  U 5      $ re   )r   rp   s    ri   _debug_str_for_device'BaseSchedulerNode._debug_str_for_device  s    $$T**rl   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Ndatar  z, F)shorten	multiline)r   r]   r   torch	_inductorr%   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rg   
maybe_datadata_strs      ri   debug_str_short!BaseSchedulerNode.debug_str_short   s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""rl   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rp   s    ri   log_detailsBaseSchedulerNode.log_details/  s,    6####		
rl   c                    gNFr   )rg   self_dep	other_deps      ri   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair7       rl   c                    S U R                   R                  5        5        Vs0 s H  nX!;   d  M
  X!U   _M     snU l        U R                  U R                   R	                  U R                  5      5        g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fre   ro   .0deps     ri   	<genexpr>9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>?  s     Q-Pc-P   )r   reads_and_writesr   set_read_writesrename)rg   renamesro   s      ri   update_mutated_names&BaseSchedulerNode.update_mutated_names<  ss     RT-=-=-N-N-PQ!
Q  D$-Q!

 	T--44T5J5JKL!
s
   	A7	A7c                X    U R                  U R                  R                  U5      5        g re   )r1  r   	with_readrg   r,  s     ri   add_fake_depBaseSchedulerNode.add_fake_depD  s!    T--77<=rl   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7fre   )r{   r}   )r+  r   s     ri   r-  =BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>H  s*      
@ROO4!2!2!44@Rs   35)anyr   rp   s    ri   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutationG  s%     
@D@P@P@R
 
 	
rl   c                f    Xl         U R                   R                  U l        U R                  5         g re   )r   r   r   
prune_deps)rg   rws     ri   r1  !BaseSchedulerNode.set_read_writesL  s&    "&"2"2"8"8rl   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7fre   )get)r+  kmutation_real_names     ri   r-  3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>U  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r   )rg   future_used_buffersrI  used_bufferss     ` ri   set_last_usage BaseSchedulerNode.set_last_usageQ  s-     88:!!U!UU&<rl   c                J    U R                    H  nUR                  5         M     g re   )r   r   )rg   r   s     ri   mark_runBaseSchedulerNode.mark_runX  s    <<CLLN  rl   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7fre   r)  r*  s     ri   r-  6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>]  s      
W HHW   )r   	itertoolschainr   r   r   rp   s    ri   used_buffer_names#BaseSchedulerNode.used_buffer_names\  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rl   c                >  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H  nUR                  PM     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R                  U5      (       aD  UR                  U4S j[        R                  R                  U   R                  5        5       5        [        U5      S:  a  M  T$ s  snf )Nr   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7fre   r   )r+  alias
used_namess     ri   r-  ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>m  s(      "5 J.	 E"5s   
	)r   rX  rY  r   r   r   ro   r~   popaddrP   r   name_to_bufferrG  extendr   )rg   r,  depsr_  s      @ri   rL  .BaseSchedulerNode.used_or_aliased_buffer_namesb  s    &0l
 !t'7'7'='=t?O?O?V?VW
W HHW 	 
 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m 
s   Dc                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7fre   )ro   r\   available_buffer_namesr+  r,  rg   s     ri   r-  /BaseSchedulerNode.prune_deps.<locals>.<genexpr>w  s0      -
.xxt~~DDD C.s   (8	8r   r   rp   s   `ri   rB  BaseSchedulerNode.prune_depsv  s#    ", -
..-
 #
rl   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                   > [        U [        5      (       d  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ r!  )	r   r0   r\   r   ro   rj   rP   r   removed_operations)r,  op_namerg   s     ri   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sI    c7++nn00:KKMGagg8888rl   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fre   r   r+  r,  rr  s     ri   r-  4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
1C\#5FCC1   !	!r,  r-   r   r   )r   r   r   r1  remove_reads)rg   	to_removerr  s   ` @ri   prune_weak_deps!BaseSchedulerNode.prune_weak_deps}  sN    	9  
++11
 
	 	T--::9EFrl   c                D    [        XU R                  R                  5        g re   )_prune_redundant_depsr\   r   )rg   name_to_fused_nodes     ri   prune_redundant_deps&BaseSchedulerNode.prune_redundant_deps  s     	d8R8RSrl   c                T    U R                   c   eU R                   R                  5       $ re   )r]   get_operation_namerp   s    ri   rf   BaseSchedulerNode.get_name  r   rl   c                "    U R                  5       $ re   rf   rp   s    ri   get_first_name BaseSchedulerNode.get_first_name  s    }}rl   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   r  r+  r]   s     ri   r-  8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     G6Fd--//6F   )r   	get_nodesrp   s    ri   get_operation_names%BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrl   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   r  r+  r  s     ri   r-  5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     ALS,,..Lr  )r   r   rp   s    ri   get_buffer_names"BaseSchedulerNode.get_buffer_names  s    ADLLAAArl   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNr   SchedulerNoder'   r+  ns     ri   r-  ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s6      
 & q-( G+AFG%s   .0allr  rp   s    ri   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rl   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7fre   r  r  s     ri   r-  @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1r  rp   s    ri   r'   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rl   c                    U /$ re   r   rp   s    ri   r  BaseSchedulerNode.get_nodes  s	    vrl   c                    U R                   $ re   )r   rp   s    ri   r   BaseSchedulerNode.get_outputs  s    ||rl   c                     U R                   U   $ re   )r   )rg   buf_names     ri   
get_outputBaseSchedulerNode.get_output  s    ##H--rl   c                T    U R                   c   eU R                   R                  5       $ re   )r]   r   rp   s    ri   r   BaseSchedulerNode.get_device  s%    yy$$$yy##%%rl   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r   rx   rg   devices     ri   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rl   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ re   )r   rJ   rx   r  s     ri   rJ   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rl   c                    gr!  r   rp   s    ri   is_reductionBaseSchedulerNode.is_reduction      rl   c                    gr!  r   rp   s    ri   is_split_scanBaseSchedulerNode.is_split_scan  r  rl   c                    gr!  r   rp   s    ri   is_templateBaseSchedulerNode.is_template  r  rl   c                    gr!  r   rp   s    ri   	is_externBaseSchedulerNode.is_extern  r  rl   c                    gr!  r   rp   s    ri   
is_foreachBaseSchedulerNode.is_foreach  r  rl   c                    gr!  r   rg   read_deps     ri   can_inplaceBaseSchedulerNode.can_inplace  r  rl   c                    gr!  r   rp   s    ri   has_side_effects"BaseSchedulerNode.has_side_effects  r  rl   c                X  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GHQ  nUR2                  nUc   eUR5                  5       (       aV  UR7                  5       (       dA  UR9                  5       (       d,  UR;                  5       [        R                  R<                  ;   a  M  T R>                  R@                   GH  nURB                  T R,                  RD                  ;   a$  T R,                  RD                  URB                     nO/T R,                  RF                  RI                  URB                  5      nU(       d  M  [        R                  RJ                  RM                  UT 5      (       d  M  [        URN                  [P        5      (       a  M  URR                  c   eURR                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	n[U        U	5      S:X  d  GM3  U	S   RV                  (       d  GMJ  U	S   R2                  T L d  GM_  UR2                  c  GMo  [        UR2                  RY                  5       [Z        R\                  [Z        R^                  [Z        R`                  45      (       a  GM  URN                  (       am  [        URN                  R2                  [Z        Rb                  [Z        Rd                  45      (       a*  [U        UR2                  R7                  5       5      S:  a  GMF  U" UR2                  UR2                  5      (       d  GMk  U" U5      (       d  GM{  [        R                  Rf                  Ri                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rj                  Rm                  UR;                  5       5        [        R                  Rj                  Rm                  UR;                  5       5        UR;                  5       [        R                  Rn                  UR;                  5       '     GMO     GMT     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r    )can_match_buffer_size	mutationsNr   c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H  nUR
                  n[        U[        5      (       d  M&  UR                  5       U R                   R                  ;  d  U R                   R                  U5      ULa  Mn  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7fre   r)  )r+  or  s     ri   r-  ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>
  s&      Evv) AEs   $	$r    FT)r\   get_fused_noderf   r   rb   r]   r   rT   r  r  r   r0  r~   )buf_to_be_inplaced
fused_nodere  r   	user_noder  rg   s        @ri   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= ' 1* rl   r   )r  rY   r   r   )8codegen.wrapperr  r   r  r"   inplace_buffersrP   r   has_featurer   r(   INPLACE_BUFFERSr   r  r  codegensimd
SIMDKernelr   r   r   rp  r\   completed_operationsr   r]   r   r   r   rf   removed_buffersr   r   ro   r   r   rG  r   	can_reuser_   NopKernelSchedulerNoderb   r~   r  r   r%   r8   r7   MutationLayoutSHOULDREMOVEFallbackKernelr6   r   make_inplacer  rb  r   )
rg   r  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         ri   decide_inplace_update'BaseSchedulerNode.decide_inplace_update  s   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 	 	D ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G q / &0&s   "V'V'c                   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH5  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	S
S9S   nUR                  SUR                  SS5      R                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM8     [        U5      S:X  a  g UR                  U5        SU l        g )Nr   r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r    )maxsplit{z{{}z}}r   \z\\z#pragma CMT END ORIGINr   T)r"   comment_originr   r]   get_originsrh   appendtargetmetarsplitreplacer~   
writelines)	rg   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            ri   codegen_originating_info*BaseSchedulerNode.codegen_originating_infoY  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3 6 y>Q 	)$rl   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrp   s    ri   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rl   c                "    U R                  SSS9$ )NTFr  r  rp   s    ri   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rl   c                "    U R                  SSS9$ )NFTr  r  rp   s    ri   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
rl   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)sumget_read_write_buffer_accessesr   )rg   r  r  s      ri   r  3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s1     //+ 0 fh	
 	
rl   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ [        T [        5      (       af  [        T R                  [
        R                  5      (       a=  T R                  R                  [        R                  R                  R                  L a  0 $ SS jm[        T [        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[         R"                  " [$        5      nU(       a:  T R&                  R(                   H   nX4R*                     R-                  U5        M"     U(       a:  T R&                  R.                   H   nX4R*                     R-                  U5        M"     U(       a&  [1        S T R&                  R(                   5       5      O	[1        5       nU(       a&  [1        S T R&                  R.                   5       5      O	[1        5       nSU 4S jjm[        T [2        5      (       a  [1        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	[5        U4S	 jX9    5       5      mU	[6        R8                  R:                  ;   a  [6        R8                  R:                  U	   n
O>U	[6        R8                  R<                  ;   a  [6        R8                  R<                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   fallback)rP   r   sizevars	size_hint)ss    ri   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<rl   r   r        eAc              3  8   #    U  H  oR                   v   M     g 7fre   r)  r*  s     ri   r-  CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     B+ACxx+Ar/  c              3  8   #    U  H  oR                   v   M     g 7fre   r)  r*  s     ri   r-  r-    s     C+BCxx+Br/  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7fre   r]   )r+  r   s     ri   r-  \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>))r/  r   )r\   r   rb   r   r~   )r   snodesrb   buf_usesrg   s       ri   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599rl   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7fre   r3  )r+  r,  r5  rg   s     ri   r-  r-    s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7fre   r   )r+  r,  
node_numels     ri   r-  r-    s     $R;QCZ;Qs   c                R  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [        U4S jU R!                  5        5       5      $ T	" [#        U R%                  5       5      5      n['        U R)                  5       5      [+        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7fre   )rP   r   
get_buffer)r+  mut_nameget_buf_bytess     ri   r-  ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  s/      (@H &agg&8&8&BCC(@   25)r   r%   TorchBindObjectr?  rz   r7   r\   r   rf   rb   r]   rT   r6   r   r8   r  r   rO   r  rD   	get_dtypemin)
r   rb   totr   	sched_buf	buf_elemsbuf_accessed_elemsr?  rg   r)  s
         ri   r?  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sG    c2#5#566,,..

,=>> !NN66s||~FLLEC %)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rl   )r(  z
sympy.Exprr   r   )r   r   r3  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )r   r  ExternKernelSchedulerNoder]   r6   r%   r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater  rO   
get_rangesr   collectionsr   r   r   r   ro   r  r   r   FusedSchedulerNoder  rP   r   rc  graph_inputs)rg   r  r  buf_accessesr,  r   r   r  buf_byte_accessesr  r   	buf_bytesrH  r?  r5  r:  r)  s   `           @@@@@ri   r   0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233Id566:II{<
 <
 It677499b&7&788		%%||%%BBC I	= dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I0.7(+!+y8+c 'f ! rl   c                    U R                   c  g U R                   R                  5       nUc  g [        U5      nUc  g [        R                  R
                  R                  USS9n[        S   S==   U-  ss'   U$ )Nr   r$  inductor
flop_count)r]   get_origin_noder3   rP   r   r&  r'  r   )rg   fx_nodeflopsresolved_flopss       ri   estimate_flops BaseSchedulerNode.estimate_flops0  su    99))++-?w'=))33EA3F\*n<*rl   c                T    U R                   b  U R                   $ U R                  5       $ re   )r   _get_estimated_runtimerp   s    ri   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime@  s)    **6222**,,rl   c                ,   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       a  [        U R                  [        R                  5      (       d   e [        R                  (       av  [        U 5      n[        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ [!        U 5      nUc  [#        U R                  5      nUR%                  X6S9  U$ [#        U R                  5      $ [/        U R                  5      (       a  g[1        U 5      nUb  U$ UR                  R3                  5       n	 [5        5       n
[7        U	5      S-  nU
S::  a  [9        SU
 35      eUS::  a  [9        SU 35      e U R=                  5       nUS:X  d  Uc  U R?                  5       U
-  nUS-  nU$ SnU R?                  5       nUc  SOUnX-  U-  S	-  nX-  n[A        UU5      nUS-  nU$ ! [&         a  n[(        R+                  U5         SnAgSnAf[,         a  n[(        R+                  U5         SnAgSnAff = f! [:         a     gf = f)
z3
Returns estimated op runtime in milliseconds (ms)
r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r+  )!r  r   r]   r   rJ   r4   rH   r   r%   IRNoder#   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr,   r+   	set_value
ValueErrorr   r  	TypeErrorrM    maybe_estimate_runtime_benchmarkmaybe_get_dtyperE   rC   AssertionErrorr   r_  r  max)rg   r   rz   	cache_keycache	cache_valmseretdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     ri   rb  (BaseSchedulerNode._get_estimated_runtimeF  s   
 nnq!--/2))+of-.. ##dii3333LL I$ OI68E %Y 7I ,))U;;;;((HNBz=diiHOOIO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%< }-#X	o    :  		sD   AH3 63H3 *H3 A J 3
J=IJ$I>>J
JJc                    g re   r   rp   s    ri   get_template_node#BaseSchedulerNode.get_template_node      rl   c                0    U R                  5       nUc   eU$ re   r  )rg   templates     ri   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rl   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7fre   r  )r+  ir  s      ri   r-  CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     P,<DAaa,<s   *	*Nr    )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        ri   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epilogue  sH     PIe,<PP.)-!+-.00rl   )r   r   r   r   r]   r   r   r   r\   r   r   )r\   r[   r   r   )r]   ir.Operationr   r   r   )r   z	list[str]r   r"  r.   r#  r.   r   r   r3  dict[str, str]r   r   )r,  r-   r   r   r   )rC  r   r   r   rM  OrderedSet[str]rI  r  r   r   r   r  r  dict[str, BaseSchedulerNode]r   r   r   rJ  )r   zSequence[SchedulerBuffer])r  r   r   rY   r   r  zdependencies.Depr   r   T)r  rG   r  r   r   r   r   )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int]r   z
int | None)r   rm  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])=ry   r   r   r   r   r   r   r   r   r   r   r
  r  r  r$  r4  r9  r?  r1  rO  rR  rZ  rL  rB  r{  r  rf   r  r@   r  r  r  r'   r  r   r  r   r  rJ   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r_  rc  rb  r  r  staticmethodr  r   r   rl   ri   rT   rT      s}   BB(('' NN''266
34B*2+#
!.7	
M>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X  - U Un
 1&1	S1 1rl   c                 R    [         R                  R                  R                  5       $ re   )r  r  	codecache
LocalCacher   rl   ri   rk  rk    s    ??$$//11rl   c                  ^ [        U R                  SS5      nU R                  R                  nU R                  R                  / UQU R                  R                  QU R                  R
                  5      nU R                  R
                  n[        R                  " X#45      u  pESS jm[        U4[        U4S jU 5       5      -   5      nU$ )Npython_kernel_namer  c                    [        U [        R                  5      =(       a    [        U [        R                  5      (       + $ re   )r   r%   rh  GeneratorStater  s    ri   _is_tensor_ir@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPrl   c              3  t   >#    U  H-  nT" U5      (       a  [        UR                  5       5      OS v   M/     g 7fre   )tupler  )r+  ar  s     ri   r-  <get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>  s+     U9a}Q'7'7ajjl#TA9s   58r   )
r   r]   inputsfill_non_provided_argsconstant_argsr   pytreetree_flattenr   r  )snoder  r   r   	flat_argsflat_args_pytree_specru  r  s          @ri   rj  rj    s     -A2F::D::,,*$*))*

D ZZF'-':':D>'J$IQ 	
U9U
U	VI rl   c                   [        U [        5      (       d  g [        R                  R                  R
                  [        R                  R                  R                  [        R                  R                  R                  S.n[        U R                  SS5      nX!;  a  g [        U R                  [        R                  5      (       d  g X   $ )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  r  )r   rK  r  opsatenmmbmmaddmmr   r]   r%   ExternKernel)r  mms_fnsr  s      ri   _get_mm_like_fnr    s    e677"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//22&&rl   c                `  ^ ^^	^
^ S m	S n[         R                  (       a  [        T 5      nUc  g Um	U U4S jnOg [        T 5      n[	        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ SSKJ	m  U" 5       u  mm
SSK
Jn  U" UU	U
4S j5      nUR                  X7S9  U$ )Nc                    > T" T 5      $ re   r   )r  snode_args_kwargss   ri   r   2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    !25!9rl   r    )r  r   )do_benchc                    > T" T 0 TD6$ re   r   )r   bench_fnr   s   ri   r   r    s    (D3F3rl   rf  )r"   !runtime_estimations_mms_benchmarkr  rj  rk  rl  r   rm  utilsr  triton.testingr  rn  )r  args_kwargs_fnmm_fnru  rv  rw  r  rx  r   r  r   r  s   `       @@@@ri   rq  rq    s    HN//&=99%@I&(EY'I)U++++(!#LD&'	3	4B	OOIO(Irl   c                  P    \ rS rSr% / SQrS\S'   S\S'   SS jrSS jrSS	 jrS
r	g)	WhyNoFusei  name1name2reasonr   r   r  ztuple[Any, ...]r   c                X    UR                  5       U l        UR                  5       U l        g re   )rf   r  r  rg   node1node2s      ri   r   WhyNoFuse.__init__  s    ^^%
^^%
rl   c                F    Xl         X l        [        R                  U 5        g re   )r  r   
fusion_logdebug)rg   r  r   s      ri   __call__WhyNoFuse.__call__  s    	rl   c                p    SU R                    SU R                   S3U R                  U R                  -  -   $ )Nzcannot fuse z with rt   r  rp   s    ri   __str__WhyNoFuse.__str__  s6    djj\

|2>KK$))#
 	
rl   )r   r  r  r  Nr  rT   r  rT   r   r   )r  r   r   r   r   r   r   )
ry   r   r   r   	__slots__r   r   r  r  r   r   rl   ri   r  r    s#     5IK
&

rl   r  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )r   r       )	r   r   setsortedr   pprintr|   textwrapr   )objr   s     ri   r|   r|     sU    #
C())Sc"^^C*Fv~HOOFG4566Mrl   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r   i  c                &    [        U/5      U l        g re   rl  r8  s     ri   r   OutputNode.__init__  s    ",cU"3rl   c                    gr!  r   rp   s    ri   r  OutputNode.is_reduction   r  rl   c                    g)Nr   r   rp   s    ri   r   'OutputNode.get_inputs_that_alias_output#  r  rl   c                    g)NOUTPUTr   rp   s    ri   rf   OutputNode.get_name&  s    rl   )r   N)r,  r/   r   r   r   r   r   )
ry   r   r   r   r   r  r   rf   r   r   r   rl   ri   r   r     s    4 Hrl   r   c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r    c                   > [        U [        5      (       aI  TU R                     R                  5       nTTU   R	                  5          S:  nTU   T:H  nU=(       d    U$ g)Nr   F)r   r0   ro   rj   rf   )r,  rq  is_redundantis_self_depr   name_to_dep_countr  r]   s       ri   rr  +_prune_redundant_deps.<locals>.should_prune@  se    c7##!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.rl   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fre   r   ru  s     ri   r-  (_prune_redundant_deps.<locals>.<genexpr>L  s      .,s2C.rw  Nrx  )rQ  r   r   r   r0   ro   rj   rf   r   r1  r   ry  )r]   r  r   r,  rq  deps_to_pruner  rr  s   ```   @@ri   r~  r~  ,  s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '

 
  .. M "&"9"9M"IT--::=IJ rl   c                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
rK  iU  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g re   superr   r   r1  get_read_writesrg   r\   r]   	__class__s      ri   r   "ExternKernelSchedulerNode.__init__V  5    #T"T1134rl   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = r  )rf   r   r]   rp   s    ri   r   )ExternKernelSchedulerNode.debug_str_extra[  s*    --/"/'$))EY[_2`1abbrl   c                    gNTr   rp   s    ri   r  #ExternKernelSchedulerNode.is_extern^  r  rl   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nr  )r]   r   r  rp   s    ri   r  *ExternKernelSchedulerNode.has_side_effectsa  s6    yy$$$tyy"45V$)):T:T:VVrl   r   r\   r[   r]   r  r   r   r   r   )
ry   r   r   r   r   r   r  r  r   __classcell__r  s   @ri   rK  rK  U  s    5
cW Wrl   rK  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  if  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g re   r  r  s      ri   r   NopKernelSchedulerNode.__init__g  r  rl   r   r  )ry   r   r   r   r   r   r  r  s   @ri   r  r  f  s    5 5rl   r  c                    ^  \ rS rSr% SrS\S'   S\S'         SU 4S jjr  S      S!S jjr  S      S"S	 jjr      S#S
 jr	S$S jr
      S%S jrS&S jr      S'S jrS(S jrS)S jrS*S jrS*S jrS*S jrS+S jrS,S jr    S-S jrS.S jr S/   S0S jjr\S1S j5       r\S1S j5       rS2S jr\S3S j5       r\S*U 4S jj5       rSrU =r$ )4r  im  zi
A SchedulerNode is a node for scheduling that encapsulates either
a ComputedBuffer or a TemplateBuffer.
z tuple[Sequence[sympy.Expr], ...]_sizesr9   _bodyc                f   > [         TU ]  U5        U R                  U5        U R                  5         g re   )r  r   r   _compute_attrsr  s      ri   r   SchedulerNode.__init__v  s,    
 	#T"rl   c                   [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        nX0l        U R                  R                  5       nU R                  R                  U5      R                  nXE" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer)  )r   r]   r%   ComputedBufferTemplateBuffersimplify_and_reorderr   r!  get_device_or_errorr\   get_backendgroup_fnr   r"   loop_ordering_after_fusionrJ   rx   r1  extract_read_writesr$   )rg   r'  r(  bodyr  r/  should_normalizes          ri   r#  SchedulerNode._compute_attrs  s;   
 $))b&7&79J9J%KLLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hrl   c                $    U R                  UUS9  g )Nr&  )r#  )rg   r'  r(  s      ri   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rl   c                   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      R                  U R                  5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R!                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7fre   )r   r0   r/   r*  s     ri   r-  5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s$      0
1CZgwEW5XCC1s   .	.r)  r    SIMDScheduling)r   r   r   r1  r$   r1  r!  r   r7  r2  r   pointwise_read_writesclear_cachecodegen.simdr<  candidate_tilingscache_clear)rg   r)  need_clear_tiling_cache	fake_depsr<  s        ri   refresh_dependencies"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rl   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr)  rB  )r!  reorder_iter_loopssizesr   rD  )rg   	new_orders     ri   apply_new_loop_order"SchedulerNode.apply_new_loop_order  sA    ZZ22

 jj&&!!E4!Prl   c                   [        U R                  [        R                  [        R                  45      (       d   eU R
                  R                  X5      U l        U R
                  R                  U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        U R                  SSS9  g )NTrG  )r   r]   r%   r*  r+  r!  #expand_dimension_for_pointwise_noderI  r   r-  r\   r.  r/  r   rD  )rg   	dimension	new_ranger  r/  s        ri   rN  1SchedulerNode.expand_dimension_for_pointwise_node  s     $))b&7&79J9J%KLLLLZZCC

 jj&&..0>>--f5>>ht{{34
 	!!D$!Orl   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFrG  )r!  merge_loopsrI  r   rD  rp   s    ri   rS  SchedulerNode.merge_loops  s<    ZZ++-
jj&& 	!!D%!Prl   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g[        R                  SU R                  5       5        g)Nr   r    z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r   r~   num_varsdecide_loop_order_to_matchr&   num_loop_reorderingloop_ordering_logr  rf   rK  )rg   r"  r#  rJ  
self_sizess        ri   r$  'SchedulerNode.reorder_loops_by_dep_pair  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rl   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r    z	.sizes = z
_layout = zclass z_loop_body:r  r   )rf   r   r   r   r0  r   r0   ro   rP   r   r=  r%   rB  r  r|   rz   r!  r9   r  r   r   r]   rd  r
  join)rg   ro   linesr,  r  r   s         ri   r   SchedulerNode.debug_str_extra  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyrl   c                    U R                   $ re   )r   rp   s    ri   rP  SchedulerNode.get_ranges      {{rl   c                    [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      $ Nztype(self.node)=)r   r]   r%   r*  r+  rx   r   r  rp   s    ri   r  SchedulerNode.is_reduction  s^    $))b&7&79J9J%KLL 	
tDII !	
L DII00233rl   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ rd  )r   r]   r%   r*  r+  rx   r  	SplitScanrp   s    ri   r  SchedulerNode.is_split_scan"  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rl   c                J    [        U R                  [        R                  5      $ re   r   r]   r%   r+  rp   s    ri   r  SchedulerNode.is_template*  s    $))R%6%677rl   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ re   rj  rp   s    ri   r  SchedulerNode.get_template_node-  s'    &tyy"2C2CDDtyyN$Nrl   c                f    U R                  5         U R                  5         U R                  U5        g re   )r  rR  r  )rg   
index_varss     ri   runSchedulerNode.run0  s#    ""$Z rl   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ re   )	r   r  mapr~   dictziprX  rY  from_iterable)rg   ro  rI  
var_rangess       ri   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars5  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rl   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f! [         a"    [        R                  SU R                  5        e f = f)a  
Generate code for this node using the provided index variables.

This method sets up the appropriate context for code generation, including
simplifying indexing expressions based on the variable ranges, and then
calls the node's body function with the index variables.

Args:
    index_vars: A sequence of sequences of sympy expressions representing
                the index variables for each dimension of the computation.
NzError in codegen for %s)rx  rP   set_ops_handlerr>   get_ops_handlerr   set_current_noder!  r   r   fatalr]   )rg   ro  rw  s      ri   r  SchedulerNode.codegenB  s     00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r   reversedr$   r1  r!  sympySZeror~   )rg   	pointwise
keep_sizesignore_sizess       ri   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writesY  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
rl   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
Tr  r  rp   s    ri   r=  #SchedulerNode.pointwise_read_writesd  s    
 666FFrl   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr  r  rp   s    ri   reduction_read_writes#SchedulerNode.reduction_read_writesk  s    
 666GGrl   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7fre   )r{   r  s     ri   r-  ,SchedulerNode.can_inplace.<locals>.<genexpr>u  s     ?,>S  ,>r  r    ztype(write_dep)=)r  r>  r   r~   r   r   r   r$   r.   r  iterrx   indexsize)rg   r  	write_deps      ri   r  SchedulerNode.can_inplacer  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXrl   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_add   r  ro      r    r  )r   r   r!  r9   r  rh   r  r   r~   r   rb  )rg   buffers_store_as_atomic_addr]   s      ri   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*rl   c                |   > U R                   b!  U R                   R                  S5      (       a  g[        TU ]  5       $ )Ndevice_assert_asyncT)r!  has_opr  r  rg   r  s    ri   r  SchedulerNode.has_side_effects  s5     ::!djj&7&78M&N&Nw'))rl   )r!  r   r   )r\   r[   r]   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r'  *Optional[tuple[dict[Any, Any], list[Any]]]r(  zOptional[Callable[_P, _T]]r   r   )r'  r  r(  zOptional[Callable[..., Any]]r   r   )r)  r   rB  r   r   r   )rJ  Sequence[int]r   r   )rO  r   rP  r   r   r   r   r  r   )r   Sequence[Sequence[sympy.Expr]]r   r  )ro  Sequence[sympy.Expr]r   r   )ro  r  r   zdict[sympy.Expr, sympy.Expr])ro  r  r   r   r  )r  r   r   r   )r   r   r  r  ) ry   r   r   r   __doc__r   r   r#  r6  rD  rK  rN  rS  r$  r   rP  r  r  r  r  rp  rx  r  r  r@   r=  r  r  r  r  r   r  r  s   @ri   r  r  m  s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QPP),P	P"
Q!.7	, ,4
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rl   r  c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fre   ro   r  )r+  r,  group_snodes     ri   r-  2refresh_group_node_dependencies.<locals>.<genexpr>  s/      
Pxx{;;== CP   "2	2)
r3  r1  r$   
ReadWrites
merge_listr   r   unionr   r   )r  r3  r  s   `  ri   refresh_group_node_dependenciesr    s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8r[   c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fre   r   r+  r  s     ri   r-  "init_group_node.<locals>.<genexpr>       H5G5Gr/  c              3  8   #    U  H  oR                   v   M     g 7fre   )r   r  s     ri   r-  r    r  r/  )r   rR  GroupedSchedulerNoder3  r\   r]   r   r  r   r  rD  r   rt  r   r   rf   r   )r  r\   r3  r  r   s        ri   init_group_noder    s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      S S j5       r\S!S j5       r	      S"S jr
S#U 4S jjr\S$S	 j5       rS$S
 jr\S%S j5       rS&S jrS$S jrS$S jr      S'U 4S jjr\S%S j5       r\S%S j5       rS(S jrS$S jr\S)S j5       r\S)S j5       r\S)S j5       r\S*S j5       rS+S jr\S)S j5       rS,S jrS-S jrS.S jrS$S jr \S)U 4S jj5       r!Sr"U =r#$ )/rR  i  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
r  r3  c           	        UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       Ga  [        U[
        5      (       Ga  [        UR                  [        5      (       d   e[        UR                  R                  5      S:X  d   e[        [        [        UR                  R                  5      5      [        5      (       d   e[        [        UR                  R                  5      5      R                  nUR                  5        Vs/ s H  oDR	                  5       (       d  M  UPM     nn[        U5      S:X  d   eUS   n[        UR                  R                  5      S:X  d   e[        [        UR                  R                  5      5      n[        U[         5      (       d   e[#        [!        X7R$                  UR&                  UR(                  UR*                  5      /5      UR                  l
        O[        U[        [        45      (       d   e[-        [.        R0                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ s  snf )Nr    r   )r\   r   r  rR  r  rK  r]   r6   r~   r   r   r  r  r/   ro   r  r.   r   r  	var_namesr  r  r   rX  rY  )	clsr  r  ro   r]   template_nodesr  writer  s	            ri   fuseFusedSchedulerNode.fuse  s    %//111%-1C!DEEEE:e5N#O#O ejj+6666u((//0A555d4(9(9(@(@#ABGLLLLU..5567<<D/4/@W/@tDTDTDVd/@NW~&!+++*1-M}00778A===m77>>?@EeY////'1kk5??EJJ

(E$ em5G%HIIIIY__U__%68IJK5??E**! Xs   ,JJc                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fre   r  r  r_  r  s     ri   r-  4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  =       0''))T^^-= *D'')) 0
   .AAr   r   filterr  r~   r  rg   fpsrz  s      ri   r_  !FusedSchedulerNode.estimate_flops  K      $ 0	
 s8q=#h
rl   c                   U R                  5       (       a  gSnU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          gUR
                  S   nMj     SnUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g[        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g)	z0
Return true if a loop reordering is performed.
FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr    z-Reorder loops for fused node %s with order %sT)r  r3  r   r  r  r   rY  r  r~   rV  rW  rf   r&   rX  rK  r  )rg   r"  r#  rZ  r  rJ  s         ri   r$  ,FusedSchedulerNode.reorder_loops_by_dep_pair   sK    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-rl   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ re   )r   r  r  s    ri   r   -FusedSchedulerNode.__init__.<locals>.<lambda>-  s    s1>>3C/Drl   r  )r  r   r  rb   rt  r   )rg   r\   r3  r  s      ri   r   FusedSchedulerNode.__init__)  s6    #0%'
%DEKK
rl   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r]  r3  rf   rg   r  s     ri   rf   FusedSchedulerNode.get_name/  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ Nr   r3  rf   rp   s    ri   r  !FusedSchedulerNode.get_first_name3      {{1~&&((rl   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf re   r   r  r3  r  r  s     ri   r  #FusedSchedulerNode.get_buffer_names6  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ re   r3  rd  r   rg   r   r]   s      ri   r   FusedSchedulerNode.get_outputs:  /    (*KKDMM$**,-  rl   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r   r  )r  r3  rf   r   r]   rd  r
  r  r   r]  r  )rg   r  r]   r^  s       ri   r   "FusedSchedulerNode.debug_str_extra@  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r3  r  )rg   r]   
snodes_strs      ri   r  "FusedSchedulerNode.debug_str_shortK  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g re   )r  rO  r   r  r3  updater   )rg   rM  rI  r]   r  s       ri   rO  !FusedSchedulerNode.set_last_usageO  sQ    
 	2G 0:|T[[)D 3H&&t7 *rl   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf re   )r   r  r3  rZ  r  s     ri   rZ  $FusedSchedulerNode.used_buffer_names\  s0    !MA"5"5"7!MNN!Mr  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf re   )r   r  r3  rL  r  s     ri   rL  /FusedSchedulerNode.used_or_aliased_buffer_names`  s5    8<D1,,.D
 	
Dr  c                    U R                   $ re   r8  rp   s    ri   r  FusedSchedulerNode.get_nodesf  rb  rl   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r   r   rp   s    ri   r   FusedSchedulerNode.__repr__i  s'    t*%%&gdmmo->a@@rl   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   )r  r  s     ri   r-  2FusedSchedulerNode.is_reduction.<locals>.<genexpr>n  s     9[>>##[r  r>  r3  rp   s    ri   r  FusedSchedulerNode.is_reductionl  s    9T[[999rl   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   )r  r  s     ri   r-  3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  s     :k??$$kr  r  rp   s    ri   r   FusedSchedulerNode.is_split_scanp  s    :dkk:::rl   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   r  r  s     ri   r-  1FusedSchedulerNode.is_template.<locals>.<genexpr>v  s     8Kq==??Kr  r  rp   s    ri   r  FusedSchedulerNode.is_templatet  s    8DKK888rl   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g re   )r3  r  r  rg   r]   s     ri   r  $FusedSchedulerNode.get_template_nodex  s3    KKD!!--//   rl   c                     U R                   S   $ r  )r   rp   s    ri   r   FusedSchedulerNode.get_device  s    zz!}rl   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   )r?  r  s     ri   r-  >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//r  r  rp   s    ri   r?  +FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErl   c                    [         ere   NotImplementedError)rg   r3  s     ri   r4  'FusedSchedulerNode.update_mutated_names      !!rl   c                    [         ere   r  )rg   ro   s     ri   r9  FusedSchedulerNode.add_fake_dep  r  rl   c                    [         ere   r  r  s     ri   r  FusedSchedulerNode.can_inplace  r  rl   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r   ru   c              3  L   #    U  H  n[        U5      R                  v   M     g 7fre   )rx   ry   r  s     ri   r-  /FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     F+QQ 0 0+s   "$rt   r   r   r   r   r   r   z.outputs = [
            Nrv   r   Tr   )rf   r]  r3  rG   r   rx   ry   r|   r   r   r   r   r   r   r   rw   r   r   r   r  r   r  )rg   ro   node_typestrr   r  s        ri   r   FusedSchedulerNode.debug_str  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F)c                r   > U R                   b  [        S U R                    5       5      $ [        TU ]  5       $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fre   )r  r  s     ri   r-  6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>  s     G;4,,..;r  )r3  r>  r  r  r  s    ri   r  #FusedSchedulerNode.has_side_effects  s0    ;;"G4;;GGGw'))rl   )r   rb   r  rT   r  rT   r   rR  r  r  )r\   r[   r3  r  r   r   r   r  r   zlist[SchedulerBuffer]r  r  r   r  )r   torch.devicer  )ro   r-   r   r   r  )$ry   r   r   r   r  r   classmethodr  r@   r_  r$  r   rf   r  r  r   r   r  rO  rZ  rL  r  r   r  r  r  r  r   r?  r4  r9  r  r   r  r   r  r  s   @ri   rR  rR    s    $#+%+.?+	+ +B  "'!'.7'	'RL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""*4 * *rl   rR  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g re   )r   rf   read_to_node)rg   producerr   s      ri   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for  sG     '')C||~!2!22((88 * rl   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr    )r   rT   r   r   ro   r\   r   rj   name_to_noderb  r~   r  r  )rg   consumer	producersrd	node_names        ri   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((rl   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7fre   )r\   can_fuse)r+  lrr3  s      ri   r-  6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  typingcastr0  r~   r3  r  ru  r  r4  r\   rA  r=  rs  )r  r3  r9  whyforeach_matchconsumer_subnodeproducer_subnodes    `     ri   rA  #ForeachKernelSchedulerNode.can_fuse  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
rl   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  rF  rG  r0  rN  rQ  ru  r3  rR  r  r=  r  r4  rs  r\   )r  r3  r9  rN  rQ  rO  rP  rB  rC  fused_nodesrK  r]   new_noderJ  s                 ri   r  ForeachKernelSchedulerNode.fuse  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     T R                   VVVs0 s H(  oR:                  R=                  5         H  u  pX_M	     M*     snnnT l        UT l        US   RA                  5       nU(       d   eU[B        RD                  " S5      444T l#        [         [H        RJ                  RL                     " 5       T l'        UT l(        g s  snnnf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fre   r  rj  s     ri   r-  6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>f  s5        xxt'<'<'>>	 C r  r   combo_kernel))r2  r8  r  r   r   r   ro   r  r\   r3  r]   rb   r1  r$   r  r  r   r  r   r   rD  r   rt  r   r  r   r0  r   r  r   itemsrN  r   r  Exprr   r  fxNoder	  rQ  )rg   r\   r3  rN  rO  rP  rQ  r]   r  ro   foreach_node
other_noder  rH  vr  r  s   `               ri   r   #ForeachKernelSchedulerNode.__init__D  s    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 #'++@"-:O:O:U:U:W$!:W+@D  *C&%%'v

> :<>?
!%((--02.@s   /Lc           	     x   U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H"  n[        U[        [        45      (       a  M   UPM$     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     nnU(       a   [        R                  S[	        U5      U5        U Vs/ s H  o"U;  d  M
  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)
r   rK  r   r  r~   r]   r   r  r0  r  )r  r  r  externr]   filtered_nodesforeach_nodesr  s           ri   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes  s|    #OUj4M&N!UOIIAF5;UVTyy(&&(VU 
a"8:S!TU  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GIIBN#
 &4O^7N!^O9 P
 V




 H PsR   FFF#FF#/F#;F(F(
F-'F-3F2F2	F7F7c           
         U R                  5       n/ nSnU H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXEXS-    PM     sn5        M?     U$ s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   r   )_topological_sort_nodesrd  ranger~   )r\   sorted_nodesgrouped_nodesmax_num_nodesr  r  s         ri   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  ss     !88:!E   #1c%j-@@ a/0@ " s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g re   r0  rq  )custom_group_algorithms    ri   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#Drl   c                ,    [         R                  U 5      $ re   rs  r\   s    ri   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVrl   c                    [         ere   r  rp   s    ri   rR  #ForeachKernelSchedulerNode.mark_run  r  rl   c                    [         ere   r  rp   s    ri   r  "ForeachKernelSchedulerNode.codegen  r  rl   c                    gr  r   rp   s    ri   r  %ForeachKernelSchedulerNode.is_foreach  r  rl   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r   r3  rp   s    ri   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DKK  rl   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7fre   )r  r  s     ri   r-  7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s     1UA++--r  )r   rX  rY  rv  r3  rp   s    ri   r  $ForeachKernelSchedulerNode.get_nodes  s(     IOO111U1UUVVrl   c                <    U R                   S   R                  5       $ r  )r3  r  rp   s    ri   r  )ForeachKernelSchedulerNode.get_first_name  s    {{1~,,..rl   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g re   )r~  r\   r   r3  r  )rg   r  r]   s      ri   r  /ForeachKernelSchedulerNode.prune_redundant_deps  s5     	d8R8RSKKD%%&89  rl   )r   rQ  r   r   r   r8  r]   r	  r   r2  r\   r3  r   rN  rb   )r3  rT   r   r^   )r9  rT   r   r^   r3  rT   r9  rT   r   r   )r3  rT   r9  rT   r   r0  )NNF)r\   r[   r3  r  rN  r   rO  r^   rP  r^   rQ  r   r   r   r  r  r   r  )r\   r[   r   list[list[BaseSchedulerNode]])rt  rp  r   r   r   r   r   r  r  r   r  )ry   r   r   r   r  r4  r=  r.  rA  r  r   re  r  rn  rq  r   ru  ry  rR  r  r  r  r  r  r  r   r  r  s   @ri   r0  r0    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/ F/P +	  B 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :rl   r0  c                     ^  \ rS rSr% SrS\S'   \SS j5       r S       SU 4S jjjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jr\SS j5       rSS jr\SS j5       rSrU =r$ )r  i  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
r  r3  c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7fre   rx  )r+  r]   r\   s     ri   r-  .GroupedSchedulerNode.create.<locals>.<genexpr>  s     B64>>Y.6s   )r\   r  r  rf   )r  r3  grouped_snoder  r\   s       @ri   createGroupedSchedulerNode.create  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>rl   c                H   > [         TU ]  U5        [        XU5        X0l        g re   )r  r   r  temp_grouping)rg   r\   r3  r  r  s       ri   r   GroupedSchedulerNode.__init__	  s$     	#0 +rl   c                B   U R                   (       a  U R                  $ U R                   H)  nXR                  R                  UR	                  5       '   M+     U R                  R                  U R	                  5       	 U R                  R                  U R                  5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)r  r3  r\   r  rf   
fuse_nodes)rg   r  s     ri   unpackGroupedSchedulerNode.unpack  so    
 ;;[[EBGNN--enn.>? !NN--dmmo>~~((55rl   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g re   )r1  r   r7  r   rb  )rg   fake_deps     ri   r9  !GroupedSchedulerNode.add_fake_dep%  s5    T--77AB##H-rl   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf r  r  r  s     ri   rf   GroupedSchedulerNode.get_name)  r  r  c                <    U R                   S   R                  5       $ r  r  rp   s    ri   r  #GroupedSchedulerNode.get_first_name-  r  rl   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf re   r  r  s     ri   r  %GroupedSchedulerNode.get_buffer_names0  r  r  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ re   r  r  s      ri   r    GroupedSchedulerNode.get_outputs4  r  rl   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fre   r  r  s     ri   r-  6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>@  r  r  r   r  r  s      ri   r_  #GroupedSchedulerNode.estimate_flops:  r  rl   c                    U R                   $ re   r8  rp   s    ri   r  GroupedSchedulerNode.get_nodesL  rb  rl   c                    gr!  r   )r  r3  r9  s      ri   rA  GroupedSchedulerNode.can_fuseO  r&  rl   )r  )r3  r  r   r  F)r\   r[   r3  r  r  r   r   r   r  )r  r-   r   r   r   r  r,  r  r  r  )ry   r   r   r   r  r   r.  r  r   r  r9  r@   rf   r  r  r   r_  r  rA  r   r  r  s   @ri   r  r    s     $#  $	++ (+ 	+
 
+ +6. = =) N N  "  rl   r  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr    c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr   Nr   r+  sl_asl_bs      ri   r-  5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>l  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr  r   r  s      ri   r-  r  o  r  r  r  )rA   absr  ru  )	r  bslstride_len_astride_len_ba_firstb_firstrI  stride_lengthss	          ri   	index_cmp"pick_loop_order.<locals>.index_cmp_  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )r  r   r  r   r   r   )		functools
cmp_to_keyr   r  rj  r~   r"   pick_loop_orderssort)r  rI  priority_idxr  orderpis   ``    ri   pick_loop_orderr  U  s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri  $Union[BaseSchedulerNode, OutputNode]r]   Fr   r  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ re   )rn   r]   rf   r  r  rp   s    ri   rq   NodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrl   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ re   )r   r  rf   r  r  rg   others     ri   __eq__NodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rl   c                6    U R                   R                  5       $ re   r   rp   s    ri   rf   NodeUser.get_name  r   rl   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ re   )r]   r  r  r  r  s     ri   r   NodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rl   r   Nr   )r  objectr   r   r   )r  r  r   r  )ry   r   r   r   r   r  r  rq   r  rf   r   r   r   rl   ri   r  r    s3    
..K GTL
$
rl   r  c                 "    [         R                  $ re   )r"   r  r   rl   ri   *used_non_deterministic_runtime_estimationsr    s    333rl   c                    ^  \ rS rSrSrSOS jrSOU 4S jjrSPS jr\SQS j5       r	\	R                  SRS j5       r	SSS jrSTS	 jrSUS
 jrSSS jrSSS jrSSS jrSSS jr    SVS jrSWS jrSXS jrSSS jrSSS jrSVS jrSSS jr    SYS jr SZ       S[S jjr      S\S jrSSS jrS]S jr      S^S jrS_S jr    SVS jr SZS`S jjr!SaS jr"    SbS  jr#      ScS! jr$      ScS" jr%        SdS# jr&      ScS$ jr'        SeS% jr(      SfS& jr)SgS' jr*        ShS( jr+      SiS) jr,ScS* jr-      ScS+ jr.        SjS, jr/SkS- jr0SlS. jr1      SfS/ jr2    SmS0 jr3    SnS1 jr4SSS2 jr5SSS3 jr6SSS4 jr7SoS5 jr8SpS6 jr9SqS7 jr:SrS8 jr;      SsS9 jr< St     SuS: jjr=  SvS; jr>    SwS< jr?      SxS= jr@      SyS> jrA    SzS? jrB    SVS@ jrC    SVSA jrD    SVSB jrE  S{SC jrFSSSD jrG      S|SE jrH      S}SF jrI      S~SG jrJSSSH jrKSaSI jrL    SSJ jrMSSK jrNSSL jrOSSSM jrPSNrQU =rR$ )r[   i  z
A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
optimizations such as fusion, reorder, and graph partition.
c                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initrg   r  s     ri   r   Scheduler.__init__  s#    ./JJu 0//s   '
5c           
     \  >^  [         TT ]  5         T [        R                  l        0 T l        [        [        5      T l        [        R                  " 5       T l        [        5       T l        [        / [        R                  R                  R                  5       Q[        R                  R                   R                  5       Q[        R                  R"                  R                  5       Q5      T l        U Vs/ s H  nT R'                  U5      PM     snT l        S T l        T R-                  5         T R$                  R/                  [        R                  R                   R                  5       5        T R(                   H  nUR1                  5         M     S T l        T R5                  5       T l        T R(                   Vs0 s H  o"R9                  5       U_M     snT l        T R(                   VVs0 s H*  o3R=                  5         H  oDR9                  5       U_M     M,     snnT l        T R:                  RA                  5       T l!        0 T l"        0 T l#        [H        RJ                  " T R(                  T R>                  T RB                  5      T l        T RM                  5         T RO                  T R(                  5      T l        T RQ                  5         T R(                   Vs0 s H  o"R9                  5       U_M     snT l!        T RS                  5         [T        =RV                  [Y        T R(                  5      -  sl+        SSK-J.nJ/n  U" T R(                  5        [Y        T R(                  5      T l0        T Rc                  5         T RO                  T R(                  5      T l        [        [d        [f        [f        4      " 5       T l4        [j        Rl                  b%  [j        Rl                  " T R(                  5      T l        T Ro                  T R(                  5      T l        [j        Rp                  b%  [j        Rp                  " T R(                  5      T l        T Rs                  5         T Ru                  5         [j        Rv                  (       a#  [y        SSSS9   T R{                  S S9  S S S 5        [j        R|                  (       a  SSK?J>n  U" T R(                  T R>                  T RB                  [        [        R                  R                  R                  5       5      [        [        R                  R                  5       5      5      T l        [j        R                  (       a  [j        R|                  (       d#  SS	K?JBn  U" T R(                  T R>                  5        [        5       (       a-  [        R                  (       a  SS
K$JFn	  U	" T R(                  5        SSKGJHn
  U
" SS U 4S jS9  [H        R                  " T R(                  5      T l        T R                  5         [        R                  Rj                  R                  (       as  [        R                  Rj                  R                  R                  (       a@  T R                  T R(                  5      T l        T R                  T R(                  5      T l        T R                  5         [        R                  Rj                  R                  R                  (       a  T R                  5         U" T R(                  5        [        R                  R                  T R(                  5        T R                  5         [        5       T lY        0 T lZ        [        S5      R                  U 4S j5        g s  snf s  snf s  snnf s  snf ! , (       d  f       GN= f)Nr   )log_ir_post_fusionlog_ir_pre_fusionz#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodesr    )reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     SSS.$ )N#scheduler_nodes_before_comm_overlapstring)ro   encodingr   r   rl   ri   r   !Scheduler._init.<locals>.<lambda>4	  s    A (%rl   c            
        > SR                  [        TR                  5       V Vs/ s H0  u  pSU  S3UR                  5       -   SUR	                  5        3-   PM2     snn 5      $ s  snn f )Nz

zsnode[rv   z buffer_names:)r]  r  r  r   r  )r  r  rg   s     ri   r   r  8	  so    6;;
 %.djj$9	 %:DA !1++-(*1+=+=+?*@AB %:	$s   7A$
)metadata_fn
payload_fngraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr~   r  rp   s   ri   r   r  \	  s%     33+/+>+>*-djj/rl   )]r  r   rP   r   r\   backendsr  _post_grad_graph_counterr  rX  count_graph_partition_counterr   r  rS  keys	constantstorchbind_constantsri  create_scheduler_noder  current_nodeupdate_zero_dim_cpu_tensorr  rB  default_device_contextget_donated_buffersr   rf   r8  r   r   copyr  rI  r   r!   decide_global_ordering_of_commsrS   topological_sort_scheduledead_node_eliminationcompute_ancestorsr&   ir_nodes_pre_fusionr~   torch._inductor.debugr  r  r   create_foreach_nodesr  r   logged_slow_fusionr"   _pre_fusion_custom_passr  _post_fusion_custom_passrS  finalize_multi_template_bufferscombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_names reorder_for_compute_comm_overlapr  r  r#   6runtime_estimations_align_across_all_distributed_ranksr  torch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesr  r  graph_partitiontriton
cudagraphs&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)rg   r  r  r]   r   r  r  r  r  r  r  r  s   `          ri   r  Scheduler._init  sy    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCUd003UC
9='')##**177+<+<+A+A+CDJJDOO  ?C# $$& 	# &*ZZ;
%/JJL!OZ;
 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
**688DDJ,,.5&* $
 ..D.A ))70

  ''177//44671773356DJ 2211UAJJ 0 0
 ;<< WW GtzzR7 CCDJJODJ""$ OO""22&&--88DDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
o D;
8
B #H* s$   6^>^/1^^^
^+c                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)r_   )rP   r   graph_inputs_originalr   r%   DonatedBufferr   )rg   name_to_donated_bufro   s      ri   r  Scheduler.get_donated_buffersc	  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"rl   c                6    [         R                  R                  $ re   rP   r   current_devicerp   s    ri   r8  Scheduler.current_devicen	  s    ww%%%rl   c                .    U[         R                  l        g re   r7  r  s     ri   r8  r9  r	  s    !'rl   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r    )draw_buffersT)print_graph)osenvironrG  r  r>  r  )rg   r>  s     ri   r,  Scheduler.debug_draw_graphv	  s1    ::>>:DASH+6 Irl   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r   isEnabledForloggingINFOr  r  r  )rg   labelr]   s      ri   debug_print_nodesScheduler.debug_print_nodes}	  sD    GLL))HHUE"

  " # *rl   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r   is_no_opr  r   r%   r*  r+  r  r  rK  r  r  s     ri   r  Scheduler.create_scheduler_node	  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++rl   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr    FrN  rQ  )r   r  r  rP   r   listsr   r   r8  r  r  r"   combo_kernels_autotuner0  r  r  rf   r   )
rg   removed_node_namesfe_nodeskept_node_namesnamesro   r3  rQ  fe_noder]   s
             ri   r  Scheduler.create_foreach_nodes	  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                  ^ ^$^%^&  " U$4S jS[         [           5      m$[        R                  " T$5      m%T R                   H  nUR                  5        H  nUR                  5       n[        UR                  R                  [        R                  5      (       a  [        UR                  5       5      S:  a  Me  UR                  5        He  nUT%;   aD  UT%;   a>  T%U   nT%U   nXV-   nT%R                  5        H  nT%U   UL d
  T%U   UL d  M  UT%U'   M     MM  UT%;   a
  T%U   T%U'   M]  T%U   T%U'   Mg     M     M     SU&U 4S jjm&  S         SU%U&4S jjjn	0 n
[        R                   R"                  R%                  5        H  u  p[        U[&        R(                  5      (       a  UR*                   H  nSX'   M	     M=  [        U[        R,                  5      (       d  M^  UR/                  5        Vs/ s H&  n[        U[&        R(                  5      (       d  M$  UPM(     nnU H  nUR*                   H  nSX'   M	     M     M     SnT R                   Hz  nUR                  c   e[1        UR                  R3                  5       S S	9nU H?  n[        U[&        R4                  5      (       d   eS
nX;  d  M-  UR                  5       X'   MA     M|     T R                   GH  n[6        R9                  SUR                  5        U(       a  UR                  c   e[1        UR                  R;                  S
S9S S	9nU Hi  nX;   d   U SU
 35       eX   =nc  M  T R<                  U   R                  5        H+  nUR?                  [A        UR                  5       5      5        M-     Mk     [        URB                  RD                  5      S:X  aQ  [G        [I        URB                  RD                  5      5      =n(       a"  [        U[J        5      (       a  URL                  nOSnUR                  5        GH  n[        URO                  5       5      S::  d   eURO                  5        H  nT&" U5      nU	" UU5        UR?                  [A        UUS95        T%U   R$                   H  nUR                  5       UR                  5       :X  a  M'  [        UR                  [P        5      (       d   eUR                  RS                  5        H:  nT&" U5      nUR?                  [U        UUR                  5       S95        U	" UUS
S9  M<     M     M     GM     URB                  RV                   H<  n[        U[T        5      (       a  M  U	" URX                  XR[                  U5      5        M>     UR]                  T R^                  5        UR                  5        H  nURO                  5        Hz  nUR                  5       T R^                  T&" U5      '   UR                  5       T R^                  U'   T R`                  Rc                  UU5      T R`                  UR                  5       '   M|     M     GM     [        R                   Re                  5        H4  n[6        R9                  SU5        U	" U[g        [A        U5      5      5        M6     U(       a  [        R                   Rh                   H  nUR;                  S
S9 H  nX;   d   U SU
R                  5        35       eX   =n(       d  M/  T R<                  U   RS                  5        H5  n[6        R9                  SUU5        U	" U[g        [A        U5      5      5        M7     M     M     T R^                   H  nU[        R                   R"                  ;   aF  U	" U[g        [A        U5      5      5        [        R                   Rj                  Rm                  U5        Mg  U[        R                   Rn                  ;   d  M  U	" U[g        [A        U5      5      5        M     [q        [        R                   R"                  R                  5       5       VVs0 s H  u  noU_M
     nnn[        R                   Rj                   Vs/ s H  nUU   PM
     sn[        R                   l9        T R                   HF  nUR                  5        H/  nURu                  T%UR                  5          R$                  5        M1     MH     T Rv                   H.  nT Rv                  U   Ru                  T%U   R$                  5        M0     [y        5       nUR{                  S5        T%R%                  5        Ha  u  nn UR}                  5          U R$                   V!s/ s H  n!U!R                  5       PM     n"n!UR{                  SU SU" S35        SSS5        Mc     UR{                  S5        UR                  5       R                  5       n#[        R9                  S5        [        R9                  SU#5        gs  snf s  snnf s  snf s  sn!f ! , (       d  f       M  = f)zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
c                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi	  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g re   )rY  r   
membership)rg   rY  r[  s      ri   r   :Scheduler.compute_dependencies.<locals>.DedupList.__init__	  s    
 #[b
","<
rl   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g re   )r[  rY  r  rb  )rg   	node_users     ri   r  8Scheduler.compute_dependencies.<locals>.DedupList.append	  s3    /

!!),##I.rl   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf re   )r   r  r[  rY  )rg   r  new_membershipr  	new_items	DedupLists        ri   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__	  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)rY  r[  r  )rY  zOptional[list[_T]]r[  zOptional[OrderedSet[_T]]r   r   )r^  rV   r   r   )r  DedupList[_T]r   rf  )	ry   r   r   r   r  r   r  rd  r   )rc  s   ri   rc  rY  	  s@     -17;=)= 5= 	=/< <rl   rc  r    c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ re   )r   )r  r2  rg   s    ri   r2  .Scheduler.compute_dependencies.<locals>.rename	  s,    D)))d33A677Hrl   Fc                N   > TT" U 5         R                  [        XU5      5        g re   )r  r  )used_by_namer  r  r  name_to_usersr2  s       ri   add_user0Scheduler.compute_dependencies.<locals>.add_user
  s'     &./669rl   Nc                    U R                   $ re   r)  r  s    ri   r   0Scheduler.compute_dependencies.<locals>.<lambda>#
  s    AFFrl   r  Tzscheduling %s)unbacked_onlyc                    U R                   $ re   r)  r  s    ri   r   ro  6
  s    !&&rl   z not in )r  )mutating_buf)r  zscheduling output %sz+scheduling output %s for unbacked symint %sr  'z': ru   r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   r   r   )FF)
rj  r   r  r  r  r   r  r   r   r   )Br   rV   rQ  r   r  r   rf   r   r]   rz   r%   r8   r~   r{   r  rP   r   rS  rY  r  rZ  r   	TensorBoxr  r  get_unbacked_symbol_defsSymbolr   r  get_free_symbol_usesr8  r9  r/   r   r   r  r  r.   r  r}   rT   r  r0   r   ro   r  r4  r   rI  rG  r  r   graph_outputsmutated_inputsrb  r  r  mutated_input_idxsr   r   rG   r   r   r   r  compute_dependencies_log)'rg   r]   buf1	buf1_name	buf2_namelist1list2combinedr  rl  unbacked_symbol_to_origin_nodero   valfsr(  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesrC  r   r,  	node_modealt_namer   
other_namer  r  r  r  	inp_nameslogbufrg  r_  rb   r   rc  rk  r2  s'   `                                   @@@ri   rS   Scheduler.compute_dependencies	  s   	< 	<> @K?V?V@
 JJD((* MMO	 tyy//??D,,./!3!%!1!1!3I M1i=6P -i 8 -i 8#(=#0#5#5#7C -c 2e ;#0#5#>5=c 2 $8 #m33@3Ki03@3Ki0 "4 + :	 	 !&!			;	 	 		
 	 	 MO&
 --335ID#uzz****B9=26 +C.. (+||~S~!Auzz9RA~S!Ann=A6: - " 6 ',#JJD99((( $*		224:J$  *!!U\\2222 /3+:8<25 *   JJDIIotyy1*yy,,,'-II222F(($
 .A> #X&D%EF> <>>K#'#4#4Q#7#C#C#EC --gclln.EF $F . D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG*.))*D*D*FJ)/
);J -- '
 P %ZtD +G !> !4 *, ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *u F 002HII,h7Xz'(*;<= 3
 'ww,,111EA> #X&D&I&I&K%LM> ;==q=(,(9(9!(<(M(M(OHII M ( !
 %Xz'(:K/LM )P F - ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0  !c'--/JC/4{{;{!{;#c%23 ! 0 	c  "))+ &&';< &&'I3O_ Tl
&
" < !s6   2#h3h3!h8h>	ii2ii
i	c           
     2  ^ ^ SSK JnJnJnJn  [        [        R                  R                  R                  5       5      nU" T R                  U5      n[        R                  R                  R                  (       d  U" T R                  T R                  5        [        [        R                  R!                  5       5      nU" T R                  UU5      u  n  n	[#        [%        T R                  5      5       V	s/ s H  n	/ / 4PM	     sn	mU H  n
U
R&                  S:X  a  U
R(                  S:X  a  M%  U
R*                  R-                  5       nTU
R.                     S   R1                  U5        TU
R2                     S   R1                  U5        M     SSKJn  U" 5               SU U4S jjn/ n[9        T R                  5       HE  u  nnUR1                  U5        UR1                  U" X[%        T R                  5      S-
  :H  S95        MG     UT l
        g s  sn	f )Nr    )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                N  > TU    S   nTU    S   nX#U/n[         R                  " [        [        R                  " S5      S9[        R
                  R                  R                  R                  / US S9nSTR                  U    R                  5        3Ul        [        TU5      $ )Nr   r    r  )r  c                $    U US   US   US   S.4$ )Nr   r    r  )alivedeadis_final_stepr   )tensor_argsr  s     ri   r   WScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>
  s(    !.q!1 -a 0)6q)9Crl   )rz   r   r  nontensor_argsunflatten_args
mem_check_)r%   MemoryCheckKernelr8   r  r  r  _inductor_debugcheck_memory_stepdefaultr  rf   operation_namerK  )step_idxr  expected_newly_aliveexpected_newly_deadr  r]   rg   step_allocs_deallocss         ri   construct_mem_check_nodeEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node
  s     $8#A!#D "6x"@"C2WN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rl   )r  )r  r   r  r   r   rK  )r  r  r  r  r  r   rP   r   rS  r  r  r  r  r"   r  r   r  rj  r~   
size_alloc	size_freer  rf   
start_stepr  end_step#torch._inductor.runtime.debug_utilsr  r  )rg   r  r  r  r  rS  name_to_freeable_input_bufrx  buf_info_listr  buf_infor  r  r  	new_nodesr  r]   r  s   `                @ri   r*  #Scheduler.insert_memory_check_nodes
  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
4RH4C
 &H""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG & 	N	9	9*.	9&	9 	92 	 ,GAtT"(DJJRS@S;SU - 
eC
s   8Hc                  ^	 / n[        U R                  5       GH  nSS jm	SnUR                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        UR                  R                    H  nUR"                  U R$                  ;   d  M  U R$                  UR"                     R                  nU Vs/ s H2  oR&                  R                  5       UR                  5       :w  d  M0  UPM4     snU R$                  UR"                     l        M     GM     [)        [        U5      5      U l        U R                   H  nUR+                  5         M     gs  snf )	z 
Remove any nodes without users
c                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ re   )r  rf   rP   r   rp  )r   s    ri   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user   s&    ||Tt}}!'':T:T'TTrl   Fc              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   )r+  ur  s     ri   r-  2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %sN)r   r  r   r   )r  r  r   r  rb   r   r  rf   rP   r   r  rb  r  r  rp  r   r   ro   r   r]   r   r{  )
rg   updated_nodesr]   active_buffersr   can_eliminater  rb   r  r  s
            @ri   r  Scheduler.dead_node_elimination
  s    TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   4/I'Ic                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ re   r)  )ds    ri   r   DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>-  s    affrl   r  )rb  r  r   ro   r  )r  r,  r8  r   seenvisits     ri   r  2Scheduler.topological_sort_schedule.<locals>.visit*  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  rl   )r  rT   r   r   )r   rT   rt  r  )rg   r  r]   ro   r8  r   r  r  s       @@@@ri   r  #Scheduler.topological_sort_schedule   sj     +,.59V*,	! 	! D--/%)T" 0  D$K rl   c                D  ^  [        5       n[        U[        [        [        [
        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7fre   )r   rj   rj  s     ri   r-  1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>L  s(     XZc))#.??AAZrE  c              3  B   >#    U  H  nTR                   U   v   M     g 7fre   r  )r+  r  rg   s     ri   r-  r  M  s     Q=at66q9=s   )r   r   r  rK  r  rR  r   rb  ro   RuntimeErrorrx   r   )rg   r  
unmet_depsr,  unmet_dep_opss   `    ri   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes;  s    &0l
)&"	
 
 //sxx( 0 =d5k]!L  YZXJQ=QQRRrl   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r    zTopological sort failed!)	rt  fromkeysr  r  r~   rG  r  rY  ra  )rg   r  r  childrenr]   re  r,  cr  r_  zero_deg_nodesr   s               ri   ri  !Scheduler._topological_sort_nodesO  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                j   0 nU R                    Hw  n[        5       nUR                   HB  nU R                  UR                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l        My     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)r  r   r   r   ro   rj   rb  rf   r   r  r   r   )rg   name_to_ancestorsr]   r   r,  dep_node_namer  s          ri   r  Scheduler.compute_ancestorsi  s    
 9;JJD)3I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1rl   c                   [         R                  (       d  g U R                   H  n[        U[        [
        45      (       a)  UR                  5       (       d  [         R                  S:w  a  MI  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)r"   r0  r  r   r  rR  rJ   cpu_backendr  r  rS  )rg   r]   r  s      ri   rS  Scheduler.merge_loops|  s    00JJD d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * rl   c                z   [        SSSS9   [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  U5      n[        U5      n[        R	                  SUS-   UU5        XC:X  d  US:X  d  Ml  [        R	                  SUS-   5          O   UsS	S	S	5        $ ! , (       d  f       g	= f)
z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodesTr  
   z/===== attempting fusion (%d/10): %d nodes =====r    z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   rj  r~   r  r  fuse_nodes_once)rg   r  r  old_lennew_lens        ri   r  Scheduler.fuse_nodes  s     #4QU
 2Ye*  EE
 ,,U3e*  TE	 %A$$Eq1u ' ( /
 
 
s   A4B,B,,
B:c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)r  rd  r   r  r  )rg   r  r]   s      ri   r!  Scheduler.process_grouped_nodes  sF     .0	JJD!+D2F!G!GdV  
rl   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r~   r   r8  r.  r   r  )rg   r  r  backends       ri   r  Scheduler.benchmark_fused_nodes  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
s   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX@l        U R                  U5      n[	        S5         UR                  XUS9sSSS5        $ ! , (       d  f       g= f)r  r   r  hint_overrideN)r~   r   r8  r.  r   generate_kernel_code_from_nodes)rg   r  benchmark_kernelr  r  r  s         ri   r  )Scheduler.generate_kernel_code_from_nodes  si     5zA~~q$$&$""6*12::} ;  322s   A!!
A/c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)r  r  N)r8  r.  r   benchmark_codegened_module)rg   moduler  r  s       ri   r  $Scheduler.benchmark_codegened_module  s=     %""6*1255f= 322s	   >
Ac           
     (  ^       S	S jn[        U R                  5       GH  u  p#[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pVO [        S UR                  5        5       5      n[        U[        R                  R
                  R                  5      (       a  [        R                   (       a  0 nXWS'   [        R                    Hm  nUR                  US9n	U	R#                  5        V
Vs0 s H  u  p[        U
[        5      (       d  M  X_M      nn
n[%        UR#                  5       S S9S   nXU'   Mo     UR                  R'                  U5        OUR                  R)                  U5        GM  UR+                  5       nUR,                  n[        U[
        R.                  5      (       d   eUR,                  n[        U[
        R0                  5      (       d   eUR2                  Ul        U" UU5        U R5                  U5      nUU R                  U'   UU R6                  UR9                  5       '   UU R:                  UR9                  5       '   0 m[<        R>                  " UR@                  RB                  URD                  5       HA  nU RF                  RI                  URJ                  S5      =n(       d  M2  URJ                  TU'   MC     S
U4S jjnU" URD                  5      Ul"        U" UR@                  RB                  5      UR@                  l!        [M        URO                  5       URO                  5       5       H4  u  nnUU RP                  UR9                  5       '   URR                  Ul)        M6     URT                  Ul*        URV                  Ul+        URX                  Ul,        GM     gs  snn
f )aP  
Finalize a backing choice for MultiTemplateBuffers which did not already have a
choice finalized through fusion. In the case of an extern choice, this will result
in replacing the SchedulerNode.

If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
will force completion of compilation and benchmarking.
c                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g re   )rf   r   r   r  rP   r   rc  ro   
name_to_opr  buffersr  remove
operations)	orig_noderS  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          ri   replace_operation_bufferKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer  s_    !) 1 1 3%..0MmS11jARTW6X6XXX'::<$779LlC00Z@PRU5V5VVV&&'89)M""#34&2#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,rl   c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7fre   )r   r  r  select_algorithmExternKernelCaller)r+  timings     ri   r-  <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>  s7      *E) & % @ @ S S  #F*Es
   7A	ANr  c                    U S   $ r7  r   r  s    ri   r   ;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>7  s	    qQRtrl   r  r   c                .   > [        U4S jU  5       5      $ )Nc              3  D   >#    U  H  oR                  T5      v   M     g 7fre   )r2  )r+  r,  r   s     ri   r-  QScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>V  s     %Sdsjj1A&B&Bds    r   )re  r   s    ri   rename_deps>Scheduler.finalize_multi_template_buffers.<locals>.rename_depsU  s    %%Sd%SSSrl   )r   zir.MultiTemplateBufferrS  zir.OperationBufferr   r   )re  r   r   r   )-r  r  r   r  r]   r%   MultiTemplateBufferr"   r(  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   multi_kernel_hintsrY  rD  finalize_as_triton_callersfinalize_as_triton_calleroutput_noder  
StorageBoxOperationBufferrz   r  r8  rf   r  rX  rY  r   r   r   rI  rG  ro   ru  r   r   rb   r   r   r   )rg   r  r  r]   
multi_nodemin_node_unfusedr  callershinttimingsrH  r_  triton_timingschoiceout_tensorboxout_storage
out_buffernew_scheduler_noder,  	real_namer  new_outold_outr   s                          @ri   r  )Scheduler.finalize_multi_template_buffers  sx   	8-	89K	8	86 !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&??  00QS(8$*$=$=D&0&?&?d&?&SG -4MMO.,;DA#-a1I#J !%,; + .
 &))=)=)?^%TUV%WF,2DM %> 		<<WE		;;<LM 0 < < >+00!+r}}====(--
!*b.@.@AAAA$.$5$5
!(Z@%)%?%?
%K" 2

15G!!$--/2;M''8 $& $??$$**D,C,CC %)$;$;$?$?$$OOyO69hh(3	T 9D&999"5 8C&22888"..4 ),&224d6F6F6H)$GW <CD$$W%5%5%78$+MMGM	) 04~~",/3~~",04"-e -:.s   P
*P
c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r  Nscatter_moder  )r   r]   r  r/  r  s     ri   r-  ,Scheduler._any_atomic_add.<locals>.<genexpr>j  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r>  rg   	node_lists     ri   _any_atomic_addScheduler._any_atomic_addi  s     

 
 
 	
rl   c                \  ^ ^^^^^^^^^^^ ^!^"^#^$^% [        S TT4 5       5      n[        R                  (       d  U(       d  gTR                  5       (       a-  [	        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  gTR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a  gTR                  5       n[        [        R                  " XE5      5      nT R                  U5      (       a  gSSKJm  [%        TT5      m%US   R                  5       mTc   eSUU4S jjm![&        R(                  R*                  R-                  5       m S     SUU 4S jjjnU(       Ga;  [        S	 TT4 5       5      (       Ga!  TR                  5       SLmT(       a  TR                  5       OTR                  5       m$[	        T$[        R.                  5      (       d   e0 m / m[        R0                   GH@  nT$R3                  U5      n	[5        U	R7                  5       S
 S9 Hw  u  p[	        U
[&        R(                  R8                  R:                  5      (       d  M:  T$R=                  U
5         TR?                  U
/U" XjR@                  S9Q75        SSS5        My     [C        S5      nSn0 nT HW  u  pn Ub  URE                  5         T$R=                  U
5         T RU                  UT5      u  nnUX'   UU:  a  UnU
nSSS5        MY     UT$RV                  U'   [	        U[X        5      (       d   eUT U'   GMC     T$R3                  5       n	T$R[                  5       u  nm"T(       a  T R]                  U5      OT R]                  U5      u  m#n/ mSn[5        U	R7                  5       [^        R`                  " S5      S9 H  u  p[	        U
[&        R(                  R                  RX                  5      (       d  M:  T(       d-  [c        U
S5      (       a  U
Rd                  T$Rd                  :w  a  Mn  UT"T#-   :  a    OTUS-  nU[        Rf                  :  a    O9T$R=                  U
5         TR?                  U
/U" U5      Q75        SSS5        M     [i        T5      S:X  a  gSUUUU U!U"U#U$U 4	S jjnU$ U" U5      mU" U5      mU" U5      mSUUUUUU!U U%4S jjnU$ ! , (       d  f       GM  = f! [F         a]  n[H        RK                  [L        RN                  5      (       a)  [H        RQ                  ST(       d  SOS[S        U5      5         SnAGM  SnAff = f! , (       d  f       GM  = f! , (       d  f       GM  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7fre   )r  r   r  r%   r  r  s     ri   r-  .Scheduler.speedup_by_fusion.<locals>.<genexpr>z  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   r  CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  rD  rE  DEBUGr  r  r<   r=   )ms_fusedms1ms2r  r  s      ri   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rl   c                   > TR                  U SUS9n[        R                  " U5      nTR                  5       (       d  S nXC4$ TR	                  SUS9n[        U[        5      (       d   eXC4$ )NT)r  r  triton_)kernel_namesource_code)r  r   loaduse_process_poolr#  r   r   )r  r  src_codemodfutasync_compilerg   s        ri   compile_kernel3Scheduler.speedup_by_fusion.<locals>.compile_kernel  s     ;;M < H ""8,C 1133
 : $**yh*W!#|4444:rl   c              3  D   #    U  H  oR                  5       S Lv   M     g 7fre   r  r  s     ri   r-  r8    s      %
7E!!-~s    c                    U S   $ r7  r   r  s    ri   r   -Scheduler.speedup_by_fusion.<locals>.<lambda>  s    !A$rl   r  r  infException in compiling %s: %sr  r  r    allowed_prologue_inpsFc            	       >	 [        S5      n S n0 nT HU  u  p4n Ub  UR                  5         TR                  U5         TR                  UT	5      u  pxXrU'   Xp:  a  Un UnS S S 5        MW     T" U TT5        U TT-   :  aP  UbM  [        R                  (       a  UTS '   TR                  T5        OTR                  U5        UTR                  S '   gg! [         a]  n[        R	                  [
        R                  5      (       a)  [        R                  ST
(       d  SOS[        U5      5         S nAGM  S nAff = f! , (       d  f       GM7  = f)NrR  rS  r  r  TF)rm  r   r   r  rD  rE  r=  r  r   swap_as_triton_callerr  r"   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr$  future	mod_fusedry  r>  pathr  epilogue_fusionfuture_choices hint_override_best_fusion_choicerA  r?  r@  r  rg   s            ri   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready-  sC   $U|"& 1?-FI!!-"MMO $99&A)-)H)H%v* /7F+#2+3L.4O BA 2@0 <c239-/2M00AP8>"==< #<<_M7BJ..t4 ? % !%227==AA&,, ?2A
z #A
 !! BAs#   C"D:
D7AD22D7:
E
	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr    z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r?  r@  r>  path1path2
path_fuseds   ri   r   KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s&    053605365?8@3;sSy3I%rl   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsrd  r   r  mathisinfr   r  rb  r   r/  r   )rd  rK  ry  r?  r@  r>  rn  ro  rp  r:  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rA  rg   rH  s      @@@@@@ri   ra  rb  c  s   ; *!,)!,/2 
 ?JJL  "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFF)r>  rm  r?  rm  r@  rm  r   r   re   )r  rJ  r  Optional[int]r   z)tuple[Optional[LambdaFuture], ModuleType]r   )5r>  r"   benchmark_fusionr  r   r  r%   TritonTemplateBufferr  r  r   rx   r   rX  rY  r3  triton.compiler.errorsr:  r  r  r  rL  AsyncCompiler  r  r  r  rY  r	  TritonTemplateCallerrV  r  r  rm  r   r   r  rD  rE  r=  r  r   r  rW  r   r  r  operator
itemgetterr   rT   max_epilogue_benchmarked_choicesr~   )&rg   r  r  is_multi_templatenode_list_1node_list_2node_list_fusedrM  r  r  r$  unfused_timerX  rY  rZ  r[  r\  ry  r>  r]  r  ro  triton_choicesra  r:  rL  r  r^  rv  rw  rx  r_  r`  rA  r?  r@  r  rH  s&   ```                     @@@@@@@@@@@@@@ri   speedup_by_fusionScheduler.speedup_by_fusionr  sP       
 U^ 
 

 &&/@ u668":Q:QRR!!!! oo'Q**,v ;;%oo'y{HI
 00;u% #..0!!!	 	" 55BBD PT	.	?L	6	 	  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA  - TVN!'!:!:!+!:!:=!I,2"((*-(F & @ @ U U  !#99&A&-- &!/$3CWCW"" BA-"  %U|FJ 1?-FI
!!-"MMO $99&A)-)H)H%v*$ /7+#l2+3L.4O BA 2@( =H
**=9!/3KLLLLBQ0?Y ";^ (668N..0FAs # **;7//< C TVNN(.$$&H,?,?,B)$ "&%//*<*<*U*UVV ((?@@44
8X8XX39,!#!F$K$KK55f="))6*TN?4S*TU >=3)8 >"a',! ,!\ (' !/{ ; .{ ;&4_&E#@ @D ('Q BA" % !%227==AA&,, ?2A
z #A
 !! BAb >=s=   ($T0T$VV
T
V(AV  V
V
V+	c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     ri   r  Scheduler.get_fused_node  s    &&t':':'<==rl   c                ,  ^ ^^^ [        U5      m[        R                  [        R                  5      (       aB  [        R                  S5        T H'  n[        R                  SUR                  5       5        M)     0 m      SUU 4S jjm      SUUU 4S jjnT R                  U5       H  u  pEU" XE5        T R                  U5      nT R                  U5      nT R                  XE5      (       d  MG  T R                  XE5      (       a  M_  T R                  XE5      n[        U5      (       a  XdU4TU'   XdU4TU'   M  U(       d  M  T" XE5        M     [        5       nTR                  5        Hx  u  pn
X;   a  M  UR                  U5        T R                  U	5      U	L d   eT R                  U
5      U
L d   eU" 5       (       d  MX  T R                  X5      (       a  Mp  T" X5        Mz     [        TS S9nT R!                  U5      nT R#                  U5        U$ )	z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  %sc                  > [         R                  SU R                  5       UR                  5       5        U R                  5       nUR                  5       U:X  d   eTR	                  U5      R                  X5      nTR                  U 5        TR                  U5        TR                  U5        TR                  R                  UR                  5        Vs0 s H  oDR                  5       U_M     sn5        U$ s  snf )Nzfusing %s with %s)r  r  rf   r   r.  r  r  rb  r  r  r  )r  r  r  node3r  rR  rg   s        ri   fuse_two_nodes1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@.?u$.?@ L As   C8c                D  > TR                  U 5      T;   d  TR                  U5      T;   a  TR                  TR                  U 5      TR                  TR                  U5      S 5      5      nUc   eUu  p4nTR                  US 5        TR                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  M  T" XE5        TR                  U 5      T;   a  M  TR                  U5      T;   a  M  g g re   )r  rG  ra  will_fusion_create_cycle)	r  r  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsrg   s	         ri   resolve_pending_fusions:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
y##It4##It4**95BBB**95BBB!||t'D'DU'R'Ry4' ##E*o=&&u-@rl   c                    U R                   $ re   r  r  s    ri   r   +Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !++rl   r  )r  rT   r  rT   r   rT   r  )r   r  rD  rE  r=  r  r  get_possible_fusionsr  rA  r  r  callabler   rb  r  r  r  )rg   r  r]   r  r  r  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  rR  r  s   `          @@@ri   r  Scheduler.fuse_nodes_once  s    !'""7==11;<#  )=)=)?@ $  	
	$	->		 	 	5$	5->	5	5 	52 !55e<LE $E1''.E''.E}}U**43P3P4 4 00>G$$.5e-DOE*.5e-DOE*u,) =, @J|3B3I3I3K/Mi4 $$]3&&y1Y>>>&&y1Y>>>t'D'D( ( y4 4L {(=>..u5!!%(rl   c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %s...r  Nz)ComboKernels: Not speeding up %d-th groupr    TrN  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ re   r  r  s    ri   r   5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>;  s    q{{rl   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r~   r   r  r  r0  ry  re  speedup_by_combo_kernelr"   rP  r\   r  r  rb  r  r  r  rf   r  r  r  )rg   r  rR  r  num_nodes_orignumr2  rQ  r  r]   r  s              ri   r  #Scheduler.create_combo_kernel_nodes  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
R

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g re   )r  r  )rg   r  r]   s      ri   r  Scheduler.prune_redundant_depsE  s     D%%d&=&=> rl   c                  ^ ^	^
 / m	[         [        [        [        4      " 5       m
SU	U
U 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nX5   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      nU(       d  M  Xx   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T	5      m	T	R                  T R                  SS9  [         R#                  S[%        T	5      5        T	$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pU US-   US-   [        R                  -     H  nX#4nUT;   a  M  TR                  U5        TR	                  X#5      (       a  TR                  U5        MH  UR                  5       (       d  UR                  5       (       d  Mt  TR	                  X25      (       d  M  TR                  X245        M     M     g r7  )r  r"   )max_fusion_buffer_group_pairwise_attemptsrb  rA  r  r  r  )r  node1_indexr  r  r  possible_fusionsr  rg   s        ri   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairsR  s    &/&6""!Ok'FF'GE
 !.Cd{ HHSM}}U22(//4++--1A1A1C1CJ J )//?! '7rl   r   NT)r  reversezfound %d possible fusionsr  r  r   r   )r   r  rT   rQ  r   r   unfusable_noderZ  r  r   r"   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r~   )rg   r  r  buffer_names_groupingr]   r   node_groupinggroup_groupingr   r  r  s   `        @@ri   r  Scheduler.get_possible_fusionsI  sV    % 13D DEFH	@ 	@( !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLrl   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fre   r  r+  r  
found_pathrg   s     ri   r-  IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s,      H!DA #4#:#:1#=>>!D   "%)r   rR  rb  r  issubsetr   r   r>  )r]   combined_ancestorscombined_namesr  rg   visiteds    ri   r  6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  rl   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fre   r  r  s     ri   r-  5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s&     WDVqJt66q9::DVr  zwill create cycler]   rT   r   r   )r   rR  r  _dictr  r   r>  r  )rg   r  r  cycler  r  r  r  s   `   @@@@ri   r  "Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78rl   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two nodes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r    )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r7  )
r   r   r   rG  ro   r~   rb   r]   has_tensor_outputr  )r]   r   r;  r   rg   s       ri   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mrl   c              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   r+  r   r  s     ri   r-  <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #S]c$4S$9$9]r  c              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   r  s     ri   r-  r    r  r  r   r  F    T)r]   rT   r   zlist[ir.Buffer])r  r  r   intersectionr   ro  score_fusion_memoryrP   r   r&  statically_known_gt)rg   r  r  r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @ri   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                   ^  UR                   R                  UR                   R                  -  UR                   R                  UR                   R                  -  -
  n[        U 4S jU 5       5      U:  $ )Nc              3  F   >#    U  H  nTR                  U5      v   M     g 7fre   dep_size_hintrj  s     ri   r-  :Scheduler.fusion_accumulate_large_reads.<locals>.<genexpr>  s     @is4%%c**irK  )r   r   r   r  )rg   r  r  	threshold	all_readss   `    ri   fusion_accumulate_large_reads'Scheduler.fusion_accumulate_large_reads  sd     &&,,u/@/@/F/FF$$u'8'8'?'??
	 @i@@9LLrl   c                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heurisitic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )rt  r  r   r   )rg   r  r  proximity_scores       ri   are_long_distant_nodes Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##rl   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r0  ro   rP   r   r=  r   r.   rx   	get_numelrO   r  
get_offsetnormalize_with_stride_orderr%   rB  rz   r   )rg   r  r  common_buf_namesreasonsr,  node1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  ri   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reason	  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  5       (       d  UR                  5       (       a  gUR                  R                  5       nUR                  R                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[!        U	["        R$                  " S5      S9u  pn['        U[(        5      (       a  ['        U[(        5      (       d  gUR*                  UR*                  :w  a4  UR-                  5       UR-                  5       :X  a  U R/                  U5      $ gSnUR1                  5       (       d  UR3                  X5      nOZUR1                  5       (       d  UR3                  X5      nO3[4        R7                  SUR9                  5       UR9                  5       5        U(       a  U R;                  X5      $ S$ s  snf s  snf )as  
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatible with node1 if that's more efficient.

Return the amount of shared data re-computed in this method.
If no such recomputation happens, return -1 (not return 0 since 0 is a valid
amount of shared data).

c              3  @   #    U  H  oR                  5       v   M     g 7fre   )r  r  s     ri   r-  >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>W  s      8
 .1HHJJr  r  r   r$  r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)r"   r0  r>  r  r   buffer_namesr0  ro   r  r  rP   r   r&  r'  r  r~   rt  r  r  r   r.   rV  r)  r  r  r$  rY  r  rf   r  )rg   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr,  r  r  
candidatesbuffer_namer  r  _numel	reordereds                  ri   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loopG  sz     00C 8
!&8
 5
 5
 
 %"3"3"5"5"..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a $'zx7J7J17M#N '9--Z5S5Sw///
   "g&7&7&99))'22	!!##77II##%%77II##Q   :Ct''5JJg YXs   6K*Kc                    [        U[        [        45      =(       a6    UR                  5       (       + =(       a    [	        UR
                  5      (       + $ )z.
Is this node unfusable under any conditions.
)r   rK  r  r  rL   r]   r  s     ri   r  Scheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
rl   c                   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnU" UR                  5       R                  5      (       a  UR                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)r]   r   rh   r  )r+  r  ry  s      ri   r-  EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )Nr  )itemsizeis_floating_point)r{  s    ri   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBrl   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r{  ztorch.dtyper   r   )r  rP   r   invoke_quant_opsr  r  r  r  r  r  r  constant_pad_ndr  r  r{  r  )	rg   prologue_noder  rH  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr	  r  s	            ri   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHII!>>@@h rl   c                  ^  [        U[        5      (       a  [        U[        5      (       d  g[        UR                  [        R                  5      (       a)  [        UR                  [        R                  5      (       d  gUR                  5       (       d  UR                  5       (       a  g[        R                  S:X  a  gUR                  UR                  pCUu  pVUu  pxUR                  5       (       d2  UR                  5       (       d  Xh:w  d  [        U5      [        U5      :w  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  gT R                  [        [        UR                  R                  5      5      5      n	T R                  [        [        UR                  R                  5      5      5      n
[!        X5      [        R"                  :  a  gSU 4S jjnU" U5      (       d  U" U5      (       a  g/ n[%        ['        XW5      5       H   u  nu  pX:w  d  M  UR)                  U5        M"     [        U5      S:w  a  gUS   nUU   UU   nn[*        R,                  R.                  R1                  UU5      (       a  UUU4$ [*        R,                  R.                  R1                  UU5      (       a  UUU4$ g)a?  
Fusing two small pointwise nodes significantly reduces kernel overhead
and launch overhead. However, slightly different sizes would prevent fusion.
Here, we decide if expanding sizes of one node is profitible by allowing
fusion, and returns the dimension to expand, node with smaller sizes,
and new size after expand.
Nr  r    c                  > U R                   R                   H  nUR                  TR                  ;   a  TR                  UR                     nO%TR                  R                  UR                  5      nU(       d  Me  [        R                  R                  R                  X 5      (       d  M  [        UR                  [        5      (       a  M    g   g)NTF)r   r   ro   r   r   rG  rP   r   r   r  r   r_   r  )r]   r  r  rg   s      ri   has_reusable_bufferIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    ((..99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I I,,66yGG&y'<'<>TUU / rl   r   r  )r   r  r]   r%   r*  r?  r"   r  r   r  r~   r   r   r  r  r  rt  small_memory_access_thresholdr  ru  r  rP   r   r&  statically_known_lt)rg   r  r  n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr'  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  ri   "get_expand_dim_for_pointwise_nodes,Scheduler.get_expand_dim_for_pointwise_nodes  sl    %//z%7W7W uzz2#4#4555::r'8'899 ))++u/M/M/O/O ) #\\5<<()1&)1&  !!##1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"7223 	  u%%)<U)C)C !'0]1R'S#C#'!#**3/ (T "#q(*1-,',' ' 77//OO66WW11..QQ66rl   c                $  ^ XL a  g[        X5      nUR                  5       (       a4  U R                  UR                  5       5      R	                  X5      (       a  g[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR                  5       (       Gac  [        R                  (       d	  U" S5        gUR                  5       (       d  UR                  5       (       a	  U" S5        gUR                  5       n[        U[        R                   5      (       d	  U" S	5        gUR#                  5       n[%        S
 UR&                   5       5      U-
  nUR)                  5       U-  (       a	  U" S5        gUR+                  5       (       d  UR+                  5       (       a	  U" S5        gUR-                  5       mTSS  HK  nUR/                  5       nU H2  n	[1        U4S jU	R2                   5       5      (       a  M)  U" S5            g   MM     [        U[4        5      (       d  U/O2UR6                   V
s/ s H  oR                  5       (       d  M  U
PM     sn
n[9        U5      S:X  d   eUS   n[9        TS   R:                  5      S:X  aU  [9        TS   R:                  S   R2                  5      S:X  a,  TS   R:                  S   R2                  S   R<                  UL d	  U" S5        gU R?                  XU5      (       d  gUR                  5       (       aH  UR+                  5       (       d*  UR                  5       (       d  [        R@                  (       d	  U" S5        gUR)                  5       [B        RD                  RF                  -  (       d0  UR)                  5       [B        RD                  RF                  -  (       a	  U" S5        gUR                  5       nUR                  5       nX:w  a
  U" SX5        gAU RI                  X5      nU[        RJ                  :  a.  [        RL                  (       a  U RO                  X5      nUS:  a  Un[        RP                  (       aA  U RS                  X5      =n(       a)  Uu  nnnURU                  UU5        U RI                  X5      n[V        RY                  [Z        R\                  5      (       a4  [V        R_                  SURa                  5       URa                  5       U5        [B        Rb                  Re                  XX/5      (       d  gUR                  5       UR                  -  (       a_  U Rg                  X5      =(       aG    [B        Rb                  Rg                  XX/5      =(       a     U R                  U5      Rg                  X5      $ [B        Rb                  Ri                  XX/5      =(       a     U R                  U5      Ri                  X5      $ s  sn
f )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  @   #    U  H  oR                  5       v   M     g 7fre   r  )r+  inps     ri   r-  %Scheduler.can_fuse.<locals>.<genexpr>{  s     E_c<<>>_r  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  @   >#    U  H  oR                   T;   v   M     g 7fre   r1  )r+  r   prologue_nodess     ri   r-  r?    s     QytyyN:ys   z7template prologue can only fuse nodes with a single user    r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)5r  r  r.  r   can_fuse_multi_outputs_templater   r  rK  r  r  r   r"   prologue_fusionr  r  r%   r{  get_allowed_prologue_inpsr   r  r  r?  r  r   r  rb   rR  r3  r~   r   r]   r#  r^  rP   r   no_fuse_buffer_namesr  score_fusion_memory_thresholdr0  r  $expand_dimension_for_pointwise_nodesr:  rN  rY  rD  rE  r=  r  rf   choicesrA  can_fuse_verticalcan_fuse_horizontal)rg   r  r  rH  r  rT  unsupported_prologue_argsr]   	node_outsr  r  template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerA  s                        @ri   rA  Scheduler.can_fuseE  sx   
 >%4#3#3$

)
)%
7$8 e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-))01!!##u'8'8':':HI779Hh(?(?@@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$,f> 44UB D DD11$($J$J5$X!$)$9!66#FFuTTOT6E3Z{<<ZU $ 8 8 F))'--88##.  !	 yy!!$uHH$$&8 &&u4 MII//UVM$$V,>>uL 9900U M""6*>>uLMS Bs   Z3Zc                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        S [$        R&                  R)                  UR+                  5       5       5       5      nX-  (       a	  U" S5        gUR-                  5       nU HJ  nU R.                  U   R1                  5       nXR2                  U   R4                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7fre   r)  r*  s     ri   r-  .Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
U HHUrW  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   r   r   rG  ro   r   r0   fusable_weak_depr  r   r   r.   fusable_read_and_writer  r   rX  rY  rv  r   r  r   rj   r  r   )rg   r  r  node1_buf_namesrH  remaining_deps_by_namer,  ro   cd	remainingr;  remaining_depsnode1_op_namesrq  s                 ri   rI  Scheduler.can_fuse_vertical  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # rl   c                P  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nUR                  R                   Vs/ s H  owR                   U:X  d  M  UPM     nn[        U4S jU 5       5      $ s  snf s  snf )NFr    r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7fre   )r   r.   r   r  r   TMPr  )r+  r  r  s     ri   r-  -Scheduler.fusable_weak_dep.<locals>.<genexpr>-  sn      

 '	 tY' ('

DHH==(

ekk)( 		UZZ'( 's   BB)ro   r  r   r   rr  r~   r   r.   r   r  r   re  rI  r   r  )	rg   weak_depr  r  r  mutating_writesr)  r  relevant_readss	       `    ri   rZ  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"%++++u{{DHH55++H,A,AB	"..44
4T		Y8ND4 	 
  

 '
 
 	
#

s   DD*D#D#c                8   [        U[        5      (       Gab  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nUR                  UR                  :H  =(       aa    [        UR                  5      [        UR                  5      :  =(       a/    UR                  S [        UR                  5       UR                  :H  $ [        U[        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                   UR                   :X  a  UR                   b  X4:X  a  ggr   )r   r.   r   rG  ro   r   r  r   re  r"   r0  rV  r)  r~   r  r/   r  )rg   r  r  	read_name
write_names        ri   r[   Scheduler.fusable_read_and_write9  sh   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rl   c                @    [         R                  R                  U5      $ re   )rP   r   get_dep_size_hintr8  s     ri   r  Scheduler.dep_size_hint[  s    ww((--rl   c                B  ^  [        UR                  R                  5      [        UR                  R                  5      -   n[        UR                  R                  5      [        UR                  R                  5      -   n[	        X45      S-  [        X45      :  a  X4:  a  UnUnUnUR                  R                  UR                  R                  -   Vs/ s H9  nXbR                  R                  ;   d  XbR                  R                  ;   d  M7  UPM;     nn[        U 4S jU 5       5      $ UR                  R                  UR                  R                  -  UR                  R                  UR                  R                  -  -  n[        U 4S jU 5       5      $ s  snf )zV
The first term in our fusion score that estimates number of saved
memory operations.
r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7fre   r  rj  s     ri   r-  0Scheduler.score_fusion_memory.<locals>.<genexpr>u  s     ?$3t))#..$rK  c              3  F   >#    U  H  nTR                  U5      v   M     g 7fre   r  rj  s     ri   r-  rt  z  s!     I6Hs4%%c**6HrK  )r~   r   r   r   rD  rt  r  )	rg   r  r  node1_dep_lennode2_dep_lentmpr,  re  common_memory_depss	   `        ri   r  Scheduler.score_fusion_memory^  sa    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT, !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   6FFc                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r    )
r~   r   r   r.  get_fusion_pair_priorityr  rD  rY  r  r  )rg   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           ri   r  4Scheduler.get_possible_fusions_with_highest_priority|  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55rl   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rP   rH  score_fusionr  s     ri   r  Scheduler.score_fusion_key  s     yy%%d3U33rl   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rP   r   r  r  r  rO  rI  r  r   )rg   rM  r]   s      ri   r'  Scheduler.compute_last_usage  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )rl   c                   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  [        U[        R                  5      (       a  GM  UR                   n[        U[        R"                  5      (       a  UR%                  5       (       d   e[        R                  R
                  R                  UR                   5        GM     U R                  R'                  5         g)z*Free any buffers that are no longer neededN)r  r-  rP   r   r  r   freedr   r   codegen_freer]   rS  r   r%   rB  r  r  r  is_input_bufferclear)rg   ro   r   r>  storages        ri   free_buffersScheduler.free_buffers  sU   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:R%6%677!hhG"7BMM::w?V?V?X?XXGG((55gllC)
, 	!!'')rl   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g re   )r  r   flushr  )rg   r  s     ri   r  Scheduler.flush  s.    }}++-GMMO .rl   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)NrY  extern_callsr    F)increase_kernel_countztype(node)=)r   rK  r   rP   set_kernel_handlerr*   r  rR  r]   r%   r  rx   r  r   r   r  )rg   scheduler_noder]   s      ri   codegen_extern_callScheduler.codegen_extern_call  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rJ   rx   r  rP   r   add_device_infor)   r  r   r  r  get_device_propertiesmajorr1   inspectcurrentframer2   )rg   r  device_schedulingdevice_propss       ri   create_backendScheduler.create_backend  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&rl   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ re   )r  r  r  s     ri   r.  Scheduler.get_backend  s@    !!!&$($7$7$?DMM&!}}V$$rl   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf re   )r.  r  r  r   r  )r  r  rg   s     ri   	get_order*Scheduler.enter_context.<locals>.get_order  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )r  ztorch.fx.Noder   r   )r  r]   r   r   r  rt  r  r  rP   r   r   enter_context)rg   r]   r  r  ry  r	  r  lasts   `       ri   r  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7fre   )r  rf   )r+  r   fused_node_namess     ri   r-  AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUrA  )r   rb   KeyErrorr  r   rI  )rg   ro   r  rb   s     ` ri   $can_buffer_be_removed_through_fusion.Scheduler.can_buffer_be_removed_through_fusion
  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                  ^  UR                   n[        U[        R                  R                  R
                  5      (       a  UR                  =n(       a  UR                  5       n[        U[        R                  R                  5      (       a  U SUR                   3OUnU[        R                  ;   d  U[        R                  ;   a,  [        U[        R                  R                  5      (       d   eg[        R                  R                  R                  R                  (       d  [        R                   c  gSS jnU(       a  ["        OUn[        U[$        5      (       a  ['        U 4S jUR(                   5       5      $ UR                   c   eUR+                  5       (       d  U" SUS9  g[        UR                   [        R,                  5      (       a  U" SUS9  g[        UR                   [        R.                  5      (       a  U" S	US9  g[1        UR                   S
S5      (       a  U" SUS9  g[3        UR                   5      (       a  U" SUS9  gg)zBReturn True if we should partition the inductor graph on this noder  TNc                    g re   r   )msgr]   s     ri   noop_log,Scheduler.should_partition.<locals>.noop_log:  s    rl   c              3  F   >#    U  H  nTR                  U5      v   M     g 7fre   )should_partition)r+  r  rg   s     ri   r-  -Scheduler.should_partition.<locals>.<genexpr>@  s     Mt,,U33rK  znon gpu opsr1  zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r  r   r]   r^   r   r   )r]   r   r  r  r%   r  rL  ro   _ops
OpOverload_overloadnamer"   custom_should_partition_opsr#  r$  r?   wrapperrN   rR  r>  r3  rJ   
DeviceCopyConditionalr   rI   )	rg   r]   
should_logir_noderh   op_overload_packet_nameop_overload_namer  log_partition_reasons	   `        ri   r  Scheduler.should_partition  s    ))gu11@@AA%%%B%&(ggi# b%**"7"788 ++1R-=-=,>?,  (6+M+MM#v'I'II!"ejj&;&;<<<< &&--886>>F	 AK<PXd.//MMMMyy$$${{}} T:dii// !1=dii00 !2>4991488 !7dC!$)),, !>TJrl   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rP   r   rS  r  r   rY  r]   )rg   r8  r]   ro   scheduler_buffers        ri   get_name_to_nodesScheduler.get_name_to_nodes[  sd     UWAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  rl   c           	        [        [        R                  R                  5       VVs0 s H  u  p#X2_M	     nnn[        [        R                  R	                  5       5       VVs0 s H  u  p#X2_M	     nnn/ [        R                  l        [        U5       H  u  pgUR                  (       a  M  / nUR                   H#  nUR                  UR                  U5      5        M%     / n	UR                   H1  n
U	R                  UR                  U
R                  5       5      5        M3     [        R                  R
                  R                  [        UUU	UR                  5      5        M     gs  snnf s  snnf )zj
computes a mapping from partition input/output indices to graph input/output
indices for each partition.
N)r  rP   r   rS  r  partition_mapsskip_cudagraphinput_nodesr  rG  output_nodesrf   rF   constant_names)rg   
signaturesr4  ro   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingr]   s              ri   compute_graph_partition_maps&Scheduler.compute_graph_partition_mapsk  s;    (11E1E'F%
'F)#DI'F 	" %
 (11I1I1K'L&
'L)#DI'L 	# &
 "$'0'<#L''
 M!--$$%>%B%B4%HI .  N!..%%&@&D&DT]]_&UV / GG""))! !",,	! (=%
&
s   E'"E-c                  ^^	^
 S	U	4S jjm	    S
U	U
4S jjm
    SU	4S jjm    SS jn[        5       R                  " U
4S jU 5       6 nUR                  " U4S jUR                  5        5       6   U" U5      n[        5       nU HG  n[        R                  R
                  R                  U5      nUR                  UR                  5        MI     [        [        U[        R                  " S5      S95      $ )a9  
Returns all symbol inputs which are required to be in scope to successfully
perform codegen for this graph partition, including:
- free symbols used in partition nodes
- free symbols in partition input/node shapes, strides, and offsets. This is needed
  for recording cudagraphs for tensors with dynamic shapes.
c                  > [        5       nU R                  5       n[        U[        R                  5      (       a  UR                  [        UR                  5      [        UR                  5      -  [        UR                  5      -  5        [        U[        R                  5      (       a!  UR                  T" UR                  5      5        U$ Ub
   SU 35       eU$ )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r%   Layoutr  r   r  strideoffsetr  r  )r]   free_symbol_usesrz   get_layout_symintss      ri   r  GScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symints  s    9C**,F&")),, '' -"6==12"6==12
 fb&C&CDD$++,>v}},MN
 $# ~ @I~ $#rl   c                <  > [        U [        5      (       a+  [        5       R                  " U4S jU R                   5       6 $ U R
                  c   eU R
                  R                  5       nUR                  " U4S jU R
                  R                  5        5       6   U$ )z
Gets symbols used in node.
c              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   )r+  r  get_scheduler_node_symbol_usess     ri   r-  fScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>  s     U4U;;r  c              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   )r+  r  r  s     ri   r-  r    s     U=T'$W--=Tr  )	r   rR  r   r  r3  r]   rw  r  r   )r]   r  r  r  s     ri   r  SScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses  s     $ 233!|))UU  99(((#yy==?##UTYY=R=R=TU $#rl   c                   > [        U [        R                  5      (       a
  [        5       $ [        U [        R                  5      (       a  T" U 5      $ [        S[        U 5       35      e)z?
Gets symbols used in input node shapes, strides, and offsets.
zUnsupported input node type: )r   r%   rB  r   rh  r  rx   )r]   r  s    ri   get_input_node_symbolsKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sU     $ 2 233!|#D")),,)$// *,I$t**VWWrl   c                &    [        S U  5       5      $ )z
Filters a set of symbols that are required for codegen. Skip symbols
that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
and SymT.R0_INDEX.
c              3     #    U  HV  n[        U[        R                  [        R                  [        R                  [        R
                  45      (       d  MR  Uv   MX     g 7fre   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOAT)r+  r(  s     ri   r-  VScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sI       A!		

))++	  s   AA 	A r   )symbolss    ri   filter_symbolsCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         rl   c              3  4   >#    U  H  nT" U5      v   M     g 7fre   r   )r+  r]   r  s     ri   r-  >Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     Iyt,T22yr  c              3  8   >#    U  H  u  pT" U5      v   M     g 7fre   r   )r+  r  r]   r  s      ri   r-  r    s     N:Mwq$T**:Ms   ro   r  )r]   z	ir.IRNoder   OrderedSet[sympy.Symbol])r]   rT   r   r  )r]   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   r  )r  r  r   r  )r   r  rY  rP   r   r&  simplifyr  r   r  r  
attrgetter)rg   	partitionr  r  candidate_symbolsresr(  symplified_sr  r  r  s           @@@ri   !get_graph_partition_symbol_inputs+Scheduler.get_graph_partition_symbol_inputs  s    	$$	$#	$%	$ 	$"	XB	X%	X 	-	%	, 7Al6H6HIyI7
 	N+:K:K:MN	
 ++<=(2"A77++44Q7LJJ|001 #
 &(*=*=f*EFGGrl   c           	       ^ ^ / n[        [        R                  R                  5       5      nT R	                  5       nSUU 4S jjm[        [        U5      [        U5      5       GHq  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H(  nT" UR"                  5      (       a  M  UR"                  PM*     sn5      U-
  n[        U 4S jU 5       5      n[        5       nU H  n	UR                  U	R$                  5        M      X-
   Vs/ s H  nX;   d  M
  UPM     nnUR                  U5        U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   a  SOS_M     nnU Vs/ s H  nX;   d  M
  X;  d  M  UPM     nnU
R                  U5        [        U 4S jU
 5       5      n
U
 Vs/ s H  nT" U5      (       a  M  X_   PM     nnU Vs/ s H$  o[        R                  R&                  ;   d  M"  UPM&     nnT R)                  UU5      n[+        UUUUUU5      nUR-                  U5        UR/                  XJ-
  5      nGMt     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf s  snf s  snf s  snf )	z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
c                B  > TR                   R                  U S5      nUc  g[        UR                  R                  [
        5      (       aU  [        UR                  [        R                  5      (       a+  TR                  R                  U S5      =n(       a  T" U5      $ gg)z
Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
so graph partition should not take it as inputs or outputs.
NFT)	r   rG  r   r]   rz   r8   r%   MutationOutputrI  )r  r   r)  is_none_layoutrg   s      ri   r	  ?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  s    
 ""&&x6C{#((//:66chh(9(9::!%!8!8!<!<Xt!LLIL))44rl   c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fre   rI  rG  r+  ro   rg   s     ri   r-  :Scheduler.get_graph_partition_signature.<locals>.<genexpr>8  ,      /1D ''++D771   (+TFc              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fre   r  r  s     ri   r-  r  b  r  r  Nr  )r  r   r   r   )r   rP   r   r  r  ru  r  r  r   r  r  r$   r  r  r   r   r   ro   r   r  r  r5   r  r  )rg   
partitionsskip_cudagraphsr  unmet_output_namesr8  r   r  output_namesr]   returned_output_namesr   r  partition_input_namesr-  ro   extra_input_namesr  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturer	  s   `                       @ri   get_graph_partition_signature'Scheduler.get_graph_partition_signature  sa    
'(@(@(BC--/	 	( *-Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K  "-!2!2[5G5G!G!GA-aff5 !G  " %/ /1/ %!
 5?L !$++DOO< " 2@!@D' @  !
 "(():; 21D' )4((1   2"1D' F&::dE1  " 2"1D' ,0,L 1  " "(();<$. /1/ %! 21D%d+ #"1   "7!6!''BSBS:S!6   !BB;M #:"# 12!6!<!<":"K*
R $B$y 9*!
""s`   K!
K&
.K&
	K+K+;	K0	K0	K5$K58	K:K:K:>K?K?!!LLc                   UR                   R                  5        VVs0 s H'  u  p#U[        R                  R                  ;  d  M%  X#_M)     nnnUR
                  R                  5        VVs0 s H'  u  p%U[        R                  R                  ;  d  M%  X%_M)     nnnUR                   Vs/ s H3  nUR                  5       [        R                  R                  ;  d  M1  UPM5     nnUR                   Vs/ s H%  nU[        R                  R                  ;  d  M#  UPM'     n	n[        UR                  UUUUR                  U	5      $ s  snnf s  snnf s  snf s  snf )z
Updates the partition signature by removing buffers specified in
V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
)r  rY  rP   r   r  r  r  maybe_get_namer  r5   r  r  )
rg   r  ro   r  r  r  r  r]   r  r  s
             ri   .clean_removed_buffer_from_partition_signatures8Scheduler.clean_removed_buffer_from_partition_signatures  sR    !* 5 5 ; ; =
 =177222 DL = 	 
 '99??A
A	177222 DIA 	 
 "..
.""$AGG,C,CC . 	 
 "00
0177222 0 	 

 '##$$
 	
)






s/   $EE,$EE+0EE5"EEc                  ^ ^^	^
^^^ SSK m	[        5       m/ m/ m[        U5       VVs0 s H  u  p#X2_M	     snnmSUU	UUU 4S jjm
SU
U4S jjnU H8  n[        UR                  R
                  5      TU'   TU   S:X  d  M0  T
" U5        M:     / nSnU[        U5      :  a  T(       d  T(       a  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  US-  nU[        U5      :  a  T(       a  M  T(       a  M  U[        U5      :  a  [        S5      eU$ s  snnf )ad  
Reorder nodes to minimize the number of partitions via a bfs
topological sort. This is the optimal reordering such that the
number of partitions cannot be reduced further. This may be
sub-optimal for other metrics such as peak memory. This does not
change relative orders of two cudagraphable nodes, nor the
relative order of two non_cudagraphable nodes.
r   Nc                   > TU    U 4nTR                  U 5      (       a  TR                  TU5        g TR                  TU5        g re   )r  heappush)r]   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrg   s     ri   insert_pending_nodesHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  sA    ,T2D9O$$T**6H2ODrl   c                   > U R                   R                   H.  nTU   S:  d   eTU==   S-  ss'   TU   S:X  d  M&  T" U5        M0     g )Nr   r    )r   
succ_nodes)r]   	succ_noder+  node_to_indegrees     ri   update_indegreeCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sO    !]]55	'	2Q666 +q0+#I.!3(3	 6rl   r    z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r]   rT   r   r   )	r(  rt  r  r~   r   
pred_nodesheappopr  r  )rg   r  r4  r]   r1  schedule	num_itersr  r'  r(  r+  r0  r)  r*  s   `       @@@@@@ri    reorder_for_minimizing_partition*Scheduler.reorder_for_minimizing_partition  s_    	9=CEGI4=e4DE4Dys4DE	E 	E	4 	4 D%()A)A%BT"%*$T* 
 -/	#e*$#':)--(?@%% *)
 &--(;<%% &%
 NI #e*$##':': s5z!  ] Fs   E(c           	     R   SSK JnJn  [        [        R
                  R                  5       5      nU" UU R                  U R                  [        [        R
                  R                  R                  5       5      U5      u  pVU R                  U5      nU" XvU5      u  pXS-  :  a  U$ U$ )z`
Reorder nodes to minimize the number of partitions if this only slightly
increase peak memory.
r    )estimate_peak_memoryprepare_planning_infor  )r  r;  r<  r   rP   r   r  r   r  rS  r  r8  )
rg   r  r;  r<  rx  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             ri   r%  0Scheduler.maybe_reorder_for_minimizing_partition  s     	H"177#;#;#=>:O##qww++0023;
7 ??F!5"

 s!::""rl   c                0   / n/ n/ nSS jnU H  nU R                  U5      nU(       a,  [        UR                  5      S:X  a  UR                  U5        MG  U(       a   U" U5      (       a  UR                  U5        Mn  UR                  U5        M     X#-   U-   $ )z
Reorder a node if it should be partitioned and has simple dependency:
1. move a partitioned node to the front if it has no dependency
2. move a partitioned node to the back if it is only used by OutputNode
3. otherwise do not reorder
c                    U R                  5        H8  nUR                   H%  n[        UR                  [        5      (       a  M$      g   M:     gr   )r   rb   r   r]   r   )r]   r   r   s      ri   only_output_userPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user  s<    '')99C%chh
;;$ % * rl   r   r  )r  r~   r   r  )rg   r  frontmiddlebackrC  r]   r  s           ri   r&  6Scheduler.reorder_for_partition_with_simple_dependency
  s     *,*,(*	 D#44T:C(?(?$@A$ET"!&6t&<&<D!d#  ~$$rl   c                v   / nSn/ n/ nU R                    HV  nU R                  USS9nU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        MX     U(       a"  UR                  U5        UR                  U5        U R                  XS9nU R	                  U5        X4$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
T)r  )r  r  )r  r  r  r  r  )rg   r  r  cur_partitionr  r]   r  r  s           ri   r"  Scheduler.graph_partition*  s     +-
')JJD#44Td4K!C!!-0&&~6 "-N  &  m,"">277! 8 

 	))*5%%rl   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r  r  r"   r"  _codegen_partitions_codegenr  rp   s    ri   r  Scheduler.codegenJ  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                J   SSK Jn  [        R                  R                  n[        U R                  5      n[        R                  R                  5          [        R                  R                  SSU 3UUS9  U R                  U5        [        [        R                  R                  U5      (       d   eU R                  U5      nU[        R                  R                  l        [        R                  R                  R                  5         [        R                  R                  R                  [        R                  R                  5      u  pgSSS5        [        R                  R                  R!                  WR"                  5        [        R                  R                  R%                  XR5        [        R                  R                  R&                  R)                  UR*                   Vs/ s H  oR-                  5       PM     sn5        g! , (       d  f       N= fs  snf )z,Codegen a partition given its inputs/outputsr    )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  rQ  rP   r   r   r  r  set_current_wrapper_codeinit_wrapper_coderN  r   r!  rV  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnrg  codegen_partition_call	allocatedr  r  rf   )	rg   r   r  rQ  rU  graph_partition_idpartition_coder  r]   s	            ri   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapperR  sz    	Bgg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQQKKIVI8AAGG  5GG  --/ ! 4 4 = =agg>R>R SN- 00 	
889M9MN	334FR	&&--)2)?)?@)?]]_)?@	
7 0/8 As   C;H.H 
Hc                P   ^ ^^ [         R                  SUU U4S jj5       nU" 5       $ )Nc               3    >#    TR                  T T5        TR                  (       a  [        TR                  R                  5      (       a[  TR                  R                  c   S5       e[
        R                  R                  R                  TR                  R                  5         S v   TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        g ! TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        f = f7f)Ndevice should have an index)
%update_graph_partition_default_devicer  rB   rx   r  rP   r   r   codegen_device_guard_entercodegen_device_guard_exit)r  rg   r  s   ri   ctx1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 0 2288D 1D $$??//553..3D//444 4 GG((BBD.2+	 ..3D//444 4 GG((BBD.2+s    B#E9'D +A%E9A&E66E9)r   zIterator[None])
contextlibcontextmanager)rg   r  r  ri  s   ``` ri   use_default_device_context$Scheduler.use_default_device_context|  s+     
	"	"	3 	3 
#	3* url   c                N   [        U5      S:X  a  US   R                  (       d  g SS jn      SS jnS n[        X5       H   u  pgUR                  (       a  M  U" U5      n  O   Uc  g [        X5       H'  u  pgUR                  (       d  M  U" Xe5      (       a  M'    g    XPl        g )Nr    r   c                6    U S   R                  5       nUc   eU$ r  r   )r   partition_devices     ri   get_cudagraph_partition_deviceWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##rl   c                D    U  H  nUR                  5       nX1:w  d  M    g   gr   rq  )r   target_devicer]   r  s       ri   all_on_target_deviceMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s(     "**  " rl   )r   rU   r   r-  )r   rU   rv  r-  r   r   )r~   r  ru  r  )rg   r  r  rs  rw  cudagraph_partition_devicer   r  s           ri   rf  /Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
$? I+++-KI-V* %@ &-$'
$? I'''0D1 1 	 %@ 'A#rl   c                   U R                  5       u  p[        U5      S:  a  S[        U5       S3n[        USS9  U R                  X5         [	        X5       H\  u  pE[        U5      S:  d   S[        U5       35       eUR
                  (       a  U R                  U5        MK  U R                  XE5        M^     SSS5        [        U R                  5      n[        R                  R                  R                  U5        US:  as  [        R                  R                  c   eU[        [        R                  R                  5      :X  d.   S	U S
[        [        R                  R                  5       35       egg! , (       d  f       N= f)z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r    zcudagraph partition into z partitionsr  )r  prefixz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r"  r~   rN   rm  ru  r  rN  ra  r  r  rP   r   r   set_all_partition_namesr  )rg   r  r  r  r   r  num_partitionss          ri   rM  Scheduler._codegen_partitions  sR    "&!5!5!7
z?Q-c*o->kJC)c"=,,ZD(+J(C$	9~* KCPYNK[\* ++MM),33II )D E d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@  EDs   A,E88
Fc                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     U R                  U l        U R                  (       aG  [         R                   R"                  (       a(  [$        R&                  R(                  R+                  5         U GHl  n[,        R/                  [0        R2                  5      (       a4   [,        R5                  SUR7                  5       UR9                  5       5        U R=                  U5        UR?                  5       =n(       Ga  XR                  :w  d*  URA                  5       (       d  URC                  5       (       a  U RE                  5         XR                  :w  a  U R                  (       aL  [G        U R                  RH                  5      (       a(  [$        R&                  R(                  RK                  5         Xl        [G        URH                  5      (       aG  URL                  c   S5       e[$        R&                  R(                  RO                  URL                  5        Xpl(        U RR                  RU                  URV                  5        URC                  5       (       aN  URY                  [[        UR]                  5       5      5      u  pnU R_                  U5      Ra                  XU	5        GO1URA                  5       (       a-  [b        Rd                  " [f        U5      nU Ri                  U5        OURk                  5       (       aw  [b        Rd                  " [l        U5      nU R_                  U5      nS	S
K7J8n  S	SK9J:n  [w        XU45      (       a  UnO[y        S[I        U 5      < 35      eUR{                  U5        Oc[w        U[|        [~        45      (       a!  U R_                  U5      R                  U5        O'[w        U[        5      (       d   eUR                  5         [         R                   R                  (       a  U R_                  U5      R                  5         U R                  RU                  UR                  5       5        U R                  RU                  UR                  5       5        [w        U[        5      (       a  GM  UR?                  5       nUc  GM"  URH                  S:w  d  GM5  U R_                  U5      R                  5       (       d  GM\  U RE                  5         GMo     U R                  U R                  :w  a[  U R                  c   e[G        U R                  RH                  5      (       a(  [$        R&                  R(                  RK                  5         U RE                  5         g ! [:         a(    [,        R5                  SUR7                  5       5         GNf = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0re  r    )CUDACombinedSchedulingr;  ztype(self)=r  )Jr"   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r  ro   filename_dynamoconvert_frame__file__linenorb  r  r8  r#  autotune_at_compile_timerP   r   r   write_get_raw_stream_headerr   rD  rE  r=  r  rf   rc  r   r  r   r  r  r  rB   rx   rh  r  rg  r	  r-  r  r   r  r   r  r.  codegen_templaterF  rG  rK  r  r  r0   codegen.cuda_combined_schedulingr  r?  r<  r   rs  codegen_combo_kernelrR  r  codegen_noder  rR  debug_sync_kernelcodegen_syncri  r  r  r  ready_to_flush)rg   r  r  stackr  framer  r]   r  r  r  r  backend_r  r<  r  s                   ri   rN  Scheduler._codegen  s   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #99&&6==+Q+QGG  <<>D..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU $%%,,T__=!!484W4W)*51   (99!X !!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D#5}"EFF  (55d;!$(>????}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*&v-((0??AAJJLY \ $"="== &&222 !4!4!9!9:: $$>>@

a ! IIPs   33X==.Y/.Y/c                    US   R                  5       nU [        R                  l        X l        Uc   eU R                  U5      nUR                  U5      $ )r  r   )r   rP   r   r\   r8  r.  benchmark_combo_kernel)rg   r2  r  r  s       ri   r   Scheduler.benchmark_combo_kernel[  sU     1((* $!!!""6*--i88rl   c                2   [         R                  (       d  gUnUS   R                  5       nUb  UR                  S:X  a  gSSKJn  S/ pe[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  p[        R                  " U
5      (       a  [        R                  SU5          g	 XZ-  nUR                  U5        M      U R                  U5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  X]:  d  U(       a$  [        R                  S['        X]-  S 5      5        O#[        R                  S[)        X]-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r6  Tr   Nr  r9  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFrr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr<  z3cannot fuse (benchmark): fusing causes %sx slowdown)r"   r  r   rx   r|  r:  r  r  r3  r  r  r  rt  ru  r   r  rD  rE  r=  r<   r=   )rg   r  subkernel_nodesr  r:  r?  
path1_listr  r  r2  rx  r]  ry  r@  	ms2_clone_path2_listsmall_kernels                    ri   r  !Scheduler.speedup_by_combo_kerneli  s   
 ,, #..0 >V[[E1;rZ!/2HA)I ##I..  R55i@::b>>$$U ! " ICd#7 3:
	*.*E*Eo*V'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44M $ *c!f4$$]     	&#a&0  Y 	s=   AF(6G! (G.$GGG!H'$HHHc                r    U R                   U   nUR                  c   eUR                  R                  5       $ re   )r   r]   
get_layout)rg   r  r   s      ri   get_buffer_layoutScheduler.get_buffer_layout  s5    x(xx###xx""$$rl   c                   U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        [        45      (       a  Mw  UR                  5       / :X  d  M  [        R
                  R                  R!                  UR                  5        M     M     g r  )r  rJ   r   r   rP   r   rc  rG  ro   r4   r   rz   r8   r7   r  zero_dim_cpu_tensor_listrb  )rg   r]   r  r  s       ri   r
  $Scheduler.update_zero_dim_cpu_tensor  s    JJD{{}} ,,22DWW3377		BF+F3u< *"MMJ8I+J! ! #OO-388<<TYYG 3 rl   )r  ri  r  r-  r  r8  r	  r  r  rI  r   r   r   r  r8  r  r   r.  r  )r  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r   )r  r   r   r   r   )rG  r   r   r   )r]   r  r   rT   r  )r  rT   r   r  )r   r  r  rJ  r   tuple[float, str]re   r  rJ  r  r   r  ry  r   r   )r  r   r  r-  r   r  )r2  rJ  r   r   )r  rT   r  rT   r   zUnion[bool, Callable[[], bool]])r]   rT   r   rT   )r  ry  r   r   r  )r  r  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  rT   r  rT   r   r   )r  rT   r  rT   r  r   r   r   )r  rT   r  rT   r  z"Union[tuple[str], OrderedSet[str]]r   r   r  rT   r  rT   r   r   r  )r  rT   r  rT   rH  r  r   r   )r  rT   r  rT   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])rg  r0   r  rT   r  rT   r   r   )r  r-   r  r.   r   r   )r,  r-   r   r   )r  r  r   r  )r  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  rK  r   r   )r  r-  r   BaseScheduling)r  r   r   r  r3  )ro   r   r  r  r   r   r  )r]   rT   r  r   r   r   )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   r   )r   rU   r  r  r   r  )r  list[PartitionType]r  z
list[bool]r   r  )r  r5   r   r5   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r   rU   r  r5   r   r   )r  r  r  r  r   z'contextlib.AbstractContextManager[None])r  r  r  r  r   r   r2  rJ  r   z(tuple[float, float, list[Optional[str]]])r  r  r   r   )r  r   r   z	ir.Layout)Sry   r   r   r   r  r   r  r  propertyr8  setterr,  rH  r  r  rS   r*  r  r  r  ri  r  rS  r  r!  r  r  r  r  r3  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r:  rA  rI  rZ  r[  r  r  r  r  r'  r  r  r  r  r.  r  r  r  r  r  r  r  r!  r8  r%  r&  r"  r  ra  rm  rf  rM  rN  r  r  r  r
  r   r  r  s   @ri   r[   r[     s#   
m
^	# & & ( (7#,"HsPjKZ(#T,	 6S(4#&$6:	808	8, (,	*  %	
 
&
> 
>*6
>	
>w@r
s(&s(/@s(	(s(j	>h,h	 hT..`?4 ,4 	:4 l,&,/@,	,\7&7/@7	7rM&M/@MMPM	M$&$/@$	$6< < !< =	<
 
<|UK&UK/@UK	UKn
9(9 )9 	9
 
9v`&`/@`	8`DXMt3&3/@3	3j

(9
BS
	
J D.J&J/@J	J<6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
 ;@B%B37B	BH	D '1' 
'ReH eH QeH 
"	eHNI -I @JI 	&I V"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@&	B&@(
 (
 +(
 
	(
T-;X	06-A--A;X-A	-A^Brh949	19I5V%
H Hrl   c                  N  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr S       SS jjrSS jrSS jrSS jrSS jr    S S jrS!S jr      S"S jr    S#S jrSrU =r$ )$r  i  c                .   > [         TU ]  5         Xl        g re   )r  r   r\   )rg   r\   r  s     ri   r   BaseScheduling.__init__  s    "rl   c                \    U R                   (       a  U R                   R                  5         g g re   )r\   r  rp   s    ri   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') rl   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   r  s     ri   get_backend_features#BaseScheduling.get_backend_features  s
    |rl   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r  s      ri   rI   BaseScheduling.can_fuse_vertical  
     "!rl   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r  s      ri   rJ  "BaseScheduling.can_fuse_horizontal  r  rl   c                    g)aE  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.
Fr   r  s      ri   rB  .BaseScheduling.can_fuse_multi_outputs_template  s     rl   c                    UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R                  X5      $ )z
Fuse two nodes
)r  r0  r  rR  r  s      ri   r  BaseScheduling.fuse  sC     !1!1!3!3-225@@%**588rl   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )rg   rI  s     ri   r/  BaseScheduling.group_fn  r  rl   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )rg   r  epilogue_nodesrA  s       ri   r  BaseScheduling.codegen_template  s
     "!rl   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )rg   r  r  r  s       ri   r  .BaseScheduling.generate_kernel_code_from_nodes  s
     "!rl   c                    [         er  r  r  s     ri   r  BaseScheduling.codegen_node  
     "!rl   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  rp   s    ri   r  BaseScheduling.codegen_sync$  r  rl   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Fr   rp   s    ri   r  BaseScheduling.ready_to_flush*  s    
 rl   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  rp   s    ri   r  BaseScheduling.flush1  r  rl   c                    [         e)r  r  r  s     ri   r  $BaseScheduling.benchmark_fused_nodes7  
     "!rl   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )rg   r  s     ri   r  )BaseScheduling.benchmark_codegened_module@  s
    
 "!rl   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   r   r  s      ri   r|  'BaseScheduling.get_fusion_pair_priorityG  s     rl   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  r1  s     ri   r  %BaseScheduling.benchmark_combo_kernelP  r  rl   rx  )r\   zOptional[Scheduler]r   )r  r-  r   zOrderedSet[BackendFeature]r  r+  )rI  r  r   z"tuple[tuple[sympy.Expr, ...], ...])r  rT   r  rJ  rA  rJ  r   zOptional[str]re   r  )r]   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r  )r  r   r   r  r  r  )ry   r   r   r   r   r  r  rI  rJ  rB  r  r/  r  r  r  r  r  r  r  r  r|  r  r   r  r  s   @ri   r  r    s[   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	"""""0"	""&/@	"4"	1" "rl   r  )r   z$torch._inductor.codecache.LocalCache)r  rT   r   r   )r  rT   r   zOptional[Callable[[Any], Any]])r  rT   r   r   )r  r   r   r   )r]   rT   r  r  r   zdict[str, SchedulerBuffer]r   r   )r  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )r  r  r\   r[   r3  r  r   r   )r   )r  zlist[list[int]]rI  r  r  r  r   z	list[int]r   )
__future__r   rQ  rk  r   r  r  rX  rE  rt  r  r@  r  r  r  rF  r   r   r   r   r   r	   r
   r   r   typing_extensionsr   r   collections.abcr   r   typesr   r  r  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r  r!   r"   r#   r$   r%   r&   analyze_preserves_zero_maskr'   codegen.commonr(   r)   r*   comm_analysisr+   r,   r-   r.   r/   r0   excr1   r2   fx_utilsr3   r4   r5   r6   r7   r8   	loop_bodyr9   r  r:   r;   runtime.runtime_utilsr<   r=   r&  r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   virtualizedrP   	getLoggerry   r   _logginggetArtifactLoggerr  rY  r{  r   rU   r   rV   rW   	dataclassrY   r   rT   rv  rk  rj  r  rq  r  r|   r   r~  rK  r  r  r  r  rR  r0  r  r  r  r  r  r  r[   r  r   rl   ri   <module>r 	     s$   "          	     , R R R 2 2    $ $ $ 6 ? 7 M > / O O * D D D M M ; : 2 $    J 7 &    &  !^^--hA
NN44XO  >>;;$    34y 4T]t_ h8 h8 h8V 4_ 4 4b1 b1J 2 2(' <
 
,  &K
&K4&K ,&K 
	&KRW 1 W"5. 5k*% k*\	@	$@ $ 
	,l** l*^~:!3 ~:B
_, _J #%+#++  + 	+\ 
 
 
> %??, 4\4H \4H~hN" N"rl   