
    9i^                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJrJrJrJr  S SK J r   S SK	J!r!  S SK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-  S SK.J/r/J0r0J1r1J2r2J3r3J4r4  S SKJ5r5  S SK6r6S SK7r7S SK8J9s  J:r;  S S	K<J=r=  S S
K>J?r?  S SK@JArA  S SKBJCrC  S SK8JDrDJErE  SS/rFS SKGJHrHJIrIJJrJJKrK  \+(       aV  S SKJLrLJMrMJNrN  S SK7JOrOJPrPJQrQ  S SKRJSrS  S SKTJUrU  S SKVJWrW  SSKXJYrY  SSKZJ[r[  SSK\J]r]  SSK^J_r_J`r`JaraJbrbJcrcJdrd  SSKeJfrf  SSKgJhrhJiri  / SQrj\," S5      rk\R                  GS S j5       rmS S KnJoro  S S!KpJqrq  S S"KrJsrs  S S#KtJuru  S S$KvJwrw  S S%KxJyry  S S&KzJ{r{J|r|J}r}J~r~Jr  S S'KJrJr  S S(KJrJr  SS)KJr  SS*KJr  \R                  S+:H  r\GR                  " \5      r\7GR                  GR!                  \S,5      r\," S-5      r\\6GR(                  \6GR(                  4   r\)\-\7GR,                  \\7R                  4      rS.S/S0.rS1rS1rS1rS2rS3r\\S-
  -  S :X  a  \S4:  d   S55       eGS!S6 jrGS"S7 jr " S8 S9\6GRB                  5      r\GRF                  " S:S;9 " S< S=5      5       rGS#GS$S> jjr GS#       GS$S? jjr\R                  GS%S@ j5       rGS&SA jrGS'SB jrGS(SC jrGS)SD jr      GS*SE jrGS+SF jr    GS,SG jrGS-SH jr    GS.SI jrGS/SJ jrSK 4     GS0SL jjr        GS1SN jrGS2GS3SO jjr  GS4         GS5SP jjr     GS6             GS7SQ jjrGS8SR jrGS9SS jrGS:ST jrGS;SU jrGS<SV jr\1" SW5      r\," SXS:SY9r\$\/\#\4   \4   r " SZ S[\*\&\\4   5      rGS=S\ jrGS>S] jr    GS?S^ jr    GS@S_ jr      GSAS` jr      GSBSa jr GSC     GSDSb jjr      GSESc jrGSFSd jrGSGSe jrGSHSf jrGSISg jrGSJSh jrGSKSi jrGSLSj jrGSMSk jrGSNSl jr    GSOSm jrGSPSn jrGSQSo jrS SKrGSRSp jr/ rSM\Sq'   GSSSr jrGSRSs jr\GR                     GST       GSUSt jj5       r\r\r\rGSVSu jr      GSWSv jr\GR                  " S45      GSXSw j5       r " Sx Sy\(5      r\GRF                   " Sz S{5      5       r " S| S}5      r " S~ S\5      r\GR                  GSYS j5       r " S S5      r " S S\5      r\R                  GSZGS[S jj5       r\GR                  GS\S j5       r\GR                  GS%S j5       rGS\S jr GSC       GS]S jjr      GS^S jrGS_S jrGS_S jrSSS:S.         GS`S jjrSS.GSaS jjrSS.GSaS jjrGSbS jrGScS jr\-\\6GR(                  4   rS\S'   \R                  GSdS j5       r\R                  GSdS j5       r\R                  GSeS j5       r\R                  GSfS j5       r\R                  GSgS j5       rGShS jrGSbS jrGSbS jrGShS jrGShS jGr         GSiS jGr    GSj               GSkS jjGrGS%S jGr " S S5      Gr        GSlS jGr        GSlS jGrGSmS jGrGSnS jGrGSoS jGr	        GSoS jGr
        GSpS jGr\GR                        GSqS j5       Gr GSC     GSrS jjGrGSsS jGrGStS jGrGSuS jGrGSuS jGrGSvS jGrGSwS jGr\GR                  GSxS j5       GrGS\S jGr\R                  GS\S j5       Gr\R                  GSyS j5       Gr\R                  GS\S j5       GrGS\S jGrGSzS jGrGS{S jGrGS%S jGrGS%S jGrGS|S jGrGSNS jGr " S S\GR@                  5      Gr!          GS}S jGr"GS~S jGr#    GS~S jGr$ GSC     GSS jjGr%GSS jGr&GSS jGr'GSS jGr(      GSS jGr)        GSS jGr*S 4           GSS jjGr+S 4           GSS jjGr,GSS jGr-GSS jGr.\GRF                   " S S5      5       Gr/\GR                  GSS j5       Gr0GSS jGr1GSS jGr2GSS jGr3GSS jGr4              GSS jGr5GSS jGr6GSS jGr7GSS jGr8GSS jGr9        GSS jGr:GSS jGr;        GSS jGr<GSS jGr= GSC       GSS jjGr>      GSS jGr?GSS jGr@      GSS jGrAGS%S jGrBGSS jGrCSSSSSSSS.GrDG\DGR                  5        V Vs0 s H  u  pX_M	     snn GrF\GR                  " S5      GrHGSS jGrIGSS jGrJGSS jGrKGSS jGrL\R                  GSS j5       GrM\GRF                   " S S5      5       GrN0 GrOS\S'           GSS jGrP\C" 5       GrQS\S'   GSS jGrRGS S jGrSGSS jGrT\," S5      GrU\," GS 5      GrV " GS GS\G\UG\V4   5      GrW\0" S:GS9GSCS:S;.GSGS jjj5       GrXGSGS jGrY " GS GS\GR@                  5      GrZ\R                  GSGS j5       Gr[GS%GS	 jGr\GSGS
 jGr]GS GS jGr^GSGS jGr_GS%GS jGr`GSGS jGraGSGrbGSGS jGrcGSGS jGrdGSGS jGre  GS         GSGS jjGrfGSGS jGrgGS%GS jGrhGSGS jGri  GS       GSGS jjGrjGSGS jGrk\GRF                  " S:S;9 " GS GS5      5       Grl\$GS\#4   Grm\$G\mG\l/G\m4   Grn " GS GS5      GroG\o" 5       GrpGSGS jGrqGSGS jGrrgs  snn f (      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     [          V s/ s H*  n [        [        U 5      R                  5       (       d  M(  U PM,     nn [	        U5      S::  d   e[	        U5      S:X  a  SnU$ UR                  5       nU$ s  sn f )Nr3   r   rC   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      U/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/torch/_inductor/utils.pyget_gpu_typerS   i   sh    &KY'%*;*H*H*J!YJKz?aZA-vHO 4>>>3CHO Ls
   'A2A2)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rC   rE         @      zmust be power of 2c                *    U [         -   S-
  [         * -  $ )z/Round up to the nearest multiple of ALIGN_BYTESr3   )ALIGN_BYTES)nbytess    rR   _alignrp      s    [ 1$44    c                   [        U [        R                  [        R                  45      (       a#  [	        [        [        U R                  5      5      $ [        U [        5      =(       d"    [        R                  " U [        5      [        :H  $ )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrn   )vs    rR   ry   ry      sT    !eii+,,3{AFF+,,aK599Q#<#KKrq   c                  4    \ rS rSrSrSrSr\SS j5       rSr	g)	r{      z<Symbolically round up to the nearest multiple of ALIGN_BYTESr3   Tc                    [        U[        [        R                  45      (       a  [	        [        U5      5      $ [        U5      (       a  U$ g N)rs   intrt   Integerrp   ry   )clsvalues     rR   eval
align.eval   s<    ec5==122#e*%%uL rq    N)r   
sympy.ExprreturnzOptional[sympy.Expr])
__name__
__module____qualname____firstlineno____doc__nargs
is_integerclassmethodr   __static_attributes__r   rq   rR   r{   r{      s!    FEJ rq   r{   Tfrozenc                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)GraphPartitionMap   zH
Mapping from the partition info (e.g., input/output) to the graph info
r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesr   Nr   r   r   r   r   __annotations__r   r   rq   rR   r   r      s$    
 	G -,-- rq   r   c           
        U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R
                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[         R                  R                  [         R                  R                  R                  /S9 n
[         R                  R                  5         [        U	5       Hp  nUR                  5         XK   R                  5         [         R                  R                   R                  S	5         U " 5         S
S
S
5        X[   R                  5         Mr     [         R                  R                  5         [         R"                  " [%        XE5       VVs/ s H  u  pUR                  U5      PM     snn5      nS
S
S
5        [         R&                  " W5      R)                  5       n[*        R-                  S5        [*        R-                  W
R/                  5       R1                  SSS95        [3        U
R5                  5        Vs/ s HI  nUR6                  [8        R                  :X  d  M#  [:        R<                  " SUR>                  5      c  MG  UPMK     sn5      nU(       a#  U[@        R&                  " S U 5       5      S-  -  n[*        R-                  SU5        U$ s  snf s  snf ! , (       d  f       GN= fs  snnf ! , (       d  f       GNK= fs  snf ):  
Returns benchmark results by examining torch profiler events.
This could be more accurate as it doesn't count CPU side overhead.
However, this also requires manually excluding irrelevant event, e.g.
vectorized_elementwise_kernel which is used to fill L2 cache,
various CUDA events, etc, so could also be fragile.
    ArC   dtypedeviceTenable_timing   r3   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  8   #    U  H  oR                   v   M     g 7fr   device_time_total.0events     rR   	<genexpr>fp8_bench.<locals>.<genexpr>	  s     Q33        @@profiling results: %s ms)!rK   rC   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestablerW   eventsdevice_typerV   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rR   	fp8_benchr      s>    D	JJKKJu}}VLE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  BGxQA5::##$#7KQ?DXO!!!!5IO			NN++00
 
  
 


 xAKKMN!!#&&7 8L! ! 	

 +.{+FG+F41Q^^A+FG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
#!!Z__4  HH0%**=	 #	
	O OOQQQ	

 II(#.JO RO 87
 H
 
*	
sP   'P'P!;A8P>3P&;AP>P82P>"Q) QQ&
P50P>>
Qc                X   U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [         R                  R                  5         [         R                  R                  [         R                  R                  R                  /S9 n
[        U	5       H  nUR                  5         U " 5         M     [         R                  R                  5         S	S	S	5        [        R!                  S
5        [        R!                  W
R#                  5       R%                  SSS95        ['        U
R)                  5        Vs/ s H7  nUR*                  [,        R                  :X  d  M#  UR.                  S:w  d  M5  UPM9     sn5      n[1        U5      U	-  S:w  a  [3        S[1        U5      U	5      e[1        U5      U	-  n['        [5        U5       VVs/ s H  u  pX-  S:w  d  M  UPM     snn5      nUR7                  5         UR#                  5       n[        R!                  S5        [        R!                  UR%                  SS95        [9        S U 5       5      S-  U	-  n[        R!                  SU5        U$ ! , (       d  f       GN= fs  snf s  snnf )r   r   rC   r   Tr   r   r3   r   Nr   r   r   r   zContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  8   #    U  H  oR                   v   M     g 7fr   r   r   s     rR   r   +do_bench_using_profiling.<locals>.<genexpr>b  s     A=%%%=r   r   r   )rK   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   r   rV   r   rM   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   num_event_per_groupactual_eventsr   s                    rR   do_bench_using_profilingr     s    D	JJKKJuyyHE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  
JJ			NN++00
 
  
 
xAKKMD	 ! 	

 
 IIlIIann$$-EQS$TU 	
#  JOO3 8=

n8T #	
O ?h&!+- 	
 	
 o.9 &o6	
6&!+ 6	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_
 
$	
	
s+   -AN"N! N!N!(N&
:N&

Nc                     SSK Jn   [        R                  R	                  SS5        U S L=(       a%    [        [        [        R                  SS 5      S5      $ ! [         a     g[         a  nS[        U5      ;   d   e S nAgS nAff = f)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rK   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrJ   opsImportErrorr   str)r   r   s     rR   has_torchvision_roi_alignr  g  s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 
B$	B-BBc                t   U c   [         R                  " S5      R                  $ [        U [        5      (       a  [         R                  " U 5      n U R
                  S;  aY  U R                  cL  [        U R
                  5      n[         R                  " U R
                  UR                  R                  5       S9$ U $ )Ng        )cpumeta)index)
rK   r   r   rs   r   typer  rT   Workercurrent_devicer   device_interfaces     rR   decode_devicer  w  s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrq   c                ~    [         R                  " [        R                  U [        R
                  R                  5      $ r   )	functoolsreduceoperatormulrt   SOne)its    rR   sympy_productr    s#    HLL"eggkk::rq   c           	         [        U 5      [        U5      :X  d   e[        R                  " [        S [	        X5       5       5      5      $ )Nc              3  .   #    U  H  u  pX-  v   M     g 7fr   r   )r   abs      rR   r   sympy_dot.<locals>.<genexpr>  s     >odaAEos   )rM   rt   expandr   r   )seq1seq2s     rR   	sympy_dotr    s6    t9D	!!!<<>c$o>>??rq   c                b    U  Vs0 s H  n[        U5      U_M     snR                  5       $ s  snf r   )r   values)r  rO   s     rR   uniquer     s+     !bBqE1Hb!((**!s   ,c           
        [        U [        R                  5      (       d  [        U[        R                  5      (       a4  [        [        R                  " U 5      [        R                  " U5      5      $ [        U [
        5      (       a  [        U[
        5      (       d$   U  S[        U 5       SU S[        U5       35       e[        X5      $ )Nz: , )rs   rt   ExprrZ   sympifyr   r  runtime_ceildiv)numberdenoms     rR   re   re     s     &%**%%E5::)F)Fu}}V,emmE.BCC fc""z%'='= ("T&\N"UG2d5k];= 6))rq   c                t   U c  g[        U 5      R                  S5      S   n0 SS_SS_SS	_S
S_SS_SS_SS	_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_SS#S$S%S&.EnUR                  [        UR	                  5       5       Vs0 s H  o3U_M     sn5        [        U [         5      (       a  U $ S'X!    3$ s  snf )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r   splitupdatelistr  rs   )key	dtype_strtysr}   s       rR   _type_ofrV    sW   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<0101012S#&&3@a/?,@@ 2s   B5c                Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf )z
Gets the shape and stride of a tensor. For non-symbolic tensors, this is
trivial. But for symbolic tensors, we need to map from SymIntNode into
sympy.Expr.
)rt   r$  lstr   s     rR   convert_shape_to_inductorrZ    s%     '**cEMM!c***s    (c                    SSK Jn  [        U [        5      (       a  U $ [        U [        R
                  5      (       a  [        U 5      $ UR                  R                  R                  R                  U SS9$ )zD
Like convert_shape_to_symint, but operates on a single expression.
r3   VN)hint)
virtualizedr]  rs   r   rt   r   graphsizevars	shape_envcreate_symintnode)r   r]  s     rR   convert_to_symintrd    sk      a 	

 !U]]++ F	 !!++==ad=Krq   c                D    U  Vs/ s H  n[        U5      PM     sn$ s  snf )zn
Takes a list of shapes from Inductor and converts them into symints (or just
ints if all shapes are static).
)rd  rX  s     rR   convert_shape_to_symintrf    s"     +..#Qa #...s   c                N    [        S U R                  R                   5       5      $ )z%
Does this op overload have aliasing
c              3  <   #    U  H  oR                   S Lv   M     g 7fr   )
alias_infor   r  s     rR   r   is_view.<locals>.<genexpr>  s     F1EA||4'1Es   )any_schema	argumentsops    rR   is_viewrq    s     F1E1EFFFrq   c                    gNFr   )r   s    rR   <lambda>rt    s    rq   c                  ^ U R                   S:X  d  g[        U R                  [        R                  R
                  5      (       d  U R                  [        R                  L d  g[        [        R                  R
                  U R                  5      nU[        R                  L d  [        U5      (       a  [        U4S jU R                   5       5      $ [        R                  R                  UR                  ;   =(       d    T" U5      $ )z
Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

Uses in views ops will follow the views uses
call_functionFc              3  <   >#    U  H  n[        UT5      v   M     g 7fr   )is_pointwise_use)r   uis_pointwise_fns     rR   r   #is_pointwise_use.<locals>.<genexpr>  s     KA#A77s   )rp  rs   targetrK   _ops
OpOverloadr  getitemr   rq  rw   usersTag	pointwisetags)userz  r|  s    ` rR   rx  rx    s     66_$3::uzz4455xGWGW9W%**''4F!!!WV__KKKK99&++-H1HHrq   	list[Any]c           	       ^^ [         R                  R                  5       m/ mSUU4S jjnTR                  " U /[	        [         R
                  X1U45      Q76 n[        U R                  R                  5      S:X  a3  [        U R                  R                  S   R                  5      S:X  a  U4nTR                  U5        [         R                  R                  0 T5      nUT4$ )Nc                `   > TR                  U 5        TR                  S[        T5       35      $ )Narg)appendplaceholderrM   )r  g
graph_argss    rR   add_tensor_arg)gen_gm_and_inputs.<locals>.add_tensor_arg  s,    #}}s3z?"3455rq   r3   r   Tensor)r  torch.Tensorr   r2   )rK   fxGraphrv  r#   r  rM   rm  returnsr   r  outputr1   )r|  rz   kwargsr  nodegmr  r  s         @@rR   gen_gm_and_inputsr    s     	A%'J6 6 ??u||^F^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rq   c                t    U S:X  a  g [        U 5      nUR                  5       (       a  UR                  5         g g Nr  )rT   rL   r   r	  s     rR   r   r      s7    /7$$&&$$& 'rq   c                    [        U5        [        R                  " S5        [        R                  " 5       n[        U5       H  nU " U6 n[        U5        M     [        R                  " 5       nWc   eXt-
  $ )Ni9  )r   rK   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rR   timedr  (  sk     	d				B5\'F  
			B7Nrq   c                    [         R                  " [        U5       Vs/ s H  n[        XX%5      PM     sn5      n[         R                  " U5      U-  n[        X-  S 5        UR                  5       $ s  snf )Nz.6f)rK   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rR   print_performancer  :  se     ll>CFmLmue	4mLG << 5(D	T_S!#99;	 	Ms   A3c                F   ^ [        X5      " 5       m[        XU4S j5        g)zKReplace obj.method() with a new method that returns a precomputed constant.c                    > T $ r   r   )r  s   rR   rt  #precompute_method.<locals>.<lambda>M  s    rq   N)rJ   setattr)objmethodr  s     @rR   precompute_methodr  J  s    S!#FC(rq   c                ,    U H  n[        X5        M     g)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rR   precompute_methodsr  P  s    #& rq   c                8    [        X:  5      [        X:  5      -
  $ r   )r   )r  r  s     rR   cmpr  V  s    qu:AE
""rq   c                    [        U [        5      (       a  U /U-  $ [        U 5      S:X  a  [        U 5      " U S   /5      U-  $ U $ )Nr3   r   )rs   r   rM   r  )rO   sizes     rR   pad_listliker  Z  sD    !SsTz
1v{Aw!v%%Hrq   c                @    [        U 5      S:X  a  / $ SS jn[        XS9$ )Nr   c                    [        U [        5      (       a  U $ SSKJn  [        X5      (       d   eU R	                  5       $ )Nr3   )rA   )rs   r   	schedulerrA   get_name)elemrA   s     rR   	sort_functuple_sorted.<locals>.sort_funcg  s4    dC  K0$2222}}rq   rS  )r  rh   r   r   )rM   sorted)rO   r  s     rR   tuple_sortedr  c  s$    
1v{	 !##rq   PRV)	covariantc                  2    \ rS rSr\SS j5       rSS jrSrg)CachedMethodix  c                    g r   r   )r   s    rR   clear_cacheCachedMethod.clear_cachey  s    ),rq   c                    g r   r   selfrz   r  s      rR   __call__CachedMethod.__call__|  s    rq   r   N)r   r   r   None)rz   P.argsr  P.kwargsr   r  )r   r   r   r   staticmethodr  r  r   r   rq   rR   r  r  x  s    , ,Drq   r  c           	        ^ U R                   nSU S3mSU 0n[        SU ST ST S3R                  5       U5        [        R                  " U 5      " X! S3   5      nS
U4S	 jjnXCl        U$ )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                B   > [        U T5      (       a  [        U T5        g g r   r   delattrr  rS  s    rR   r  "cache_on_self.<locals>.clear_cache  s    4D# rq   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rS  s        @rR   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH oob!#n&=">?G &Nrq   c                    [        U 5      $ )zU
Variant of cache_on_self for properties. The only difference is the type signature.
)r  )r   s    rR   cache_property_on_selfr    s    
 rq   c                    ^      SU 4S jjnU$ )Nc           	        >^ ST SU R                    S3mSU 0n[        ST ST ST S3R                  5       U5        [        R                  " U 5      " US	   5      nSU4S
 jjnX2l        U$ )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                B   > [        U T5      (       a  [        U T5        g g r   r  r  s    rR   r  <cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!!c" "rq   r  r  )r   r  r  r  rS  
class_names       @rR   r  'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, #CL1	# (rq   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rR   cache_on_self_and_argsr    s     
$$	$L Nrq   c           
        SSK Jn  [        U [        5      (       ay  [        R
                  " [        R                  U  Vs/ s H?  n[        US5      (       d  M  UR                  (       d  M)  UR                  R                  PMA     sn[        5       5      $ [        XR                  5      (       a  U R                  $ [        5       $ s  snf )Nr3   irr  ) r  rs   rR  r  r  r  or_r   r  originsr!   r:   )node_scheduler  r  s      rR   aggregate_originsr    s     -&&LL *)D4( "-1YY "		!!)
 L
 	
 
M??	3	3$$$|s   C
C
+C
c                &   [        U 5      nUS:X  a~  U Vs/ s H\  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   c  M9  UR                  S   R                  R                  PM^     nn[        [        U5      5      nOUS:X  a  / nU H  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   S   n[        US   [        5      (       a  UR                  US   5        Mg  UR                  US   R                  5        M     [        [        U5      5      nO:US:X  a.  U Vs/ s H   o3R                  S:X  d  M  UR                  PM"     nnO[        eUnSR                  S	/U-   5      $ s  snf s  snf )
Noriginal_atenrv  rK   source_fn_stackr   r3   inductor_noder   fused)r  rp  r  _overloadpacketr   r  r!   rs   r   r  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         rR   get_fused_kernel_namer    s    $M2KO+ &
%yyO+ B  6;;. B O,	 BFKK(88AA% 	 
 G,-	g	%!FyyO+0AV[[0P"KK(9:2>	ilC00NN9Q<0NN9Q<#8#89 " G,-	o	-&1
&1FYY/5QKFKKk 	 
 "!G88WI'((5
(
s"   F	F	 F	'F	FFc                N
  ^^^  [        U 5      nU Vs/ s H  o3R                  S:X  d  M  UPM     nn[        R                  " [        5      n[        R                  " [        5      nSm[        U5      (       a  [        S U 5       5      n[        U5      S:X  ac  US   R                  m[        TS5      (       d0  [        TR                  5       VV	s0 s H  u  pX_M	     n
nn	U
Tl        UR                  U4S jS9  U H  nS	UR                  ;   aO  UR                  S	   b?  [        UR                  S	   R                  5      nXl   R!                  UR"                  5        S
UR                  ;   d  Mt  UR                  S
   S   R"                  nX\   R!                  UR"                  5        M     Tb  SOSnUR$                   SU SSR'                  UR)                  5       5       SSR'                  UR)                  5       5       S3nUR$                   S3/n[+        UR-                  5       5       HA  u  nnUR!                  UR$                   SU SSR'                  [+        U5      5       35        MC     TGb  SSKJm  UR!                  UR$                   S35        [        5       n/ n[3        U TR4                  5      (       Gd  SSKJn        S'U4S jjnS(S jm S)U 4S jjnU  GH  n	[        U	S5      (       a  U	R:                  c  M$  [        U	R:                  S5      (       a  U	R:                  R<                  b  U	R:                  R<                   H  nUR"                  U;   a  M  UR?                  UR"                  5        UR                  RA                  UR"                  5      nUc  MZ  U" UUR"                  5      u  nnUR!                  UR$                   SU SU" U5       SU S35        M     [        U	R:                  S 5      (       d  GM+  U	R:                  RB                  c  GME  U	R:                  RB                   HW  nUR                  RA                  UR"                  5      nUc  M-  U" UUR"                  5      u  nnUR!                  S!U-   5        MY     GM     U H0  nUR!                  UR$                   SURE                  S"S#9 35        M2     UR!                  UR$                   S$S%R'                  U5       35        US&R'                  U5      4$ s  snf s  sn	nf )*a  
Retrieves metadata information for a kernel.
Args:
    node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
        Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
    wrapper (PythonWrapperCodegen):
        An instance of PythonWrapperCodegen, used to define the code comment format.
Returns:
    tuple[str, str]:
        A tuple containing two strings:
            - The first string represents the kernel's metadata.
            - The second string represent the kernel's detailed metadata.
rv  Nc              3  8   #    U  H  oR                   v   M     g 7fr   )r`  )r   ns     rR   r   &get_kernel_metadata.<locals>.<genexpr>)  s     "CNq77Nr   r3   r   )_inductor_kernel_metadata_node_to_idx_mapc                "   > TR                   U    $ r   )r  )r  single_graphs    rR   rt  %get_kernel_metadata.<locals>.<lambda>1  s    lTTUVWrq   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r"  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:r\  c                R  > [        U TR                  5      (       aF  [        U R                  TR                  5      (       a!  U R                  R                  R                  nOU R                  nUc  UnOUR
                  n U R                  5       nX44$ ! [         a    S n X44$ f = fr   )rs   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer  r   layoutr  s        rR   get_buffer_info,get_kernel_metadata.<locals>.get_buffer_infoR  s     fbll33
KK9 9 #)++"2"2">">K"("4"4K&"D&++D"#..0F |# + "!F|#"s   B B&%B&c           	     j    SSR                  U  Vs/ s H  n[        U5      PM     sn5       S3$ s  snf )N[r"  r  )r  r   )shaperO   s     rR   stringify_shape,get_kernel_metadata.<locals>.stringify_shapef  s1    499e%<ec!fe%<=>a@@%<s   0
c                   > U c  gT" U R                   5       nT" U R                  5       nU R                   nS[        U R                      U U U S3$ )Nr  ")r  strider   r    r   )r  shape_annotationstride_annotationdevice_annotationr#  s       rR   stringfy_layout,get_kernel_metadata.<locals>.stringfy_layouti  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?rq   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r  r   r   ztuple[str, ir.Layout | None])r"  zIterable[int]r   r   )r  zir.Layout | Noner   r   )#r  rp  collectionsdefaultdictrR  rM   r!   r`  r   r   nodesr  sortr  r   r   r  r   commentr  keysr  itemsr  r  rs   r:   r_  r]  r-  r.  addtry_get_bufferr/  format_node)!r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_mapr  rS  sort_strmetadatadetailed_metadataoriginal_noder6  	all_reads
all_writesr]  r  r+  rr  
input_namer  woutput_namer   r  r  r#  s!                                 @@@rR   get_kernel_metadatarN    s   $ $M2K+6W;)):Vf;NW ,,T2N$006
 L
>""CN"CC}")!,22L<)TUU8A,BTBT8U"V8Ufc168U"VIXFW    dii'DIIo,F,Rdii0@@AC#**4995$))#))K(+00C&&tyy1  *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= >u  s=/diiu6N5OP	
 !?   GOO#44D!EF%/\	 "
-99&$J$UX$-$(A
 #q-00AMM4I1=='22q}}7J7J7V]]0066Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ 1 AMM844,,8]]11!"!7!7!?!>$)8)HQ"))#*;< 2- #< #D$$??#3t'7'7PT'7'U&VW #
 	  GOO#4Jsxx
?S>T!UVTYY0111w X #Ws   TTT!c                   [        U 5      n [        U 5      nU (       ak  U R                  5       nUR                   HB  nU(       a  U" U5      (       a  M  XB;  d  M   UR	                  U5        U R                  U5        MD     U (       a  Mk  U$ )zJReturns the set of nodes whose values depend on those within initial_queue)rR  r!   rN   r  r;  r  )initial_queueskip_filterdominated_setr  users        rR   dominated_nodesrT    sx    
 'M}-M
  "JJD{400(!!$'$$T*  - rq   c                Z  ^^	 SSK Jm  SUU	4S jjm	[        U5      u  p#U Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[        U 5      u  pcU Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[	        [
        R                  " / UQUQ76 5      $ s  snf s  snf )Nr3   r  c                l  > [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      =(       a=    [        U TR
                  TR                  TR                  TR                  45      (       + $ r   )	rs   r  r  r  r;   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rR   r[  *gather_origins.<locals>.is_unrealized_node  s    a&&%aff--a''%aff--!RYY' 

!!!!	1
 -
 	
rq   )r  r;   r   r*  )r  r  r"   r  r!   	itertoolschain)
rz   r  kwargs_flattenr   valkwargs_originsargs_flattenargs_originsr  r[  s
           @@rR   gather_originsrd    s     
 
" %V,N-;W^c?QRU?Vkckk^NW"4(OL+7S<C;Mc;RKCKK<LSiooE|EnEFF XSs   B#B#B(0B(c                X   ^^^^ SS jmSUU4S jjmSUU4S jjmSU4S jjmT" U 5      $ )z
Normal sympy str is very slow, this is a lot faster.  The result are
somewhat worse, as it doesn't do as much simplification.  So don't
use this for final codegen.
c                    [        U [        R                  5      =(       a1    [        U R                  5      S:H  =(       a    U R                  S   S:H  $ )N   r   r   )rs   rt   MulrM   rz   )exprs    rR   is_neg_leadsympy_str.<locals>.is_neg_lead  s:    tUYY'VC		Na,?VDIIaLTVDV	
rq   c                v  > [        U [        R                  5      (       a  [        U R                  5      S:X  aT  T" U R                  S   5      (       a:  T" U R                  S   5       ST" U R                  S   R                  S   5       3$ SR                  [        TU R                  5      5      $ T" U 5      $ )Nrg  r3   r   z - z + )rs   rt   ru   rM   rz   r  rx   )ri  rj  sympy_str_muls    rR   sympy_str_add sympy_str.<locals>.sympy_str_add  s    dEII&& 499~"{499Q<'@'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&rq   c                   > [        U [        R                  5      (       aJ  T" U 5      (       a  ST" U R                  S   5       3$ SR	                  [        TU R                  5      5      $ T" U 5      $ )N-r3   z * )rs   rt   rh  rz   r  rx   )ri  rj  sympy_str_atoms    rR   rm   sympy_str.<locals>.sympy_str_mul  sa    dEII&&4   >$))A,7899zz#ndii"@AA!$''rq   c                  > [        U [        R                  5      (       a  U R                  $ [        U [        R                  [        R
                  45      (       a  ST" U 5       S3$ [        U [        [        [        [        45      (       aC  U R                  R                   SSR                  [        [        U R                  5      5       S3$ [!        U 5      $ )N()r"  )rs   rt   Symbolr   ru   rh  r^   r[   r\   r]   funcr   r  rx   	sympy_strrz   r   )ri  rn  s    rR   rr  !sympy_str.<locals>.sympy_str_atom  s    dELL))99uyy%))455}T*+1--(HMNNii(()499SDII5N+O*PPQRRt9rq   )ri  r   r   r*  ri  r   r   r   r   )ri  rj  rn  rr  rm  s    @@@@rR   ry  ry    s.    

	' 	'	( 	( rq   c                    SSK Jn  [        R                  (       a9  [	        UR
                  SS 5      =n(       a  UR                  S:w  a  [        U 5      $ [        R                  " 5       $ )Nr3   r\  current_node
index_expr)
r_  r]  rd   compute_all_boundsrJ   interpreterr|  ra   rb   unknown)r  r]  fx_nodes      rR   get_bounds_index_exprr    sN     	!!~tDDWDNNl*5!!""$$rq   c                    U S   S:H  $ )Nr   rJ  r   )prefixs    rR   prefix_is_reductionr    s    !9rq   c                D    U [         R                  :w  d   e[        XSSS9$ )1
Used to generate an integer-nonnegative symbol.
Tintegernonnegative)r`   SIZEr_   )r  rB  s     rR   sympy_index_symbol_with_prefixr    s'     TYY vDdCCrq   c                b    U =(       d    [         R                  =(       a    [         R                  $ r   )rd   debug_index_assertsassert_indirect_indexing)checks    rR   generate_assertr    s    /V//TV5T5TTrq   c                D    U S   S:w  d   e[         R                  " U SSS9$ )r  r   r   Tr  )rt   rw  r   s    rR   sympy_index_symbolr    s)     7c>> <<d==rq   c                          SS jn[         R                  " U 5      R                  UR                  5        VVs0 s H  u  p4X2" X45      _M     snn5      $ s  snnf )z
When the passed replacement symbol v is a string, it is converted to a symbol with name v that
have the same replaced expression integer and nonnegative properties.
c                    [        U [        R                  5      (       d   e[        U[        5      (       a*  [        R                  " UU R
                  U R                  S9$ U$ )Nr  )rs   rt   r#  r   rw  r   is_nonnegative)replacedreplacements     rR   	to_symbolsympy_subs.<locals>.to_symbol1  sV     (EJJ////k3''<< ++$33  rq   )r  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)rt   r$  xreplacer:  )ri  replacementsr  kr}   s        rR   
sympy_subsr  +  sh    +A	 ==''(4(:(:(<=(<IaO	(<= =s   A
c                   [        U [        R                  5      =(       dd    [        U [        R                  5      =(       aC    [	        S [
        R                  " U R                  5       U R                  5       5       5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   is_symbolicr   rO   s     rR   r   is_symbolic.<locals>.<genexpr>G  s     N(M1A(Mr   )	rs   rK   r/   r  rl  r]  r^  r  r'  )r  s    rR   r  r  D  sS    a& 1ell# 	ON	!((*(MNNrq   c                 &    [        S U  5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   r  rj  s     rR   r   "any_is_symbolic.<locals>.<genexpr>L  s     ,t!{1~~tr   rl  )rz   s    rR   any_is_symbolicr  K  s    ,t,,,rq   c                   SSK Jn  [        / SQ5      n[        R                  " 5       (       a  UR                  S5        U R                  R                   H  n[        UR                  5      U;   a  Us  $ [        R                  R                  R                  (       ds  [        UR                  [        R                  R                  5      (       a@  [        R                   R"                  R$                  UR                  R&                  ;   a  Us  $ UR(                  R+                  S5      =nc  M  U" U5      (       d  M  Us  $    g )Nr   )r'   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr`  )%torch.fx.experimental.symbolic_shapesr'   r!   rK   $are_deterministic_algorithms_enabledrQ  r`  r6  r   r|  	_inductorrd   graph_partitionrs   r}  r~  r   r  cudagraph_unsafer  r  get)r  r'   forbidden_setr  r`  s        rR   %get_first_incompatible_cudagraph_noder  O  s     L	
M  1133	
" t{{},K &&664;;

(=(=>>--1A1AA
 K99==''C49Ns9S9SK " rq   c                    [        [        [        U R                  R                  5      5      5      nUR
                  S:X  d   eU$ )z$Get the output node from an FX graphr  )nextiterreversedr`  r6  rp  )r  	last_nodes     rR   output_noder    s6    T(288>>234I<<8###rq   c                    U R                   R                  SS9n[        S U 5       5      n[        U 5      R                  S   n[        U[        5      (       a  UOU4n[        S U 5       5      nX%-  $ )Nr  ro  c              3     #    U  HX  n[        UR                  R                  S 5      [        R                  5      (       d  M=  UR                  S    R
                  v   MZ     g7fr`  N)rs   r  r  rK   r  r   )r   r  s     rR   r   "get_all_devices.<locals>.<genexpr>  sC      9%DdiimmE*ELL9 	 		%%s   <A" A"r   c              3    #    U  H  n[        U[        R                  R                  5      (       d  M.  [        UR                  R                  S 5      [        R                  5      (       d  Mh  UR                  S    R                  v   M     g7fr  )rs   rK   r  r2   r  r  r  r   )r   r  s     rR   r   r    s[      7Cc588==) 	 sxx||E*ELL9 	s   -B6B- B)r`  
find_nodesr!   r  rz   rs   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rR   get_all_devicesr    s~    ++}+=.8 9%9 /M "o""1%G$We44w7*H,6 77 -K &&rq   c                    [        [        R                  R                  5       5       GHC  n U R	                  S5      (       d  M  [        R                  U    nUR
                  R                  5        H  nUR	                  S5      (       d  M  [        X5      n[        U[        R                  R                  R                  R                  5      (       d  Me  UR                   Hp  n[        U[        R                  R                  R                  R                  5      (       d  MB  UR                  R                   R"                  R%                  5         Mr     M     [        R                  U 	 GMF     S[        R                  ;   aR  [        R                  S   n['        UR(                  R*                  R,                  5      ?UR(                  R*                  ?[0        R2                  " 5         g )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rR  sysmodulesr9  
startswith__dict__rJ   rs   rK   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rR   unload_xpu_triton_pydsr    sQ   CKK,,./%%&NOOKK$*I##I.. .EOO33EEVV  #)"8"8%"!OO33EEYY  #MM--1199; #9 + KK$! 0& #++-kk12""(()2JJ#JJLrq   _registered_cachesc                    [        U S5      (       a  [        U R                  5      (       d  [        U  S35      e[        R                  U 5        U $ )z\
Use this decorator to register any caches that should be cache_clear'd
with fresh_cache().
cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  s    rR   clear_on_fresh_cacher    sE    
 3&&hs.G.Gu$GHIIc"Jrq   c                 >    [          H  n U R                  5         M     g)z
Clear all registered caches.
N)r  r  r  s    rR   clear_cachesr    s     " "rq   c              #    ^#    [        5         SSKJn  U" [        R                  " US95      m [
        R                  R                  [        R                  ST05         [        R                  ST5        U" [        R                  R                  TS5      5      n[
        R                  R                  [        R                  SU05         Sv   [        U [        5      (       a  [        U 5      S:X  d   S	5       e[        R                  R!                  U5      (       a{  [        R"                  " U5      nU R%                  U Vs0 s HH  nS
U;  d  M  U[        R                  R'                  [        R                  R                  XF5      5      _MJ     sn5        SSS5        SSS5        U(       a^  [)        5       (       a-  [*        R,                  R/                  5       (       a
  [1        5         [2        R4                  " T[)        5       U4S jS9  [        5         gs  snf ! , (       d  f       N= f! , (       d  f       N= f! [6         a    [        R9                  ST5        e f = f! [        5         f = f7f)z
Contextmanager that provides a clean tmp cachedir for pt2 caches.

Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
generated with this cache instance.
r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                .   > [         R                  STUS9$ )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)rx  pathr  inductor_cache_dirs      rR   rt  fresh_cache.<locals>.<lambda>  s    S[[@&% 6A 6rq   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictosenvironr   r   r  r  rs   rM   existslistdirrQ  getsize
is_windowsrK   rE   rL   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rR   fresh_cacher    s     ND1(2B2Bs2KL)ZZ__JJ24FG
 II35GH7/:  .@BR-STmT22}-2W4WW2ww~~&677 "

+; <%,, */).A#*!#3 !V277??277<<@P3T#U U). U
( ||		 6 6 8 8&(MM" )l  	5 UT
 
H  >@RS 	st   +I60I A-H1A9H 
HAHH H1#A-I I6H  
H.	*H11
H?;I "I$$I' 'I33I6c           
     z    U R                   n[        [        U 5      5      n[        [	        [        X!SS95      5      $ )NT)rS  reverse)__getitem__r   rM   rR  r  r  )seqgettera_rs      rR   argsortr    s/    __F
C/C>?@@rq   c           	     D  ^  SU 4S jjn[        U5       VVs/ s H>  u  p4U[        U[        R                  5      (       a  UR                  R
                  OU4PM@     nnn[        U[        R                  " U5      S9nU VVs/ s H  u  p6UPM	     nnnU$ s  snnf s  snnf )Nc                ~   > U u  p#Uu  pESU4S jjnU" X5:  5      (       a  gU" X5:  5      (       a  gX$:  a  gX$:  a  gg)Nc                R   > [        U [        5      (       a  U $ TR                  U SS9$ )NT)size_oblivious)rs   r*  evaluate_expr)ri  rb  s    rR   evaluate*argsort_sym.<locals>.cmp.<locals>.evaluate,  s+    $%%**4*EErq   r   r3   r   )ri  z%Union[bool, torch.SymInt, sympy.Expr]r   r*  r   )r  r  a_idxa_valb_idxb_valr  rb  s          rR   r  argsort_sym.<locals>.cmp(  sN    	F
 EM""EM""
 ==rq   r  )r  tuple[int, sympy.Expr]r  r"  r   r   )	r   rs   rK   r/   r  ri  r  r  
cmp_to_key)rb  r  r  rB  r   exprsr   r  s   `       rR   argsort_symr%  %  s    4  n$FC 
Z5<<88affkka@$ 
  5i22378E %&fccF&M
 's   ABBc                r    U [         R                  :X  a  g[         R                  " SU S9R                  5       $ )Nrl   r   r   )rK   rN  r   element_sizer'  s    rR   get_dtype_sizer)  I  s-     ;;r'4466rq   c                       \ rS rSr% S\S'   Srg)LineContextiR  r   contextr   Nr   r   r   r   r   r   r   rq   rR   r+  r+  R  s    Lrq   r+  c                  *    \ rS rSr% S\S'   S\S'   Srg)ValueWithLineMapiV  r   r   zlist[tuple[int, LineContext]]line_mapr   Nr-  r   rq   rR   r/  r/  V  s    J++rq   r/  c                     \ rS rSrSrSSS jjr\R                  SS j5       rSS jr	SS jr
SS jrSS jrSS	 jrSS
 jrSS jrSS jr    S S jrS!S"S jjrS!S#S jjrS!S#S jjr S$     S%S jjrS&S jrSS jrS'S jrS(S jrSrg))IndentedBufferi\     c                    / U l         Xl        g r   )_lines_indent)r  initial_indents     rR   __init__IndentedBuffer.__init___  s    GI%rq   c              #  \   #    U R                   n Xl         S v   X l         g ! X l         f = f7fr   )tabwidth)r  r;  prevs      rR   set_tabwidthIndentedBuffer.set_tabwidthc  s%     }}	!$M MDMs   ,
! ,),c                   [        5       nSn/ nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O5[        U[        5      (       a  UR                  X$R                  45        MX  Un[        U[        5      (       d   eUR                  U5        UR                  S5        USUR                  S5      -   -  nM     [        UR                  5       U5      $ )Nr3   r3  )r   r5  rs   DeferredLineBaser+  r  r,  r   writecountr/  getvalue)r  bufr   linemaplilines         rR   getvaluewithlinemap"IndentedBuffer.getvaluewithlinemapl  s    j13++B".//t<  B,,::/dC((((IIdOIIdOTZZ%%%A   88rq   c                6    U R                  5       R                  $ r   )rH  r   r  s    rR   rC  IndentedBuffer.getvalue  s    '')///rq   c                   [        5       nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O[        U[        5      (       a  M<  Un[        U[
        5      (       d   eUR                  S5      (       a  UR                  US S 5        M  UR                  U5        UR                  S5        M     UR                  5       $ )N\r   r3  )	r   r5  rs   r@  r+  r   endswithrA  rC  )r  rD  rF  rG  s       rR   getrawvalueIndentedBuffer.getrawvalue  s    j++B".//t<  B,,dC((((}}T""		$s)$		$		$   ||~rq   c                8    U R                   R                  5         g r   )r5  clearrK  s    rR   rS  IndentedBuffer.clear  s    rq   c                ,    [        U R                  5      $ r   )r*  r5  rK  s    rR   __bool__IndentedBuffer.__bool__  s    DKK  rq   c                :    SU R                   U R                  -  -  $ )Nr  )r6  r;  rK  s    rR   r  IndentedBuffer.prefix  s    dllT]]233rq   c                &    U R                  S5        g )Nr3  	writelinerK  s    rR   newlineIndentedBuffer.newline  s    trq   c                   [        U[        5      (       a  U R                  R                  U5        g [        U[        5      (       a9  U R                  R                  UR                  U R                  5       5      5        g UR                  5       (       a.  U R                  R                  U R                  5        U 35        g U R                  R                  S5        g Nr  )rs   r+  r5  r  r@  with_prefixr  stripr  rG  s     rR   r\  IndentedBuffer.writeline  s    dK((KKt$.//KKt//>?ZZ\\KK$++-78KKr"rq   c                8    U H  nU R                  U5        M     g r   r[  )r  linesrG  s      rR   
writelinesIndentedBuffer.writelines  s     DNN4  rq   c                L   ^ ^ [         R                  SUU 4S jj5       nU" 5       $ )Nc               3     >#    T=R                   T -  sl          S v   T=R                   T -  sl         g ! T=R                   T -  sl         f = f7fr   r6  )offsetr  s   rR   r  "IndentedBuffer.indent.<locals>.ctx  s8     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  rl  r  s   `` rR   indentIndentedBuffer.indent  s$    		"	"	' 
#	' urq   c                .    U =R                   U-  sl         g r   rk  r  rl  s     rR   	do_indentIndentedBuffer.do_indent      rq   c                .    U =R                   U-  sl         g r   rk  ru  s     rR   do_unindentIndentedBuffer.do_unindent  rx  rq   c           	        [        U[        5      (       a  [        S5      nUR                   HR  n[        U[        5      (       a  M  U(       d  M#  [        U[        U5      [        UR                  5       5      -
  5      nMT     [        R                  " U5      (       a  SnUR                   HV  n[        U[        5      (       a  U R                  R                  U5        M5  [        R                  X[        U5      S  5        MX     g [        R                  " U5      nU(       a  UR                  5       nU(       d  g UR                  5       nUR!                  S5       H  nU R                  U5        M     g )Ninfr   r3  )rs   r2  floatr5  r+  minrM   r  mathisinfr  r\  r   textwrapdedentrstriprP  )r  
other_coderb  r  rG  r   s         rR   spliceIndentedBuffer.splice  s    j.115\F"))!$44 TS5G)GHF * zz&!!"))dK00KK&&t,",,TF3FG	 * "4J'..0
#**,J%%d+q! ,rq   c                    [        U R                  S9nU R                   Vs/ s H
  o1" U5      PM     snUl        U$ s  snf N)r7  )r2  r6  r5  )r  rx  r   rG  s       rR   rx   IndentedBuffer.map  s8    DLL9-1[[9[Td4j[9

 :s   =c                @    [        U 5       SU R                  5        S3$ )Nru  rv  )r  rC  rK  s    rR   __repr__IndentedBuffer.__repr__  s     t*Qt}}/q11rq   c                    U R                   UR                   :X  d   e[        U R                   S9nUR                  U R                  5        UR                  UR                  5        U$ r  )r6  r2  rg  r5  )r  otherr   s      rR   __add__IndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
rq   c                    XR                   ;   $ r   )r5  )r  new_lines     rR   containsIndentedBuffer.contains  s    ;;&&rq   )r6  r5  r;  Nr   )r7  r   r   r  )r;  r   r   ro  )r   r/  r   r   r   r  r   r*  )rG  z)Union[LineContext, DeferredLineBase, str]r   r  )rf  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )rl  r   r   'contextlib.AbstractContextManager[None])rl  r   r   r  )F)r  zUnion[IndentedBuffer, str]rb  r*  r   r  )rx  zCallable[[Any], Any]r   r2  )r  r   r   r2  )r  z)Union[DeferredLineBase, LineContext, str]r   r*  )r   r   r   r   r;  r8  rp  rq  r=  rH  rC  rP  rS  rV  r  r]  r\  rg  rr  rv  rz  r  rx   r  r  r  r   r   rq   rR   r2  r2  \  s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"2
2'rq   r2  c                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )FakeIndentedBufferi  c                "   > [         TU ]  5         g r   )superr8  )r  	__class__s    rR   r8  FakeIndentedBuffer.__init__  s    rq   c                V    US:X  a  [         R                  X5      $ [        SU S35      e)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rR   r  #FakeIndentedBuffer.__getattribute__  s9    ;**466!$ (= =
 	
rq   r   r  )r   r   r   r   )r   r   r   r   r8  r  r   __classcell__r  s   @rR   r  r    s    
 
rq   r  c               #     #    [         R                  [         R                  p S v   Xs[         l        [         l        g ! Xs[         l        [         l        f = f7fr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rR   restore_stdout_stderrr     s9     %(ZZN@!/
CJ
CJs    A> AAAc                  h    \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSrg)r@  i	  z.A line that can be 'unwritten' at a later timec                >    UR                  5       (       d  SnXl        g r`  )rb  rG  rc  s     rR   r8  DeferredLineBase.__init__  s    zz||D	rq   c                    [         e)zJReturns either self.line or None to indicate the line has been 'unwritten'r  rK  s    rR   r  DeferredLineBase.__call__      !!rq   c                    [         e)z3Returns a new deferred line with the same conditionr  rc  s     rR   	_new_lineDeferredLineBase._new_line  r  rq   c                @    U R                  U U R                   35      $ r   r  rG  )r  r  s     rR   ra  DeferredLineBase.with_prefix  s    ~~455rq   c                T    U R                  U R                  R                  5       5      $ r   )r  rG  r  rK  s    rR   r  DeferredLineBase.lstrip  s    ~~dii..011rq   c                >    U R                  U R                  U   5      $ r   r  )r  r  s     rR   r  DeferredLineBase.__getitem__  s    ~~dii.//rq   c                ,    [        U R                  5      $ r   )r*  rG  rK  s    rR   rV  DeferredLineBase.__bool__"  s    DIIrq   c                ,    [        U R                  5      $ r   )rM   rG  rK  s    rR   __len__DeferredLineBase.__len__%  s    499~rq   )rG  N)rG  r   )r   zUnion[str, None])rG  r   r   r   )r  r   r   r   )r   r   )r  zUnion[int, slice]r   r   r  r   r   )r   r   r   r   r   r8  r  r  ra  r  r  rV  r  r   r   rq   rR   r@  r@  	  s-    8
""620rq   r@  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DelayReplaceLinei)  z6At end of codegen call `line.replace(key, value_fn())`c                <   > [         TU ]  U5        Xl        X l        g r   )r  r8  rS  value_fn)r  rS  r  rG  r  s       rR   r8  DelayReplaceLine.__init__,  s     rq   c                j    U R                   R                  U R                  U R                  5       5      $ r   )rG  replacerS  r  rK  s    rR   r  DelayReplaceLine.__call__1  s#    yy  4==?;;rq   c                D    [        U R                  U R                  U5      $ r   )r  rS  r  rc  s     rR   r  DelayReplaceLine._new_line4  s    $-->>rq   )rS  r  )rS  r   r  zCallable[[], str]rG  r   r  )rG  r   r   r  )
r   r   r   r   r   r8  r  r  r   r  r  s   @rR   r  r  )  s    @!
<? ?rq   r  c                   [        U [        R                  5      (       a  U nO[        R                  " [        5       U 5      n[        R
                  " U5      n[        R                  R                  (       aF  UR                  c   eUR                  S:  d  UR                  S:X  a  [        R                  S5        ggUR                  S:X  a  SOSnUR                  nXC:  a  [        R                  S	X4S
.S9  gg)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrE   ri   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rs   rK   r   rS   r   createversionhipmajorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rR   
is_big_gpur  8  s    /5<<00 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I:%> 	 	
 rq   c                     [         R                  R                  5       (       a(  [         R                  R                  5       R                  $ [         R
                  R                  S5      R                  $ )NrC   )rK   rE   rL   get_device_propertiesgpu_subslice_countrC   r  r   rq   rR   get_max_num_smsr  U  sI    yyyy..0CCC::++F3IIIrq   c                     [         R                  R                  5       (       d  g[         R                  R                  [         R                  R	                  5       5      n U R
                  S:H  $ )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rK   rC   rL   r  r  r  )device_propertiess    rR   
using_b200r  \  sM     ::""$$

889R9R9TU""b((rq   c                     [         R                  R                  5       (       a
  [        5       $ [         R                  R                  5       n [        5       U b  U -
  $ S-
  $ )zFHandle experimental carveout if set otherwise return hardware SM countr   )rK   rE   rL   r  r   _get_sm_carveout_experimental)carveouts    rR   get_num_smsr  f  sM     yy  xx557HH,@HHaHHrq   c                    SSK JnJn  Uc
  [        5       nUR	                  S5      nX -  [
        -  nU" UUUUR                  " 5       S9$ )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r3   )r4   WorkspaceZeroModeF)rB  	zero_moder   
outer_name)codegen.commonr4   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr4   r  r  r  s          rR   get_tma_workspace_argr  o  sU     @"}!++E2I-0CCD++-	 rq   c                   U R                   U;  a!  [        R                  SU R                   U5        [        U R                  R
                  5      =(       a+    U R                   U;   =(       a    [        U R                  5      $ )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r  allowed_layout_dtypess     rR   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%rq   c                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf Nr2  )upperrd   max_autotune_gemm_backendsrP  rb  backendrO   s     rR   _use_autotune_backendr    P    ==?!<<BBDJJ3OOa	O      Ac                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf r  )r  rd   max_autotune_conv_backendsrP  rb  r  s     rR   _use_conv_autotune_backendr    r  r  F)enable_int32enable_float8check_max_autotunec                  SSK JnJn  [        R                  [        R
                  [        R                  /nU(       a>  [        R                  [        R
                  [        R                  [        R                  /nU(       a/  UR                  [        R                  [        R                  /5        [        U R                  R                  5      =(       a    [        X5      =(       d/    U R                  R                  S:H  =(       a    U R                  U;   =(       ak    [         R"                  =(       d    [         R$                  =(       d    U(       + =(       a/    ['        S5      =(       a    U" U R                  UR(                  5      $ )Nr3   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rK   r   r:  r<  rD  extendr4  r5  r  r   r  r  r   rd   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r  r  r  r  r  r  layout_dtypess          rR   use_triton_templater    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOrq   )
add_guardsc                   ^ ^^^^^ SSK Jn  SSKJm  S	U4S jjmS
UUU 4S jjmS
U4S jjmU" 5       =(       a    [	        UUU4S jU 5       5      $ )u  
Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
that Triton relies on today.
* https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

A tensor is accepted when:
  * 2 ≤ rank ≤ 5
  * dtype ∈ {FP16, BF16, FP8-E4M3FN}
  * Every logical size ≥ 2
  * Base pointer 16-byte aligned
  * All "outer" dims have 16-byte aligned strides
  * The “inner” dim has stride 1 (contiguous)
  * For FP8 tensors, inner dim ≥ 32
r   )has_triton_tma_devicer3   r\  c                X   > TR                   R                  R                  U [        5      $ r   )r`  ra  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesr]  s    rR   _alignedcan_use_tma.<locals>._aligned  s     ww<<ZWWrq   c                  > U R                  5       nU R                  5       n[        U5      nU R                  5       nUR                  nUS:  d  US:  a  gU[
        R                  [
        R                  [
        R                  4;  a  gU R                  5       TR                  R                  ;   a  gT(       aK  TR                  R                  R                  U5      nTR                  R                  R                  U5      nOjU Vs/ s H(  nTR                  R                  R                  U5      PM*     nnU V	s/ s H(  n	TR                  R                  R                  U	5      PM*     nn	[        U4S jU 5       5      (       a  g[!        U5       V
V	s/ s H4  u  pTR                  R                  R#                  U	S5      (       d  M2  U
PM6     nn
n	[        U5      S:w  a  gUS   n[!        U5       H  u  pX:X  a  M  T" X-  5      (       a  M    g   Xl   nT" X-  5      (       d  gU[
        R                  :X  a,  TR                  R                  R%                  US5      (       d  ggs  snf s  sn	f s  sn	n
f )	Nrg  r   Fc              3  z   >#    U  H0  nTR                   R                  R                  US 5      (       + v   M2     g7frg  N)r`  ra  statically_known_geq)r   r   r]  s     rR   r   Bcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>  s.     P1177##88A>>>s   8;r3   r       T)get_size
get_striderM   	get_dtypeitemsizerK   r   r:  r4  r  r`  unaligned_buffersra  guard_int_seqsymbolic_hintrl  r   statically_known_equalsr  )rO   sizesstridesrankr   r"  sizes_i	strides_ir   str   r  	inner_idx	inner_dimr]  r  r  s                 rR   _is_tma_compatible_default/can_use_tma.<locals>._is_tma_compatible_default  s   

,,.5z>> !8tax 8K8KLL ::<177444gg&&44U;G((66w?IBGH%Qqww''55a8%GHFMNg))77;gIN PPPP
 #9-
-ww77A> - 	 

 u:?!H	 y)EA~BM**	 * &		,-- E'''0@0@0U0Ur1
 1
 G IN
s   	/I,>/I11I6I6c                ^  > U R                  5       nU Vs/ s H(  nTR                  R                  R                  U5      PM*     nn[	        U5       VVs/ s H4  u  pBTR                  R                  R                  US5      (       d  M2  UPM6     nnn[        U5      S:w  a  ggs  snf s  snnf )Nr3   FT)r   r`  ra  r%  r   r&  rM   )rO   r(  r,  r+  r   r  r]  s         rR   _is_tma_compatible_xpu+can_use_tma.<locals>._is_tma_compatible_xpu  s    ,,.BIJ'BQWW%%33B7'	J #9-
-ww77A> - 	 

 u:? K
s   /B$1B)
B)c              3     >#    U  H8  nUR                  5       =mb  TR                  S:w  a  T" U5      OT" U5      v   M:     g 7f)NrE   )
get_devicer  )r   r  r/  r2  m_devices     rR   r   can_use_tma.<locals>.<genexpr>  sL      + A &H/8==E3I 	#1%#A&	' s   A A)r  Union[int, sympy.Expr]r   r*  rO   r;   r   r*  )torch.utils._tritonr  r_  r]  rw   )r  matricesr  r]  r  r/  r2  r6  s   `  @@@@@rR   can_use_tmar<    sI     :X: :x !" s + 	+ ( rq   c                    [        S U 5       5      =(       a,    [        USU 06=(       a    [        R                  R                  $ )Nc              3  Z   #    U  H!  n[        UR                  5       5      S :H  v   M#     g7fr  )rM   r  )r   r  s     rR   r   *use_triton_tma_template.<locals>.<genexpr>%  s      5HqC

"Hs   )+r  )rw   r<  rd   r  enable_persistent_tma_matmul)r  r;  s     rR   use_triton_tma_templaterA  #  s9    5H55 	79j9	7MM66rq   c                r   SSK Jn  UR                  R                  R	                  X-  U-  SS9nUS::  d  U[
        R                  R                  :  a  gSSKJ	n  [        R                  R                  (       a  g[        R                  [        R                  [        R                  /n[!        X5      =(       a9    [
        R"                  =(       d    [
        R$                  =(       a    ['        S5      nU(       a;  U" 5       (       d/  [(        R+                  S	[
        R                  R,                  5        gU$ )
Nr3   r\  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)r_  r]  r`  ra  	size_hintrd   rC   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsrE  rK   r  r  r   r:  rD  r  r  r  r  r   r  cutlass_dir)	r  r  r  r  r]  	gemm_sizerE  r  r   s	            rR   use_cutlass_templaterL  +  s      **1519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mf4 	-  <F$<$<	-!),  !##KK4 ''	 Jrq   c                    [         R                  R                  R                  5       nUS:X  a  gU R                  5       UR	                  S5       Vs/ s H  o"R                  5       PM     sn;   $ s  snf )z8Check if CUTLASS should be used for the given operation.ALLTr2  )rd   rC   cutlass_enabled_opsr  rP  rb  )op_nameenabled_opsrO   s      rR   _use_cutlass_for_oprR  L  sY    ++11779Ke==?+2C2CC2HI2HQwwy2HIIIIs   A0r   _IntLikec           
        SSK Jn  [        R                  R                  n[
        R                  R                  (       + =(       a    UR                  R                  R                  [        R                  " [        R                  " X$U -  5      [        R                  " X$U-  5      5      5      =(       a=    UR                  R                  (       + =(       a    UR                  R                  (       + $ )Nr   r\  )torch._inductor.virtualizedr]  rd   r  decompose_k_thresholdrK   r  r  r`  ra  statically_known_truert   AndGeaot_modecpp_wrapper)r  r  r  r]  rV  s        rR   use_decompose_k_choicer\  W  s    -"MM?? MM 	$GG22IIA56A56
	$    	$ ###
rq   c           
        [         R                  R                  nSSKJn  [        [        R                  R                  5      =(       a    UR                  R                  R                  [        R                  " [        R                  " X#U -  5      [        R                  " X#U-  5      5      5      =(       a=    UR                  R                  (       + =(       a    UR                  R                   (       + $ )z
Check if we should use the contiguous subgraph transform.
This transform makes the second matrix contiguous before the matmul.
r   r\  )rd   rocmcontiguous_thresholdrU  r]  r*  rK   r  r  r`  ra  rW  rt   rX  rY  rZ  r[  )r  r  r  r_  r]  s        rR   use_contiguousr`  j  s     ";;;; . 	U]] 	$GG22II145145
	$    	$ ###
rq   c                6   [         R                  R                  n/ SQn[        U[        R
                  5      (       a  UR                  (       d  U$ US:X  a  / $ [        U [        R
                  5      (       a  U R                  (       a0  [        U[        R
                  5      (       a  UR                  (       d  SnO[        X -  X!-  5      nSn[        R                  " U5      nU Vs/ s H  nX::  d  M
  X:  d  M  UPM     nn/ / / pn	U H`  nX,-  nUS:  a  M  XS-
  -  S:X  a  US:  a  U	R                  U5        M3  US-  S:X  a  U
R                  U5        MO  UR                  U5        Mb     [         R                  S:X  a  X-   U-   $ X-   U-   nUS U $ s  snf )	N)ri   r  rk   rj      r   rb  rg  rj   r3   r  
EXHAUSTIVE)rd   r  num_decompose_k_splitsrs   rt   r#  	is_numberr  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitrf  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rR   get_k_splitsrs    s    ]]99N .!UZZ  	1		1ejj!!!++1ejj!!!++!&!&)K~~a H  G! 	&-&< 	   =?B> 3; AI!#$$Q'RZ1_%%a( !!!$ " ,,< 5FF#8>IK''=s   (	F5F<Fc                T    [         R                  R                  U 5      R                  $ r   )rK   rC   r  gcnArchNamer   s    rR   _rocm_native_device_arch_namerw    s    ::++F3???rq   c                      SS K n SSKJnJn  SSKJn  [        R                  R                  U R                  5      nXAX#4$ ! [         a    SS jnSS jn " S S5      nS n N&f = f)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     / $ r   r   r   rq   rR   ry  *try_import_ck_lib.<locals>.gen_ops_library      Irq   c                     / $ r   r   r   rq   rR   rz  .try_import_ck_lib.<locals>.gen_ops_preselected  r~  rq   c                      \ rS rSrSrg)*try_import_ck_lib.<locals>.CKGemmOperationi  r   N)r   r   r   r   r   r   rq   rR   r{  r    s    rq   r{  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesry  rz  ck4inductor.universal_gemm.opr{  r   r  dirname__file__r   )r  ry  rz  r{  package_dirnames        rR   try_import_ck_libr    sh    	
	
 ''//+*>*>? -@QQ  			 	 s   ;A  A$#A$c                P   [         R                  (       d  [         R                  (       d  g[        R                  R
                  (       d  gU R                  R                  S:X  d  g[        U R                  5      n[         R                  R                   Vs0 s H  o"R                  S5      S   U_M     sn=(       d    UR                  S5      S   U0nUR                  5       [         R                  R                  -   Vs/ s H  nX2   PM	     nnU(       d  gU R                  [        R                  [        R                   [        R"                  4;  a  g[%        5       u  n    nU(       d  [&        R)                  S5        g[         R*                  " 5       (       a  U[         R                  l        [         R                  R,                  (       d  [&        R)                  S5        gU[         R                  R,                  :w  a  [&        R)                  S5        ggs  snf s  snf )	NFrC   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rd   r  r  rK   r  r  r   r  rw  r^  archrP  r9  ck_supported_archr   r   r:  r<  r  r   r  	is_fbcodeck_dir)r  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rR   use_ck_templater    s   6#;#;====' 0>K39;;3C3CD3Cawws|A)3CD #q!;IO
 !%%'&++*G*GG!GA 	G  ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   HH#c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr3   r\  CKr   rC  r   r_  r]  r  r  r`  ra  rG  r  r  r  r  r]  s        rR   use_ck_gemm_templater    sP     	d# 	CF#	CGG&&quqy2&>Brq   c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr3   r\  CKTILEr   rC  r   r  r  s        rR   use_ck_tile_gemm_templater    sP     	h' 	CF#	CGG&&quqy2&>Brq   c                <    [        S5      =(       a    [        U 5      $ )Nr  )r  r  r  s    rR   use_ck_conv_templater     s    %d+G0GGrq   c                    [         R                  =(       d    [         R                  =(       a    U R                  R                  S:H  $ r  )rd   r  r  r   r  r  s    rR   _use_template_for_cpur  $  s2    7v77&
--


%&rq   c                    SSK Jn  [        UR                  U5      (       d   e[	        XUSS9=(       a    UR                  R                  5       $ )Nr3   )r<   F)require_constant_mat2)r  r<   rs   r  use_cpp_gemm_templateis_contiguous)r  mat1mat2r<   s       rR   use_cpp_bmm_templater  *  sF     dkk6**** 	fDN 	(KK%%'rq   c                   SSK Jn  SSKJn  SSKJn	  SSKJn
  [        U 5      (       a  [        S5      (       d  g[        R                  R                  (       d  gUR                  5       [        R                  [        R                   4;   n[        R"                  [        R$                  [        R&                  [        R                  /nU
" UUU(       a  U R(                  OS UUS9u  ppp[+        X45      (       a  g[-        X'R.                  5      (       a  UR1                  5       nU	" UR                  5       5      u  nnU" S	UUUUR                  5       UR                  5       U[3        5       U(       + US
9
nSS jnU R(                  U;   =(       aT    US L=(       aI    U" U5      =(       a:    [-        X'R4                  5      =(       a    UR7                  5       =(       d    U(       + $ )Nr3   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    U R                  5         U R                  5       S   S:H  $ )Nr   r3   )freeze_layoutr   rO   s    rR   is_last_dim_stride12use_cpp_gemm_template.<locals>.is_last_dim_stride1j  s"    	||~b!Q&&rq   r9  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  rd   cppweight_prepackr!  rK   rK  r@  r<  r:  halfr   has_free_symbolsrs   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmr  r  r  r  r  r   r  r  s                       rR   r  r  7  s    9M) ((0Ee0L0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#A!T $$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C t]]+	C ""$A,A(Arq   c                 ~    [         R                  =(       d    [         R                  (       + =(       d    [        S5      $ )NATEN)rd   r  r  r  r   rq   rR   use_aten_gemm_kernelsr  w  s-    7v77 '	v	&'rq   c                  b    \ rS rSr% \R
                  " S5      rS\S'   S
S jrS
S jr	SS jr
Srg	)DebugDirManageri}  r   r   prev_debug_namec                @    [        [        R                  5      U l        g r   )r  r  counterr   rK  s    rR   r8  DebugDirManager.__init__  s    ../rq   c                    [         R                  R                  R                  U l        U R                   SU R
                   3U l        U R                  [         R                  R                  l        g )N_tmp_)rK   _dynamord   debug_dir_rootr  r   new_namerK  s    rR   	__enter__DebugDirManager.__enter__  sM    $}}33BB//0dggY?.2mm+rq   c                    [         R                  " U R                  5        U R                  [        R
                  R                  l        g r   )r  r  r  r  rK   r  rd   r  )r  rz   s     rR   __exit__DebugDirManager.__exit__  s*    dmm$.2.B.B+rq   )r   r  r  Nr  )rz   r   r   r  )r   r   r   r   r]  rB  r  r   r8  r  r  r   r   rq   rR   r  r  }  s&    ooa G0<
Crq   r  c                   ^ SSK Jn  / mSU4S jjn[        R                  R	                  USU5         [
        R                  R                  5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr3   r7   c                (   > TR                  U 5        g r   r  codesource_codess    rR   save_output_code*run_and_get_code.<locals>.save_output_code      D!rq   r  r  r   r   r  r`  r8   r   r  r  rK   r  reset)r   rz   r  r8   r  r  r  s         @rR   run_and_get_coder    su    
 % L" 
		=*<>N	OT$V$ 
P < 
P	O <s   'A&&
A7c                    [        U /UQ70 UD6u  p4/ nU H8  nUR                  [        R                  " SU[        R                  5      5        M:     X54$ )Nz	'''.*?''')r  r
  r   findallDOTALL)r   rz   r  r  r  kernelsr  s          rR   run_and_get_kernelsr    sO     ,B@@@FGrzz,bii@A ?rq   c                *   ^  SU 4S jjn[        U5      $ )Nc                 R   > T" 5       n U R                  5       R                  5         U $ r   )r   backward)r  r   s    rR   run_with_backward1run_fw_bw_and_get_code.<locals>.run_with_backward  s!    

rq   )r   r   )r  )r   r  s   ` rR   run_fw_bw_and_get_coder    s    
 -..rq   c                t  ^^ SSK Jn  / mSU4S jjmS	U4S jjn[        R                  R	                  USU5         [        R                  R	                  UST5         [
        R                  R                  5         U " U0 UD6nSSS5        SSS5        T$ ! , (       d  f       N= f! , (       d  f       T$ = f)
zLGet the inductor-generated code, but skip any actual compilation or running.r3   r7   c                (   > TR                  U 5        g r   r  r  s    rR   r  "get_code.<locals>.save_output_code  r  rq   c                   >  " S S5      nU R                   (       a  U R                  5       OU R                  5       u  p#T" UR                  5        U(       a  T" UR                  5        U" 5       $ )Nc                  ,    \ rS rSrSrSS jrSS jrSrg)	@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulei  z4This is empty to replace the generated triton modulec                    g r   r   rK  s    rR   r8  Iget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rq   c                    g r   r   r  s      rR   callEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rq   r   Nr  rz   r   r  r   r   r  )r   r   r   r   r   r8  r  r   r   rq   rR   DummyModuler    s    Frq   r  )r[  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rR   patched_compile_to_module+get_code.<locals>.patched_compile_to_module  s[    	 	 04/?/?D))+T\\^ 	" 	++,[../}rq   compile_to_moduler  Nr  )r  r8   r   r   r  )r   rz   r  r8   r  r   r  r  s         @@rR   get_coder	    s    $ L", 	

.0I	
 	

-);=MN	 	O	
  	ON	
 	
 s#   "B('BB(
B%	!B((
B7c                    [        U /UQ70 UD6nS[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ Nr3   rg  z%expected one or two code outputs got r   )r	  rM   )r   rz   r  r  s       rR   get_triton_coder    sQ    B000LL!&Q& 
/L0A/BC& ?rq   c                    [        U /UQ70 UD6u  p4S[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ r  )r  rM   )r   rz   r  r   r  s        rR   run_and_get_triton_coder    sU     'r;D;F;OAL!&Q& 
/L0A/BC& ?rq   c                   ^^^ SSK Jm  SSKJn  UR                  m/ mSUUU4S jjn[
        R                  R                  USU5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr   r7   r?   c                 h   > T" U 0 UD6  U S   n[        UT5      (       d   eTR                  U5        g )Nrg  )rs   r  )rz   r  r`  r8   graph_lowerings	real_inits      rR   	fake_init-run_and_get_graph_lowering.<locals>.fake_init  s:    4"6"Q%////u%rq   r8  r   )torch._inductor.graphr8   torch._inductor.output_coder@   r8  r   r  r  )	r   rz   r  r@   r  r  r8   r  r  s	         @@@rR   run_and_get_graph_loweringr    sv     4;((IO& & 
		?J		BT$V$ 
C ?"" 
C	B ?""s   		A
A/c              #     #    SSK Jn  UR                  U    n [        R                  " X5      UR                  U '   Sv   X2R                  U '   g! X2R                  U '   f = f7f)zs
Override the lowering of aten_op with override_fn.
The first argument of override_fn is the original lowering fn.
r   )loweringN)torch._inductorr  	loweringsr  partial)aten_opoverride_fnr  orig_fns       rR   override_loweringr   	  sY      )  )G.&/&7&7&M7#&-7#g7#s   A"'A  A"AA"c                   ^ ^^ SSK Jn  UR                  mSUUU 4S jjn[        R                  R
                  R                  USU5      $ )zf
Add hook functions to be called at the beginning and end of Scheduler.__init__.
Used for unit tests.
r   )	Schedulerc                F   > T" X5        T" X5      nT(       a  T" X5        U$ r   r   )r  r6  outr  post_fnpre_fns      rR   r  (add_scheduler_init_hook.<locals>.wrapper$	  s%    y i'I%
rq   r8  )r  r   r6  r   r   r   )torch._inductor.schedulerr"  r8  unittestr   r  r  )r&  r%  r"  r  r  s   ``  @rR   add_scheduler_init_hookr*  	  s>     4  G  ==%%iWEErq   c                    [         R                  (       a  [        R                  U 5        g[        R	                  U 5        g)z
Warnings that will be actionable for PyTorch developers, but not
end users.  Allows us to easily disable them in stable releases but
keep them on for nightly builds.
N)rd   developer_warningsr   r  info)msgs    rR   developer_warningr/  .	  s$       Crq   c                     [         R                  R                  S5      n U S-   [        [         R                  5      :  aV  [        [         R                  U S-      5      S:  a3  [         R                  U S-      S   S:w  a  [         R                  U S-      $ [         R                   H)  nUR                  S5      (       d  M  U[        S5      S s  $    g! [         a     NJf = f)a  
An experimental API used only when config.benchmark_kernel is true.

The benchmark name is only available at codegen time. So we can not
directly call it in benchmark_all_kernels which is run after codegen.

The function assumes the argument after --only is the benchmark name.
It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
scripts, this function may return None.

There are 2 flavors of --only argument we need handle:
1. --only model_name
2. --only=model_name
z--onlyr3   r   rq  z--only=N)r  argvr  rM   
ValueErrorr  )rB  r  s     rR   get_benchmark_namer3  :	  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$$s9~'((    s   BC 
C"!C"c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7fr3   Nr   r  s     rR   r   is_ones.<locals>.<genexpr>\	       %u!Avu   rw   r:  s    rR   is_onesr<  [	      %u%%%rq   c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7f)r   Nr   r  s     rR   r   is_zeros.<locals>.<genexpr>`	  r8  r9  r:  r;  s    rR   is_zerosrA  _	  r=  rq   c                &    [        S U  5       5      $ )Nc              3     #    U  HI  n[        U[        R                  5      (       d  M$  UR                  [        R                  " S 5      :H  v   MK     g7f)r  N)rs   rK   r  r   )r   r   s     rR   r    is_cpu_device.<locals>.<genexpr>d	  s9      DdELL) 	+u||E**s
   #A*Ar:  )inputss    rR   is_cpu_devicerF  c	  s       rq   c                    [        U [        R                  5      (       d   S5       eU R                  (       a  [        R
                  $ [        R                  $ )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rs   rt   r#  r   rK   rF  r>  )r`  s    rR   get_sympy_Expr_dtyperH  k	  s@    c5::&& B& ~~{{}}rq   c              /     #    U (       a.  [         R                  R                  " U0 UD6 nUv   S S S 5        g S v   g ! , (       d  f       g = f7fr   )rK   r   r   )should_profilerz   r  r   s       rR   maybe_profilerK  u	  s;     ^^##T4V4G 54 	 54s   (A=A
AAc                 p    [         R                  R                  n U S:  a  [        R                  " 5       n U $ Nr3   )rd   r  threadsrK   get_num_threads)rN  s    rR   r  r  ~	  s+    jj  G{'')Nrq   c                     SSK Jn   U " 5       nUR                  S[        R                  R
                  (       a  S5      $ S5      $ )Nr3   )get_backend_options
num_stagesrg     )runtime.triton_helpersrQ  r  rK   r  r  )rQ  optionss     rR   get_backend_num_stagesrV  	  s2    ;!#G;;|%--*;*;QCCCCrq   c                L   [        U [        R                  R                  R                  R
                  S9nUb  U$ SSKJnJn  [        R                  R                  5       =(       a!    [        R                  R                  5       S:  nU [        R                  [        R                  [        R                  4;   d   e[        R                  " U5      R                   R#                  S5      (       a  SSKJn  U" 5       nU [        R                  [        R                  4;   a  U(       a  U" X5      $ [        R                  R                  R                  R
                  (       a  U" [        R                  U5      $ U" [        R                  U5      $ U [        R                  [        R                  4;   a  U(       a  U" U 5      $ [        R                  R                  R                  R
                  (       a  U" [        R                  5      $ U" [        R                  5      $ )z
We don't want to throw errors in this function. First check to see if the device is in device_info.py,
then fall back to the inaccurate triton estimation.
)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rl   r   
clock_rate)max_clock_rate)r   rK   backendsrC   matmul
allow_tf32triton.testingrY  rZ  rL   get_device_capabilityr   r:  r<  inspect	signature
parametersr  torch._utils_internalr\  )r   ds_topsrY  rZ  SM80OrLaterr\  sm_clocks          rR   get_device_tflopsri  	  sl    UENN,?,?,F,F,Q,QRGM**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\JJ8!#U]]ENN33,U==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rq   c                     SSK Jn   U " 5       $ )Nr   get_dram_gbps)r`  rl  rk  s    rR   get_gpu_dram_gbpsrm  	  s    ,?rq   c                 x    SSK Jn   U R                  R                  R	                  S5      R                  SS5      $ )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  ro  s    rR   get_gpu_shared_memoryrr  	  s.    %==44Q7;;<LaPPrq   c                $    U R                  S5      $ )Nwelford)r  reduction_types    rR   is_welford_reductionrw  	  s    $$Y//rq   c                4    [        U 5      (       a  gU S:X  a  gg)NrS  online_softmax_reducerg  r3   )rw  ru  s    rR   reduction_num_outputsrz  	  s    N++	2	2rq   c                 2    [         R                  " 5       S:H  $ )NLinux)platformsystemr   rq   rR   is_linuxr  	  s    ??''rq   c                 (    [         R                  S:H  $ )Nrf   )r  r}  r   rq   rR   r  r  	  s    <<7""rq   c                &    [        S U  5       5      $ )Nc              3     #    U  H7  n[        U[        R                  5      =(       a    UR                  (       + v   M9     g 7fr   )rs   rt   r#  re  r  s     rR   r   #has_free_symbols.<locals>.<genexpr>	  s)     Jcz!UZZ(<_<cs   ?Ar  )itrs    rR   r  r  	  s    JcJJJrq   c            	        SSK Jn  U  H  n[        X!R                  UR                  UR
                  UR                  UR                  45      (       aR  [        UR                  5       =(       d    S5      (       d'  [        UR                  5       =(       d    S5      (       a    gM  [        X!R                  5      (       d  M  [        S[        U5       35      e   g)Nr3   r  r   Tzunexpected type for is_dynamic F)r  r  rs   r  r  r  rW  r9   r  maybe_get_sizemaybe_get_strider;   	TypeErrorr  )rz   r  ts      rR   
is_dynamicr  	  s    bmmR[[":K:KRYYW
 
   0 0 2 8b99=M""$*> > > Ayy))=d1gYGHH  rq   c                      \ rS rSrSrSrSrg)Placeholderi	  KERNEL_NAMEDESCRIPTIVE_NAMEr   N)r   r   r   r   r  r  r   r   rq   rR   r  r  	  s      K *rq   r  c                x   SSK Jn  [        R                  " SSSS9 n[        R
                  " 5       n[        R
                  " 5       n[        U[        U5      S9R                  " U6   [        SUR                   3US	9  [        UR                  US	9  [        R                  " 5       n[        X5         U " UR                  5        S S S 5        [        R                  " 5       U-
  n	U" UR                  5        UR                  R                  5         UR                  5         [        S
UR                   3US	9  [        UR                  US	9  UR!                  5       UR!                  5       :H  n
["        R%                  SUUR&                  U
U	5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr3   )stable_topological_sortrL  zutf-8F)modeencodingr
  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   rY   rU   	propagater  r`  r
   nowrX   lint	recompilerC  r   r-  r   )rx  r  inpr.  r  r  	before_ioafter_io
start_timetime_elapsedr  s              rR   pass_execution_and_saver  	  sH    9		$	$
 
KKM	;;=R#3C#89CCSI	"(($1-bhhY'\\^
#B,N -||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
 -,
 
s%   BF+3FCF+
F(	$F++
F9c                    SSK Jn  [        XR                  5      =(       a     [        U R                  UR
                  5      $ )z:
Check if input buffer is a multi-outputs template buffer
r3   r  )r  r  rs   CppTemplateBufferr  MultiOutputLayout	input_bufr  s     rR   is_multi_outputs_templater  
  s7     i!5!56 :"..< rq   c                    SSK Jn  [        XR                  5      =(       a7    [	        U R
                  5      S:H  =(       a    [        U R
                  S   5      $ )zD
Check if input buffer is a output of multi-outputs template buffer
r3   r  r   )r  r  rs   MultiOutputrM   rE  r  r  s     rR   #is_output_of_multi_outputs_templater  )
  sJ      	9nn- 	;	  !Q&	;%i&6&6q&9:rq   c                   U c  gSSK Jn  [        XR                  5      =(       a:    [        XR                  5      (       + =(       a    US L =(       d    U R
                  UL =(       Gd`    [        U 5      UR                  :H  =(       Ga@    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       d    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       df    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  $ )NFr3   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rs   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr   rK   r   torchrecr  defaultr  r  r  rp  r  s      rR   is_collectiver  8
  sM    | 	4--. 	3400	34Z14++r1  	T
b''' 	
 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/rq   c                >    SSK Jn  [        U 5      UR                  :H  $ Nr3   r  )r  r  r  r  r  r  s     rR   is_waitr  ^
  s    :''rq   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_collectiver  s     rR   r   &contains_collective.<locals>.<genexpr>h
  s     @<a&q))<r   )r(  r  rs   rl  snodesr  r  snoder  s     rR   r  r  d
  s4    >%..@5<<@@@$$rq   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   r  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_waitr  s     rR   r    contains_wait.<locals>.<genexpr>q
  s     :\=##\r   )r(  r  rs   rl  r  r  r  r  s     rR   r  r  m
  s4    >%..:U\\:::uzz""rq   c                    SSK Jn  [        U[        R                  R
                  5      (       a  U/n[        XR                  5      =(       a    U R                  U;   $ r  )r  r  rs   rK   r}  r~  r  r  r  s      rR   is_fallback_opr  v
  sF     "ejj++,,Td--.I43C3Cr3IIrq   c                @    X!U    R                   R                  5          $ r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rR   buf_name_to_fused_snoder  
  s!     (3??HHJKKrq   c                    grs  r   r  s    rR   rt  rt  
      urq   c           	         U" U 5      (       a  g UR                  U 5        U R                   H-  n[        UR                  X#5      nXa;   a  M   [	        UUUUUS9  M/     g )Ncriteria_cb)r;  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rR   r  r  
  sf     55!''5HHk
 4##	
 (rq   c                    grs  r   r  s    rR   rt  rt  
  r  rq   c           
        U" U 5      (       a  g UR                  U 5        U R                  5        H  nUR                   H  nUR                  c   eUR                  R	                  5       S:X  a  M2  UR                  R	                  5       U;  a  MR  X6R                  R	                  5          nXq;   a  Mu  [        UUUUUS9  M     M     g )NOUTPUTr  )r;  get_outputsr  r  r  find_recursive_users_of_node)r  r  r  r  r  orS  user_ops           rR   r  r  
  s     55! GGD99(((yy!!#x/yy!!#+==(););)=>G,(""'  !rq   c                j    [         R                  R                  R                  (       a  SOSnX-
  U-
  $ )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rg  r   )rK   
_functorchrd   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rR   num_fw_fixed_argumentsr  
  s3     $$::   69SSSrq   c                   SS jnSn/ nU R                   R                   H8  nUR                  S:X  d  M  U" U5      (       a  UR                  U5        US-  nM:     U[	        [        [        U5      5      5      :X  d   e[        U5      $ )z6
Infers which inputs are static for a backwards graph
c                    SU R                   ;  =(       a;    SU R                   ;  =(       a%    SU R                   ;  =(       a    SU R                   ;  $ )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rR   is_saved_tensor'count_tangents.<locals>.is_saved_tensor
  sH    aff$ .!&&(.!/.  qvv-		
rq   r   r  r3   )rO   r2   r   r*  )r`  r6  rp  r  rR  r   rM   )fx_gr  	arg_countstatic_arg_idxsr  s        rR   count_tangentsr  
  s    

 IOZZ44= q!!&&y1NI	  d5_)=#>????rq   c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
	BoxedBooli
  r*  r   c                    U R                   $ r   )r   rK  s    rR   rV  BoxedBool.__bool__
  s    zzrq   c                @    [        U [        5      (       a	  SU l        U $ grs  )rs   r  r   r  s    rR   disableBoxedBool.disable
  s    c9%%CIJrq   r   Nr  )r  r   r   zUnion[BoxedBool, bool])	r   r   r   r   r   rV  r  r  r   r   rq   rR   r  r  
  s     K  rq   r  c              #     ^ ^#    SSK Jn  UR                  m   S             SU U4S jjjn[        R                  R                  USU5         S v   S S S 5        g ! , (       d  f       g = f7f)Nr3   r5   c                :   > TR                  U5        T" XX#XE5      $ r   r  )r  kernel_namer  rE  gpucpp_definitionkernel_listorig_define_kernels         rR   define_kernel.collect_defined_kernels.<locals>.define_kernel
  s'     	;'!{c
 	
rq   r  )NTN)r  r6   r  r   r  r   rE  Optional[str]r  r*  r   r  r   r   )codegen.wrapperr6   r  r   r  r  )r  r6   r  r  s   `  @rR   collect_defined_kernelsr  
  s     5-;; #'(,
"

 
  	

 
 &
 

 
 
		/-	P 
Q	P	Ps   AA2A!	A2!
A/+A2c                    U S-   $ )N__original__r   r  s    rR    get_cloned_parameter_buffer_namer
    s    .  rq   c                    U [         ;   $ r   )rI   rv  s    rR   r  r    s    Yrq   c                0    U S:g  =(       a    [        U 5      $ )NrD   )r  rv  s    rR   device_need_guardr    s    U?-vf~-rq   c                   [         R                  " 5       (       ao  U [        R                  :X  a[  [        R                  R                  5       (       a8  [        R                  R                  5       S:  a  [         R                  (       a  gU [        [        R                  [        R                  [        R                  /5      ;   $ )N)r  r   F)rd   r  rK   r:  rC   rL   ra  bfloat16_atomic_adds_enabledr!   rF  r*  r'  s    rR   ,needs_fallback_due_to_atomic_add_limitationsr    sv    
 	U^^#JJ##%%JJ,,.&8//
EKKU^^#LMMMrq   c                   U R                   [        R                  R                  R                  [        R                  R                  R
                  4;   a  Uc  gU R                   [        R                  R                  R                  :X  a  SOSnUS U4;  =(       Gd&    U=(       a    [        U5      =(       a    [        U5      =(       d    U R                   [        R                  R                  R                  :H  =(       ap    US:H  =(       ad    U=(       a[    US:H  =(       aO    [        R                  R                  =(       a.    [        R                  R                  =(       d    [        5       S:g  =(       dJ    X:H  =(       a#    U[        R                  [        R                  4;   =(       d    [        R                   " 5       $ )NFr;  r   r  r3   )overloadpacketrK   r   atenscatter_reduce_scatter_reducescatter_r  r  rd   r  fallback_scatter_reduce_sumdynamic_threadsr  r*  rF  r  )r  rv  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rR   use_scatter_fallbackr  #  s]    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 'SJ5::u{{:S,S	8 557!rq   c                   SSK JnJn  SSKJn  [        S[        U 5       S35        [        U 5       GH.  u  pE[        SUS S35        XRL a  [        S	5        M'  XQL a  [        S
5        M8  [        XS5      (       a  UR                  5       n[        U(       a  SOS S35        U(       a;  UR                  c   e[        SUR                  R                  R                   35        [        S5        UR                  R                   H  n[        U5        M     [        S5        UR                  R                   H  n[        U5        M     GM  [!        S[#        U5       35      e   g)z
An API that can be used in pdb to dump a node_schedule.
Right mainly dump the read/write dependencies but can add more as needed.
r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr   r!  r(  r"  r  rM   r   rs   is_reductionr  r  reduction_hintr-  r.  r/  r   r  )r  r   r!  r"  rB  r  is_redr  s           rR   dump_node_scheduler*  J  s&   
 O7	M 236
:;}-	#al"$%%%&,,&&(FfU$/?@yy,,,01N1N0OPQ*''--c
 .+''..c
 / !9$t*FGG' .rq   c                z    SSK Jn  U" U R                  5       [        U R                  5      -  [
        -  S:H  5      $ )Nr   )rW  )r  rW  storage_offsetr)  r   GPU_ALIGN_BYTES)r   rW  s     rR   tensor_is_alignedr.  i  s:     L 				 >&,,#?	??RVWW rq   c                    [        U R                  R                  5      (       d  g[        R                  =(       d    [        U 5      $ rs  )r  r   r  rd   assume_aligned_inputsr.  )example_inputs    rR   should_assume_input_alignedr2  w  s5     -&&++,,''K+<]+KKrq   c                 X   [         R                  R                  R                  5       n U (       d  [        R
                  " 5       $ U R                  (       a  U R                  R                  (       d  [        R
                  " 5       $ U R                  R                  nUR                  5       $ r   )	rK   _guardsTracingContexttry_getrp  nullcontextr  rb  suppress_guards)tracing_contextrb  s     rR   #maybe_get_suppress_shape_guards_ctxr:    sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&rq   c                "   [         R                  R                  R                  [        SS5         [
        R                  R                  5         SS KnSS K	nUR                  " 5       nUR                  " U5      nSSKJn  UR                  U5        UR                  nUR!                  UR"                  5        U " U0 UD6n	UR%                  5       n
UR!                  U5        UR'                  U5        S S S 5        X4$ ! , (       d  f       W	W
4$ = f)Nr   Tr   )output_code_log)r)  r   r  r  rd   rK   r  r  r  loggingr   StreamHandlertorch._inductor.codecacher<  
addHandlerlevelsetLevelDEBUGrC  removeHandler)r   rz   r  r  r=  log_capture_stringchr<  
prev_levelr  r   s              rR   run_and_get_cpp_coderH    s     
			#	#FGT	:[[]""#56=""2&$**
  /T$V$'')  ,%%b) 
;  9! 
;	:  19s   CC==
Dc                    [        U 5      nUb  UR                  $ U  H:  n[        U[        R                  5      (       d  M$  UR
                  R                  s  $    g r   )rU   rb  rs   rK   r/   r  )rE  r  inputs      rR   shape_env_from_inputsrK    sR     (I """ eU\\**::''' 
 rq   c                B   ^ ^^ [        T5      S:X  a  T $ SUU U4S jjnU$ )Nr   c                   > [        U TT5      u  pT" U 5      n[        U5      (       a  [        R                  " X5        U$ r   )copy_misaligned_inputsrM   rK   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr$  inputs_to_checkr  mutated_input_idxss       rR   r  )align_inputs_from_check_idxs.<locals>.run  sD    #9);$
  J {  :
rq   )rP  list[InputType]r   r   )rM   )r  rS  rT  r  s   ``` rR   align_inputs_from_check_idxsrW    s(    
 ?q   Jrq   c                X   SU R                  5       ;   a  SnO;[        S [        U R                  5       U R                  5       5       5       5      S-   n[        R
                  " X4S5      R                  5       n[        R
                  " X R                  5       U R                  5       5      $ )Nr   c              3  6   #    U  H  u  pUS -
  U-  v   M     g7fr6  r   )r   r"  r'  s      rR   r   )clone_preserve_strides.<locals>.<genexpr>  s     T:Sf$:Ss   r3   r   )r  r   r   r'  rK   
as_stridedclone)rO   needed_sizer  s      rR   clone_preserve_stridesr^    s    AFFH} T#affh
:STTWXX 	 a6<<>FFFFHahhj99rq   c                T   / n/ nUSLnU H  nX   n[        U[        R                  5      (       d   S[        U5       35       eUR	                  5       [
        -  (       d  MW  [        U5      X'   U(       d  Mm  Xb;   d  Mt  UR                  U5        UR                  X   5        M     X44$ )z
Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
cloned tensor which is in `return_pair_idxs`.
Nz Expected tensors only, but got: )rs   rK   r  r  data_ptr	ALIGNMENTr^  r  )rP  check_inputs_idxsreturn_pair_idxsrQ  rR  ret_pair_definedr   _inps           rR   rN  rN    s     ')K&(K (t3}$-- 	
.tDzl;	
- ==?Y&&248JMA$9""4("":=1  ##rq   c                    / nU HV  nX   n[        U[        R                  5      (       d  M(  UR                  5       [        -  S:X  d  ME  UR                  U5        MX     [        U5      [        U5      :w  a  U$ U$ )zO
We require all inputs to be aligned, so introduce a copy for any
that aren't.
r   )rs   rK   r  r`  ra  r  rM   )rE  static_input_idxsaligned_static_input_idxsrB  rJ  s        rR   remove_unaligned_input_idxsri    sp     !# eU\\**0@90LQR/R%,,S1 ! $%->)??((rq   c                
   SSK Jn  [        R                  " [        R                  5      R
                  nUR                  R                  R                  nUR                  R                  R                  R                  nUR                  R                  R                  X:*  5      (       a  gUR                  (       a.  UR                  R                  R                  U S:  5      (       a  gU" U 5      =(       a    U" U 5      U:*  $ )Nr3   r\  Tg@xDF)r_  r]  rK   iinforD  r   r`  ra  rG  rb  has_hintrW  aot_compilation)r   r]  int_maxrG  rl  s        rR   expr_fits_within_32bitro    s    kk%++&**G  **Iww))22H 	ww--al;; 	 7711!d(;;  A;29Q<722rq   c                6  ^^^ [         R                  R                  R                  5       nUb  UR                  b  [        UR                  5      S:X  d   e[        U 5      mUR                  c   eUR                   H  nUc  UR                  R                  S 5        M#  Sm[         R                  R                  R                  5       =n(       a  UR                  mSUU4S jjmUR                  R                  [        U4S jU 5       5      5        M     g g g )Nr   Fc                r   > Tc  [        U 5      $ T(       a  TR                  U 5      $ TR                  U 5      $ r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrb  s    rR   map_expr4set_tracing_context_output_strides.<locals>.map_exprE  s7     ("1v((<<Q??$55a88rq   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r   ru  s     rR   r   5set_tracing_context_output_strides.<locals>.<genexpr>M  s     5u!(1++us   )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rK   r4  r5  r6  output_stridesrM   rK  r  rt  r  )r  compiled_graphr,  r$  r  rt  ru  rb  s        @@@rR   "set_tracing_context_output_stridesr{  4  s     mm**224Gw55A7))*a///).9	,,888#22E}&&--d3$)!--66>>@@3@(+(=(=%9 9 &&--5u55 3	  Brq   c                 4   [         R                  b  [         R                  $ [         R                  " 5       (       d  g[        R                  R                  5       (       a  g SSKJn   U [        R                  R                  S5      :  $ ! [         a     gf = f)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rd   fx_graph_remote_cacher  rK   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher~  ModuleNotFoundErrorjustknobs_getval_intr}  s    rR    should_use_remote_fx_graph_cacher  Q  s    ##/+++,,..H  5#8#8#M#M8$    s   "B
 

BBc                2    [         R                  " SSU 5      $ )Nz[^a-zA-Z0-9_]r   )r   subr  s    rR   normalize_namer  d  s    66"C..rq   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                j    [         R                  S[        U 5      5      n[        R	                  X5      $ )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r   _triton_type_mappingr  )r   triton_type_names     rR   triton_typer  z  s+    &**5#e*=##$4GGrq   c                    [         R                  X 5      nUR                  SS5      n[        [        U5      n[        U[        R                  5      (       d   eU$ )Nr  r  )_torch_triton_mappingr  r  rJ   rK   rs   r   )r   adjusted_type	type_namer  s       rR   triton_type_to_torchr    sM    )--e;M%%eR0Iy)Ii----rq   c                   U R                   (       + =(       a    U R                  5       UR                  5       :H  =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ae    U R                  5       R                  5       UR                  5       R                  5       :H  =(       a!    U R                  5       UR                  5       :H  $ r   )	is_mkldnnr  r'  r   r   untyped_storager`  r,  r  r   s     rR   is_same_tensorr    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rq   c                   U R                   =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       as    U R                  UR                  :H  =(       aS    [        R
                  R                  R                  U 5      [        R
                  R                  R                  U5      :H  $ r   )r  r  r   r   rK   r   mkldnnr`  r  s     rR   is_same_mkldnn_tensorr    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrq   c                     g)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rq   rR   boolean_opsr    s    rq   c                  *    \ rS rSr% S\S'   S\S'   Srg)OpDtypeRulei  r0   type_promotion_kindOptional[torch.dtype]override_return_dtyper   Nr-  r   rq   rR   r  r    s    8800rq   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                (    [        X5      [        U '   g r   )r  r  )r   r  r  s      rR   #register_op_dtype_propagation_rulesr    s    
 (3(t$rq   zOrderedSet[str]op_requires_libdevice_fp64c                .    [         R                  U 5        g r   )r  r;  r  s    rR   #register_op_requires_libdevice_fp64r    s    ""4(rq   c                     SSK Jn   U R                  R                  5       R                  nUS:X  a  [
        R                  $ US:X  a  g[
        R                  $ )Nr   r\  r  rD   )rU  r]  r`  get_current_device_or_throwr  rd   cpu_backendcuda_backend)r]  
device_strs     rR   get_current_backendr    sH    -446;;JU!!!	u	"""rq   c                    U [         R                  [         R                  4;   a=  [        R                  R
                  (       a  [        5       S:X  a  [         R                  $ U $ )z"Maybe upcast [b]float16 to float32r  )rK   r   r:  rd   r  codegen_upcast_to_fp32r  r<  r'  s    rR   upcast_compute_typer    s@     	%--00MM00!X-}}Lrq   KeyTypeValTypec                  v    \ rS rSrSrSS jrSS jrSS jrSS jrSSS jjr	SS	 jr
SS
 jrSS jrSS jrSrg)
ScopedDicti  z
A dictionary-like object that allows for scoped updates. It maintains
an original dictionary and a set of new items that can override
the original items within the scope.  The original dictionary is
unmodified.
c                    Xl         0 U l        g r   original_dict	new_items)r  r  s     rR   r8  ScopedDict.__init__  s    *13rq   c                \    XR                   ;   a  U R                   U   $ U R                  U   $ r   r  r  r  s     rR   r  ScopedDict.__getitem__  s,    .. >>#&&!!#&&rq   c                     X R                   U'   g r   )r  )r  rS  r   s      rR   __setitem__ScopedDict.__setitem__  s    #srq   c                H    XR                   ;   =(       d    XR                  ;   $ r   r  r  s     rR   __contains__ScopedDict.__contains__  s    nn$A/A/A(AArq   Nc                t    XR                   ;   a  U R                   U   $ U R                  R                  X5      $ r   )r  r  r  )r  rS  r  s      rR   r  ScopedDict.get  s2    .. >>#&&!!%%c33rq   c                    [        U R                  5      nU R                   H  nX R                  ;  d  M  US-  nM     U$ rM  )rM   r  r  )r  r  r  s      rR   r  ScopedDict.__len__  s<    ""#A***Q   rq   c              #     #    U R                    S h  vN   U R                   H  nXR                   ;  d  M  Uv   M     g  N-7fr   r  )r  r  s     rR   __iter__ScopedDict.__iter__
  s8     %%%%A***   	&s   AA  A
Ac                R    [        U R                  =(       d    U R                  5      $ r   )r*  r  r  rK  s    rR   rV  ScopedDict.__bool__  s    D&&8$..99rq   c                    [         er   r  r  s     rR   __delitem__ScopedDict.__delitem__  s    !!rq   r  )r  Mapping[KeyType, ValType])rS  r  r   r  )rS  r  r   r  r   r  )rS  r  r   r*  r   )rS  r  r  Optional[ValType]r   r  r  )r   zIterator[KeyType]r  )rS  r  r   r  )r   r   r   r   r   r8  r  r  r  r  r  r  rV  r  r   r   rq   rR   r  r    s5    4'
$B4
:"rq   r  )frozen_defaultc              .   ^ SU4S jjnU c  U$ U" U 5      $ )Nc                   > [         R                  S:  a  [        R                  " U STS9$ [        R                  " U TS9$ )N)rS  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   s    rR   wrapir_dataclass.<locals>.wrap  s;    w&((d6JJ ((V<<rq   )r   rh   r   rh   r   )r   r   r  s    ` rR   ir_dataclassr    s    = {9rq   c                     [         R                  R                  R                  5       n U b'  U R                  (       a  U R                  R
                  $ g r   )rK   r4  r5  r6  fw_metadatabw_donated_idxs)r9  s    rR   get_donated_idxsr  &  s=    mm22::<O"'B'B**:::rq   c                  (    \ rS rSrSrSrSrSrSrSr	g)	TritonAttrsDescriptorVersioni-  r   r3   rg  rS  r3  r   N)
r   r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r   rq   rR   r  r  -  s     LKK	  Grq   r  c                 f   [         R                  R                  S5      c  [        R                  $ SS Kn SS Kn [        U R                  R                  S5      (       a  [        R                  $ [        U R                  R                  S5      (       a  [        R                  $ [        R                  $ )Nr  r   AttrsDescriptor)	importlibutil	find_specr  r  triton.backends.compilertriton.compiler.compilerr   r]  compilerr  r  r  )r  s    rR   #get_triton_attrs_descriptor_versionr  7  s    ~~)1+888##v''):;; ,777	))+<	=	=+777 ,333rq   c                 8    [        5       [        R                  :H  $ r   )r  r  r  r   rq   rR   triton_version_uses_attrs_dictr  Q  s    .04P4X4XXXrq   c                &   SSK Jn  [        XR                  5      (       d  g[        U R                  [
        R                  R                  5      (       a=  [
        R                  R                  R                  U R                  R                  ;   a  gg)ze
Returns True if the node is an op that is not cudagraphable.
Usually only custom ops have this tag.
r3   r  FT)r  r  rs   r  r  rK   r}  r~  r   r  r  r  r  s     rR   is_cudagraph_unsafe_opr  U  sb    
 d--.. 	4##UZZ%:%:;;HHLL))T-=-=-B-BBrq   c                 6   [         R                  R                  SS5      n [        R                  " 5       (       a^  SSKJn  U" 5       nU(       aJ  [         R                  R                  USS5      nU (       a   [         R                  R                  X0/5      OUn U $ )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r   r  r  rd   r  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_paths       rR   get_ld_library_pathr  h  sh    ::>>+R0D5')ww||L)UCH8<2::??H#34(DKrq   c                N    SSK Jn  [        X5      =(       a    U R                  S L$ )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr
  rs   partition_signatures)r  r
  s     rR   #is_codegen_graph_partition_subgraphr  u  s'    L 	79 	5((4rq   c                     [         R                  R                  R                  R                  =(       d    [
        R                  S L=(       a$    [         R                  R                  R                  $ r   )rK   r  rd   r  
cudagraphs&_unstable_customized_partition_wrapperr  r  r   rq   rR   is_using_cudagraph_partitionr  ~  sN    %%00 	F199E1 //
 
 
0
01rq   c                    SSK Jn  UR                  R                  R	                  U S5      (       a;  UR                  R                  R                  U S5      (       a  [        R                  $ [        R                  $ )Nr3   r\  l        i   )	r_  r]  r`  ra  statically_known_ltr  rK   rD  rF  )r  r]  s     rR   dtype_from_sizer    sX    ww++e 
''


/
/h
?
?{{{{rq   )r  rE   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN BF16.
r  rE   TF)rK   r   r  _is_mkldnn_bf16_supportedr   s    rR   is_mkldnn_bf16_supportedr    3     eyy99;;	+	rq   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN FP16.
r  rE   TF)rK   r   r  _is_mkldnn_fp16_supportedr  s    rR   is_mkldnn_fp16_supportedr    r  rq   c           
     x   U Vs/ s H  n[        [        U5      5      PM     nnU  HS  n[        U5      [        U5      :X  d   e[        U5       H'  u  pR[        X5   [        [        U5      5      5      X5'   M)     MU     / nUR	                  SR                  S [        X5       5       5      5        [        U5      [        U5      S-  -   [        U5      S-
  -   nUR	                  SU-  5        U  H3  nUR	                  SR                  S [        XC5       5       5      5        M5     SR                  U5      $ s  snf )N|c              3  4   #    U  H  u  pS X  S 3v   M     g7fr  Nr   )r   hrL  s      rR   r   tabulate_2d.<locals>.<genexpr>  s     H3G41AaWA,3G   rg  r3   rq  c              3  4   #    U  H  u  pS X  S 3v   M     g7fr   r   )r   r   rL  s      rR   r   r"    s     H7Gtq!Cl7Gr#  r3  )rM   r   r   r   r  r  r   r   )elementsheadersr   widthsrowr   rf  total_widths           rR   tabulate_2dr*    s    #*+7ac#a&k7F+3x3w<'''cNDAFIs3q6{3FI #  E	LLH3w3GHHIf+Vq1S[1_EK	LL{"#SXXHs37GHHI 99U ,s   D7c              #     #    [        U R                  5       5      [        UR                  5       5      -  nU H6  nU R                  U5      nUR                  U5      nUUb  UOUUb  UOU4v   M8     g7f)a  
Zip two dictionaries together, replacing missing keys with default values.

Args:
    dict1 (dict): The first dictionary.
    dict2 (dict): The second dictionary.
    d1_default (Any): the default value for the first dictionary
    d2_default (Any): the default value for the second dictionary

Yields:
    tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
           and the value from dict2 (or d2_default if missing).
N)r!   r9  r  )dict1dict2
d1_default
d2_defaultall_keysrS  value1value2s           rR   	zip_dictsr3    sp     ( %**,'*UZZ\*BBH 33 (Fj(Fj
 	
 s   A1A3c                ,           S	S jnU R                  S[        R                  R                  5      nU R	                  5       n U(       aE  U" U SS5        U" U SS5        U" U S[
        R                  R                  (       + 5        U" U SS5        U $ )
a  
Ensures the configuration is internally consistent for standalone AOTInductor.

If `aot_inductor.compile_standalone` is set to True in the provided
`config_patches` (or falls back to the global config), this function ensures
that the following configs are also enabled:
    - `aot_inductor.package_cpp_only`

Args:
    config_patches (dict[str, Any]): A dictionary of user-provided config
        overrides for AOTInductor compilation.

Returns:
    dict[str, Any]: The possibly-updated `config_patches` dictionary.
c                    U R                  U[        [        U5      5      nUc  X U'   g U(       d  X2:w  a  [        SU SU S35      eg g )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rJ   rd   r   )config_patchesconfig_nameconfig_valuer   s       rR   patch_config2maybe_aoti_standalone_config.<locals>.patch_config  sY     "";0LM=*6;'50";-q>lm  1rq   zaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_model)r7  dict[str, Any]r8  r   r9  r   r   r  )r  rd   aot_inductorcompile_standalonecopyrK   r  r  )r7  r:  r?  s      rR   maybe_aoti_standalone_configrA    s    "	&	58	HK			 (++)6+>+>+Q+Q $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 rq   c                     SSK Jn   U R                  R                  nUc  g[	        U[
        5      (       d  [        S5      eUS:X  a  g[        R                  " SU5      (       d  [        S5      eg)zD
Validates if a model name is suitable for use in code generation.

r   rc   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rd   r>  model_name_for_generated_filesrs   r   r2  r   r   )rd   
model_names     rR   is_valid_aoti_model_namerE    sn    
 '$$CCJj#&&OPPR 88/<<d
 	
 rq   c                <    U(       a  [        U 5      $ [        U 5      $ r   )r'   r&   )rO   unbacked_onlys     rR   get_free_symbolsrH  *  s    $Q''Arq   c                0   [         R                  R                  (       d  gU U  3nU(       aT  UR                  =n(       aA  UR	                  5       =n(       a*  UR
                  R                  SS5      =n(       a  U SU 3n[        R                  U5        g)zz
Cudagraph partition may lead to extra memory overhead so we
log partition reasons to help users understand the overhead.
Nstack_tracez. Found from : 
 )	rd   r  r  r  get_origin_noder  r  perf_hint_logr  )r.  r  r  warning_msgir_noder  rJ  s          rR   maybe_log_cudagraph_partitionrO  1  s     ==##HSE"K 			!W!//11W1#LL,,]DAA[A$%7}E+&rq   c                 *   0 [         R                  ES[         R                  R                  S[         R                  R	                  [
        R                  5      5      0En [        R                  " 5       (       a  [        R                  " S5      U S'   U $ )z9
Get a base environment for running Python subprocesses.

PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r   r  r  r  r  r  r  rd   r  	sysconfigget_path)envs    rR   python_subprocess_envrW  J  so    

** 	bjjnn%rzzsxx'@
	C  %..v6LJrq   c                  .    \ rS rSr% SrS\S'   S\S'   Srg)CUDAGraphWrapperMetadataie  z
Metadata for Customized CUDAGraphWrapper.

Currently assumes there is 1 dynamo graph and will extend to
multiple graphs in the future.
r   num_partitionspartition_indexr   Nr   r   rq   rR   rY  rY  e  s      rq   rY  .c                  $    \ rS rSr% SrS\S'   Srg)CUDAGraphWrapperi|  NzOptional[CUDAGraphWrapperType]r  r   )r   r   r   r   r  r   r   r   rq   rR   r]  r]  |  s    .2G+2rq   r]  c                    U [         l        g r   )r  r  )r  s    rR   !set_customized_partition_wrappersr_    s    5<*2rq   c                H  ^ U R                   R                  nU R                   R                  / UQU R                   R                  QU R                   R                  5      nU R                   R                  n[
        R                  " X45      u  p4SS jnU Vs/ s H:  nU" U5      (       a(  [        R                  R                  R                  USS9OUPM<     nnSS jmSU4S jjnU Vs/ s H
  og" U5      PM     nn[
        R                  " X45      u  pX4$ s  snf s  snf )	Nc                    [        U [        R                  R                  R                  5      =(       a3    [        U [        R                  R                  R
                  5      (       + $ r   )rs   rK   r  r  r;   GeneratorStater  s    rR   _is_tensor_ir(snode_args_kwargs.<locals>._is_tensor_ir  sH    !U__//667 

u!!00A
 =
 	
rq   F)guard_shapec                ,    [         R                  " XUS9$ )Nr   )rK   r   )r  r   r   s      rR   _tensor"snode_args_kwargs.<locals>._tensor  s    {{4V<<rq   c                   > [        U [        R                  5      (       d  U $ T" U R                  5       U R                  U R
                  5      nU$ r   )rs   rK   r  r  r   r   )r   r$  rg  s     rR   to_real_tensor)snode_args_kwargs.<locals>.to_real_tensor  s:    !U\\**Haffh2
rq   r  )r   r  )r   r   r   r   )r  rE  fill_non_provided_argsconstant_argsr  pytreer"   rK   r  r  ir_node_to_tensortree_unflatten)	r  rz   r  	flat_argsflat_args_pytree_specrc  r  rj  rg  s	           @rR   snode_args_kwargsrs    s   ::D::,,*$*))*

D ZZF'-':':D>'J$I
 	 A  	,,QE,B	 	  = -66Iq"II6((JLD<%  7s   AD,Dr  )ro   r   r   r   )r}   r   r   r*  )   d   )r   zCallable[[], Any]r   r   r   r   r   r~  r  )r   z"Union[Optional[torch.device], str]r   torch.device)r  zIterable[sympy.Expr]r   r   )r  Sequence[sympy.Expr]r  rw  r   r   )r  zIterable[_T]r   zValuesView[_T])r&  r8  r'  r8  r   r8  )rS  r  r   r   )rY  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])r   r8  r   zUnion[int, torch.SymInt])rY  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])rp  torch._ops.OpOverloadr   r*  )r  r2   rz  z'Callable[[torch._ops.OpOverload], bool]r   r*  )r|  r   rz   r  r  r=  r   z&tuple[GraphModule, list[torch.Tensor]])rC   )r   r   r   r  )r3   rC   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r   r   r~  )r   r  r  g      ?rC   )r  ry  r  rz  r   r   r  r   r  r~  r   r   r   r~  )r  r   r  r   r   r  )r  r   r  r   r   r  )r  r   r  r   r   r   )rO   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rO   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   CachedMethod[P, RV])r   zCallable[P, RV]r   r|  )r  r   r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r   )r  r}  r  r6   r   ztuple[str, str]r   )rP  zIterable[torch.fx.Node]rQ  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])rz   zSequence[IRNode]r  zdict[str, IRNode]r   r  r{  )r  r   r   zValueRanges[Any])r  r   r   r*  )r  r`   rB  r   r   r  )r  r*  r   r*  )r   r   r   r  )ri  r   r  zdict[sympy.Expr, Any]r   r   )r  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rz   r   r   r*  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r  r   r2   )r  r  r   zOrderedSet[torch.device]r  )r  r   r   r   )NNT)r	  zOptional[dict[str, Any]]r  r  r
  r*  r   ro  )r  rz  r   	list[int])rb  r)   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r   r  )r   torch.dtyper   r   rn  r  )r  zUnion[int, torch.device]r   r*  r  )r  r   r   rv  r  Optional[int]r   r4   )r  r<   r  zlist[torch.dtype]r   r*  )r  r   r   r*  )
r  r<   r  r*  r  r*  r  r*  r   r*  )r;  r;   r  r*  r   r*  )
r  r<   r  r   r  r   r  r   r   r*  )rP  r   r   r*  )r  rS  r  rS  r  rS  r   r*  )r  rS  r  rS  r  rS  r   r  )r   r   r   r   )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r  r<   r   r*  )r  r<   r  zUnion[ReinterpretView, Buffer]r  r;   r   r*  )FTFN)r  r<   r  r;   r  r;   r  r*  r  r*  r  r*  r  r  r   r*  )r   Callable[P, _T]rz   r  r  r  r   ztuple[_T, list[str]])r   ry  r   ztuple[Any, list[str]])r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   ztuple[Any, list[GraphLowering]])r  ry  r  ry  r   ro  )r&  ry  r%  zOptional[Callable[..., Any]]r   r   )r.  r   r   r  )r   r  )r:  rz  r   r*  )rE  zSequence[torch.Tensor]r   r*  )r`  r   r   r  )rJ  r*  rz   r   r  r   r   zIterator[Any])r   r  r   r~  )rv  r   r   r*  )rv  r   r   r   )r  zIterable[Any]r   r*  )
rx  ry  r  r1   r  rz  r.  r   r   r  )r  z"Optional[Union[Buffer, Operation]]r   r*  )r  z Optional[Union[Node, Operation]]rp  z!Optional[torch._ops.OperatorBase]r   r*  )r  z"Optional[Union[IRNode, Operation]]r   r*  )r  rA   r   r*  )r  zOptional[Operation]rp  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r*  )r  r   r  r=  r  r=  r   r   )r  rA   r  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r   r  )r  r   r  r   r   r   )r  r  r   r   )r  r   r   ro  )r   r   r   r   )r   r  r   r*  )r   r   r   r*  )r   r  r   r*  )r  rx  rv  r  r  r  r  r  r  r   r  r*  r   r*  )r  r~  r   r  )r   r  r   r*  )r1  r  r   r*  )r   r  )r   r  rz   r  r  r  r   ztuple[_T, str])rE  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]rS  r{  rT  zOrderedSet[int]r   r  )rO   r  r   r  )rP  rV  rb  r{  rc  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])rE  r  rg  r{  r   r{  )r   r   r   r*  )r  rz  rz  r@   r   r  )r   r  r   r   )r   r   r   r  )r  r  r   r  r   r*  )r   ztuple[str, ...])r   r   r  r0   r  r  r   r  )r   r   r   r  )r   r  r   r  )r   zOptional[type[Any]]r   r*  r   r   )r   zOptional[list[int]])r   r  )r  r=   r   r*  )r  r6   r   r*  )r  r   r   r  )r   r   r   r*  )r%  zSequence[Sequence[T]]r&  zSequence[T]r   r   )NN)
r,  r  r-  r  r.  ValType | Noner/  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r7  r=  r   r=  )rO   r(   rG  r*  r   zOrderedSet[sympy.Symbol])zcudagraph partition due to N)r.  r   r  r  r  zOptional[BaseSchedulerNode]r   r  )r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r  rA   r   z tuple[list[Any], dict[str, Any]](s  
__future__r   r4  rp  r  enumr  r  rb  r  r]  r=  r  r  r   r}  r   r  r   r  rT  r  r  r  r)  collections.abcr   r   r   r   r   r	   r
   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   rt   rK   torch.utils._pytreer  _pytreern  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRADr  r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   torch._prims_commonr0   torch.fxr1   torch.fx.noder2   r  r4   r  r6   r`  r8   r  r9   r:   r;   r<   r=   r>   output_coder@   r  rA   rB   rI   rG   r   rS   torch._dynamo.device_interfacerT   torch._dynamo.utilsrU   torch.autogradrV   torch.autograd.profiler_utilrW   (torch.fx.passes.graph_transform_observerrX   torch.fx.passes.shape_proprY   torch.utils._sympy.functionsrZ   r[   r\   r]   r^   torch.utils._sympy.symbolr_   r`   torch.utils._sympy.value_rangesra   rb   r  rd   runtime.runtime_utilsre   r%  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerrL  rh   r  r#  	VarRangesr  r   	InputTypeGPU_KERNEL_BIN_EXTSr-  ra  r  r  rn   rp   ry   Functionr{   r  r   r   r   r  r  r  r  r   rV  rZ  rd  rf  rq  rx  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  rN  rT  rd  ry  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rq  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r%  	lru_cacher)  r+  r/  r2  r  r  r@  r  r  r  r  r  r  r  r  r  r  r<  rA  rL  rR  rS  r\  r`  rs  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r   r*  r/  r3  r<  rA  rF  rH  rK  r  rV  ri  rm  rr  rw  rz  r  r  r  r  Enumr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r*  r.  r2  r:  rH  rK  rW  r^  rN  ri  ro  r{  r  r  r  r:  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r*  r3  rA  rE  rH  rO  rW  rY  PartitionFnTyper  r]  r  r_  rs  )r  r}   s   00rR   <module>r     s5   "        	     	  	   
                $ $ ? : 0 / ; ($ 
  >>//C$",5$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%!00<H T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  d#  $"G GV 9<SS#&S25S
Sl  ;@
+*"*+A**#AL+	+++"/	)/#/G @OI	I<I 
I0 *8+0' ' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:++/+\C*!).!)O!) 	!)HN2CN2!N2 N2f 48*0 (G
G$5GG:,^%	DU	>2-888v'& 
: !# I "	 .29+9	9 9 	9 9z !5 $ " A!!L!!H Q7 7*  , , ,
R' R'j
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
: 7< d dN CH  BJ CO,) ,  $  . 5( 5(p @ @ R R:+\H&

8
@F
	
" ""&"&==
= = 	=
  = =  = 
=@'C C"      	 $ &2:/(V &2:## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D %6 %6P  Q0(#K(*$)) *!

!
"-!
4A!
HK!
	!
H1	" -1#
*#)# 
#L(%#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.N $&$!$ $ 	$
 $ $ 
$NH>L'  &2:2(*" ( %	0	: 37$$$$ 0$ 3	$<$ $ 3F!3B	:&/ '#)* $%
  +?*D*D*FG*F$!*FG  **Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)	# )

)
-" 01 -"` D)t   *499  4 42Y&
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
  
F.b6 :(,'	'' &' 
	' '26 d#  $ 38$./@ 3 3 *:); &=   } Hs   9o