
    9i'                     B   S r SSKrSSKJr  SSKJr  SSKJrJrJ	r	  SSK
r
SSKr
SSKJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJrJrJrJrJr  SSKJ r   SSK!J"r"  S\
RF                  RH                  S\%\&   4S jr'S\
RF                  RP                  S\)\
RT                  \
RF                  RV                  4   4S jr,S\
RF                  RP                  S\&S\	\-   4S jr.S\
RF                  RP                  S\&S\	\-   4S jr/S\
RF                  RP                  S\&4S jr0S\
RF                  RP                  S\1\	\-      4S jr2S\
RF                  RP                  S\\   S\4S jr3 " S S5      r4\"" S\4" 5       S9    S&S \S!\4   S"\\   S#\5S$\5S\S!\\   4   4
S% jjr6g)'a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Sequence)AnyCallableOptional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc           	         S[         [        [        4   S[        4S jn[        [        5      nSn[	        5       nU R
                   GH  nUR                  S:X  aq  [        U" UR                  5      [        R                  5      (       a;  U[        U" UR                  5      R                  5       5         R                  U5        US-  nM  UR                  S:X  d  M  [        UR                  S5      (       d  M  UR                  R                   n[#        UR$                  5       H  u  pxU['        UR(                  5      :  a  UR(                  U   n	O5UR*                  UR,                  ;  a  MJ  UR,                  UR*                     n	S	n
UR.                  (       a  UR.                  R0                  (       a  S
n
U
(       d  M  XB[        U" U	R                  5      R                  5       5         -  nM     GM     U$ )Nmetar   c                 "    SU ;   a  U S   $ U S   $ )Nvalfake_result )r   s    a/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk%find_input_mutations.<locals>.meta_fk7   s    #tmtE{Dm1DD    r   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr'   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r"   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r!   find_input_mutationsrE   6   sz   Ed38n E E FIUNWW44= '!&&/5<<88~gaffo&D&D&FGHLLYWNITT_$188Y//XX%%F#F$4$45s166{? vvayHxxqxx/  xx1H>>~~.."&7 #&wx}}'='L'L'NO' N 6 : r$   gmc                     0 nU R                   R                   H`  nUR                  R                  SS 5      n[	        U[
        R                  5      (       d  M@  UR                  U;  d  MR  X!UR                  '   Mb     U$ )Nr   )graphr+   r   getr-   r.   r/   device)rF   device_node_mappingr?   ts       r!   get_device_node_mappingrM   ]   sb     >@XX^^FFJJud#a&&188;N+N,-)  r$   	aot_model	num_fixedc                     [        U R                  5      [        [        U5      5      -
  nU(       d  g [	        U R                  5      n[        X25      $ N)rE   rH   r*   ranger   r   )rN   rO   mutation_indicesplaceholderss       r!   3check_for_mutation_ignore_cuda_graph_managed_tensorrU   h   sA     ,IOO<s5CS?TT'	8L#LCCr$   c                     [         R                  (       d  [        X5      =n(       a  U$ [        [	        U 5      5      =n(       a  U$ [        U 5      =n(       a  [        SUR                   S35      $ g )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrU   r   rM   r   r   r8   )rN   rO   mut_skipskipnodes        r!   check_for_skipr\   s   sx    ::J
 
8 
 O6	* t  4Y??t?*->tyyk+KLLr$   c                 x    [        [        [        U 5      5      5      nUR                  S:X  d   eUR                  $ )Ncuda)nextiterrM   typeindex)rF   rJ   s     r!   get_device_indexrc      s3    $.r234F;;&   <<r$   c                 @   [        U 5      n[        UR                  5      S:X  d   eUR                  S   n[        US5      (       d  / $ U Vs/ s HD  n[	        U[
        R                  R                  R                  5      (       a  UR                  OS PMF     sn$ s  snf )Nr   r   __iter__)
r   r6   r7   r2   r-   r.   fxr[   Nodestack_trace)rF   outputr7   rB   s       r!   get_stack_tracesrj      s    _Fv{{q   ;;q>D4$$	 C 'sEHHMM,>,>??T	I  s   ABdynamo_modeldynamo_inputsc           	        ^^^^ SSK Jm  [        S5      m[        S 5      m SS[        R
                  R                  S[        [           S[        S[        4UUUU4S jjjnS[        R
                  R                  S[        [           S[        4UUU4S	 jjn[        UU[        R                  " USS
9[        R                  R                  R                  S9nU" U T5      $ )Nr   )cudagraphify_implTrN   
aot_inputsis_inferencer   c                   > [        X5      n[        [        T
5      [        U5      5      n[        X5      =n(       a&  [        R
                  " T	5        [        SU 35        U$ TR                  [        U 5      5        T" UU[        U5      TR                  SS[        U 5      [        U R                  5      [        U R                  5      S9	nSUl        U$ )Nskipping cudagraphs due to Fdevice_indexis_backwardrp   stack_tracesrT   mutated_input_idxsT)r
   r   r6   r\   r   disabler   r*   rc   rR   valuerj   r   rH   rE   _boxed_call)rN   ro   rp   interpfixedskip_msgoutboxed_device_indexrn   do_cudagraphsrl   s          r!   forward_cudagraphs&cudagraphs.<locals>.forward_cudagraphs   s    
 91&s='93z?K%i7787m,/-hZ8 M/	:;%L+11))4-ioo>3IOOD

 
r$   c                    >^ ^ [        T U5      nT(       d  T $ [        T 5      n[        T U5      =n(       aq  [        SU 35        T	R                  nUc  Sn[
        R                  R                  R                  USS9mTc   eS[        [           S[        4U U4S jjnSUl        U$ T
" UU[        U5      [        T 5      SS[        T 5      [        T R                   5      [#        T R                   5      S	9	nSUl        U$ )
Nrr   r   F)create_if_none_existsr<   r   c                 4   > TR                  5         T" U 5      $ rQ   )set_to_running_backward)r<   rN   managers    r!   fn3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s    //1 ((r$   Trs   )r
   r   r\   r   ry   r.   	_inductorcudagraph_treesget_managerlistr   rz   rR   rc   rj   r   rH   rE   )rN   ro   r{   r|   r}   
device_idxr   r~   r   r   rn   r   s   `       @r!   backward_cudagraphs'cudagraphs.<locals>.backward_cudagraphs   s    9j1y)%i7787/-hZ8
 ,11J!
oo55AA% B G &&&)49 ) ) ) "BNI%L))4))4-ioo>3IOOD

 
r$   )rp   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrn   r   r   r.   rf   GraphModuler   r   boolr	   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rk   rl   r   r   aot_cudagraphsr   rn   r   s    `   @@@r!   
cudagraphsr      s    AdOM)$/
 #88''I  
	 :)88'')59#Y)	) )V "&'$,,-?dS',}}';';'a'a	N ,66r$   c                   z    \ rS rSrSr\S
S j5       r\S\R                  R                  S\
\   S\4S j5       rS	rg)CudagraphsBackend   r   r   Nc                      SSK Jn   U " 5         g )Nr   reset_cudagraph_trees)r   r   r   s    r!   resetCudagraphsBackend.reset   s    Ir$   modelr<   c                     [        X5      $ rQ   )r   )r   r<   s     r!   __call__CudagraphsBackend.__call__   s    %((r$   r    )r   N)__name__
__module____qualname____firstlineno__compiler_namestaticmethodr   r.   rf   r   r   r   r   __static_attributes__r    r$   r!   r   r      sP     M   
 ),, )hsm ) ) )r$   r   r   )r8   compiler_fnr   .r<   copy_outputscopy_inputsc                   ^^^^^	 [        U[        [        45      (       d   eT(       a(  U Vs/ s H  n[        R                  " U5      PM     snmO[        U5      m[        R
                  R                  5         [        R
                  R                  5       nUR                  [        R
                  R                  5       5        [        R
                  R                  U5         U " U6   SSS5        UR                  5         [        R
                  R                  5       R                  U5        [        R
                  R                  5         [        R
                  R                  5       m[        R
                  R                  TUS9   U " T6 m	SSS5        [        T	[        [        45      (       d  T	4m	S[        S[        [           4UUUUU	4S jjnU$ s  snf ! , (       d  f       N= f! , (       d  f       Nc= f)zBThis isn't registered as a backend, but is used in some benchmarksN)stream
new_inputsr   c                    > [        T5      [        U 5      :X  d   eT(       a&  [        TU 5       H  u  pUR                  U5        M     TR                  5         T(       a   T Vs/ s H  o3R	                  5       PM     sn$ T$ s  snf rQ   )r6   zipcopy_replayclone)	r   dstsrcxr   r   rH   static_inputsstatic_outputss	       r!   runcudagraphs_inner.<locals>.run  sk    =!S_444z:		# ;'56~!GGI~66!! 7s   $B)r-   r   tupler.   
zeros_liker^   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrH   r   r   )
r   r<   r   r   r   r   r   rH   r   r   s
     ``   @@@r!   cudagraphs_innerr     si    ftUm,,,,6<=f))!,f=V 
JJZZ F
uzz0023			6	"v 
#
	JJ++F3	JJ JJ  "E			%		/. 
0ntUm44(*	" 	"# 	" 	" JA > 
#	" 
0	/s    G%G
G-
G*-
G;)TT)7__doc__r   collectionsr   collections.abcr   typingr   r   r   r.   torch.fxtorch._dynamor   torch._dynamo.backends.commonr	    torch._dynamo.backends.debuggingr
   torch._inductor.cudagraph_utilsr   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   rf   Graphr*   intrE   r   r(   rJ   rg   rM   r)   rU   r\   rc   r   rj   r   r   r   r   r    r$   r!   <module>r      s  .  # $ * *     6 6   < &$EHHNN $s3x $N	%,,
%&Dxx##D03Dc]Dehh22 s xPS} $-- # 	-- 	$x}2E 	T7UXX11 T7(3- T7TW T7n) )  l0A0C D 	)CH)SM) ) 	)
 c8C= !)r$   