
    i)                         S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJrJ	r	J
r
  S SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  \\" S5       " S S\\5      5       5       rg)    N)	resources)AnyDictfinalListOptional)AotiBackend)MoveCondPredicateToCpuPass)ReplaceEdgeOpWithTritonOpPass)experimental)BackendDetails)CompileSpec)conv1d_to_conv2d)
SDPBackendzHThis API and all of cuda backend related functionality are experimental.c                   R   \ rS rSrSr\S\4S j5       r\S\S\	\   4S j5       r
\S\4S j5       r\S\4S j5       r\S\\\4   4S	 j5       r\S\\\4   4S
 j5       r\S\\   S\\R                     4S j5       r\S\\   S\\\R                  4   4S j5       r\S 5       rSrg)CudaBackend   a  
CudaBackend is a backend that compiles a model to run on CUDA devices. It uses the AOTInductor compiler to generate
optimized CUDA kernels for the model's operators with libtorch-free. The compiled model can be executed on CUDA devices
using the Executorch runtime.
returnc                     g)Ncuda clss    d/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/executorch/backends/cuda/cuda_backend.pyget_device_nameCudaBackend.get_device_name)   s        cuda_versionc                 n  ^	 SU  S3m	S[         S[        4U	4S jjn SSKJn  U(       a0  [        R
                  R                  USS	5      nU" U5      (       a  U$ S
 H]  n[        R                  R                  U5      nU(       d  M+  [        R
                  R                  USS	5      nU" U5      (       d  M[  Us  $    SU  S3n[        R
                  R                  U5      (       a  U$ [        R                  " S	5      nU(       a  U" U5      (       a  U$ SnU" U5      (       a  U$ g! [         a     Nf = f)z
Find ptxas binary that matches the expected CUDA version.
Returns the path to ptxas if found and version matches, None otherwise.
z/cuda-/pathr   c                    > [         R                  R                  U 5      (       d  g[         R                  R                  U 5      nTU;   $ )z;Check if ptxas at given path matches expected CUDA version.F)osr!   existsrealpath)r!   resolvedexpected_version_markers     r   _validate_ptxas_versionDCudaBackend._find_ptxas_for_version.<locals>._validate_ptxas_version5   s8    77>>$''ww''-H*h66r   r   )	CUDA_HOMEbinptxas)r*   	CUDA_PATH	CUDA_ROOTz/usr/local/cuda-z
/bin/ptxasz/usr/local/cuda/bin/ptxasN)strbooltorch.utils.cpp_extensionr*   r#   r!   joinImportErrorenvirongetr$   shutilwhich)
r   r(   r*   
ptxas_pathenv_var	cuda_homeversioned_pathptxas_in_pathdefault_pathr'   s
            @r   _find_ptxas_for_version#CudaBackend._find_ptxas_for_version-   s    %+<.":	7# 	7$ 	7	;WW\\)UGD
*:66%%
 ?G

w/IyWW\\)UGD
*:66%% ? ,L>D77>>.))!! W-4]CC   3"<003  		s   <D' '
D43D4c                      [         R                  R                  n U c  g[        [        U R                  S5      SS 5      u  pUS:  d  US:X  a  US:  a  gUS:X  a2  US:  a,  [        R                  U 5      nUc  gU[        R                  S	'   [        R                  R                  S
5      b8  [        R                  " S[        R                  R                  S
5       S35        g[         R                  R                  5       n[         R                  R                  U5      nUS    SUS    3[        R                  S
'   g! [         a     gf = f)a  
Configure CUDA environment variables based on detected CUDA version and GPU architecture.
These are needed to compile fatbin kernels for more portable binaries on older CUDA versions.
Returns True if setup succeeded or if setup was skipped (CUDA >= 12.9), false otherwise.
NF.      	   T   TRITON_PTXAS_PATHTORCH_CUDA_ARCH_LISTzTORCH_CUDA_ARCH_LIST is set to z,, skipping automatic architecture detection.r      )torchversionr   mapintsplitr   r>   r#   r4   r5   loggingwarningcurrent_deviceget_device_capability	Exception)r   majorminorr8   device
capabilitys         r   "_setup_cuda_environment_for_fatbin.CudaBackend._setup_cuda_environment_for_fatbin`   s7   	 !==--L#sL$6$6s$;BQ$?@LE rzerkeqj {uz(@@N
% 2<

./zz~~45A5bjjnnE[6\5]  ^J  K  ZZ..0F99&AJ4>qM?!JqM?1SBJJ-. 		s*   E 6E $E <A*E 'AE 
EEc                     g)z
CUDA backend saves SO blob and weights blob to an external .ptd file.
This file must be provided at runtime via --data_path argument.
Tr   r   s    r   save_data_externally CudaBackend.save_data_externally   s     r   c                 
    SS 0$ )Nz#at::_ops::_weight_int4pack_mm::callr   r   s    r   get_supported_fallback_kernels*CudaBackend.get_supported_fallback_kernels   s     24
 	
r   c                 j    [         R                  R                  R                  R                  [
        0$ )N)rI   opsatenconv1ddefaultr   r   s    r   get_decomposition_table#CudaBackend.get_decomposition_table   s(     IINN!!))+;
 	
r   compile_specsc                    SnU HU  nUR                   S:X  d  M  UR                  R                  S5      R                  5       nUS;  a  [	        SU S35      eUnMW     [        5       /nUS:X  a  UR                  [        5       5        U$ )a<  
Return CUDA-specific passes: ReplaceEdgeOpWithTritonOpPass.

The Triton kernel replacement behavior can be controlled via compile_specs:
- triton_kernel_mode="ON": Always use Triton kernels
- triton_kernel_mode="OFF": Never use Triton kernels and fallback to other implementations like cuda or decomposed operator.
ONtriton_kernel_modeutf-8)rh   OFFzInvalid triton_kernel_mode: z. Expected 'ON' or 'OFF'.)keyvaluedecodeupper
ValueErrorr
   appendr   )r   rf   ri   specmodepassess         r   get_custom_passesCudaBackend.get_custom_passes   s     "!Dxx//zz((1779},$6tf =2 3  &*" " -./%MM79:r   c                    [         R                  5       nSSSSSSSSSSUS.nSnSnU H[  nUR                  S:X  a  UR                  R	                  S	5      nUR                  S
:X  d  M@  UR                  R	                  S	5      nM]     US:X  aK  Uc0  [
        R                  " S5      R                  S5      n[        U5      nUR                  SSUSS.5        U$ Ub   S5       eU$ )zc
Get AOTI compile options for CUDA backend.
Options may vary based on platform (Linux vs Windows).
FTbinary_blobTRITON)loop_ordering_after_fusionemulate_precision_castsz aot_inductor.embed_kernel_binaryzaot_inductor.link_libtorchzaot_inductor.packagez$aot_inductor.package_constants_in_soz-aot_inductor.package_constants_on_disk_formatmax_autotunemax_autotune_gemm_backendsmax_autotune_conv_backendsz#aot_inductor.emit_multi_arch_kernellinuxNplatformrj   shim_library_pathwindows
executorchzdata/libaoti_cuda_shims)z"aot_inductor.cross_target_platformzaot_inductor.aoti_shim_libraryz#aot_inductor.aoti_shim_library_pathzaot_inductor.precompile_headersz-shim_library_path should not be set for Linux)
r   rW   rl   rm   rn   r   filesjoinpathr/   update)r   rf   emit_multi_arch_kerneloptionsr   r   rr   lib_dirs           r   get_aoti_compile_options$CudaBackend.get_aoti_compile_options   s    "-!O!O!Q +0'+04*/$(49=J *2*23I+*
4  !Dxx:%::,,W5xx..$(JJ$5$5g$>!	 " y  !(#//,7@@L$'L!NN:C6G;L7<	  ")?>?)r   c                 r    [         R                  R                  R                  [        R
                  /5      $ )ah  
Return SDPA MATH backend context manager for CUDA compilation.

This context manager plays as a fallback solution for any remaining PyTorch SDPA
operations to use the MATH backend (decomposed SDPA) during AOTInductor compilation.

Note:
- If SDPA ops are replaced with Triton kernels by ReplaceEdgeOpWithTritonOpPass,
  this context manager will have no effect on those ops (they are no longer
  PyTorch SDPA ops).
- If SDPA ops are NOT replaced (e.g., when triton_kernel_mode="OFF"), this
  context manager will force them to use the MATH backend, causing them to
  be automatically decomposed during compilation.
)rI   nn	attentionsdpa_kernelr   MATHr   s    r   &get_extra_aoti_compile_context_manager2CudaBackend.get_extra_aoti_compile_context_manager   s&      xx!!--z.?@@r   r   N)__name__
__module____qualname____firstlineno____doc__classmethodr/   r   staticmethodr   r>   r0   rW   rZ   r   r   r]   rd   r   r   typingru   r   r   __static_attributes__r   r   r   r   r      sG   
    0c 0hsm 0 0d % % %N T   
tCH~ 
 

 
S#X 
 

 d;.? DDT  2 D -D	c6::o	D DL A Ar   r   )rN   r#   r6   r   	importlibr   r   r   r   r   r   rI   %executorch.backends.aoti.aoti_backendr	   :executorch.backends.cuda.passes.move_cond_predicate_to_cpur
   0executorch.backends.cuda.triton.replacement_passr   executorch.exir._warningsr   'executorch.exir.backend.backend_detailsr   +executorch.exir.backend.compile_spec_schemar   torch._inductor.decompositionr   torch.nn.attentionr   r   r   r   r   <module>r      sp     	    3 3  = 3 B C : ) NkA+~ kA kAr   