
    #KiJ                     
   S SK r S SKrS SKJrJrJr  S SKrS SKJr	  S\ R                  4S jrS\ R                  4S jrS\ R                  4S jrS\SS4S	 jrS\ R                  4S
 jrS\ R                  4S jrS\ R                  4S jrS\\   4S jr    S S\S\S\\   S\\   S\\   S\S\\\4   4S jjr " S S5      r " S S5      r S!S\\\4   S\\\      S\\\\S4   4   4S jjr S"S\S\S\S\4S jjrg)#    N)AnyOptionalUnion)_get_device_indexreturnc                  p   [         R                  S:X  a6  [        R                  " S[        R
                  R                  S    S35      n O[        R                  " S5      n U R                  U l        U R                  U l
        U R                  U l        U R                  U l        U R                  U l        U $ )Nwin32	amdhip64_r   .dllzlibamdhip64.so)sysplatformctypesCDLLtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)libs    S/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/torch/cuda/_utils.py_get_hip_runtime_libraryr      s    
||wkkIemm&7&7&:%;4@Akk*+00C00C!66C22C 44CJ    c                      [         R                  S:X  a  [        R                  " S5      $ [        R                  " S5      $ )Nr	   z
nvcuda.dllzlibcuda.so.1)r   r   r   r    r    r   _get_cuda_libraryr#      s,    
||w{{<(({{>**r    c                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ N)r   r   r   r   r#   r"   r    r   _get_gpu_runtime_libraryr&       s!    }}')) ""r    resultc                    U S:X  a  g [         R                  " 5       n[        5       nUR                  U [         R                  " U5      5        UR
                  b  UR
                  R                  5       OSn[        SU 35      e)Nr   Unknown CUDA errorCUDA error: )r   c_char_pr&   r   byrefvaluedecodeRuntimeError)r'   err_strlibcudaerror_messages       r   _check_cudar3   (   sn    {ooG&(GVV\\'%:;")--";AU  m_5
66r    c                  z   [         R                  S:X  af  SR                  S[        R                  R
                  S   S[        R                  R
                  S   /5      n [        R                  " SU  S35      nO[        R                  " S5      nUR                  Ul	        UR                  Ul        UR                  Ul        UR                  Ul        UR                   Ul        UR$                  Ul        UR(                  Ul        UR,                  Ul        UR0                  Ul        UR4                  Ul        U$ )	Nr	    0r      hiprtcr   zlibhiprtc.so)r   r   joinr   r   r   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetPTXSizehiprtcGetCodenvrtcGetPTXhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)version_strr   s     r   _get_hiprtc_libraryrO   4   s    
||wggsEMM$5$5a$8#u}}?P?PQR?STUkkF;-t45kk.) "66C 44C!66C!66C//C''CO!$!<!<C 44C!$!<!<C!66CJr    c                  *   [        [        R                  R                  R	                  S5      S   5      n [
        R                  S:X  a  SU  S3/nOSU  3S/nU H  n [        R                  " U5      s  $    [        S5      e! [         a     M6  f = f)	N.r   r	   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_names      r   _get_nvrtc_libraryrZ   I   s    **005a89M
||w}oW-


 =/*

 	;;x(( 
 4
55  		s   B
BBc                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ r%   )r   r   r   rO   rZ   r"   r    r   _get_gpu_rtc_libraryr\   \   s#     }}"$$!##r    c                      SSK Jn Jn  S1nU Vs/ s H  o3U;  d  M
  UPM     nn[        R                  R
                  (       a  UR                  U 5        U$ s  snf )z
Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

Returns:
    List of HIPCC/NVCC flags that can be safely used with NVRTC.
r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexpr)torch.utils.cpp_extensionr^   r_   r   r   r   extend)r^   r_   nvrtc_unsupported_flagsflagcompatible_flagss        r   _get_gpu_rtc_compatible_flagsre   e   sc     P 	# +*:Q.Q*   }} 23s
   	AAkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc           
      ~	  ^^ SSK n[        5       mSmS[        SS4UU4S jjnU R                  S5      nUcv  UR                  R                  UR                  R                  5       5      n	UR                  R                  (       a  U	R                   nOU	R                   U	R                   3n/ n
UR                  R                  (       a#  U
R                  SU 3R                  5       5        O"U
R                  SU 3R                  5       5        SS	KJn  U" S
5      nU H%  nU
R                  SU 3R                  5       5        M'     U(       a+  U H%  nU
R                  SU 3R                  5       5        M'     U(       a@  [        UR                  R                  5      S:  d   S5       eUc  / nUR                  S5        U(       a)  U H#  nU
R                  UR                  S5      5        M%     [!        5       nU
R#                  U Vs/ s H  nUR                  S5      PM     sn5        [%        U
5      n[&        R(                  U-  " U
6 n[&        R*                  " 5       nU" TR-                  [&        R.                  " U5      UU S3R                  5       SSS5      5        UR                  S5      nU" TR1                  UU5      5        TR3                  UUU5      nUT:w  a  [&        R4                  " 5       nTR7                  U[&        R.                  " U5      5        [&        R8                  " UR:                  5      nTR=                  UU5        [?        SUR:                  RA                  5        35      e[&        R4                  " 5       nU" TRC                  U[&        R.                  " U5      5      5        [&        R8                  " UR:                  5      nU" TRE                  UU5      5        [&        R(                  " 5       nU" TRG                  UU[&        R.                  " U5      5      5        UR:                  b  UR:                  RA                  5       nOSnTRI                  [&        R.                  " U5      5        UR                  R                  (       a  URJ                  OUR:                  nUU4$ s  snf )a  
Compiles a CUDA kernel using NVRTC and returns the PTX code.

Args:
    kernel_source (str): The CUDA kernel source code as a string
    kernel_name (str): The name of the kernel function to compile
    compute_capability (str, None): The compute capability to target (e.g., "86").
                                       If None, will detect from current device.
    cuda_include_dirs (list, None): List of directories containing CUDA headers
    nvcc_options (list, None): Additional options to pass to NVRTC
    auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

Returns:
    Tuple[bytes, str]: The compiled PTX code and mangled kernel name
r   Nr'   r   c                    > U T:w  ar  [         R                  " 5       nTR                  U [         R                  " U5      5        UR                  b  UR                  R                  5       OSn[        SU 35      eg )Nr)   r*   )r   r+   r;   r,   r-   r.   r/   )r'   r0   r2   NVRTC_SUCCESSlibnvrtcs      r   check_nvrtc#_nvrtc_compile.<locals>.check_nvrtc   so    ]"oo'G((g1FG ==, $$&) 
 m_=>> #r    utf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrT   z-Iz12.8zPCH requires CUDA 12.8+z--pchz.cuzKernel compilation failed:
r5   )&
torch.cudar\   rS   encoderT   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendr`   rs   strre   ra   lenr   r+   c_void_pr=   r,   rK   rA   c_size_trG   create_string_bufferr-   rI   r/   r.   rC   rE   rM   r?   raw) rf   rg   rh   ri   rj   rk   r   rp   source_bytespropsoptionsrs   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsrc   num_optionsoptions_arrayprogc_kernel_namereslog_sizelogptx_sizeptxc_mangled_namemangled_name	ptx_bytesrn   ro   s                                  @@r   _nvrtc_compiler   }   s   0  $%H M	?C 	?D 	? 	? !''0L !

001J1J1LM==$)$5$5#6$)KK=!> G}});(<=DDFG/0B/CDKKMN 8&v.'	I;'..01 ( *INNR	{+2245 + 5==%%&&0K2KK0LG$ "FNN6==12 # ;<NN5KL5KTDKK(5KLM g,K__{2W=M ??D##LLm3&&(	
	  &&w/M//mDE 
&
&t[-
HC m??$''fll8.DE))(..9##D#.9#)):J:J:L9MNOO  H((v||H/EFG

%
%hnn
5C$$T3/0 __&N$$T=&,,~:VW '%++224  d!34
 !==,,#))Il""s Ms   R:c                   L    \ rS rSrS\R
                  SS4S jrS\SS4S jrS	r	g)
_CudaModulei  moduler   Nc                     Xl         0 U l        g r%   )_module_kernels)selfr   s     r   __init___CudaModule.__init__  s    02r    name_CudaKernelc           	         XR                   ;   a  U R                   U   $ SSKJn  U" 5       n[        R                  " 5       n [        UR                  [        R                  " U5      U R                  UR                  S5      5      5        [        X@R                  5      nXPR                   U'   U$ ! [         a  n[        SU S35      UeS nAff = f)Nr   )r&   rr   zNo kernel named 'z' in this module)r   torch.cuda._utilsr&   r   r~   r3   r   r,   r   ru   r   r/   AttributeError)r   r   r&   r1   funckernelerrs          r   __getattr___CudaModule.__getattr__  s    == ==&& 	?*, 	V++LL&dkk'6J
 !||4F"(MM$M 	V #4TF:J!KLRUU	Vs   A-B0 0
C:C

C)r   r   )
__name__
__module____qualname____firstlineno__r   r~   r   r|   r   __static_attributes__r"   r    r   r   r     s/    3v 34 3V V Vr    r   c                       \ rS rSrSrS\R                  S\R                  SS4S jr     SS\\	\	\	4   S	\\	\	\	4   S
\
\   S\	S\
\   SS4S jjrS\	SS4S jrSrg)r   i.  zL
Represents a compiled CUDA kernel that can be called with PyTorch tensors.
r   r   r   Nc                 *    Xl         X l        SU l        g )Nr   )r   r   _max_shared_mem_bytes)r   r   r   s      r   r   _CudaKernel.__init__3  s    	%&"r    gridblockargs
shared_memstreamc                    SSK nUR                  R                  R                  5       nU(       d  / n/ n/ n	U GHv  n
[	        XR
                  5      (       a  U
R                  (       d1  U
R                  (       a  U
R                  5       (       d  [        S5      e[        R                  " U
R                  5       5      nUR                  U5        U	R                  [        R                  " U5      5        M  [	        U
[        5      (       a>  [        R                   " U
5      nU	R                  [        R                  " U5      5        GM  [	        U
["        5      (       a>  [        R$                  " U
5      nU	R                  [        R                  " U5      5        GMb  ['        S[)        U
5       35      e   [        R                  [+        U	5      -  " 5       n[-        U	5       H,  u  p[        R.                  " U
[        R                  5      X'   M.     Uc  SSKnUR                  R3                  5       nUS:  aS  U R4                  S:X  d  X@R4                  :  a4  U R4                  S:X  a  SOSU R4                   S3n[7        S	U S
U S35      e[9        UR;                  U R<                  US   US   US   US   US   US   UUR>                  US5      5        g)a  
Call the compiled CUDA kernel

Args:
    grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
    block (tuple): Block dimensions (block_x, block_y, block_z)
    args (list): List of arguments to pass to the kernel.
                 PyTorch tensor arguments will be automatically converted to pointers.
    shared_mem (int): Shared memory size in bytes
    stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.   r7   ) r   rT   _utilsr&   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr   r~   data_ptrr{   r,   rS   c_intfloatc_double	TypeErrortyper}   	enumeratecastrt   current_streamr   r/   r3   r   r   _as_parameter_)r   r   r   r   r   r   r   r1   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgs                    r   __call___CudaKernel.__call__8  sT   & 	**##<<>D 13C#||,,{{CJJ3==??$Y  ooclln5%%c*fll3/0C%%S)fll512C''!??3/fll845"=d3i[ IJJ+ 0 #f+58'FA$kk#v?LO ( >ZZ..0F "&&!+z<V<V/V --2 !T7788IJ 
 ":, /%& '33  	""		QQQaaa%%	
r    shared_mem_bytesc                 z   US:  a  Xl         g [        5       n[        R                  R	                  5       n[        R
                  R                  (       a  UR                  S:w  a  SOSnO[        USS5      nX:  a  [        SU SU S35      eS	n[        UR                  U R                  UU5      5        Xl         g )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r&   r   rT   rv   r   r   rx   getattrr/   r3   r   r   )r   r   r1   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizes         r   set_shared_memory_config$_CudaKernel.set_shared_memory_config  s    i')9&*, zz779== &11X=:  %=uN ,+,<+= >!!/ 0 1GG  783&&		; 	
 &6"r    )r   r   r   )r   r   r   r   Nr   N)r   r   r   r   __doc__r   r~   r   tuplerS   r   listr   r   r   r   r"   r    r   r   r   .  s    'V__ 'foo '$ ' &/&/# $_
CcM"_
 S#s]#_
 tn	_

 _
 _
 
_
B(6 (6 (6r    r   r   kernel_namesc           
      \   SSK n[        5       n[        U [        5      (       a  U R	                  S5      n [
        R                  " 5       nUR                  R                  5       nU   [        UR                  [
        R                  " U5      U 5      5        SSS5        U(       d  [        U5      $ 0 nU Hc  n[
        R                  " 5       n[        UR                  [
        R                  " U5      XGR	                  S5      5      5        [        X5      Xg'   Me     U$ ! , (       d  f       N= f)a  
Loads a CUDA module from PTX code and returns a module object that can access kernels.

Args:
    ptx (bytes or str): The PTX code to load
    kernel_names (list, optional): List of kernel names to extract from the module.
                                  If None, will return a module object with __getattr__.

Returns:
    object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
           If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
r   Nrr   )rt   r&   r   r|   ru   r   r~   rT   r   r3   r   r,   r   r   r   )	r   r   r   r1   r   r   kernelsr   r   s	            r   _cuda_load_moduler     s       '(G #sjj! __FZZ&&(F	G,,V\\&-A3GH 
 6"" G ''T"FKK,@	

 $D1  N! 
s   &0D
D+deviceoptional	allow_cpuc                    [        U [        5      (       a  U $ [        U [        5      (       a  [        R                  " U 5      n [        U [        R                  5      (       aD  U(       a  U R
                  S;  a  [        SU  35      eOU R
                  S:w  a  [        SU  35      e[        R                  R                  5       (       d5  [        U [        R                  R                  5      (       a  U R                  $ [        XU5      $ )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

If :attr:`device` is a torch.device object, returns the device index if it
is a CUDA device. Note that for a CUDA device without a specified index,
i.e., ``torch.device('cuda')``, this will return the current default CUDA
device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
CPU devices will be accepted and ``-1`` will be returned in this case.

If :attr:`device` is a Python integer, it is returned as is.

If :attr:`device` is ``None``, this will return the current default CUDA
device if :attr:`optional` is ``True``.
)rT   cpuz(Expected a cuda or cpu device, but got: rT   z!Expected a cuda device, but got: )r   rS   r|   r   r   r   r   jitis_scriptingrT   idx_torch_get_device_index)r   r   r   s      r   r   r     s      &#&#f%&%,,''{{/1 #KF8!TUU 2[[F"@IJJ99!!##fejj//00::"6Y??r    )NNNFr%   )FF)r   r   typingr   r   r   r   torch._utilsr   r   r   r   r#   r&   rS   r3   rO   rZ   r\   r   r|   re   boolr   bytesr   r   r   dictr   r"   r    r   <module>r      s    
 ' '  F
&++ 
+6;; +#&++ #	7 	7 	7V[[ *6FKK 6&$fkk $tCy 6 )-(,#'P#P#P# !P#  ~	P#
 4.P# P# 5#:P#fV V<S6 S6n AE-	sEz	-*249*=-
;S-/001-b <A@@@48@@r    