
    9id                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  \R"                  R%                  \R"                  R'                  \5      5      r\R"                  R+                  \S5      /r\R"                  R+                  \S	5      rS
/rSq\ R4                  " 5       S 5       r\ R4                  " 5       S 5       r " S S\5      rS rSSSSSS.r SSSSSS.r!Sr"\#" \"5      r$S r%\&" S \'" S5       5       5      r(S\(S'   S\(S'   S\(S'   S r)S  r* " S! S"\5      r+ " S# S$\5      r,g)%    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludelibcudac            	      (   [         R                  R                  =n (       a  U /$ [        R                  " SS/5      R                  SS9nUR                  5        Vs/ s H  nSU;   d  M  UR                  5       S   PM      nnU Vs/ s H"  n[        R                  R                  U5      PM$     nn[        R                  " S5      nU(       al  U(       de  UR                  S5       Vs/ s HI  n[        R                  R                  [        R                  R                  US5      5      (       d  MG  UPMK     nnS	nU(       a  US
[        U5      -  -  nUS-  nO
US-  nUS-  n[        S U 5       5      (       d   U5       eU$ s  snf s  snf s  snf )Nz/sbin/ldconfigz-pignore)errorslibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      #    U  HB  n[         R                  R                  [         R                  R                  US 5      5      v   MD     g7f)r   N)ospathexistsjoin).0r   s     ]/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/triton/backends/nvidia/driver.py	<genexpr>libcuda_dirs.<locals>.<genexpr>)   s/     Sddrww~~bggll4@AAds   A
A)r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinessplitr   r   dirnamegetenvr   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr0      sd    <<4444 !!""$4d#;<CC8CTD *.):U):nPT>TDJJL):DU,01DSBGGOOC DD1))$564288=s=PRPWPWP\P\]`bpPqAr=s
&C2SY>>KKMM>>SdSSSXUXXSK V1 ts   
F*F)F
)AF3Fc                  $    [         /[        5       Q$ N)libdevice_dirr0        r   library_dirsr6   -   s    +LN++r5   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	CudaUtils7   c                 n   > [        U S5      (       d  [        [        U ]  U 5      U l        U R                  $ )Ninstance)hasattrsuperr8   __new__r;   )cls	__class__s    r   r>   CudaUtils.__new__9   s-    sJ'' C8=CL||r5   c                 r   [        [        [        R                  R	                  [
        S5      5      R                  5       S[        5       [        [        S9nUR                  q
UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        g )Nzdriver.c
cuda_utilssrcnamer6   include_dirs	libraries)r   r   r   r   r   r#   	read_textr6   rG   rH   PyCUtensorMapload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__CudaUtils.__init__>   s    %RWW\\':67AAC%%
 ))??%(%>%>",/,L,L)$'$<$<!#&#:#: r5   )rM   rO   rL   rK   rN   )__name__
__module____qualname____firstlineno__r>   rR   __static_attributes____classcell__r@   s   @r   r8   r8   7   s    
; ;r5   r8   c                     U S   S:X  a  gU R                  S5      (       a  g0 SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_U    $ )Nr   *CUdeviceptr
tensordescCUtensorMapi1int8_ti8i16int16_ti32int32_ti64int64_tu1uint8_tu8u16uint16_tu32uint32_tu64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cppr{   T   s    	!u|	}}\""hh 	y 	y	
 	y 	i 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r5   rm   ro   rq   )rr   rt   ru   rv   rw   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOOc                 |  ^^^^ U4S jnU4S jmU4S jmU4S jmU" UR                  5       5      n[        U5       VVs0 s H  u  pVXV_M	     nnnSR                  UR                  5        Vs/ s H  nT" U5      PM     sn5      n[        U-   n	/ n
UR                  5        H  nT" X5        M     [        U
5       VVs0 s H  u  pVXV_M	     nnn[	        U5      S:  a)  SSR                  S UR                  5        5       5      -   OSn/ nUR                  5        HU  u  pWUS	:X  a  M  U[        ;   a  UR                  [        U    S
U 35        M6  UR                  [        U5       S
U 35        MW     SR                  U5      n/ nUR                  5        H~  u  pWUS   S:X  a  UR                  SU S35        M%  U[        ;   a  UR                  SU S35        MF  US:X  a  UR                  SU 35        Mb  US	:w  d  Mj  UR                  SU 35        M     [        [	        U5      5      nSnUR                  5        VVs/ s H  u  pWUS   S:X  d  M  SU SU SU SU S3	PM!     nnnUR                  5        VVs/ s H  u  pWUS:X  d  M  SU SU SU S3PM     nnnUR                  5        VVs/ s H-  u  pWU[        ;   d  M  [        U    SU S[        U    SU S3PM/     nnnUR                  5        VVs/ s H  u  pWUS	:w  d  M  SU 3PM     nnnUR                  S 5        UR                  S!5        S"[	        U5      S:  a  SU-   OS S#SR                  U5       S$UR                  UR                  5        VVs/ s H  u  pWT" U5       SU S%3PM     snn5       S&U	 S'U S(UR                  U5       SUR                  U5       SUR                  U5       S)[	        U5      S:  a  SSR                  U5      -   OS S*3nU$ s  snnf s  snf s  snnf s  snnf s  snnf s  snnf s  snnf s  snnf )+Nc                   > / nSnU  GHK  n[        U[        5      (       Ga   UR                  S5      (       Ga	  T
(       a  T
U   OS nUS-  n[        R                  " SU5      nUR                  S5      nUR                  S5      nUR                  S5      S-   nUcL  UR                  SU-   5        [        SU-  5       H  n	UR                  S5        M     UR                  S	5        OUR                  S
5        [        U5       H  n	UR                  S5        M     [        U5       H  n	UR                  S5        M     GM:  UR                  U5        GMN     T
(       a  U[        T
5      :X  d   eU$ )Nr   r^      ztensordesc<([^[>]*)\[([^]]*)\]   ,r\   rg   r`   rx   re   )

isinstancer%   ry   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signature(make_launcher.<locals>._expand_signature   s5    C#s##|(D(D:I~6t!#!CSIAA{{3'!+<MM#+. #1t8_e, -MM$'MM+.tAMM%( %tAMM%( % c"9 < #nO8L&LLLr5   c                 v   > [        U [        5      (       a  U  H  nT" X!5        M     g UR                  U 5        g r2   )r   tupler   )r   r   x_flatten_signatures      r   r   )make_launcher.<locals>._flatten_signature   s0    c5!!"1-  MM#r5   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  g[	        U 5      $ )Nr   []r   r\   z	PyObject*	constexprrx   )r   r   r   mapr{   )rz   val_extracted_types     r   r   &make_launcher.<locals>._extracted_type   sW    b%  ((334Cse1:a5C<++}r5   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  gU R	                  S5      (       a  gS	S
SSSSSSSSS.
[        U 5         $ )N ()r   r\   Or   r^   dlbhiLBHIK)
rs   longra   rd   rf   rh   rj   rm   ro   rq   )r   r   r   r   ry   r{   )rz   r   	format_ofs     r   r    make_launcher.<locals>.format_of   s    b%  ''#i,-Cse1:a5C<++==&&
 B- 	r5   r   r   z, c              3   0   #    U  H  u  pS U 3v   M     g7f)z&_argNr4   )r   r   rz   s      r   r    make_launcher.<locals>.<genexpr>   s      L:K5:Ks   r   z argr\   ptr_infoz.dev_ptr_arg_storagerx   z*tma_ptrz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz_storage = z(_argz);z&argz&global_scratchz&profile_scratcha  
#include "cuda.h"
#include <dlfcn.h>
#include <stdbool.h>
#include <stdlib.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
  PyObject_HEAD;
  _Alignas(128) CUtensorMap tensorMap;
} PyCUtensorMapObject;

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratchz) {
  void *params[] = { a   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maximum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    if (num_ctas != 1) {
      config.gridDimX *= clusterDimX;
      config.gridDimY *= clusterDimY;
      config.gridDimZ *= clusterDimZ;
    }

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = clusterDimX;
      clusterAttr.value.clusterDim.y = clusterDimY;
      clusterAttr.value.clusterDim.z = clusterDimZ;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    config.numAttrs = num_attrs;

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static PyObject* data_ptr_str = NULL;
static PyObject* py_tensor_map_type = NULL;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
  if (!ret) {
    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
    ptr_info.valid = false;
    goto cleanup;
  }
  if (!PyLong_Check(ret)) {
    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    ptr_info.valid = false;
    goto cleanup;
  }
  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
  if(!ptr_info.dev_ptr)
    return ptr_info;
  uint64_t dev_ptr;
  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
  if (status == CUDA_ERROR_INVALID_VALUE) {
      PyErr_Format(PyExc_ValueError,
                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
      ptr_info.valid = false;
  } else if (status != CUDA_SUCCESS) {
      CUDA_CHECK(status);  // Catch any other cuda API errors
      ptr_info.valid = false;
  }
  ptr_info.dev_ptr = dev_ptr;
cleanup:
  Py_XDECREF(ret);
  return ptr_info;

}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {
    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
    return NULL;
}

  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
  if (align_128 != 0) {
    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
    return NULL;
  }
  return map;
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  PyObject *profile_scratch_obj = NULL;
  ;z
  if(!PyArg_ParseTuple(args, "aM  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hookaT  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  CUdeviceptr profile_scratch = 0;
  if (profile_scratch_obj != Py_None) {
    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
    if (!profile_scratch_info.valid) {
      return NULL;
    }
    profile_scratch = profile_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratchap  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  data_ptr_str = PyUnicode_InternFromString("data_ptr");
  if(data_ptr_str == NULL) {
    return NULL;
  }
  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
  if (driver_mod == NULL) {
    return NULL;
  }
  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
  if (py_tensor_map_type == NULL) {
    return NULL;
  }

  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)values	enumerater   _BASE_ARGS_FORMATr   itemsFLOAT_STORAGE_TYPEr   r{   r   FLOAT_PACK_FUNCTION)	constantsr   r   r   expand_signaturer   srz   args_formatformatflat_signaturer   	args_listarg_decl_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsrE   r   r   r   s     `                   @@@r   make_launcherr      s   %N. ))9)9);<"+,<"=>"=$!"=I>''93C3C3EF3ER9R=3EFGK,FN!3/ ""+N";<";$!";I<PST]P^abPbtyy L)//:K LLLhjI M"##  $6r$:#;4s!CD  IbM?$qc!:; # 		-(I"a5C<%%8&<=%%%%QCx&89;%%n5;%%QCj1 # 3y>"F G __&&EAa5C< 	f
 #5aS1#=MaSPde&   foetetevev\a\] 	X
qc!3A3oaSWev   __&&EA## 	Zb!
"%s+6I"6M5NeTUSVVXY&  
 '0oo&7M&7UQ2;LjQCj&7FM
MM#$
MM$%5cj x{  |E  xF  IJ  xJ  dh  kt  dt  PR  cS Syy() y*r <<	@QR@QuqOB'(aS2@QRST U  &x (Q R[P[ %\J <<	 
<<	 
<<#$% &Y BE  FX  BY  \]  B]  Z^  ae  aj  aj  k}  a~  Z~  ce  Yf 2fyNC^
 JI ?F =8

 Nd SsH   P/P PP P  P&P&6P,
 P, P2	P2P8c              #   (   #    U  H  oU4v   M
     g 7fr2   r4   )r   r   s     r   r   r   [  s     :	1A	s      
      	   c           
      R   UcL  U R                   /U R                  QU R                  QU R                  S:H  PU R                  QU R                  Q$ US   nUS   nUS   nUS   nUS   nU R                  nU R                  nUS   S:X  d   eU R                  S:X  a  SOS	n	U(       a  [	        U5      nUS==   S
-  ss'   [
        R                  R                  R                  R                  R                  U R                   R                  5       UU[        U   UUUU	5      n
U
/UQUQ$ )Nnanswizzle	elem_size	elem_type
block_size
fp4_paddedr   r   r   r   )baser   stridespaddinglisttritonruntimedriveractiveutilsrO   data_ptrTMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   cu_tensor_maps              r   make_tensordesc_argr   a  s2    c399cs{{cCKK54Hc399cWZWbWbccy!G%I%I,'J,'JIIEkkG2;!;;%'aQGUb	Q	NN))0066JJ +	M ,E,G,,r5   c           
        ^ ^^ [        S UR                  5        5       5      nU(       d  T $ [        [        UR                  5       5       VVs/ s H6  u  pE[	        U[
        5      (       d  M  UR                  S5      (       d  M4  UPM8     snn5      mT(       a  [        T5      [        T5      :X  d   eT(       d  S /[        T5      -  mU UU4S jnU$ s  snnf )Nc              3   r   #    U  H-  n[        U[        5      =(       a    UR                  S 5      v   M/     g7f)r^   N)r   r%   ry   )r   r   s     r   r   )wrap_handle_tensordesc.<locals>.<genexpr>  s+     r_qX[jc2Ss~~l7SS_qs   57r^   c                     > [        U S [         5      nSn[        U [        S  5       HA  u  p4UT;   a%  UR                  [	        UTU   5      5        US-  nM0  UR                  U5        MC     T" U6 $ )Nr   r   )r   _BASE_ARGS_FORMAT_LENr   extendr   r   )args
final_argsr   r   r   launchertensordesc_indicesr   s        r   inner%wrap_handle_tensordesc.<locals>.inner  s~    $5 567
%:%; <=FA&&!!"5c?>;Z"[\!#!!#& > $$r5   )r&   r   setr   r   r%   ry   r   )r   r   r   has_tensor_desc_argr   r   r   r   s   ` `    @r   wrap_handle_tensordescr     s    r_h_o_o_qrr"9#3#3#56p6vq*S#:NSVSaSabnSo6pr#o"6#>P:Q"QQQ&3'9#::	% L! 	qs   C
-C
C
c                        \ rS rSrS rS rSrg)CudaLauncheri  c                   ^ [        TS5      (       a  TR                  O	[        5       nU4S jnUR                  5        VVs0 s H  u  pVU" U5      U_M     nnnTR                  R                  5        VVs0 s H  u  pVXV_M	     nnn[        USS 5      n[        X7U5      m[        TS[        5       [        [        S9n	[        R                  " [        R                  UR                  S5      U l        [#        U	R$                  Xx5      U l        UR&                  U l        UR(                  U l        UR*                  U l        UR,                  U l        UR.                  U l        UR0                  U l        g s  snnf s  snnf )Nr   c                 ~   > [        U [        5      (       a&  TR                  R                  R	                  U 5      4$ U $ r2   )r   r%   fn	arg_namesindex)r   rE   s    r   <lambda>'CudaLauncher.__init__.<locals>.<lambda>  s2    Z3=O=OSVV--33A69VUVVr5   r   __triton_launcherrD   r   )r<   r   dictr   r   getattrr   r   r6   rG   rH   	functoolsreduceoperatormulcluster_dimsnum_ctasr   launchglobal_scratch_sizeglobal_scratch_alignprofile_scratch_sizeprofile_scratch_alignlaunch_cooperative_grid
launch_pdl)
rP   rE   r   r   arg_idxidxvaluer   r   rQ   s
    `        r   rR   CudaLauncher.__init__  s6   %,S+%>%>CMMDF	V;D??;LM;LZSWS\5(;L	M25--2E2E2GH2GJCSZ2G	H!(,=tDI/B%$%%
 "((x7L7LaP,SZZT#+#?#? $,$A$A!$,$A$A!%-%C%C"'/'G'G$"--' NHs   E57E;c                 .  ^ ^^^^ UUUU U4S jnU" T R                   T R                  [        R                  5      nU" T R                  T R
                  [        R                  5      n	T R                  " TTTTUT R                  T R                  X/	UQ76   g )Nc                 x   > U S:  a3  TT-  T-  nUT	R                   -  U -  nUR                  5       nU" XAT
5      $ g Nr   )r  get)sizealign	allocator	grid_size
alloc_sizealloc_fngridXgridYgridZrP   streams         r   allocate_scratch/CudaLauncher.__call__.<locals>.allocate_scratch  sF    ax!EME1	&6=
$==?
6::r5   )
r  r  r   
_allocatorr  r  _profile_allocatorr  r  r  )
rP   r'  r(  r)  r*  functionr   r+  global_scratchprofile_scratchs
   `````     r   __call__CudaLauncher.__call__  s    	 	 *$*B*BDD]D]_j_u_uv*4+D+DdF`F`+6+I+IKE5%4;W;WY]YhYh"	<6:	<r5   )r  r  r  r  r  r  r  r  N)rT   rU   rV   rW   rR   r2  rX   r4   r5   r   r  r    s    .0<r5   r  c                   n   ^  \ rS rSrU 4S jrS rS rS r\S 5       r	S\
S\
4S	 jrS
 rS rS rSrU =r$ )
CudaDriveri  c                 V   > [        5       U l        [        U l        [        TU ]  5         g r2   )r8   r   r  launcher_clsr=   rR   )rP   r@   s    r   rR   CudaDriver.__init__  s    [
(r5   c                 |    U R                  5       nU R                  U5      nUS   S-  US   -   nSn[        SX#5      $ )Nr   r   r       r   )get_current_deviceget_device_capabilityr   )rP   device
capability	warp_sizes       r   get_current_targetCudaDriver.get_current_target  sI    ((*//7
]R'*Q-7
	77r5   c                 J    SS K nUR                  SU R                  5       5      $ )Nr   r   )torchr=  r;  rP   rC  s     r   get_active_torch_device"CudaDriver.get_active_torch_device  s    ||FD$;$;$=>>r5   c                 "    SS K nUR                  $ r  )rC  r   rD  s     r   get_device_interfaceCudaDriver.get_device_interface  s    zzr5   c                       SS K n U R                  R                  5       =(       a    U R                  R                  S L $ ! [
         a     gf = f)Nr   F)rC  r   is_availableversionhipImportError)rC  s    r   	is_activeCudaDriver.is_active  sC    	::**,L%--2C2Ct2KL 		s   <? 
AArz   returnc                     [        U5      $ r2   )r{   )rP   rz   s     r   map_python_to_cpp_type!CudaDriver.map_python_to_cpp_type  s    }r5   c                     SSK Jn  U$ )Nr   )do_bench)triton.testingrV  )rP   rV  s     r   get_benchmarkerCudaDriver.get_benchmarker  s
    +r5   c                 \    SS K nSnUR                  [        US-  5      UR                  SS9$ )Nr   i      r   )r   r=  )rC  emptyint)rP   rC  
cache_sizes      r   get_empty_cache_for_benchmark(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr5   c                 $    UR                  5         g r2   )zero_)rP   caches     r   clear_cacheCudaDriver.clear_cache  s    r5   )r7  r   )rT   rU   rV   rW   rR   r@  rE  rH  staticmethodrO  r%   rS  rX  r_  rd  rX   rY   rZ   s   @r   r5  r5    sS    
8?    Q r5   r5  )-r  r  r   r   r   r   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r#   realpath__file__r   rG   r3   rH   rJ   	lru_cacher0   r6   objectr8   r{   r   r   r   r   r   r   r
  r   r   r   r   r  r5  r4   r5   r   <module>rp     su     	   	   8 & . ,
''//"''**84
5Wi01We,H	  . , ,; ;:
4     $ -. Wv  :b	::         $-N2(<6 (<V/ /r5   