
    9iGi                        S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJr  S SKJr  \R$                  R'                  \R$                  R)                  \5      5      r\R$                  R-                  \S	5      /rS
 r\ R2                  " 5       S 5       r " S S\5      rS rSSSSSS.rSSSSSS.rSr S r!S r" " S S\5      r# " S S\5      r$g)    N)Path)knobs)	GPUTarget)	GPUDriver)_allocation)compile_module_from_src)TensorDescriptorincludec                   ^ ^^^^ SS K nUR                  5       S:w  a  g SS KmSSKJnJnJnJmJmJn   " UU4S jSTR                  5      nTR                  X5" U5      U" U5      U" U5      5      n TR                  S5      R                  nUT/Ul        X8l        SmTR!                  TS-   5      n	UU U4S	 jn
U" U" U
5      U	5      (       a%  ["        R$                  " TR'                  U	5      5      $ g ! [         a     g f = f)
Nr   Linux)c_charc_intc_size_tc_void_pc_char_pPOINTERc                   *   > \ rS rSrS Y4S Y 4/rSrg)8_find_already_mmapped_dylib_on_linux.<locals>.DlPhdrInfo   	dlpi_addr	dlpi_name N)__name__
__module____qualname____firstlineno___fields___static_attributes__)r   r   s   Z/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/triton/backends/amd/driver.py
DlPhdrInfor      s    (#(#
    r    z	libc.so.6i      c           
         > U R                   R                  n[        [        R                  " U5      5      nTUR
                  ;   a&  TR                  X#[        T[        U5      5      5        gg)Nr"   r   )	contentsr   r   osfsdecodenamememmoveminlen)infosizedatar   pctypeslib_namemax_path_lengths        r   callback6_find_already_mmapped_dylib_on_linux.<locals>.callback3   sP    MM++	Y'(qvvNN4CY,PQr!   )platformsystemr/   r   r   r   r   r   r   	Structure	CFUNCTYPECDLLdl_iterate_phdr	Exceptionargtypesrestypecreate_string_bufferr%   r&   	string_at)r0   r4   r   r   r   r   r    
callback_tr9   pathr2   r   r   r/   r1   s   `          @@@@r   $_find_already_mmapped_dylib_on_linuxrA      s    G#
 KK
 
V%% 
 !!%)<gh>OQXY_Q`aJ ++k2BB !+H5O#O&&':;D z(+T22{{6++D122+  s   2C4 4
D Dc                  V
   Sn [         R                  R                  =n(       aM  UR                  U 5      (       a&  [        R
                  R                  U5      (       a  U$ [        SU SU  35      e[        U 5      nU(       a7  [        R
                  R                  U5      (       a  U$ [        SU SU  35      e/ n[        R
                  R                  [        R
                  R                  [        5      SU 5      n[        R
                  R                  U5      (       a  U$ UR                  U5        SS KnUR                  5       nUR                  5       nUR                   (       a  U/U-   nU H^  n[        R
                  R                  USSU 5      n[        R
                  R                  U5      (       a  Us  $ UR                  U5        M`     [        R"                  " S	5      n	U	(       ap  U	R%                  S
5       H[  n
[        R
                  R                  X5      n[        R
                  R                  U5      (       a  Us  $ UR                  U5        M]     [        R"                  " S5      nU(       aX  [        R
                  R                  USU 5      n[        R
                  R                  U5      (       a  U$ UR                  U5         [&        R(                  " SS/5      R+                  5       R-                  5       nU(       aX  [        R
                  R                  USU 5      n[        R
                  R                  U5      (       a  U$ UR                  U5        [        R"                  " S5      nU(       aX  [        R
                  R                  USU 5      n[        R
                  R                  U5      (       a  U$ UR                  U5        [&        R(                  " SS/5      R+                  SS9nUR3                  5        Vs/ s H<  nUR-                  5       R                  U 5      (       d  M)  UR%                  5       S   PM>     nnU H<  n[        R
                  R                  U5      (       a  Us  $ UR                  U5        M>     [        R
                  R                  SU 5      n[        R
                  R                  U5      (       a  U$ UR                  U5        [        SU  SU 35      e! [&        R.                  [0        4 a     GNf = fs  snf )Nzlibamdhip64.sozTRITON_LIBHIP_PATH 'z' does not point to a valid zmemory mapped 'z'' in process does not point to a valid libr   torchLD_LIBRARY_PATH:HIP_PATH	hipconfigz--path	ROCM_PATHz/sbin/ldconfigz-pignore)errorsz/opt/rocm/lib/zcannot locate z after attempted paths )r   amdlibhip_pathendswithr%   r@   existsRuntimeErrorrA   joindirname__file__appendsitegetsitepackagesgetusersitepackagesENABLE_USER_SITEgetenvsplit
subprocesscheck_outputdecodestripCalledProcessErrorFileNotFoundError
splitlines)r0   env_libhip_pathmmapped_pathpaths	local_librV   site_packages	user_siter@   env_ld_library_pathdfenv_hip_pathhip_lib_pathhip_rootenv_rocm_pathrocm_lib_pathlibslinelocsloccommon_install_paths                         r   _get_path_to_hip_runtime_dylibrv   A   s   H  ))////##H--"''..2Q2Q""1/1BB^_g^hijj 8AL77>>,''_\N:abjaklmmE RWW__X6xHI	ww~~i  	LL ((*M((*I"m3ww||D'5(;77>>$KT	  ))$56$**3/AQ)Aww~~a  LLO	 0 99Z(Lww||L%B77>>,''\"	**K+BCJJLRRT77<<%BLww~~l++##LL& IIk*M]E8D77>>-((  ]# ""$4d#;<CC8CTD *.):^):djjl>S>ST\>]DJJL):D^77>>#JS  '',,'7B	ww~~)**""	LL$%
z1HP
QQ; ))+<= " _s%   BT T (T&T&T#"T#c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )HIPUtils   c                 n   > [        U S5      (       d  [        [        U ]  U 5      U l        U R                  $ )Ninstance)hasattrsuperrx   __new__r{   )cls	__class__s    r   r~   HIPUtils.__new__   s-    sJ'' 37<CL||r!   c                    [        5       n[        [        R                  R	                  [
        S5      5      R                  5       nUR                  SUS5      n[        US[        S9nUR                  U l
        UR                  U l        g )Nzdriver.cz/*py_libhip_search_path*/r"   	hip_utilssrcr'   include_dirs)rv   r   r%   r@   rR   rS   	read_textreplacer   r   load_binaryget_device_properties)selfrN   r   mods       r   __init__HIPUtils.__init__   si    46277<<45??A kk5{AF%#Kl[??%(%>%>"r!   )r   r   )r   r   r   r   r~   r   r   __classcell__r   s   @r   rx   rx      s    
	? 	?r!   rx   c                 @    U S   S:X  a  gSSSSSSSS	S
SSSSSSS.U    $ )Nr   *hipDeviceptr_tint8_tint16_tint32_tint64_tuint8_tuint16_tuint32_tuint64_tdouble)i1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64r   )tys    r   	ty_to_cppr      sQ    	!u|  	!
 
r!   r   r   r   )r   r   r   r   r   	pack_fp16	pack_bf16	pack_fp32	pack_fp64piiiKKOOOOOc           "      .  ^^^ S nU4S jmU4S jmU4S jm[        U" UR                  5       5      5       VVs0 s H  u  pEXE_M	     nnnSR                  UR                  5        Vs/ s H  nT" U5      PM     sn5      n[        U-   nSR                  [	        TUR                  5       5      5      n[        [        [        UR                  S5      5      5      n[        U5       V	Vs0 s H  u  pX_M	     nn	n[        U5      S:  a)  SSR                  S	 UR                  5        5       5      -   OSn
/ nUR                  5        HU  u  pUS
:X  a  M  U[        ;   a  UR                  [        U    SU	 35        M6  UR                  [        U5       SU	 35        MW     SR                  U5      n/ nUR                  5        Hb  u  pUS   S:X  a  UR                  SU	 S35        M%  U[        ;   a  UR                  SU	 S35        MF  US
:w  d  MN  UR                  SU	 35        Md     UR                  5        V	Vs/ s H-  u  pU[        ;   d  M  [        U    SU	 S[        U    SU	 S3PM/     nn	n[        5       n[        [!        [        U5      5      5      nUR                  5        V	Vs/ s H  u  pUS
:w  d  M  SU	 3PM     nn	nUR                  S5        UR                  S5        SU S[        U5      S:  a  SU-   OS SSR                  U5       SU SU SSR                  UR                  5        V	Vs/ s H  u  pT" U5       SU	 S3PM     snn	5       S U S!U
 S"SR                  U5       S#SR                  UR                  5        V	Vs/ s H  u  pUS   S:X  a  S$U	 S%U	 SU	 S&U	 S'3	OSPM!     snn	5       S([        U5      S:  a  SSR                  U5      -   OS S)3nU$ s  snnf s  snf s  snn	f s  snn	f s  snn	f s  snn	f s  snn	f )*Nc                 0   / nU  GH  n[        U[        5      (       a  UR                  S5      (       a  UR                  S5      S-   n[        R
                  " SU5      R                  5       nUR                  SU-   5        [        SU-  5       H  nUR                  S5        M     UR                  S5        [        U5       H  nUR                  S	5        M     [        U5       H  nUR                  S5        M     M  UR                  U5        GM     U$ )
N
tensordesc,r"   ztensordesc<([^[>]*)r      r   r   r   )	
isinstancestr
startswithcountrematchgrouprU   range)	signatureoutputsigndimdtype_s         r   _expand_signature(make_launcher.<locals>._expand_signature   s     C#s##|(D(Dyy~)!6<BBDcEk*q4xAMM%( )d# tAMM%( %tAMM%( % c"' * r!   c                 h   > [        U [        5      (       a  SR                  [        TU 5      5      $ U $ )Nr   )r   tuplerR   map)r   _serialize_signatures    r   r   +make_launcher.<locals>._serialize_signature   s,    c5!!88C 4c:;;
r!   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S:X  a  g[	        U 5      $ )Nr   []r   r   z	PyObject*	constexprr   r   rR   r   r   )r   val_extracted_types     r   r   &make_launcher.<locals>._extracted_type   sV    b%  ((334Cse1:a5C<}r!   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S:X  a  gSS	S
SSSSSSSS.
[	        U 5         $ )N ()r   r   Or   rj   lbhiLBHIK)
r   longr   r   r   r   r   r   r   r   r   )r   r   	format_ofs     r   r    make_launcher.<locals>.format_of	  s    b%  ''#i,-Cse1:a5C<
 B- 	r!   r   r   r   z, c              3   0   #    U  H  u  pS U 3v   M     g7f)z&_argNr   ).0r   r   s      r   	<genexpr> make_launcher.<locals>.<genexpr>%  s      L:K5:Ks   r   z argr   ptr_infoz.dev_ptr_arg_storagez _argz_storage = z(_argz);z&argz&global_scratchz&profile_scratcha\  
#define __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <Python.h>
#include <dlfcn.h>
#include <stdbool.h>
#include <dlfcn.h>

// The list of paths to search for the HIP runtime library. The caller Python
// code should substitute the search path placeholder.
static const char *hipLibSearchPaths[] = {"a  "};

// The list of HIP dynamic library symbols and their signature we are interested
// in this file.
#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \
  FOR_EACH_STR_FN(hipGetLastError)                                            \
  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                     \
  FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f,                     \
                  unsigned int gridDimX, unsigned int gridDimY,               \
                  unsigned int gridDimZ, unsigned int blockDimX,              \
                  unsigned int blockDimY, unsigned int blockDimZ,             \
                  unsigned int sharedMemBytes, hipStream_t stream,            \
                  void **kernelParams, void **extra)                          \
  FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, hipFunction_t f,          \
                  unsigned int gridDimX, unsigned int gridDimY,               \
                  unsigned int gridDimZ, unsigned int blockDimX,              \
                  unsigned int blockDimY, unsigned int blockDimZ,             \
                  unsigned int sharedMemBytes, hipStream_t stream,            \
                  void **kernelParams, void **extra)                          \
  FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data,                         \
                  hipPointer_attribute attribute, hipDeviceptr_t ptr)

// The HIP symbol table for holding resolved dynamic library symbols.
struct HIPSymbolTable {
#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                             \
  hipError_t (*hipSymbolName)(__VA_ARGS__);
#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                             \
  const char *(*hipSymbolName)(__VA_ARGS__);

  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
};

static struct HIPSymbolTable hipSymbolTable;

bool initSymbolTable() {
  // Use the HIP runtime library loaded into the existing process if it exits.
  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);

  // Otherwise, go through the list of search paths to dlopen the first HIP
  // driver library.
  if (!lib) {
    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
    for (int i = 0; i < n; ++i) {
      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
      if (handle) {
        lib = handle;
      }
    }
  }
  if (!lib) {
    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
    return false;
  }

  typedef hipError_t (*hipGetProcAddress_fn)(
      const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags,
      hipDriverProcAddressQueryResult *symbolStatus);
  hipGetProcAddress_fn hipGetProcAddress;
  dlerror(); // Clear existing errors
  const char *error = NULL;
  *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress");
  error = dlerror();
  if (error) {
    PyErr_SetString(PyExc_RuntimeError,
                    "cannot query 'hipGetProcAddress' from libamdhip64.so");
    dlclose(lib);
    return false;
  }

  // Resolve all symbols we are interested in.
  int hipVersion = HIP_VERSION;
  uint64_t hipFlags = 0;
  hipDriverProcAddressQueryResult symbolStatus;
  hipError_t status = hipSuccess;
#define QUERY_EACH_FN(hipSymbolName, ...)                                        status = hipGetProcAddress(#hipSymbolName,                                                                (void **)&hipSymbolTable.hipSymbolName,                                        hipVersion, hipFlags, &symbolStatus);               if (status != hipSuccess) {                                                     PyErr_SetString(PyExc_RuntimeError,                                                            "cannot get address for '" #hipSymbolName                                      "' from libamdhip64.so");                                      dlclose(lib);                                                                  return false;                                                                }

  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)

  return true;
}

static inline void gpuAssert(hipError_t code, const char *file, int line)
{
   if (code != HIP_SUCCESS)
   {
      const char* prefix = "Triton Error [HIP]: ";
       const char* str = hipSymbolTable.hipGetErrorString(code);
      char err[1024] = {0};
      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
      PyErr_SetString(PyExc_RuntimeError, err);
   }
}

#define HIP_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function, hipDeviceptr_t profile_scratchz>) {
  hipDeviceptr_t global_scratch = 0;
  void *params[] = { z };
  if (gridX*gridY*gridZ > 0 && launch_cooperative_grid) {
    HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, z*num_warps, 1, 1, shared_memory, stream, params, 0));
    return;
  }
  if (gridX*gridY*gridZ > 0) {
    HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, ae
  *num_warps, 1, 1, shared_memory, stream, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    hipDeviceptr_t dev_ptr;
    bool valid;
} DevicePtrInfo;

static PyObject* data_ptr_str = NULL;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  hipError_t status = hipSuccess;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
  if (!ret) {
    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
    ptr_info.valid = false;
    goto cleanup;
  }
  if (!PyLong_Check(ret)) {
    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    ptr_info.valid = false;
    goto cleanup;
  }
  ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
  if (!ptr_info.dev_ptr)
    goto cleanup;
  uint64_t dev_ptr;
  status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
  if (status == hipErrorInvalidValue) {
      PyErr_Format(PyExc_ValueError,
                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
      ptr_info.valid = false;
      // Clear and ignore HIP error
      (void)hipSymbolTable.hipGetLastError();
  }
  ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
cleanup:
  Py_DECREF(ret);
  return ptr_info;
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat/blob/5e317108f872c904eb726cb8d560dcadbdf88a72/pythoncapi_compat.h#L482-L492
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  PyObject *profile_scratch_obj = NULL;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   z; z
  if(!PyArg_ParseTuple(args, "a,  ", &launch_cooperative_grid,
                                           &gridX, &gridY, &gridZ, &_stream, &_function, &profile_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook z)) {
    return NULL;
  }

  a  

  // extract kernel metadata
  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    return NULL;
  }
  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  hipDeviceptr_t profile_scratch = 0;
  if (profile_scratch_obj != Py_None) {
    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
    if (!profile_scratch_info.valid) {
      return NULL;
    }
    profile_scratch = profile_scratch_info.dev_ptr;
  }

  // raise exception asap
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;z;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, (hipDeviceptr_t)profile_scratcha  );

  if(launch_exit_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  if(PyErr_Occurred()) {
    return NULL;
  }
  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  if (!initSymbolTable()) {
    return NULL;
  }
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  data_ptr_str = PyUnicode_InternFromString("data_ptr");
  if(data_ptr_str == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)	enumeratevaluesrR   _BASE_ARGS_FORMATr   listfilterboolr[   r*   itemsFLOAT_STORAGE_TYPErU   r   FLOAT_PACK_FUNCTIONrv   r   )	constantsr   	warp_sizer   idxsr   args_formatformatr   	args_listarg_decl_list	arg_declsinternal_args_listfloat_storage_declsrN   paramsr   r   r   r   s                     @@@r   make_launcherr     s   6
* '00A)BRBRBT0U&VW&VFC&VIW''93C3C3EF3ER9R=3EFGK,F193C3C3EFGIVD)//#"678I"+I"67"6$!"6I7PST]P^abPbtyy L)//:K LLLhjI M"##  $6r$:#;4s!CD  IbM?$qc!:; # 		-(I"a5C<%%8&<=%%%%QCx&89;%%QCj1 # __&&EA## 	Zb!
"%s+6I"6M5NeTUSVVXY&   12K %I'(F&/oo&7M&7UQ2;LjQCj&7FM
MM#$
MM$%- .9M i:@R UX  Yb  Uc  fg  Ug  AE  HQ  AQ  mo  @p pyy() *^^g]h iS T]R] Y^r 88Y__=NO=NEA#$E!B/=NOPQ R  &x (R S\Q\ ] 88 ! "2 99  R[  Ra  Ra  Rc  d  Rc  IN  IJoqrsotx{o{&qc);A3bCSTUSVVjk  BD  D  Rc  d  e  f fS |  @R  |S  VW  |W  TX  [_  [d  [d  ew  [x  Tx  ]_  S` +`obCF	 JY
 XF 8, Np PB ds5   O.)O4*O9<O? O?-P=	P<P &P0c                    ^  U 4S jnU$ )zF
Replace all tensor descriptors with the base ptr, shape, and strides
c                  |  > U S [        [        5       nU [        [        5      S  n/ nU H  n[        U[        5      (       a]  UR	                  UR
                  /UR                  QUR                  QUR                  S:H  PUR                  QUR                  Q5        Mu  UR                  U5        M     T" / UQUQ76 $ )Nnan)
r*   r   r   r	   extendbaseshapestridespaddingrU   )args	meta_argsraw_kernel_args
final_argsarglaunchers        r   inner,wrap_handle_tensor_descriptor.<locals>.innerr  s    0#/01	s#4567
"C#/00 !!388"vcii"v#++"vs{{V[G["v^a^g^g"vjmjuju"vw!!#& # 00Z00r!   r   )r  r  s   ` r   wrap_handle_tensor_descriptorr  m  s    
1" Lr!   c                        \ rS rSrS rS rSrg)HIPLauncheri  c                 t  ^ [        TS5      (       a  TR                  O	[        5       nU4S jnUR                  5        VVs0 s H  u  pVU" U5      U_M     nnnTR                  R                  5        VVs0 s H  u  pVXV_M	     nnn[        X7UR                  5      m[        TS[        S9n[        S UR                  5        5       5      n	U	(       a  [        UR                  5      OUR                  U l        UR                  U l        UR                  U l        UR                  U l        g s  snnf s  snnf )Nr   c                 ~   > [        U [        5      (       a&  TR                  R                  R	                  U 5      4$ U $ N)r   r   fn	arg_namesindex)xr   s    r   <lambda>&HIPLauncher.__init__.<locals>.<lambda>  s2    Z3=O=OSVV--33A69VUVVr!   __triton_launcherr   c              3   r   #    U  H-  n[        U[        5      =(       a    UR                  S 5      v   M/     g7f)r   N)r   r   r   )r   r   s     r   r   'HIPLauncher.__init__.<locals>.<genexpr>  s+     !vcu\_*S#"6"W3>>,;W"Wcus   57)r|   r   dictr   r   r  r   r   r   anyr   r  launchlaunch_cooperative_gridprofile_scratch_sizeprofile_scratch_align)
r   r   metadatar   arg_idxr   valuer   r   has_tensor_desc_args
    `        r   r   HIPLauncher.__init__  s    %,S+%>%>CMMDF	V;D??;LM;LZSWS\5(;L	M25--2E2E2GH2GJCSZ2G	HI(2D2DE%#4GVbc!!vclcscscu!vvCV3CJJ?\_\f\f'/'G'G$$,$A$A!%-%C%C" NHs   D.7D4c           	         ^^^^ UUUU4S jnU" U R                   U R                  [        R                  5      nU R                  " U R
                  TTTTXX/UQ76   g )Nc                 \   > U S:  a%  TT-  T-  nX0-  nUR                  5       nU" XAT	5      $ g Nr   )get)
r,   align	allocator	grid_size
alloc_sizealloc_fngridXgridYgridZstreams
         r   allocate_scratch.HIPLauncher.__call__.<locals>.allocate_scratch  s;    ax!EME1	&-
$==?
6::r!   )r'  r(  r   _profile_allocatorr%  r&  )	r   r7  r8  r9  r:  functionr  r;  profile_scratchs	    ````    r   __call__HIPLauncher.__call__  sW    	 	 +4+D+DdF`F`+6+I+IK 	D00%vxpkopr!   )r%  r&  r(  r'  N)r   r   r   r   r   r@  r   r   r!   r   r  r    s    Dqr!   r  c                   n   ^  \ rS rSrU 4S jrS r\S 5       rS\S\4S jr	S r
S	 rS
 rS rS rSrU =r$ )	HIPDriveri  c                 V   > [         TU ]  5         [        5       U l        [        U l        g r  )r}   r   rx   utilsr  launcher_cls)r   r   s    r   r   HIPDriver.__init__  s    Z
'r!   c                 "    SS K nUR                  $ r0  )rD   cudar   rD   s     r   get_device_interfaceHIPDriver.get_device_interface  s    zzr!   c                       SS K n U R                  R                  5       =(       a    U R                  R                  S L$ ! [
         a     gf = f)Nr   F)rD   rI  is_availableversionhipImportError)rD   s    r   	is_activeHIPDriver.is_active  sC    	::**,P%--2C2C42OP 		s   <? 
AAr   returnc                     [        U5      $ r  )r   )r   r   s     r   map_python_to_cpp_type HIPDriver.map_python_to_cpp_type  s    }r!   c                     U R                  5       nU R                  R                  U5      n[        R                  R
                  =(       d    US   nUS   n[        SUR                  S5      S   U5      $ )NarchwarpSizerP  rF   r   )get_current_devicerE  r   r   runtimeoverride_archr   r[   )r   devicedevice_propertiesrY  r   s        r   get_current_targetHIPDriver.get_current_target  se    ((* JJ<<VD}}**G.?.G%j1	

3 2I>>r!   c                 J    SS K nUR                  SU R                  5       5      $ )Nr   rI  )rD   r^  r[  rJ  s     r   get_active_torch_device!HIPDriver.get_active_torch_device  s    ||FD$;$;$=>>r!   c                     SSK Jn  U$ )Nr   )do_bench)triton.testingrf  )r   rf  s     r   get_benchmarkerHIPDriver.get_benchmarker  s
    +r!   c                 \    SS K nSnUR                  [        US-  5      UR                  SS9$ )Nr   i      rI  )r   r^  )rD   emptyint)r   rD   
cache_sizes      r   get_empty_cache_for_benchmark'HIPDriver.get_empty_cache_for_benchmark  s.     '
{{3zQ/uyy{PPr!   c                 $    UR                  5         g r  )zero_)r   caches     r   clear_cacheHIPDriver.clear_cache  s    r!   )rF  rE  )r   r   r   r   r   rK  staticmethodrR  r   rV  r`  rc  rh  ro  rt  r   r   r   s   @r   rC  rC    sS    (
    ??
Q r!   rC  )%	functoolsr%   r\   r   pathlibr   tritonr   triton.backends.compilerr   triton.backends.driverr   triton.runtimer   triton.runtime.buildr   triton.tools.tensor_descriptorr	   r@   rS   realpathrT   rR   r   rA   	lru_cacherv   objectrx   r   r   r   r   r  r  r  rC  r   r!   r   <module>r     s     	  	   . , & 8 ;
''//"''**84
5Wi01-` \R \R~?v ?(
.     " M`2q& q@.	 .r!   