
    9i_                         S SK r S SKJr  S SKJr  SS jr\ R                  S 5       r\ R                  SS\R                  4S jj5       r
\ R                  SS\R                  4S jj5       rg)	    N)TensorDescriptorc                    [        U5      n[        U R                  5      n[        U5      nUS:  a  X$-  nSUs=::  a  US-
  :  d   S5       e   S5       eUS::  d   S5       e[        U5      U:X  d   S5       eSnSnX2   U::  d   S	5       eXcU'   U R                  U5      nS
U-
  U/[	        U5       Vs/ s H  oR                  U5      PM     sn-   n	XU/U-   n
SS/U-   n[        X
X5      $ s  snf )a  
Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
which behaves like a concatenation (along the first axis) of subarrays
of potentially unequal size.

The load_ragged and store_ragged device functions can be used to read
and write from subarrays T[batch_offset : batch_offset + batch_size]
with hardware bounds-checking preventing any sort of leakage outside
the subarray.
r      zlast dimension cannot be ragged   z<read-write ragged descriptors must have at most 3 dimensionsz1block shape must have same length as tensor shapei     @z#number of rows may not exceed 2**30l        )listshapelenstrideranger   )Tblock_shape
ragged_dimtensor_shaperankmax_intbillionragged_stridei
tma_stride	tma_shape	box_shapes               W/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/triton/tools/ragged_tma.pycreate_ragged_descriptorr      s!    {#K=L|DA~

%TAX%H'HH%H'HH%19TTT9{t#X%XX#GG#w.U0UU.&HHZ(M -'7PUVZP[:\P[188A;P[:\\J#l2IQ+%IA*@@	 ;]s   3C+c                 $    SnX1-
  U-   nX-   nX5U4$ )z3
Helper function for load_ragged and store_ragged.
r    )batch_offset
batch_sizerowr   xys         r   to_ragged_indicesr"   0   s(     Gs"A!Aq=    r   c                 (   [         R                  " [        U R                  5      [        U5      S-   :H  S5        [	        XX4   5      u  pVnU R                  XV/USU -   U/-   X4S-   S -   5      n[         R                  " XR                  SS 5      nU$ )z
Read from a subarray T[batch_offset : batch_offset + batch_size] with
hardware bounds-checking, where reading outside the subarray gives zeros.

Coords should be an appropriately-sized list of integers, just like in
TMA.load().
   z*TMA must be a read-write ragged descriptorNr   )tlstatic_assertr
   r	   r"   loadreshape)	TMAr   r   coordsr   c0c1c2datas	            r   load_raggedr0   =   s     S^s6{Q68de"<V=OPJBB88RHvkz22bT9FPQ>?<SSTD::dJJqrN+DKr#   c                     [        XX5   5      u  pgn[        R                  " USS/UR                  -   5      nU R	                  Xg/USU -   U/-   X5S-   S -   U5        g)z
Write to a subarray T[batch_offset : batch_offset + batch_size] with
hardware bounds-checking, where writes outside the subarray are masked
correctly.

Coords should be an appropriately-sized list of integers, just like in
TMA.store().
r   N)r"   r&   r)   r	   store)	r*   r   r   r+   r/   r   r,   r-   r.   s	            r   store_raggedr3   O   se     #<V=OPJBB::dQFTZZ/0DIIrh,,t3f!^_6MMtTr#   )r   )tritontriton.languagelanguager&   triton.tools.tensor_descriptorr   r   jitr"   	constexprr0   r3   r   r#   r   <module>r:      sv      ;
%AP 	 	 2<<  " U",, U Ur#   