
    9iJ                     x   S SK Jr  \R                  SS j5       r\R                  SS j5       r\R
                  SS j5       r\R
                  SS j5       r\R
                  SS j5       r\R
                  SS j5       r	\R
                  SS	 j5       r
\R
                  SS
 j5       r\R
                  SS j5       rg)    )coreNc           
      R    [         R                  " SS/ [         R                  SSU S9$ )Nzmov.u64 $0, %globaltimer;z=lF   dtypeis_purepack	_semantic)r   inline_asm_elementwiseint64r
   s    `/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/triton/language/extra/cuda/utils.pyglobaltimerr      s.    &&'BD"TXT^T^hmtu1:< <    c           
      R    [         R                  " SS/ [         R                  SSU S9$ )Nzmov.u32 $0, %smid;z=rTr   r   )r   r   int32r   s    r   smidr   
   s+    &&';T2TZZaelm1:< <r   c                 p    [         R                  " U R                  R                  R                  S-  5      $ )N    r   	constexprbuilderoptions	num_warpsr   s    r   num_threadsr      s(    >>)++33==BCCr   c                 j    [         R                  " U R                  R                  R                  5      $ Nr   r   s    r   r   r      s#    >>)++33==>>r   c           
      T    [         R                  " SSU /[         R                  SSUS9$ )Na  {                                      
.reg .b32 a<2>, b<2>;                  
prmt.b32 a0, 0, $2, 0x5746;            
and.b32 b0, a0, 0x7f007f00;            
and.b32 b1, a0, 0x00ff00ff;            
and.b32 a1, a0, 0x00800080;            
shr.b32  b0, b0, 1;                    
add.u32 b1, b1, a1;                    
lop3.b32 $0, b0, 0x80008000, a0, 0xf8; 
shl.b32 $1, b1, 7;                     
}                                      
z=r,=r,rT   r   )r   r   float16)argr
   s     r   convert_fp8e4b15_to_float16r"   !   s2    &&
	4 6?T\\cgno r   c           
          SnU(       a  US-  nOUS-  nUS-  n[         R                  " USU /[         R                  SSUS9$ )	NaN  {
            .reg .pred p<4>;
            .reg .b32 a<2>, b<2>;
            .reg .b16 c<4>;
            .reg .b16 max_val_f16;
            .reg .b32 max_val_f16x2;
            mov.b16 max_val_f16,   0x3F00;
            mov.b32 max_val_f16x2, 0x3F003F00;
            and.b32 a0, $1, 0x7fff7fff;
            and.b32 a1, $2, 0x7fff7fff;zSmin.f16x2 a0, a0, max_val_f16x2;
                  min.f16x2 a1, a1, max_val_f16x2;a  setp.lt.f16x2  p0|p1, a0, max_val_f16x2;
                  setp.lt.f16x2  p2|p3, a1, max_val_f16x2;
                  mov.b32 {c0, c1}, a0;
                  mov.b32 {c2, c3}, a1;
                  selp.b16  c0, c0, max_val_f16, p0;
                  selp.b16  c1, c1, max_val_f16, p1;
                  selp.b16  c2, c2, max_val_f16, p2;
                  selp.b16  c3, c3, max_val_f16, p3;
                  mov.b32 a0, {c0, c1};
                  mov.b32 a1, {c2, c3};zmad.lo.u32 a0, a0, 2, 0x00800080;
              mad.lo.u32 a1, a1, 2, 0x00800080;
              lop3.b32 b0, $1, 0x80008000, a0, 0xea;
              lop3.b32 b1, $2, 0x80008000, a1, 0xea;
              prmt.b32 $0, b0, b1, 0x7531;
              }z=r,r,rTr   r   )r   r   float8e4b15)r!   	has_minx2r
   asms       r   convert_float16_to_fp8e4b15r'   2   sm    	+C  6 	6 	 	+ 		+   C &&sHse4CSCS]ahi1:< <r   c                 8   U R                   R                  R                  5       (       aH  [        XS9nUR                  R	                  5       (       a  UR                  [        R                  US9nU$ U R                   R                  R                  5       (       d+  U R                   R                  R	                  5       (       d   eU nU R                   R                  R	                  5       (       a  UR                  [        R                  SUS9n[        XcUS9nU$ )Nr   rtz)fp_downcast_roundingr
   r%   r
   )typescalaris_fp8e4b15r"   is_fp32tor   float32is_fp16r    r'   )r!   dst_tyr*   r%   r
   
upcast_valdowncast_vals          r   convert_custom_float8r6   V   s    
xx""$$0J
==  ""#t||yIJ88??""$$(?(?(A(AAAL
xx  #t||%[de.|\efLr   c                     [        XUSUS9$ )NTr+   r6   r!   r3   r*   r
   s       r   convert_custom_float8_sm80r:   f   s     .Bd^ghhr   c                     [        XUSUS9$ )NFr+   r8   r9   s       r   convert_custom_float8_sm70r<   k   s     .Be_hiir   r   )NN)triton.languager   externr   r   builtinr   r   r"   r'   r6   r:   r<    r   r   <module>rA      s      < <
 < <
 D D ? ?     <  <F   i i j jr   