
    pi	                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dl m!Z! d dl"m#Z#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZImJZJmKZKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZSmTZT ddlmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZemfZfmgZgmhZh ddlimjZj ddlkmlZl ddlmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z| dd l}m~Z~mZmZmZmZmZ dd!lmZmZmZmZmZ dd"lmZ er,d d#lmZ d d$lmZ d d%lmZ dd&l9mZ dd'lmmZ dd(lmZ  ed)      Z ej,                  e      Zej2                  j5                  ed*      Zej2                  j5                  ed+      Zej2                  j5                  ed,      Z e<       Z; G d- d.      Z ed      d^d/       Z ed      d^d0       Z G d1 d2      ZejD                   G d3 d4             ZejD                   G d5 d6             ZejD                   G d7 d8e             ZejD                   G d9 d:e             Z	 	 	 	 	 	 	 	 d_d;Z G d< d=ew      Z e       jR                  Zd`d>Zd`d?Zdad@Zd`dAZdbdBZdcdCZ G dD dEer      ZdddFZdedfdGZ G dH dIev      Zejk                  dJ        G dK dLe      Z G dM dN      ZejD                   G dO dP             Z G dQ dR      ZejD                   G dS dT             Z G dU dVeqeeeeeef   f   f         ZejD                   G dW dX             Z G dY dZee         Z G d[ d\e      Zdgd]Zy)h    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_packagehas_triton_stable_tma_api   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)BlockShapeType)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  @    e Zd ZU dZi Zded<   i Zded<   edd       Zy)	OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    |j                   }t        t        j                  t        j                  g      | j
                  |<   || j                  |<   y N)__name__r   torchfloat32float64rq   rr   )clsfuncconvert_outputop_names       `/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pyregister_upcastzOpDtypeSupport.register_upcast   s=    --(2EMM5==3Q(RW%'5G$    N)rz   zCallable[..., str]r{   boolreturnNone)	ru   
__module____qualname____doc__rq   __annotations__rr   classmethodr~    r   r}   rp   rp   x   s1    
 <>8=')O_)6 6r   rp   c                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r}   gen_attr_descriptor_importr      s-     # v''):;Er   c                     t               } | j                  d       t               x}r| j                  |       | j                  d       | j	                         S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rO   splicer   	writelinegetvalue)imports	attr_descs     r}   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r   c                     e Zd ZdZ eej                  ej                  g      Z eej                  ej                  ej                  ge      ZeD  ci c]%  }|t        j                  t        |    ddd      ' c}}}} ZeD  ci c]3  }|t        j                  t        |   j#                          ddd      5 c}}}} Zed
d       Zed
d       Zy	c c}}}} w c c}}}} w )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    offsetTintegernonnegativeBLOCKr   positivec                4    | j                   |j                     S rt   )block_sizessymtry   trees     r}   get_block_sizezTritonSymbols.get_block_size   s    tyy))r   c                4    | j                   |j                     S rt   )block_offsetsr   r   s     r}   get_block_offsetzTritonSymbols.get_block_offset   s      ++r   N)r   rY   r   zsympy.Symbol)ru   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typessympySymbolr   r   upperr   r   r   r   ).0r   r   r   s   0000r}   r   r      s    !$--!?@Odkk4;;VoVWK    	ellj./v6RVWWM  	   	ell$%%'(.t
 	
K * * , ,#
s   *C
8C'
r   c                      e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   ddZddZddZddZddZe	dd       Z
y)IndexingOptionsstr	index_strOrderedSet[str]	mask_varszOptional[str]
expand_strr   _has_rindex
sympy.Exprindexz#Optional[Sequence[Union[int, str]]]expand_shapec                ,    t        | j                        S rt   )r   r   selfs    r}   has_maskzIndexingOptions.has_mask   s    DNN##r   c                J    t        | j                  t        j                        S rt   )r   r   r   TMPr   s    r}   has_indirectzIndexingOptions.has_indirect   s    "4::txx88r   c                    | j                   S rt   )r   r   s    r}   
has_rindexzIndexingOptions.has_rindex   s    r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)tmpNr   
startswithr   masks     r}   	<genexpr>z.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s     J43t9''.J   $&anyr   r   s    r}   has_tmpmaskzIndexingOptions.has_tmpmask   s    J4>>JJJr   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)rNr   r   s     r}   r   z,IndexingOptions.has_rmask.<locals>.<genexpr>   s     H3t9'',Hr   r   r   s    r}   	has_rmaskzIndexingOptions.has_rmask   s    HHHHr   c                    | j                   r2dj                  t        t        t        | j                                     S dS )N & r   )r   joinsortedmapr   r   s    r}   mask_strzIndexingOptions.mask_str   s4     =ANNEJJvc#t~~678	
PV	
r   Nr   r   r   r   )ru   r   r   r   r   r   r   r   r   propertyr   r   r   r}   r   r      sT    N55$9 KI 
 
r   r   c                  X   e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   d
ed<   dZded<   ed!d       Zed!d       Zed!d       Z	ed!d       Z
e	 	 	 	 	 	 	 	 	 	 	 	 d"d       Z	 	 	 	 	 	 	 	 d#dZd$dZ	 	 	 	 	 	 d%dZd&dZd'dZd'dZd'dZd'dZd'dZ	 	 	 	 	 	 	 	 	 	 d(d Zy))BlockDescriptorOptionsz
    This is a base class that describes a block descriptor used in Triton kernels.
    It can be used to create either a tensor descriptor (with TensorDescriptorOptions)
    or a block pointer (with BlockPtrOptions).
    BlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkc                .    | j                   j                  S rt   )r   shaper   s    r}   r   zBlockDescriptorOptions.shape  s    {{   r   c                .    | j                   j                  S rt   )r   block_shaper   s    r}   r   z"BlockDescriptorOptions.block_shape  s    {{&&&r   c                .    | j                   j                  S rt   )r   stridesr   s    r}   r   zBlockDescriptorOptions.strides	      {{"""r   c                .    | j                   j                  S rt   )r   offsetsr   s    r}   r   zBlockDescriptorOptions.offsets  r   r   c               F   t         j                  j                  d	fd} ||j                        |_         ||j                        |_        |j                  D cg c]  }j                  |d       }}|j                  D 	cg c]  }	j                  |	d       }
}	t        |
      rd|
d<   t        |j                  |
      D 	cg c]	  \  }	}|s|	 }}	}t        |
|      D cg c]  }t        |       c}t        ||
      D 	cg c]	  \  }	}|s|	 }}	}fd}t        d
i t        j                  |      j                         D ci c]  \  }}| ||       c}}}|D cg c]  }t        j                  |       }}t         j                   j"                  r%|d   j$                  dk(  sJ |j'                  d       t         j                   j(                  }t         j                   j*                  st-        |j                        t-        t         j                   j.                        |z
  k(  rIt         j                   j0                  j3                         r!|t4        j6                  j8                  g|z  z  } | |t         j                  j                  j;                  |      t=        t?        tA        t-        |j                                          ||||      }|jC                  ||       |S c c}w c c}	w c c}}	w c c}w c c}}	w c c}}w c c}w )z2Helper to create a BlockDescriptorOptions instancec                L    | D cg c]  }j                  |       c}S c c}w rt   )lookup_precomputed_size)exprsexprsizevarss     r}   lookup_sizez2BlockDescriptorOptions.create.<locals>.lookup_size  s"    GLMtH44T:MMMs   !r   rG   Fc                R    t        |       D cg c]	  \  }}|s| c}}S c c}}w )z@Removes any broadcasting or singleton dims from a given sequence)zip)ititemis_removableremovable_dimss      r}   remove_dimsz2BlockDescriptorOptions.create.<locals>.remove_dimsI  s3     +.b.*A&D,#   s   #x)r   r   r   r   r   r   r   )r   zIterable[sympy.Expr]r   list[sympy.Expr]r   )"rE   graphr   r   r   statically_known_equalsr   allr   r   r   dataclassesasdictitemsr   r   kernelno_x_dimprefixpopnum_reduction_dimsinside_reductionlennumelsfeaturesis_reductionr   SOner   listreversedrangecompute_boundary_check)ry   r   r   range_treesr   get_max_blockr   strider   dimsingleton_dimsis_singletonr   dimsr  keyvalr   r   reduction_ndimresultr  r   s                        @@r}   createzBlockDescriptorOptions.create  s    77##	N #6<<0$V^^4
 GMnn
<BH,,VQ7
 
 AG@R@R
9<H,,S!4
 
 ~!&N2 &)););^%L
!\ 
 
 14NDU0VW#d)W &)):N%K
!\ 
 
	 ! 
5@5G5G5O5U5U5WXcsK$$X

 GRRd}33D9RR88q>((C///OOA44))FNN#s188??';n'LL!!..0 EGGKK=>99KGG,,DD_Uxc&,,&7 89:#+/
 	%%m[AO


 X
  Y Ss*   K=LL+LLL/Lc                D    t         j                  |   }t        |||i      S zN
        Replaces instances of {symt}_offset with the new expression.
        r   r   r>   r   r   replacementr   roffsets        r}   replace_offsetz%BlockDescriptorOptions.replace_offsetq  &      --d3$+ 677r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S Nr   r   r   r0  r   Integerr   r   r   s      r}   remove_roffsetsz&BlockDescriptorOptions.remove_roffsetsz  ;    !11 	ED&&tU]]1-=tDD	Er   c           
        t         j                  j                  }|D ci c]7  }t        j                  |j
                      |t        |j
                           9 }}t        t        t         j                  j                  |            }t        t        | j                              D cg c]%  }|j                  | j                  |   t         j"                  j$                        s|r:t        j                  t&        j(                     | j*                  |   j,                  v sb|j/                  | j                  |   | j*                  |         s|j/                  | j                  |   t1        | j*                  |   |            sMt         j                  j2                  r1| j*                  |   t        j                  t&        j4                     k(  s|( c}| _        yc c}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)N)rE   r  r   r   r   r   r   r   r   r  needs_yz_grid_overflowr  r  r   r	  r   r   r  Zeror   r   r   free_symbolsstatically_known_multiple_ofr>   r  r   r   )r   r  r  r   tblock_to_maxneeds_overflow_grididxs           r}   r  z-BlockDescriptorOptions.compute_boundary_check  s    77## !/
 %%aff-}Z=O/PP/
 /
 "#ahh&E&E{"ST S_- 
44T\\#5FU ,)55dkkB++C0==> %AA JJsOT-=-=c-B !) E E JJsO&t'7'7'<lK! HH%%((-1J1J4;;1WW-  
/
 
s   <G*D+G#c                6    | j                   J | j                   S rt   )r   r   s    r}   boundary_checkz%BlockDescriptorOptions.boundary_check  s     ##///###r   c                     yNFr   r   s    r}   r   z#BlockDescriptorOptions.has_indirect      r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wrt   )r   r   r   )r   r   s     r}   r   z4BlockDescriptorOptions.has_rindex.<locals>.<genexpr>  s%      
  m&C&CD
r   )r   r   r   s    r}   r   z!BlockDescriptorOptions.has_rindex  s"     
((
 
 	
r   c                "    | j                         S rt   )r   r   s    r}   r   z BlockDescriptorOptions.has_rmask  s      r   c                     yrE  r   r   s    r}   r   z"BlockDescriptorOptions.has_tmpmask  rF  r   c                4    t        | j                               S rt   )r   rC  r   s    r}   r   zBlockDescriptorOptions.has_mask  s    D'')**r   c                0  	 t        | j                  | j                        D cg c]#  \  }}|rt        j                  j
                  n|% }}}t        |||      }t        j                  j                  	|xr7 t        |      t        |      k(  xr t        	fdt        ||      D              }t        | j                        r2|s0d| dt        j                  j                  | j                         d}t        || j                  |      }|S c c}}w )z
        Generate a broadcast and a reshape for the block descriptor.
        This restores stride-0 dimensions which were removed from the block descriptor.
        c              3  p   K   | ]-  \  }}j                  |d       xs j                  ||       / ywrG   N)r	  )r   pre_dimpost_dimr   s      r}   r   zGBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>  sH       &GX 00!< G33GXFGs   36tl.broadcast_to(, ))r   r   r   r   r  r  triton_reshaperE   r  r   r  r
  r   r  index_to_str)
r   valueinitial_shaper   allow_implicitr!  is_broadcastingpre_broadcast_shapesupports_implicit_broadcastr   s
            @r}   codegen_broadcast_and_reshapez4BlockDescriptorOptions.codegen_broadcast_and_reshape  s    ),$$d&<&<)
$_ +EGGKK3
 
 um5HI 77##&4 '
#$K(88   *--@+)N  	$ t%%&/J&ugR0E0EdFZFZ0[/\\]^E ud&:&:KH9
s   (Dr   r  )r   r   r   r   r  list[IterationRangesRoot]r   r   r  Callable[[str], int]r   r   r   r   r.  r   r   r   r   r   r   r   r   r   )r  r_  r  r^  r   r   )r   r   r   )
rV  r   rW  r   r   r   rX  r   r   r   )ru   r   r   r   r   r   r   r   r   r   r   r   r)  r0  r7  r  rC  r   r   r   r   r   r\  r   r   r}   r   r      s    ))!!%%+/O(/! ! ' ' # # # # ]  ] $	]
 /] #] ,] 
 ] ]~88-78?C8	8
2
+2
 /2
 
	2
h$
!+)) ,) *	)
 ) 
)r   r   c                      e Zd ZdddZy)TensorDescriptorOptionsc                ,   t         j                  j                  }| j                  dk7  r| d || j                         dn|d || j                         d || j
                         d || j                         g}ddj                  |       dS )	a  
        Codegen a call to tl.make_tensor_descriptor()

        Args:
            name: variable name for pointer
            roffset: unused, but kept for compatibility with BlockPtrOptions.format()

        Returns:
            "tl.make_tensor_descriptor(...)"
        r    + (rS  shape=strides=block_shape=ztl.make_tensor_descriptor(rR  )rE   r  rU  r   r   r   r   r   )r   namer/  fargss        r}   formatzTensorDescriptorOptions.format  s     HH!! ''1, &Qt3345Q7Qtzz]O$q'(1T--./0	
 ,DIIdO+<A>>r   NTri  r   r   r   )ru   r   r   rl  r   r   r}   rc  rc    s    ?r   rc  c                  >    e Zd Z	 	 	 	 	 	 	 	 ddZddZdd	dZd
dZy)BlockPtrOptionsc                D    t         j                  |   }t        |||i      S r+  r,  r-  s        r}   r0  zBlockPtrOptions.replace_offset  r1  r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S r3  r4  r6  s      r}   r7  zBlockPtrOptions.remove_roffsets  r8  r   c           	        t         j                  j                  }g | j                  }|s|D cg c]  }| j	                  |       }}| j
                  dk7  r| d || j
                         dn|d || j                         d || j                         d || j                         d || j                         d ||       g}d	d
j                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r   re  rS  rf  rg  rh  zorder=zoffsets=ztl.make_block_ptr(rR  )rE   r  rU  r   r7  r   r   r   r   r   r   )r   ri  r/  rj  r   r   rk  s          r}   rl  zBlockPtrOptions.format"  s     HH!!!DLL/BIJt++F3JGJ ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Ks   C"c           	         t         j                  |   }| j                  D cg c]A  }| j                  |||      | j                  |t        j
                  j                  |      z
  C }}|S c c}w )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        )r   r   r   r0  r   r  r;  )r   r   rblockr   advances        r}   advance_roffsetzBlockPtrOptions.advance_roffset?  st     **40 ,,

  ##FFD9%%feggllDAB
 
 
s   AA,Nr`  ra  rm  rn  )r   r   r   r   )ru   r   r   r0  r7  rl  rw  r   r   r}   rp  rp    s6    88-78?C8	8
7:r   rp  c                r   t        |t              rt        |t              sJ |D cg c]!  }t        j                  j	                  |      # }}|D cg c]!  }t        j                  j	                  |      # }}||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }	|t        |      k  r|	||   k(  r|j                  d       |dz  }0|	dk(  sJ |j                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w c c}w c c}w )z<Workaround https://github.com/triton-lang/triton/issues/28361ztl.reshape(z, [rR  z])r   :rG   r   [])
isinstancer  rE   r  rU  r   r  append)
rV  	old_shape	new_shaper   old_shape_strnew_shape_strsrA  expandsizes
             r}   rT  rT  S  sE    i&:i+FFF?HIeQXX**51IMI?HIeQXX**51IMI% -aAH->UG3tyy'?&@CC
CF "]##c0B(BMM#1HC3;;MM&!" #m$$$$WAdii'(**% JI .s   &D*&D/
D4D4c                      e Zd Zd dZd dZd dZd dZd dZd dZd dZ	d dZ
d d	Zd d
Zd dZd dZd dZd dZd!dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZ y)"TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrG   libdevice.trunc(r   ).to(rS  r  rk  _printrE   r  index_dtyper   r   s     r}   _print_TruncToIntzTritonPrinter._print_TruncToInts  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r   c                x    t        j                         rt        j                  j                  r| }|S d| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcoderv   versionhip)r   r   rets      r}   _print_FloatzTritonPrinter._print_Floaty  s=    %--"3"3FC 
 !m4C
r   c                    t        |j                        dk(  sJ | j                  |j                  d   t        d   dz
        }| dS )NrG   r   Atom      ?z.to(tl.float64))r  rk  parenthesizer   )r   r   r  s      r}   _print_ToFloatzTritonPrinter._print_ToFloat  sI    499~"""diilJv,>,DEO$$r   c                    |j                   \  }}|j                  r3|j                  r'| j                  |j                   dt        d   dz
        S | j	                  |      }| j	                  |      }d| d| dS )N % r  r  z!triton_helpers.remainder_integer(rR  rS  )rk  is_nonnegative	stringifyr   r  r   r   quotdivquot_sdiv_ss         r}   _print_PythonModzTritonPrinter._print_PythonMod  sr    II	c3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr   c                   |j                   sJ |j                  \  }}|j                  r3|j                  r'| j                  |j                  dt        d   dz
        S | j                  |      }| j                  |      }d| d| dS )N // r  r  z!triton_helpers.div_floor_integer(z,  rS  )
is_integerrk  r  r  r   r  r  s         r}   _print_FloorDivzTritonPrinter._print_FloorDiv  s~    II	c3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr   c                P    | j                  |j                  dt        d   dz
        S )N / r  r  )r  rk  r   r  s     r}   _print_IntTrueDivzTritonPrinter._print_IntTrueDiv  s#    ~~dii
60BS0HIIr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrG   libdevice.floor(r   r  rS  r  r  s     r}   _print_floorzTritonPrinter._print_floor  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  r  r  s     r}   _print_FloorToIntzTritonPrinter._print_FloorToInt  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrG   libdevice.ceil(r   r  rS  r  r  s     r}   _print_ceilingzTritonPrinter._print_ceiling  K    499~""" TYYq\!: ;5AUAU@VVWXXr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  r  r  s     r}   _print_CeilToIntzTritonPrinter._print_CeilToInt  r  r   c                ,    d| j                  |       dS )Nzlibdevice.sqrt(().to(tl.float32)))r  r  s     r}   _helper_sqrtzTritonPrinter._helper_sqrt  s    !$++d"3!44EFFr   c                    d| j                  |j                  d          d| j                  |j                  d          dS )Nlibdevice.pow(r   rR  rG   rS  )r  rk  r  s     r}   _print_FloatPowzTritonPrinter._print_FloatPow  s?    T[[167r$++diiPQl:S9TTUV	
r   c                ,   |j                   d   j                  r;dt        |j                   d          d| j                  |j                   d          dS d| j                  |j                   d          d| j                  |j                   d          dS )Nr   r  rR  rG   rS  )rk  
is_Integerfloatr  r  s     r}   _print_PowByNaturalz!TritonPrinter._print_PowByNatural  s    99Q<""#E$))A,$7#84;;tyyQR|;T:UUVWWT[[167r$++diiPQl:S9TTUV	
r   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   rG   r   	tl.where(rR  rS  )doprintrk  )r   r   cpqs        r}   _print_WherezTritonPrinter._print_Where  s_    LL1&LL1&LL1&1#Rs"QCq))r   c                   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zI
        Helper for max/min code generation.
        cmp: > or <
        rG   r   r   Nc              3  (   K   | ]
  }d | d  yw)(rS  Nr   r   r  s     r}   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + )))r  rk  r  typetuple)r   r   cmpmidry   abs          r}   _print_min_max_helperz#TritonPrinter._print_min_max_helper  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr   c                &    | j                  |d      S )Nr  r  r  s     r}   
_print_MinzTritonPrinter._print_Min      ))$44r   c                &    | j                  |d      S )Nr  r  r  s     r}   
_print_MaxzTritonPrinter._print_Max  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   tl_math.abs(r   rS  r  rk  r  r  s     r}   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.cos((r   r  r  r  s     r}   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.cosh((r   r  r  r  s     r}   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.acos((r   r  r  r  s     r}   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.sin((r   r  r  r  s     r}   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.sinh((r   r  r  r  s     r}   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.asin((r   r  r  r  s     r}   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.tan((r   r  r  r  s     r}   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.tanh((r   r  r  r  s     r}   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.atan((r   r  r  r  s     r}   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrG   zlibdevice.log2((r   r  r  r  s     r}   _print_OpaqueUnaryFn_log2z'TritonPrinter._print_OpaqueUnaryFn_log2  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrG   zlibdevice.llrint(r   r  rS  r  r  s     r}   _print_RoundToIntzTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r  rk  r  
ValueErrorr  r   )r   r   numberndigits
number_strs        r}   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr   N)r   r   r   r   )r   r   r  r   r   r   )!ru   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r}   r  r  r  s    
%
FGJ


YYG


*C&55;NOONOONOOO
Qr   r  c                *    t        t        |             S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r?   rA   dtypes    r}   triton_compute_typer	  "  s    *5122r   c                ^    | t         j                  k(  rt         j                  } t        |       S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rv   r   int8r?   r  s    r}   triton_store_typer  '  s"    



ur   c                    t        |       r+| j                  r| j                  dk  rt        j                  S t        |       S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizerv   int32rA   r  s    r}   upcast_acc_dtyper  .  s0    5??u~~7J{{u%%r   c                *    t        t        |             S )z:Convert torch.dtype to triton type, with reduction upcasts)r	  r  r  s    r}   triton_acc_typer  5  s    /677r   c                <    | j                   dk  xr | j                  S )Nr   )r  is_floating_pointr  s    r}   low_precision_fpr  :  s    >>Q:5#:#::r   c                    t        | t              sy| j                  }t        |t        j                        rt	        |      S dS rE  )r}  rM   r  rv   r  )varr  s     r}   low_precision_fp_varr  >  s6    c;'IIE&0&DE"O%Or   c                  <     e Zd Z	 d	 	 	 	 	 	 	 	 	 d fdZd Z xZS )TritonCSEVariablec                \    t         |   ||||       t               | _        |J d       y )Nr   z!TritonCSEVariable must have dtype)super__init__r   r   )r   ri  boundsr  r   	__class__s        r}   r   zTritonCSEVariable.__init__G  s6     	vuE:*4, E"EE r   c                F   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sTt        j                  D ]6  }t        ||      s| j                  j                  t        |    dg          y )Nr   )
r}  r  r   updater   r   r   r   r   r   )r   ri  rk  kwargsargr   s         r}   update_on_argsz TritonCSEVariable.update_on_argsU  s     
	C#01%%cmm4C. *55 D%c40--*T2B1C4/H.IJ
	r   rt   )
ri  r   r!  zValueRanges[Any]r  torch.dtyper   ri   r   r   )ru   r   r   r   r'  __classcell__r"  s   @r}   r  r  F  sH     !%
F
F !
F 	
F
 
F 

Fr   r  c                     ddl m}   |        S )Nr   rf   )!torch._inductor.dtype_propagationrg   rf   s    r}   get_dtype_handlerr-  c  s    L%''r   c                0     dddfdd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    c                    t         j                  j                   xr> t        | t              xr, | j
                  t        j                  t        j                  fv S rt   )	r   r   codegen_upcast_to_fp32r}  rM   r  rv   float16bfloat16)r  s    r}   needs_upcastz*maybe_upcast_float32.<locals>.needs_upcasto  sD    444 =3,=		emmU^^<<	
r   c                (     |       rdnd}|  | S )N.to(tl.float32)r   r   )r  upcast_stringr3  s     r}   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_argv  s!    -9#->)B}o&&r   c                H     t         j                          d fd}|S )Nc                    | D cg c]
  } |       }}|j                         D ci c]  \  }}| |       }}} |i |}xr6 t        fdt        j                  | |j	                               D              }|sd n# t        t               j                        | i |}	|	t        j                  d fv}
|
r|	dt        |	       dnd}| | S c c}w c c}}w )Nc              3  .   K   | ]  } |        y wrt   r   )r   r  r3  s     r}   r   zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>  s      6&)S!6s   .to(rS  r   )r  r   	itertoolschainvaluesgetattrr-  ru   rv   rw   r?   )rk  r%  r&  upcast_argsr%  r&  upcast_kwargsr(  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringr{   rz   r7  r3  s               r}   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped~  s   <@AS+C0AKAHNWHCS"23"77WMW ;8-8F-  # 6-6__T6==?-S6 3
 ( @W.0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   CCr   )rp   r~   )rz   rF  r{   r7  r3  s   ` r}   	decoratorz'maybe_upcast_float32.<locals>.decoratorz  s$    &&t^<	0 	0. r   r   r   )rz   Callable[..., Any]r   rH  r   )r{   rG  r7  r3  s   ` @@r}   maybe_upcast_float32rI  i  s    
': r   c                     e Zd ZdZ ej
                  ej                        Ze	 	 dN	 	 	 dOd       Z	edPd       Z
ed        Zed        Ze e       d               Zed	        Zed
        Ze e       d               Ze e       d               Ze e       d               Ze e       d               Zed        Zed        Zed        Zed        Zedej6                  dddd       Ze e       d               Ze e       d               Zed        Zed        Z e e       d               Z!e e       d               Z"e e       d               Z#e e       d               Z$e e       d               Z%e e       d               Z&e e       d                Z'e e       d!               Z(e e       d"               Z)e e       d#               Z*e e       d$               Z+e e       d%               Z,e e       d&               Z-e e       d'               Z.e e       d(               Z/e e       d)               Z0e e       d*               Ze e       d+               Z1ed,        Z2ed-        Z3ed.        Z4ed/        Z5ed0        Z6ed1        Z7ed2        Z8ed3        Z9ed4        Z:ed5        Z;ed6        Z<ed7        Z=ed8        Z>ed9        Z?e e       d:               Z@e e       d;               ZAe e       d<               ZBe e       d=               ZCe e       d>               ZDed?        ZEe e       d@               ZFe e       dA               ZGe e       dB               ZHe edCD      dE               ZIe edCD      dF               ZJe e       dG               ZKe e       dH               ZLedI        ZMedJ        ZNe e       dK               ZOedL        ZPe e       dM               ZQy)QTritonOverrideszMap element-wise ops to TritonNTc                Z   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r||j                  s||  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r   )rv   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r}   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr   r  z != 0)z.to(tl.int16).to(tl.uint8)r;  rS  )rP  r(  rQ  r(  r   int)
maxrE   r  min_elem_per_threadrv   r   uint8r  r	  r  )r  r  rP  use_compute_typesrS  	out_dtypes         r}   to_dtypezTritonOverrides.to_dtype  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk!!i&A&AYEV S233+E2I)%0ID1%%r   c                    |j                   |j                   k(  sJ | j                  |k7  r|  dt        |       d} |  dt        |       d}t        |      |k7  r| dt        t        |             d}|S )Nr;  rS  z, bitcast=True))r  r  r?   rA   )r  r  rP  outs       r}   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r   c           	         t         j                  j                  |      }t         ||             }t	        |      }|dk(  r|S | dk  r#|j
                  sd|dd   }d| d| d| d| d	S d| d| d| dS )	Nz
tl.float32r   ztl.r  tl.full(rR  r  rS  )rv   _prims_commondtype_to_typerX   r	  r  )rV  r  r   type_
triton_valr?   triton_signed_types          r}   _shaped_constantz TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0,& 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr   c                *    | j                  ||g       S )Nr  )re  )ry   rV  r  s      r}   constantzTritonOverrides.constant  s    ##E5#;;r   c                    d|  dS )Nr  rS  r   r  s    r}   abszTritonOverrides.abs       aS""r   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  rS  r;  )r  r-  truedivrv   r1  rw   r?   r  yr\  rY  s       r}   rm  zTritonOverrides.truediv  sl    !Cs!n"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  rS  r;  )r  r-  modrv   r1  rw   r?   rn  s       r}   rq  zTritonOverrides.mod  sl    !Cs!n"&:1&=)+//15IU]]EMM::T+i"8!9;
r   c                :    t         j                  rd|  dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        ztl_math.exp(rS  zlibdevice.exp()r   use_fast_mathri  s    r}   expzTritonOverrides.exp  s+     !!A&&#A3a((r   c                    d|  dS )Nzlibdevice.exp2(rS  r   ri  s    r}   exp2zTritonOverrides.exp2-       !1%%r   c                    d|  dS )Nzlibdevice.expm1(rS  r   ri  s    r}   expm1zTritonOverrides.expm12       "!A&&r   c                    d|  dS )Nzlibdevice.sqrt(rS  r   ri  s    r}   sqrtzTritonOverrides.sqrt7  rw  r   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", rS  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumrg  rv   r  AssertionError)r  bugs     r}   reluzTritonOverrides.relu<  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r   c                    d|  d| dS )Nztriton_helpers.minimum(rR  rS  r   r  r  s     r}   minimumzTritonOverrides.minimumN      (2aS22r   c                    d|  d| dS )Nztriton_helpers.maximum(rR  rS  r   r  s     r}   r  zTritonOverrides.maximumR  r  r   c                    d|  d| d| dS )Nr  rR  rS  r   )r  r  r  s      r}   wherezTritonOverrides.whereV  s    1#Rs"QCq))r   rG   )constraintsr  is_purepackc                    t        |      }dj                  |D cg c]  }t        |       c}      }|#dj                  dg|D 	cg c]  }	d c}	z         }d|  d| d| d| d| d	| d
S c c}w c c}	w )NrR  z=rr   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=rS  )r	  r   r   )
asmr  r  r  r  inputsr?   i
input_refs_s
             r}   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwiseZ  s     *%0YY71A78
))TF6-Bac-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A.	A3
c                    d|  dS )Nztl_math.cos(rS  r   ri  s    r}   coszTritonOverrides.cosd  rk  r   c                    d|  dS )Nztl_math.sin(rS  r   ri  s    r}   sinzTritonOverrides.sini  rk  r   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)ry   r   r  s      r}   
index_exprzTritonOverrides.index_exprn  s    !"STTr   c                    t        d      )Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      r}   maskedzTritonOverrides.maskedr  s    !"OPPr   c                    d|  dS )Nzlibdevice.lgamma(rS  r   ri  s    r}   lgammazTritonOverrides.lgammav       #1#Q''r   c                    d|  dS )Nzlibdevice.erf(rS  r   ri  s    r}   erfzTritonOverrides.erf{        s!$$r   c                    d|  dS )Nzlibdevice.cosh(rS  r   ri  s    r}   coshzTritonOverrides.cosh  rw  r   c                    d|  dS )Nzlibdevice.sinh(rS  r   ri  s    r}   sinhzTritonOverrides.sinh  rw  r   c                    d|  dS )Nzlibdevice.acos(rS  r   ri  s    r}   acoszTritonOverrides.acos  rw  r   c                    d|  dS )Nzlibdevice.acosh(rS  r   ri  s    r}   acoshzTritonOverrides.acosh  rz  r   c                    d|  dS )Nzlibdevice.asin(rS  r   ri  s    r}   asinzTritonOverrides.asin  rw  r   c                    d|  dS )Nzlibdevice.asinh(rS  r   ri  s    r}   asinhzTritonOverrides.asinh  rz  r   c                    d|  d| dS )Nzlibdevice.atan2(rR  rS  r   r  ro  s     r}   atan2zTritonOverrides.atan2       "!Bqc++r   c                    d|  dS )Nzlibdevice.atan(rS  r   ri  s    r}   atanzTritonOverrides.atan  rw  r   c                    d|  dS )Nzlibdevice.atanh(rS  r   ri  s    r}   atanhzTritonOverrides.atanh  rz  r   c                    d|  d| dS )Nzlibdevice.copysign(rR  rS  r   r  s     r}   copysignzTritonOverrides.copysign  s     %QCr!A..r   c                    d|  dS )Nzlibdevice.erfc(rS  r   ri  s    r}   erfczTritonOverrides.erfc  rw  r   c                    d|  dS )Nzlibdevice.erfinv(rS  r   ri  s    r}   erfinvzTritonOverrides.erfinv  r  r   c                    d|  d| dS )Nzlibdevice.hypot(rR  rS  r   r  s     r}   hypotzTritonOverrides.hypot  r  r   c                    d|  dS )Nzlibdevice.log10(rS  r   ri  s    r}   log10zTritonOverrides.log10  rz  r   c                    d|  dS )Nzlibdevice.log2(rS  r   ri  s    r}   log2zTritonOverrides.log2  rw  r   c                    d|  d| dS )Nzlibdevice.nextafter(rR  rS  r   r  s     r}   	nextafterzTritonOverrides.nextafter  s     &aS1#Q//r   c                    |  d| S Nr   r   r  s     r}   logical_andzTritonOverrides.logical_and      Cs|r   c                    |  dS )Nz == 0r   r  s    r}   logical_notzTritonOverrides.logical_not  s    E{r   c                    |  d| S Nz | r   r  s     r}   
logical_orzTritonOverrides.logical_or  r  r   c                    d|  d| dS )Nr   ^ rS  r   r  s     r}   logical_xorzTritonOverrides.logical_xor  s    1#S1~r   c                    |  d| S r  r   r  s     r}   bitwise_andzTritonOverrides.bitwise_and  r  r   c                    d|  S )N~r   r  s    r}   bitwise_notzTritonOverrides.bitwise_not  s    1#wr   c                    |  d| S r  r   r  s     r}   
bitwise_orzTritonOverrides.bitwise_or  r  r   c                    |  d| S )Nr  r   r  s     r}   bitwise_xorzTritonOverrides.bitwise_xor  r  r   c                    |  d| S )Nz << r   r  s     r}   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      D}r   c                    |  d| S )Nz >> r   r  s     r}   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  r  r   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(rR  rS  r   seedr   s     r}   randzTritonOverrides.rand  s%    VHO,$r&++r   c                     d| d}d|  d| dS )Nr  r  z	tl.randn(rR  rS  r   r  s     r}   randnzTritonOverrides.randn  s%    VHO,4&6(!,,r   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(rR  rS  r   )r  r   lowhighs       r}   	randint64zTritonOverrides.randint64  s1    VHO,*4&6("SED6KKr   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr  )ri  r   s     r}   	load_seedzTritonOverrides.load_seed  s    !"RSSr   c                    d|  dS )Nzlibdevice.rsqrt(rS  r   ri  s    r}   rsqrtzTritonOverrides.rsqrt  rz  r   c                    d|  dS )Nzlibdevice.log1p(rS  r   ri  s    r}   log1pzTritonOverrides.log1p  rz  r   c                    d|  dS )Nzlibdevice.tan(rS  r   ri  s    r}   tanzTritonOverrides.tan  r  r   c                    d|  dS )Nzlibdevice.tanh(rS  r   ri  s    r}   tanhzTritonOverrides.tanh  rw  r   c                    d|  dS )Nztl.sigmoid(rS  r   ri  s    r}   sigmoidzTritonOverrides.sigmoid  s     QCq!!r   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   ri  s    r}   signbitzTritonOverrides.signbit$  s#     "!L3NqcQUV	
r   c                    d|  d| dS )Nzlibdevice.fmod(rR  rS  r   r  s     r}   fmodzTritonOverrides.fmod+  s     !2aS**r   c                    d|  d| dS )Nr  rR  rS  r   r  s     r}   powzTritonOverrides.pow0  s      s"QCq))r   c                    d|  dS )Nztl_math.log(rS  r   ri  s    r}   logzTritonOverrides.log5  rk  r   F)r{   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   ri  s    r}   isinfzTritonOverrides.isinf:       "!M22r   c                    d|  dS )Nzlibdevice.isnan(r  r   ri  s    r}   isnanzTritonOverrides.isnan?  r	  r   c                    d|  dS )Nzlibdevice.nearbyint(rS  r   ri  s    r}   roundzTritonOverrides.roundD  s     &aS**r   c                    d|  dS )Nr  rS  r   ri  s    r}   floorzTritonOverrides.floorI  rz  r   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), rS  r   )r  r  r  rems       r}   floordivzTritonOverrides.floordivN  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r;  .dtype))r  rg  rv   r  rZ  ltr  sub)r  zleftrightr  s        r}   signzTritonOverrides.signW  su    LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r   c                    d|  dS )Nr  rS  r   ri  s    r}   trunczTritonOverrides.trunc_  rz  r   c                    |  d| S )Nr  r   r  s     r}   truncdivzTritonOverrides.truncdivd  s     D}r   c                    d|  dS )Nr  rS  r   ri  s    r}   ceilzTritonOverrides.ceilj  rw  r   )NT)r  r(  rP  Optional[torch.dtype])r  r(  rP  r(  )Rru   r   r   r   mathr  e_LOG_2_EstaticmethodrZ  r]  re  r   rg  rI  rj  rm  rq  rt  rv  ry  r|  r  r  r  r  rv   rw   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r   r   r}   rK  rK    sw   (tyy H ,0	:&:& ):& :&x    D D" < < #  #     )  ) &  & '  ' &  &  " 3 3 3 3 * * "&emmTPQK K #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % &  & "  " 
 
 +  + *  * #  # /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r   rK  r   c                       e Zd ZdZ fdZeej                  d               Zed        Z	ed        Z
ed        Zed        Zed        Zed	        Z xZS )
TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                D    t        |   |i | | j                          y rt   )r  r   _setup_libdevice_routing)r   rk  r%  r"  s      r}   r   zTritonKernelOverrides.__init__{  s#    $)&) 	%%'r   c                   ddl m t        j                  j                  j
                  D ]  }t        | |      sJ t        | |      }fd}|dk(  rDt        d      sJ t        j                  |||      }||_
        t        | |t        |             kd }t        j                  |||      }||_
        t        | |t        |              y)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                    | j                   t        j                  k7  r ||       S  t        |      |       j                  S rt   )r  rv   rx   r?  rV  )r  _original_impl_fn_namer,  s      r}   decomposition_routerzLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_router  s9    77emm+)!,,>7#3X>qAGGGr   r  )r.  r/  c                ^    | j                   t        j                  k(  r	d| d|  dS  ||       S )Nz
libdevice.r  rS  )r  rv   rx   )r  r.  r/  s      r}   dtype_routerzDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_router  s2    77emm+'z1#Q77)!,,r   N)torch._inductor.codegen.commonr,  rv   	_inductorutilsop_requires_libdevice_fp64r   r?  	functoolspartialru   setattrr&  )ry   fn_nameoriginal_implr0  fnr2  r,  s         @r}   r*  z.TritonKernelOverrides._setup_libdevice_routing  s    
 	D,,GG 	4G3(((#C1MH )#/;;;&&(QX &Wl2&67- ""]WB "BKC,r"23;	4r   c                r    t         j                  j                         }dg|z  }| j                  |||      S )NrG   r  )rE   r  triton_tensor_ndimre  )ry   rV  r  ndimr   s        r}   rg  zTritonKernelOverrides.constant  s9    
 xx**,d
##E5#>>r   c                t   t         j                  j                  |dd       }t        |t              sJ t         j                  j                         }|t        j                  t        j                  fvr|n|}t        j                  j                  }	 dt        j                  _        t         j                  j                  j                  t         j                  j                  |j                  t!        |      ||j"                        }|t        j                  _        |t        j                  t        j                  fvrit         j                  j                  j                  t         j                  j                  | j%                  ||      t'        |      |j(                        }n|}|j*                  D ]l  }t-        |t.        j0                        st        j2                  |t         j                  j                  j4                  |j6                     j8                        }n ||k7  r_t         j                  j                  j                  t         j                  j                  | j%                  ||      ||j(                        }|j:                  |_        |S # |t        j                  _        w xY w)NF	block_ptrtma_compatibility_checkerr!  r  r   r  r   )rE   r  indexingr}  r   get_index_dtype_as_torch_dtyperv   r  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r6   r   rZ  rA   r   r<  r   r   r   promote_typesvarname_mapri  r  r   )ry   r   r  rF  r  origr  	index_vars           r}   r  z TritonKernelOverrides.index_expr  s   88$$ET % 
 (O444 hh==?u{{EKK&@@k "">>
	C>CF;((,,''  "",T2++ ( C ?CF;ekk22((,,''  S%()%0ii	 ( C  E!.. 	!)TXX6!//qxx||77	GMME #hhll++HH$$LLk2%))	 ,  !**
= ?CF;s   A8J   J7c           
        | zt         j                  j                  `t        j                  j
                  j                  t        j                  j                  |  dt         j                  | j                        } |j                  j                  d      }|sJ d       d}|D ]>  }|j                  D ]-  }|j                  dk7  st        |j                  d         s+d	} > @ |rd n|}t        j                  j                  | |
      5 } |       }	d d d        |r	j                   j"                  rt        |      }t        j                  j
                  j                  t        j                  j                  d|	 dt%        |       d|	 dt'        j(                  |      |	j*                  |	j                        }t-        j.                  |	|      }
n	}
|
j0                  j3                         |
S # 1 sw Y   xY w)N.to(tl.int1)rE  output)opz)graph for body does not contain an outputFloadrG   TrV  r_  z.shape, rR  r  rD  )rv   r  r  rE   r  rK  rL  rM  r   r   r  
find_nodesrk  targetra   
mask_loadsr!  is_boolrX   r   wrapr  r  r  r   discard)r   r  r  nodes
need_wherenoder&  rV  new_maskr(  r  s              r}   r  zTritonKernelOverrides.masked  s    1 1 =88<<((  &%jjjj	 ) D 

%%%2AAAu
  	Dyy ::'+CCHHQK+P!%J	 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''.llll * E ))Hfe4CCh'
)	 	s   G88Hc                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load( + load_seed_offsetrS  )rE   r  rk  inputseed_offset)ri  r   r  s      r}   r  zTritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r   c                0   d|  d}t         j                  j                  j                  |      x}r|S t         j                  j                  j	                  | j
                  | j                        }t         j                  j                  j	                  t        j                  | j                        }t         j                  j                  j                  | d| d|  d       t         j                  j                  j                  |||f       ||fS )Nzfrexp(rS  rE  rR  z = triton_helpers.frexp()rE   r  rK  try_getnewvarr  r   rv   r  rM  r   put)r  	cache_keycse_valmantissaexponents        r}   frexpzTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWWAGG&D88<<&&U[[&H	""j8*$<QCqA	
 	
Xx$89(##r   c                &    d|  dt        |       dS )Nztl.device_assert(rR  rS  )repr)condmsgs     r}   device_assert_asyncz)TritonKernelOverrides.device_assert_async-  s    "4&49+Q77r   )ru   r   r   r   r   r   r7  cacher*  rg  r  r&  r  r  rp  ru  r)  r*  s   @r}   r(  r(  s  s    ( __"4  "4H ? ? 5 5n , ,\ 
 
 $ $ 8 8r   r(  c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     i | _         g | _        y rt   )ry  r{  r   s    r}   r   zHelperFunctions.__init__8  s    !!#r   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )ri  )ry  getr  r{  r~  rl  )r   template_coder  existing_nameri  s        r}   addzHelperFunctions.add<  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr   c                ,    t        | j                        S rt   )iterr{  r   s    r}   __iter__zHelperFunctions.__iter__R  s    D**++r   c                     | j                   |   S rt   )r{  )r   rA  s     r}   __getitem__zHelperFunctions.__getitem__U  s    %%c**r   Nr   r   )r  r   r   r   )	ru   r   r   r   r   r   r  r  r  r   r   r}   rx  rx  2  s+    -##  $ 4G ,,+r   rx  c                      e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   d
dZy	)r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr  r   r   r   r   c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wrt   )r  r  r  s     r}   r   z*BlockParameters.__add__.<locals>.<genexpr>i  s     Bq[''*Bs   !r   )r  r  )r   r  ry   r  r  r%  s         r}   __add__zBlockParameters.__add__d  sR     4jBT5MBB19a8sc1S6AcF?*8998s   AN)r  r   r   r   )ru   r   r   r   r  fieldr  r   r   r   r   r   r  r   r   r}   r   r   Y  sn     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG:r   r   c                  *    e Zd ZdZd ZddZd Zd Zy)"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t        j                  t        j
                        | _        d| _        d| _        y r3  )	rk  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   rk  s     r}   r   z+CooperativeReductionWorkspaceCache.__init__s  s@    	*66{7H7HIr   c                    | j                   j                  |      }|r|j                         S | j                  j	                  |d      \  }}| j
                  j                  |||f       ||fS rE  )r  r  popleftrk  	workspacer  r~  )r   nbytescachedws_name	ws_offsets        r}   allocatez+CooperativeReductionWorkspaceCache.allocate{  si    %%))&1>>##!YY00?  &'9!=>##r   c                    | j                   D ]&  \  }}}| j                  |   j                  ||f       ( | j                  | _         g | _        | xj                  dz  c_        y NrG   )r  r  r~  r  r  )r   r  r  r  s       r}   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end  s_    *.// 	F&FGY  (//)0DE	F++1r   c                H    | j                   }| xj                   dz  c_         |S r  )r  )r   priors     r}   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count  s#      Ar   N)r  r   )ru   r   r   r   r   r  r  r  r   r   r}   r  r  m  s    
$r   r  c                  $    e Zd ZU ded<   d Zd Zy)FixedTritonConfigzdict[str, int]r   c                     | j                   |   S rt   r   r   r  s     r}   r  zFixedTritonConfig.__getitem__  s    {{4  r   c                    || j                   v S rt   r  r  s     r}   __contains__zFixedTritonConfig.__contains__  s    t{{""r   N)ru   r   r   r   r  r  r   r   r}   r  r    s    !#r   r  c                      e Zd ZdZddZy)	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    c                Z    t         j                  j                  x}r||j                  fS |S rt   )rE   r  
_load_maskri  )r   rl  r   s      r}   augment_keyzTritonCSE.augment_key  s,    88&&&4&tyy))r   N)rl  r   r   zUnion[str, tuple[str, str]])ru   r   r   r   r  r   r   r}   r  r    s    
r   r  c                  R    e Zd ZU dZded<   ded<   ded<   d Z	 	 dd	Z	 	 	 	 dd
Zy)TMACompatibilityCheckerzO
    Checks if the TMA API can be used for load / store triton operations.
    TritonKernelr  r(  r  r   	for_storec                    d| _         y )Nz2Cannot use TMA descriptor for load / store since: )failed_debug_prefixr   s    r}   __post_init__z%TMACompatibilityChecker.__post_init__  s
    #W r   c                   t         j                  j                         j                  dk(  rXt        j
                  j                         d   dk\  r4t        j                  j                  rt        j                  r
t               s!t        j                  d| j                         y| j                  r7| j                   j"                  r!t        j                  d| j                         yy)Ncudar   	   z}%s Requires triton>=3.4.0, a CUDA device with cc>=9.0 and `use_tensor_descriptor` and `assume_aligned_inputs` options enabledFz/%s stores with `no_x_dim` cannot load 16 bytes.T)rE   r  get_current_device_or_throwr  rv   r  get_device_capabilityr   r   use_tensor_descriptorassume_aligned_inputsr   r  debugr  r  r  r  r   s    r}   can_use_tmaz#TMACompatibilityChecker.can_use_tma  s     GG//166&@

00215:33,,)+ II[ ((  >>dkk22IIA(( r   c           
     p   t         j                  j                  j                  |j                  d   t        j                  d            s!t        j                  d| j                         y| j                  j                  }|j                  dd D ]  }t         j                  j                  j                  t        ||z  dt        j                  d            t        j                  d            rbt        j                  d| j                          y |j                  d   }d}d}|j                  D ])  }t        j                   D ]  }t#        ||      s|}|} ) + |r|sJ | d	t        j                           | j$                  j&                  r| j(                  st*        |   }	d}
| j$                  j,                  D ],  }|j.                  s|j0                  |	k(  s |j2                  }
 n |
J | j$                  j5                  |
      }|j7                  ||i      |z  }t         j                  j                  j9                  |t        j                  d            s!t        j                  d
| j                         yy	 t;        t=        t        j>                  ||z  dz
  |d                  }t         j$                  jA                  |      }| j$                  jB                  rW|| j$                  jB                  |   kD  rt        j                  d| j                  || j$                  jB                  |   |       ytE        || j$                  jF                  jI                  |d            | j$                  jF                  |<   y# tJ        $ r# t        j                  d| j                         Y yw xY w)zB
        Check if the block parameters are valid for TMA.
        r   rG   z-%s TMA API requires innermost stride to be 1.FN   r   z8%s TMA API requires outer strides to be 16 byte aligned.z, expr must contain a single block type from zC%s persistent reduction innermost block shape cannot load 16 bytes.zT%s For block %s, fixed config block size %d is smaller than the minimum required: %dz.%s innermost block shape cannot load 16 bytes.T)&rE   r  r   r	  r   r   r5  r  r  r  r  r  r   r   r<  r   r   r   r  persistent_reductionr  r   r  r  r  numel_get_persistent_RBLOCKsubsstatically_known_geqr/   rT  nsolverU  fixed_configrU  tma_min_block_sizesr  r  )r   block_paramselement_sizer   innermost_block_shapeinnermost_block_typeinnermost_block_symtblock_type_str
block_symtinnermost_tree_prefix
tree_numelr>  persistent_rblockinnermost_block_bytesmin_block_sizes                  r}   are_block_parameters_compatiblez7TMACompatibilityChecker.are_block_parameters_compatible  su    ww77  $emmA&6
 II?(( zz**"**3B/ 		F77##;; 5q%--:KLa  		N,, 		 !- 8 8 <##3@@ 	N+77 
!.*=+9(+5(		 $(< 	
$%%QR_RkRkQlm	
<
 ;;++DNN %//C$D!J[[,, >>xx#88%&WW
	
 ))) $ B B: N%**,@BS+TU " 77##88%u}}R'8 		Y,, Z O%!01L@2E0" "#!6!67K!L;;++%(@(@(PP		< 44* KK44^D*  % GJ&77;;NANGDKK33NC   		D,, s   B:N	 AN	 	)N54N5Nr   )r  r   r   r   )ru   r   r   r   r   r  r  r  r   r   r}   r  r    sH     OX	Bw%w 
wr   r  c                  n    e Zd ZU dZeZded<   eZded<   dZ	e
Z	 	 	 	 dL	 	 	 	 	 	 	 dM fdZdNd	ZdOd
Zd Zd Zd Zd ZdOdZd ZedPd       Zdddddd	 	 	 dQdZ	 dR	 	 	 	 	 	 	 dSdZdRdZ	 	 	 	 	 	 	 	 dTdZd ZdUdZ	 dV	 	 	 	 	 	 	 	 	 dWdZd ZdXdZ	 	 dY	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dZdZ dPdZ!d[dZ"	 	 	 	 	 	 d\d Z#	 	 	 	 	 	 	 	 	 	 d]d!Z$	 	 d^d"Z%d^d#Z&d$ Z'd% Z(d& Z)d' Z*	 	 d_d(Z+	 	 	 	 	 	 d`d)Z,	 	 	 	 	 	 dad*Z-	 	 	 	 	 	 	 	 dbd+Z.	 	 	 	 	 	 	 	 	 	 dcd,Z/d- Z0ddd.Z1d/ Z2d0 Z3d1 Z4e5d2        Z6dVdPd3Z7e5d4        Z8e5d5        Z9d6 Z:ded7Z;d8 Z<dVdfd9Z=dgd:Z>dhd;Z?did<Z@djd=ZA	 	 	 	 	 	 dkd>ZBdjd?ZCdld@ZDdmdAZEdndBZFdOdCZGdodDZHeIdddE       ZJdpdFZKdqdGZLeIdrdH       ZMdsdIZNdpdJZO	 	 	 	 	 	 dtdKZP xZQS )ur  zdA class to represent a triton kernel and helpers to generate
    triton kernel programmatically
    rx  helper_functionszCallable[[sympy.Expr], str]kexprTNc                X   || _         || _        t        |   |fi | t	        | j
                  | j                        | _        t               | _	        t               | _
        t        t                  | _        || _        t        j                          | _        t%        t&        t&        f          | _        t+               | _        t/        j0                  t$              | _        t%        t&        t4        f          | _        || _        t/        j:                         | _        t        t>                  | _         d | _!        | jD                  r| jG                  | jH                         | jJ                  r| jM                          | jO                          | jJ                  r| jQ                          y y rt   ))optimize_maskr  r  r   r  newvar_prefixsuffixrK  rO   post_loop_combinepost_loop_storer   r   outside_loop_varsrV  r<  countblock_ptr_iddictr   block_ptr_to_bufferrx  r  r  r  pointer_advancementsrT  r  hint_overrideCounter_load_countsr*   autotune_hintstriton_metar  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   tilingrV  r  r  r  r%  r"  s          r}   r   zTritonKernel.__init__]  sM    $1(*6*T//=1?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! $(S>#3 *6A6I6I6K )6859  ))$))4%%++-!%%002 &r   c                    t        |      S rt   )r?   )r   r  s     r}   dtype_to_strzTritonKernel.dtype_to_str  s    5!!r   c                p    | j                   xr) t        j                  j                  | j                        S rt   )r  rE   choices should_use_cooperative_reductionr  r   s    r}   r  z-TritonKernel.should_use_cooperative_reduction  s-    $$ 
)S)SMM*
 	
r   c                     j                   sJ  j                  D ]$  }|j                  |xj                  dz  c_        &  j                  d   } j                  rt        | j                  d         } j                  j                  |       _        t         j                         _
         j                  j                  d       t         fd j                  D              r j                  j                  d       yy)z/One time setup code for cooperative reductions.NrG   r  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  Z   K   | ]"  }|j                   rj                  |        $ y wrt   )r  _has_constant_mask)r   r   r   s     r}   r   z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>  s0      
   ''--
s   (+z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  r  grid_dimr  r  r   rk  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  r}   r  z'TritonKernel.init_cooperative_reduction  s    )))) $$ 	#D}}("	# KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 

 IIP
r   c                   d}| j                   s| d}| j                  j                  d|        | j                         r| j                  j	                  d       y | j                   rJ | j                  j                  d       y )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r}   r  z,TritonKernel.init_cooperative_reduction_mask  s{    >}},oY7M		.}o>?##%II }}$$IIer   c                2   | j                   D ]q  }|j                  s| j                  || j                         ,| j                  s9| j                  j                  |j                   d| j                  |              s | j                  rt        d | j                   D              rS| j                  ddd      }| j                  |      }| j                  j                  d| j                  |              y | j                  | j                         y y )Nzbase = c              3  4   K   | ]  }|j                     y wrt   )is_loopr   r   s     r}   r   z2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =D4<<=s   baseTr   zrbase = )r  r	  iteration_ranges_codegen_headerr  r  r   r  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   rU  codegen_reduction_indices)r   r   rn_basesrbases       r}   r  zTritonKernel.codegen_range_tree  s    $$ 		D<<44T499E&& 		##{{m74+L+LT+R*ST		   =D,<,<==66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r   s    r}   need_numel_argszTritonKernel.need_numel_args  s     r   c                    | j                   xr4 t        j                  j                  | j                  | j
                        S rt   )r  rE   r  should_use_persistent_reductionr  r  r   s    r}   r  z,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r   c                    | j                   xrG t        | j                        | j                  dz   k(  xr  | j                  xr | j                  d   dk(  S )NrG   r   )r  r  r  r  r  r   s    r}   want_no_x_dimzTritonKernel.want_no_x_dim  sY    %% 1DKK D$;$;a$??1!!1 !!(+q0		
r   c                     y)Nztl.device_assertr   r   s    r}   assert_functionzTritonKernel.assert_function  s    !r   F)
copy_shapedense_indexingoverride_maskrB  rC  c          
     h	     j                        j                  }d}t               t        |t	        j
                  d            D ]j  }	t        |	t        j                        sJ |xs t        |	t        j                        }|rAt        |	t        j                        r? j                  j                  |	j                      }
j#                  |
j$                         t        |	t        j&                  t        j(                  t        j*                  t        j,                  t        j.                  t        j0                  f      rt        j2                  D cg c]  }t        |	|      r	t4        |    }}t7        |      dk(  sJ d|	j                           j9                  |d    d       m t:        j<                  j>                  xs |xs  j@                  duxr dk7  }d	}d}t               } jC                         D ]@  }|jE                  |jF                        rd	}nd}|j9                  |jH                   d       B |r& jJ                  rt:        j<                  jL                  srjO                         ry|sw j@                  skt7        |z
        dk(  rZ jQ                        sI|rG jR                  d
k(  r8	 	 	 	 	 	 dd	 	 	 	 	 	 d fd	 	 	 	 	 	 dfdd fd} |       }||S d}d} jU                        }t        t        jV                        r|r| dn jY                         }|rdnt[         j]                               }d| d| d} j^                  r ja                         st        dg      n
t                j@                  rj9                   j@                         tc        ||||      S |rB|s@|r| dn jY                         }|rdnt[         j]                               }d| d| d}|n|s|rd| d| d}||$|s|r|rdnt[         j]                               }nd}|rt        |g       j@                  rj9                   j@                          je                         tc        ||||      S c c}w )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fri  r%  rG   zAmbiguous type: r   r   NTtl.int32c                    t        j                  | |j                               }|yt        |j                  gt
        j                  |      g|gt
        j                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rH   match_affine_block_exprsymbolr   r  r   r   r   )r   
range_treer   s      r}   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_blockQ  sl     -DD:,,. >&%++,!.!=!=j!I J#H*;;JGH	 r   c                   |j                         }t        j                  dt        j                  t        j
                  |g            \  }}t        dt        j                        | j                  t        ||            | j                  t        |||            z         }t        j                  | ||j                  |      }|y|\  }}}	t        j                  |      }
t         j"                  j$                  j'                  |j(                        t+        fd|
D              ryt,        j/                  |      }t1        ||
d         gt3        |
dd |dd       D cg c]%  \  }}t        j4                  t1        ||      |      ' c}}z   }|	D cg c]#  }t7        ||t,        j9                  |      i      % }}t;        ||||	      S c c}}w c c}w )
a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)ry   r   Nc              3  l   K   | ]+  }j                  |       xr j                  |        - y wrt   )r=  statically_known_power_of_2)r   r  	max_blockr   s     r}   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>  sH        !==eYOO H$@@GGHs   14r   rG   r"  )r$  r   symbolsr7  r8  WildrU  r  range_tree_nodesr  r   r   rH   match_mod_div_block_exprr  get_slice_numelsrE   r  r   r+  r  r   r   r   r   r   Minr>   r   r   )r   r%  rQ  denommodulonum_dimsmatch_resultr$  r   block_index_exprsslice_numelslinear_block_sizer  r!  r   r   r   r+  r   r   s                    @@r}   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_blockg  s   ( '--/	 !&"!))%**ykJ!v --.HY$>?++oi&OPQ	  3KK9j&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ". 
   %2$@$@$L!-|A?1 '*,qr*:DH&E"s IIg&7?E1 !2	3  y-*H*H*TU3 3 ' +#)	 3s   ,*G (Gc                6    fD ]  } || |      }||c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r   r%  
match_funcmatchr&  r9  s       r}   match_block_subexprz2TritonKernel.indexing.<locals>.match_block_subexpr  s:     ''# %J 'tZ8E($% r   c            	     .   t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                         }|D cg c]&  }t        j                  ||j                               ( }}t        d |D              }t               }t        ||      D ]@  \  }}t        |j                  |j                              dkD  r y  ||      }	|	 y ||	z  }B |t        |      z
  }
j                         t         j"                  j$                  rt&        nt(        }|j+                  ||
|j,                        }|t(        k(  r,t/        t0              j3                  |j4                        sy |S c c}} w c c}w )Nc              3  <   K   | ]  }|j                           y wrt   )r$  r
  s     r}   r   zBTritonKernel.indexing.<locals>.match_block_expr.<locals>.<genexpr>  s     *QT4;;=*Q   rG   )r   r   r  r   r  )r>   r.  r  r   active_range_treesrH   get_subexpr_involving_symbolr$  r   r   r   r  intersectionr<  sumfilter_masksr   r   use_block_ptrrp  rc  r)  r+  r	   r  r  r   )vr>  index_relative_to_xyr_indexr  r   index_subexprsrange_symbolsr  subexprr   r   options_classoptionsr   r   r=  r   rC  s                r}   match_block_exprz/TritonKernel.indexing.<locals>.match_block_expr  s   .8$2G2G2M2M2OP$!QAqvvIP/+ #557 !,	"  (DD3T[[]" " !+*Q[*Q Q.0%(n%E 
+MD' =55g6J6JKLqP# 1$?F~# F*L
+ 5s>7JJ !!), }}22 $0 
 (..'$* +'"&.. /  !$;;04/1J1- 5TT  $u Q"s   F+Fz.shaper_  rR  z, tl.int32)xmask)r   rQ  rS  .shape)r   )r   r   r%  r[   r   Optional[BlockParameters])r   r   r%  r[   r   rQ  )r   z Optional[BlockDescriptorOptions])3prepare_indexingr<  r   r   operator
attrgetterr}  r   r   r   r   r   r   r   rK  rO  ri  r$  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  r  r   r   r  r  rA  rC  var_listr  allow_block_ptrrF  r  is_indirect_indexingr  rU  r5  dense_size_strr  dense_size_listr  r  r   rE  )r   r   r  r  r  rB  rC  
index_varsr   r  cse_varr   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   rN  rM  r   r   r   r   r&  r=  r9  s   ``    `                @@@@r}   rF  zTritonKernel.indexing  s    %%e,''

%/\	*(*=*=f*EF 	:Cc5<<000# ~]22(J TXX.((..sxx8  !2!23%%II))JJJJ''
 
 !. 9 9"%c40 t$" "
 >*a/N3CCHH:1NN/!2 3489?	:D MM(( ++d* qj	 	 
+5<++- 	6D&&t}}5!%"
4;;-t 45	6 t338S8S-1==? "OOI/0A5--e4  J.!/B*,`!`/B`*`D .A* < <~ '(G"
'+%%e,	eU]]+2<J<v.$BUBUBWJ#-459M9M9O3PL":,b;GI  )A)A)C&y1	&L	doo.")  j2<J<v.$BUBUBWJ#-459M9M9O3PL*9+R
|1EI'IJ*9+R
|7KI'IZ'1tuT=Q=Q=S7T!"M?3I??MM$//*)$%
 	
E	"s   %R/c                   |j                         }t        |t              r|r&|r$|dk(  sJ d}n|sd}n|r|dk(  sJ d|d}nd|}| j                  r| j                  d   j
                  r|j                         rt        | j                        }t        |t              rd| }nd| }| j                  j                  t        || d|j                  |d	
                    t        |t              rn|| j                  |<   t        j                   D ]L  }|j#                  |      }	t%        d |	D              r'| j&                  |   }
||
vsJ d| d| d       |	|
|<   N ||fS |j                  |      }||fS )N, other=0.0r   , boundary_check=z, padding_option='zero'r   rB  tma_descriptor = F)r/  c              3     K   | ]A  }t         j                  j                  j                  |t	        j
                  d              C ywr   N)rE   r  r   r	  r   r5  )r   r   s     r}   r   z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>	  s=       # ((@@"EMM!$4s   AA	z#duplicate advancement for pointer 'z' at type 'r  )rC  r}  rc  r  r  r	  r   nextr  rp  r  r   rN   rl  r  r   r   rw  r
  r  )r   ri  r  rF  r  checkblock_descriptor_idblock_descriptorr   advance_offsetsadvancementss              r}   codegen_block_ptrzTritonKernel.codegen_block_ptrR	  s    '')h 78 ------+E94KL+E95 !!  $,,##%"&t'8'8"9(O4%./B.C#D %34G3H#I II-.c(//#u/2U1VW (O4 >B(()9: *99 ED&.&>&>t&DO   '6	  !#'#<#<T#BL+<? =>N=O{[_Z``ab? 6EL!12!E&  &&  (s3&&r   c                J   d| d|j                    d}t        t        |j                   |j                              D ]B  \  }\  }}t        j
                  j                  j                  ||      s4d|j                  |<   D |j                  ||j                   |j                  d      }| dt        t        j
                  j                  |             d}t        |t              rd| d| | dS | dt        j                  j!                  |j"                         d| dS )NrQ  rR  rS  Fr;  	tl.store(z.store()r   	enumerater   r   rE   r  r   r	  r   r\  r   r  	get_dtyper}  rp  r  rU  r   )	r   ri  rF  rB  rV  r  rA  r!  broadcast_dims	            r}   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line	  s%    #5'H,@,@+AC *3$$h&>&>?*
 	8%C%#} ww77]K27**3/		8 668'')=)=u

 '/0A0A$0GHIKh0ykE75';;GAHH$9$9(:J:J$K#LBugUVWWr   c                   |s|sy t        |t        j                        sJ | j                  |dd       }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      }
| j                  j                  |
|	dt        j                         y )NFrA  0)
assignmentr  )r}  r   ExprrF  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferrK  rL  rv   r  )r   r   r  lowerr   rF  r   r   size_strlinebuffers              r}   check_boundszTritonKernel.check_bounds	  s     $

+++==RV=W(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 %%h/&$5Lr   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )Nr   )	r   r   rM  r  r  r	  r   r  loads)r   rF  s     r}   r  zTritonKernel.get_load_buffer	  sb      "h&:&:&<<<!!  $,,'') 99::r   c           
     
   | j                   j                        }| j                  xx   dz  cc<   t        }| j	                  |      |}t
        j                  j                        }| j                  |d| j                  | |d            }|j                         |j                         }t        d | j                  |      j                         D              }	| j                  |      rd}
nX|	sd}
nS| j                   rE| j"                  d   j$                  r,fd	}   d
}
t'        j(                  t*        d|      }nd}
|sr8|j-                         r(| j.                  rdt1        | j.                         }nd}nd}	 d}t2        j4                  j6                  r"| j8                  j;                         }|   dkD  }	 | j                  |       xr | j                    xr | xr |	}d}|rd}d}d}t=              r8|}|t>        j@                  t>        jB                  fv rt>        jD                  }d}ntG        |tH        tJ        f      r| jM                  |||      \  }}tG        |tH              rd| | |
 | d}n/| dt
        jN                  jQ                  |jR                         d}|jU                  ||jV                  |jX                  d      }|jX                  }ndtG        |tZ        j\                        rd| d| d}|j^                  }d}n2d| d|j`                   d|jb                   |
 | | d
}|jd                  }|t>        j@                  t>        jB                  fv r/t2        j4                  jf                  r|dz  }t>        jD                  }|t>        jh                  k(  r/t>        jj                  jl                  |dz  }t>        jh                  }| jo                  |      }| jp                  js                  | ||      ||      }|jt                  dkD  rxx   dz  cc<   tG        |tv              sJ |jx                  |_<        |rd| d| d}| jp                  js                  ||||jd                        }|jx                  r|jz                  rd}n|t>        jh                  k(  rd}nd}| j.                  rt1        | j.                        n|}d |jb                   d| d| d}| jp                  js                  ||||j|                        }| j                   r|j                         ss| j                  j                  |       |S )!zc
        Load from the memory location 'name', offset by some indexing expression 'index'.
        rG   TFr  rA  c              3  &   K   | ]	  }|d k(    ywrN  r   )r   r  s     r}   r   z$TritonKernel.load.<locals>.<genexpr>	  s      
AF
   z, eviction_policy='evict_last'r   c                          kD  rsryy)N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsri  s   r}   decide_laterz'TritonKernel.load.<locals>.decide_later	  s    t$~5"3'$r   z, eviction_policy='<EP>'z<EP>r   z, other=rh  z, cache_modifier='.cg'Nr   rc  rS  z.load(re  r  r  r5  rS  rE  rQ  rR  z0.0Truer|  r  )Brk  rf  r  r   r]  rE   r  rx  rF  tma_compatibility_checker_clsr   r   r   get_strides_of_loadr>  is_broadcastedr  r  r	  r7  r8  r5   r   _load_otherrX   r   r   skip_l1_cacher  buffer_read_countsra   rv   r1  r2  rw   r}  rp  rc  rt  r  rU  r   r\  r   r   r   r5  r   r   r   r   r0  r   r  r  r  rK  rL  	use_countr  r   r  r   r   r  r  )r   ri  r   r  	make_lineoriginal_indexr  rF  r   is_coalescedepr  r  has_read_depsr  r  cachemodappend_broadcastr   r  rq  load_buffer
result_varzero	other_valr  r   r  r  s    `                       @@@@r}   rV  zTritonKernel.load	  sj    iiood#''DQCK	 55e<!!$'==&*&H&Heu 'I ' ! 
 ((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H $#D)D 77E (_6M$NO*.*@*@#x+' % h8%&6%7wrd8*AND./vahh6K6KHL\L\6]5^^_`D==(..0D0Dd !,,NEMM:!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk -- %--88MM88))

"u}}'8'8'@ &

**84XX&&4U ' 

 !#"*&7888'11
%j\4D3EQGD**Th6K6K + J !!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..U*:J:J / 
 $$X-?-?-A*""&&z2r   c           	     "   | j                   j                  |      }|}t        j                  j	                  |      }d }|| j                  | |d      }| j                  |d|d u |      }	|| j                   j                  v }
| j                  |      }|
r'|r%| j                  j                  t        |d             t        |	t        t        f      r,| j                  |||	      \  }}| j!                  ||	|||      }n]|$d| d|	j"                   d| d|	j$                   d		}n7|d
k(  r$d| d|	j"                   d| d|	j$                   d	}nt'        d|       t)        j*                         }| j,                  s7| j.                  r+|j1                  | j3                  || j                               | j                  j                  t        ||             | j,                  s| j4                  j7                  |       |j9                          y )NTr  )r  rB  rC  ztl.debug_barrier()rv  re  r  rR  rS  
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)rk  rT  rE   r  rx  r  rF  inplace_buffersr  storesr   rN   r}  rp  rc  rt  rz  r   r   r  
contextlib	ExitStackr  r  enter_contextguard_cooperative_storer  r  close)r   ri  r   rV  moder  r  r  rC  rF  
is_inplacer  rq  r  r  
exit_stacks                   r}   storezTritonKernel.storew
  s	    iit$!!$'$(!<(,(J(Jet )K )% ==dl&?	 ! 
 TYY666
,,^<.KK!!,t5I"JKh2I JK&*&<&<T3&Q#e44h 0%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r   c                    | j                   j                         }|j                  t        |d| d             |j	                         S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  r  r   rN   indent)r   ri  r  rA  s       r}   r  z$TritonKernel.guard_cooperative_store
  sC    
 88NNPd.?uK,PQR}}r   c                t    d }|D ]0  }|t        |d      s||j                  }"||j                  z  }2 |S )Nr   )r   r   )r   	variablesmaskselems       r}   _combine_maskszTritonKernel._combine_masks
  sK     	3D|t[)= NNE!DNN2E	3 r   c                   | j                   j                  t        j                         | j                  j                  |d         }| j                  |d         }	| j                  |d         }
| j                  |d         }|r| j                  j                  |d         nd}|r| j                  |d         nd}|t        j                  k(  rd}n!|t        j                  k(  rd}nt        d      | j                  j                  | j                  d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d||j                        }| j                  |||      }||_        |S )z3
        See [Note: Inductor bucketize op]
        r   rG   r   r   r   r   ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(rR  z, )rE  )r  r  r*   ONE_ELEMENT_PER_THREADrk  rf  rU  rv   r  rH  r  rK  rL  rM  r   r  r   )r   r>  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyper(  r  s                    r}   	bucketizezTritonKernel.bucketize
  s   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  !,, # 
 ##F,<nM r   c                    | j                         }|dk(  rd| dS | j                  }dg||z
  z  dg|z  z   }| ddj                  |       dS 	NrG   z!triton_helpers.promote_to_tensor(rS  rz  r   r{  rR  r|  r>  r  r   )r   rV  ndimsnreducesizess        r}   reduction_resizezTritonKernel.reduction_resize
  sj    '')A:6ugQ??)))VHw,>>$))E*+1--r   c                    | j                         }|dk(  rd| d|fS | j                  }dg||z
  z  dg|z  z   }|g |d ||z
   dg|z  nd }| ddj                  |       d|fS r  r  )r   rV  r   r  r  r  r  s          r}   reduction_resize_and_shapez'TritonKernel.reduction_resize_and_shape  s    '')A:6ugQ?FF)))VHw,>>=B=N9e'uw(9A3=9TX 	 $))E*+1-y88r   c                   | j                   dk(  r|S | j                         | j                   z
  }| j                         }|d| dgz   }| j                  j	                  |t        t        |      ||      |t        |            S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rG   NRBLOCKrE  )r  r>  r_  rK  rL  rT  r   r  )r   r  rV  r  target_ndimrW  target_shapes          r}   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dims  s     ""a'L--/$2I2II,,.$\k2hZ?xx  3u:}lC%	 ! 
 	
r   c                ~   56789: d8d}t        j                  |      D cg c]  }|j                   }}t        j                  ||      }t	        d |D              rHt        j                  |t
        j                        }t        j                  t
        j                         j                  sJ t        d  j                  D              } j                  |       t        |      } j                  r|j                   j                          j                  d   j                  d   }	 j!                         6 j#                  6 fd|      } j%                          j&                  z
  7	 	 	 	 	 	 d97 fd8	 	 	 	 	 	 	 	 d:8fd}
79 fd	}||f}| j(                  j*                  v r j(                  j*                  |   S t-        |      }t/        |      }t1         j3                               }d
|7<    j(                  j5                  |t7        |            }t        d |D              |_        dj;                  |      55fd: j<                  r&t>        j@                  jC                  |      } j#                  tD        |      }d; :fd}dk(  rn?tG        |t6              r&tI        ||      D cg c]  \  }} |||       }}}n	 |||      }dv rtG        tJ              sJ tL        jN                  jQ                         }tS         j(                  jU                   jV                  d|	 d| d||jX                              }ddd   9 | jV                  |||       ||_        ndk(  r8 jZ                  r j]                  ||:|      }n j_                  |      }ndk(  rMtG        t`              sJ |\  }}}t7         fd jc                   jV                  |||7      D              }n.dk(  r je                  |      }ntG        tJ              sJ  8 jV                  ||j                        \  }}} j(                  jU                   jV                  |||      }n j(                  jg                  d| |t7         j3                                     }t>        j@                  ji                  |      } j#                  tD        |      }tG        |t6              s5 jj                  jm                  | d j!                          d| d| d       dv rd| d} jn                  jq                         } jj                  jm                  | d j!                          dt        jr                  |      jt                   d jw                  |       d       ddd   9 jV                  jy                  d| d | d!9 d"| d| d| d|	 d#| d$ :| d%|       d&| d$ :| d%|       d&        | jz                  |||       nt}              r j]                  ||:|      }ndk(  r3d| d'} d| d(}! jj                  jm                  |  d j!                          d)| d        jj                  jm                  |! d* j!                          d| d        jV                  jy                  d+|  d |! d,|  d|! d| dt~        j                   d-        jV                  jy                  d+|  d$ :|  d%|        d+|! d$ :|! d%|!       d+	       |}" j(                  j5                  |"jX                        }# j                   jz                  |"|#| |!7      }nt?        j                  |      }$ |$||      }% jV                  jm                  | d$ :|%|              |t
        j                  k(  rD j(                  jU                   jz                  | d.t
        j                  |jX                        } |
 jz                  ||d         jZ                  rt>        j@                  ji                  |      }t        j                         }& jz                   j                  fD ]2  }'|'jm                  d/       |&j                  |'j                                4 dv r jz                  jm                  | d0 j                  | d1               j                  | d2||      }( jn                  jq                         } j                  ||t        jr                  |      jt                        }) | j                  ||(|)       nFt}              rdk(  sJ |\  }*}+}, j                  |*t/        |      |d         }- j                  |+t/        |      |d3         }. j                  |,t/        |      |d4         }/ j                   j                  |*|+|,|-|.|/7	       ndk(  rw|\  }"}#tG        |t`              sJ  j                  |"t/        |      |d         }0 j                  |#t/        |      |d3         }1 j                   j                  |"|#|0|17       n1 j                  |t/        |      |      }2 |
 j                  ||2d        |&j                          | j(                  j*                  |<   tG        |t6              rt        d5 |D              sJ  j                  j                  |       d6v rt        |      d3k(  sJ t        |      |z  }t        |      t        |      k(  sJ tI        ||      D ]F  \  }3}4|4J |3j                  |4k7  s jz                  jm                  |3 d$|3 d7t        |4       d       H |S tG        |t              sJ  j                  j                  |       |j                  |d   k7  r7|d   J  jz                  jm                  | d$| d7t        |d          d       |S c c}w c c}}w )<Nc                    | j                   t        j                  t        j                  fv r$t	        j
                  | t        j                        S | S rt   )r  rv   r1  r2  r  rZ  rw   rW  s    r}   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcast)  sF     ;;MMNN UEMM2 r   c              3  `   K   | ]&  }|t         j                  t         j                  fv  ( y wrt   )rv   r1  r2  r  s     r}   r   z)TritonKernel.reduction.<locals>.<genexpr>9  s"     MqU]]ENN33Ms   ,.c              3  :   K   | ]  }|j                    d   ywr   Nr  r
  s     r}   r   z)TritonKernel.reduction.<locals>.<genexpr>?       MDdkk]$/M   r   r   c           	         j                   j                  j                  d|  d d| j                  t	        j                                     S )NrQ  rR  rS  rE  )rK  rL  rM  r  r  r_  )rG  r^  r   s    r}   <lambda>z(TritonKernel.reduction.<locals>.<lambda>N  sM    dhh''"1#R'7q9ggD0023	 (  r   c           
     J   	dv }|rdnd}
j                  | |      }	dv r,
j                  | d	 d| d d|j                        \  }}n+
j                  | d	 d	| d d|j                        \  }}|| d

j                  |       d}n|j                  }|||fS )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   rU  minprodtriton_helperstl)rU  r  r  z2(rR  rS  r  r;  )r  r  r   r  r  )r  rV  result_type
use_helpermoduler(  r   r!  r  reduction_typer   s          r}   final_reductionz/TritonKernel.reduction.<locals>.final_reductionZ  s     (+HHJ)3%F00FE/ $ ? ?ha/r%3%qA5;;! !% ? ?ha/qr#a@%++! &"84(9(9+(F'GqI#kk;--r   c                N     | ||      \  }}}| j                  | d|        y)zU
            Generate a reduction and assign it to an existing variable.
            rk  N)r   )r  r  rV  r  r  r  s        r}   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_definev  s0     *&%EKE1aMMZLE734r   c                    j                  | |      }j                  | |      }| j                  d| d| d d| d| d d| dj                  | d       d	       y )
N                z_val, z_idx = triton_helpers.z_with_index(rR  )
                rk  _idx
                )r  r   r  )r  r  rV  r   r!  r  root_opr   s       r}   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr   ry  rE  c              3  >   K   | ]  }t        |d          r|  ywrm  )r;   )r   r  s     r}   r   z)TritonKernel.reduction.<locals>.<genexpr>  s!      *
(;CF(CC*
s   r   c                :    s| S t         j                  | |      S rt   )r(  r  )tvalfvalrs  s     r}   
where_condz*TritonKernel.reduction.<locals>.where_cond  s     (..tT4@@r   c                    j                   j                  j                   | |      | j                  | j                  | j                        S |j                        S )NrE  )rK  rL  rM  r  r   )rV  defaultr   r  s     r}   _mask_valuez+TritonKernel.reduction.<locals>._mask_value  sa    xx((LLug.++).)@%++	 )   GNmm	 )  r   online_softmax_reduce)argmaxargminrQ  zindex, rP  rU  r  welford_reducewelford_combinec              3  v   K   | ]0  \  }}j                   j                  j                  ||        2 yw)rE  N)rK  rL  rM  )r   rV  r   r  r   s      r}   r   z)TritonKernel.reduction.<locals>.<genexpr>  s9      #$u HH%%dllEe%T#s   69r   = tl.full(rR  rS  _indexr  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                rk  _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrG   r   c              3  <   K   | ]  }t        |t                y wrt   )r}  r  r  s     r}   r   z)TritonKernel.reduction.<locals>.<genexpr>  s     LAz!%67Lr@  )r  r  r;  )rV  rM   r   rM   )rV  rM   r  r"  r   z1tuple[str, Optional[torch.dtype], BlockShapeType])r  rM   rV  rM   r  r"  r   r   r   rM   )Upytreetree_leavesr  tree_mapr   rv   rN  rw   r  r   r  rE  r   r  r~  r  r^  _map_tuple_or_scalarr>  r  rK  reduction_cacher  r  r  r_  rj  r  r   r   r  r    	Reductiondefault_valuerX   r}  r   rM   rE   r  rG  r   rL  rM  r   r  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbacknamedvardefault_accumulatorr  r   r  select_index_dtypeiinforU  r  r   r  r9   r   rs  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   r  r  r  r  r  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r
  r  r$  r  r	  r  r  );r   r  rP  r  rV  r  r&  original_dtypesr  reduction_range_prefixr  r  rl  acc_typetorch_acc_typeresult_shaper  r  r  rG  dmasked_valueaccumulator_dtypeaccumulator_indexmeanm2weight_result_dtype_shapeaccumulatorr  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedr  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr  
orig_dtypers  r^  r!  r  r  r  s;   `` `                                                 @@@@@@r}   	reductionzTritonKernel.reduction"  s   	 170B0B50IJ399JJe4M_MM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!? ,,.)) 
 %%'$*A*AA	.	. /	. ?		. 	.8
	5#
	5 
	5 /	
	5
 
	5	 6	00088++I66"9-))4D0023S((// l(; * 

  * *
 *
  

 zz% 	A
 $$ll00KG//wGG !88 E5)>A%>QRdaAq 1RR*5':!55!,<<<$%HH$K$K$M!$'HH%%*+A*B',W^_/*00	 & %! &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,999%1"r6" #(,dBU)# 
  #:: "BB5%P
!,<<<*9LL,0B0B+' "XX..LL'v / 
 ((++J< $D0023 , K
 ll66~yQG//wGGgu-		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n5!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5
@P@P_Q
!GG**##
  88S
$[%8&&"m3z';'G&HI 

* #'(("3"3..&-|4#jj)//	 #4 #K '**JT %%ll66~yQG#--/J..0D0DE 7./((67
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n5%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
J!'8444JJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W 't';';ZPTU.8  +j%(LLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z?#C Z!---99
***44%s3%t,?
,K+LAN"  j*;<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` G Kx  Ss   p4p9c                   | j                  |||      }| j                  |||      }t        d      D cg c]'  }t        | j                  j	                  |            ) c}\  }}|j                  d| d| d| d| d| dt        j                   d| d| j                  |        d| d| j                  |        d       ||fS c c}w )Nr   r  
            rR  9 = triton_helpers.online_softmax_reduce(
                )
            rk  )	r  r  r   rK  rj  r   r   rs  r  )	r   r  r2  r3  r!  r  r  r4  r5  s	            r}   _online_softmax_reducez#TritonKernel._online_softmax_reduce  s     66vPUV66vPUVMRSTX!V#dhhooEo&B"C!V
JL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%% "Ws   ,Cc           
          fd|||fD        \  }}}d| d| d| d d	}fd}|||fD 	cg c]/  }	 j                   j                   ||	j                              1 }
}	j                  dj	                  |
D cg c]  }t        |       c}       d|        t         fd|
D              S c c}	w c c}w )	z;
        Helper to codegen triton_helpers.welford.
        c              3  D   K   | ]  }j                  |        y wrt   )r  )r   rV  r  r  r   s     r}   r   z(TritonKernel._welford.<locals>.<genexpr>  s(      
 ((>
s    ztriton_helpers.welford(rR  rS  c                2    t        | d | dz   d  z         S )Nr   rG   )r  )r   r!  s    r}   reduced_shapez,TritonKernel._welford.<locals>.reduced_shape  s$    qcAgi(8899r   rE  rk  c              3  V   K   | ]   }j                  ||j                         " y wrt   )r  r   )r   rV  r   s     r}   r   z(TritonKernel._welford.<locals>.<genexpr>  s*      
 ++E5;;?
s   &))rK  rj  r   r   r   r   r  )r   r  r+  r,  r-  r!  r  welfordrN  rV  welford_resultsr   s   ``   ``     r}   r  zTritonKernel._welford  s    
F+
b& ,D6B4r&C5J	:
 F+
 HHOO%}U[[/IOJ
 
 	DII&G!s1v&GHIWIVW 
(
 
 	

 'Hs   4B:B?c                   | j                         | j                  z
  }t        | dt        | j	                               |t        j                               }t        | dt        | j	                               |t        j                               }	t        | dt        | j	                               |t        j                               }
| j                  j                  | d| j                          d| d       | j                  j                  |	 d| j                          d| d       | j                  j                  |
 d| j                          d| d       |dk(  r>|\  }}}| j                  j                  d	| d
|	 d
|
 d| d|	 d|
 d| d| d| d       n8|dk(  sJ | j                  j                  d	| d
|	 d
|
 d| d| d|	 d|
 d       | j                  j                  d| d || d|       d|	 d ||	 d|	       d|
 d ||
 d|
       d       |}| j                  | j                  |dd||	|
||	      S )z%Helper to codegen a welford reduction_meanr   r  r!  _m2_weightr  rR  rS  r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            rk  r  rG  N)r>  r  r  r  r_  r   unknownr  r   r^  rM  r   r!  r  )r   r  r  rV  r  r$  r  r!  r1  accumulator_m2accumulator_weightr+  r,  r-  r;  s                  r}   r  zTritonKernel.welford_reduce  s     %%'$*A*AA'l% ,,./&&(	
 +l#,,./&&(	
 /l'",,./&&(	
 			m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$D"fLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !22""

 
	
r   c
           
        t        | j                  ||||||	            }
|||g}t        t        ||
            D ]E  \  }\  }\  }}|"| j                  j                  |	|      }|||<   |j                  | d|        G t        |      S )z0Helper to codegen call to triton_helpers.welfordrE  rk  )r  r  rw  r   rK  rj  r   r  )r   r  r;  r<  r=  r+  r,  r-  r!  r  r>  result_exprsr  result_exprrV  r   s                   r}   r!  z+TritonKernel.welford_reduce_final_reduction-  s     dmmFD"fc5IJ#Y>09#lF:S0T 	6,A,^eU""hhooEoG"-QMM[MUG45		6 \""r   c                   | j                  |||      }| j                  |||      }	|j                  d| d| d| d|	 d| dt        j                   d| d| j	                  |        d| d| j	                  |        d       ||fS )NrG  rR  rH  rI  rk  )r  r   r   rs  r  )
r   r  r4  r5  rA  rB  r!  r  r2  r3  s
             r}   r  z2TritonKernel.online_softmax_reduce_final_reductionE  s     66vxO66vxOL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%%r   c                D    | j                   r| j                   d   S t        S )NRSPLIT)r  r-   r   s    r}   
max_rsplitzTritonKernel.max_rsplitT  s"    $$X..  r   c                   | j                   d   }| j                         sdnd}||j                  z  | j                         z  }| j                  j                  |      \  }}| j                  j                  d| d| d| j                  |       dt        |       d| d	| d
| dd       | j                  | dddg|t        j                               }	| j                  j                  |	 d| dt        |       d       |	S )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r  zxindex < xnumelNr  z_ws = (rd  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), rR  rI  Tstrip_peersr   r_  rT  z = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  )r  r  r  r`  r  r  r  r   rU  r?   create_cse_varr   rW  r  r   rX   )
r   r  r  default_valxnumelr   r  r  r  rC  s
             r}   r   z7TritonKernel.codegen_cooperative_reduction_peer_combineY  sS    S!(,(@(@(B %..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 ##l&!X&&&(	 $ 
 	&&g[ -eers~e  eA  ACD	
 r   c                @   | j                   sJ d| _         t        j                  j                  |      }| j	                  |d| j                  | |d            }d| _         | j                  j                  |      }t        j                         }| j                  r+|j                  | j                  || j                               t        |t        t         f      rY| j                  j#                  t%        || j'                  |||j)                  |      |d|j+                                            nXt        |t,              sJ | j                  j#                  t%        |d| d|j.                   d| d	|j0                   d
	             |j3                          y )NFT)r  r  r  rA  ri  rv  re  r  rR  rS  )r  rE   r  rx  rF  r  rk  rT  r  r  r  r  r  r  r}  rp  rc  r   rN   rz  rl  rC  r   r   r   r  )r   ri  r   rV  r  rF  r  r  s           r}   store_reductionzTritonKernel.store_reductionz  s    $$$$ %!!$'==&*&H&H5D 'I ' ! 
 !%iit$))+
%%$$,,T43G3GH h2I JK  **55  ,+H,C,C,E+HI	 h888  **uD););(<CwbIZIZH[[\] 	r   c                N   t               j                  d       t               t        d      D cg c]*  t	        fdt        t        ||            D              , }}dj                  d t        j                  j                  |      D              }j                  d| d       t               dd	d
lm} d	dlm}  |        |        G fddt               }	j#                         5  t%        j&                   |	             5   || }
dj                  d |
D              }
j                  d|
        d d d        d d d        | j(                  j+                  j-                               S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr   c              3  r   K   | ].  \  }\  }}j                  d  d| ||j                         0 yw)r&  r  rE  N)r  r   )r   nrV  r  rK  r  s       r}   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s@      %A~u s1#Qqc]%u{{Ks   47rR  c              3  2   K   | ]  }t        |        y wrt   r   r  s     r}   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     Rc!fR   zdef {name}():r}  r   rf   )ShapePropagationOpsHandlerc                  4    e Zd Z	 	 	 	 	 	 	 	 d fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    	d| z  	 t        |      |i |} t        |      |i |}j                   t        
|      |i |||      S )Nr  rE  )r?  rL  )r   ri  rk  r%  output_dtypeoutput_shaperK  dtype_handlerhelperhelper_name	overridesshape_handlers         r}   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._default  s     4&z) w!   # " #
 w!   # " #
 ||,GIt,d=f=&&	 $  r   N)ri  r   rk  ztuple[Any, ...]r%  dict[str, Any]r   r   )ru   r   r   r|  )rK  rw  rx  ry  rz  r{  s   r}   CSEProxyrs    s-    '6@N r   r~  c              3  2   K   | ]  }t        |        y wrt   rn  )r   rT  s     r}   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     BFBro  return r~  )rO   r   rL   r  r  rw  r   r   r<  r=  from_iterablerK  r,  rg   !torch._inductor.shape_propagationrq  r'   r  rE   set_ops_handlerr  r  r   )r   r<  r>  dtypesr  rk  	signaturerg   rq  r~  outputsrK  rw  rx  ry  rz  r{  s       `      @@@@@@r}   _lift_helperzTritonKernel._lift_helper  sy   
  !'e 1X

 	  )23vv3F)G 
 
 IIRioo.K.KD.QRR	=267#%	 *PP2424	 	~ 	0 ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RRk
`	2 	2 	2 	2s)   /F
F2FFF	FF$c                B     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d       g }g }t        d |D              }t        j                   j                  j                   j                        } j                  |||      } j                          j                  z
  }	t!        ||      D ]U  \  }
} j                  j                   j                  |
 dt#        |       d||
j$                        } j                  j                   j                  d| d	 j'                          d|t         j)                                     }
|j+                  |
       t-        |      } j.                  rǉ j)                         }d
|d<    j                  j1                  ||      }dd	j3                  |       d}|j4                  rdnd} j6                  j9                  | d| d	| d	| d       |j+                  |       X d  fd} |d |       d|	 d	| d|||      } j.                  sd }|D cg c]*  } |d| dt;        |j<                         ||            , }} |t        |      t        |            } |t        |      |      }t!        ||      D cg c]+  \  }} |d| d	| d|j<                  |j$                        - }}}t!        |||      D ]*  \  }}} j                  j9                  | d| d	| d       , n|}|D ]$  }t?        |t@              sJ t        |      |_!        & t        |      S c c}w c c}}w )z:
        Perform an associative scan on 'values'.
        TODOc              3  :   K   | ]  }|j                    d   ywr  r  r
  s     r}   r   z$TritonKernel.scan.<locals>.<genexpr>  r  r  z(ops.scan not supported inside ops.maskedc              3  2   K   | ]  }t        |        y wrt   rA   r   r  s     r}   r   z$TritonKernel.scan.<locals>.<genexpr>        Fe*51Fro  r;  rS  rE  rQ  rR  ry  r   r{  r|  zfloat('nan')z-1r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   yw,Nr   r   rV  s     r}   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>$       <EugQK<r  r   r>  s    r}   csvzTritonKernel.scan.<locals>.csv#      88<V<<<r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )NrR  c              3  T   K   | ]  }j                   j                  |       ! y wrt   rK  containsr   rl  r   s     r}   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>)        LI488$$Y/L   %(rE  rk  r  r  r
  rK  r  r   rj  r   rM  r   r   rk  r  )r  r>  r  r  rl  r  
cache_keysrl  r  rV  result_varsr  r  r   s               r}   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple&  s   FA;@8DaTF"QCr%1DJDLLLAKLIY/LL '*&&&9"UE e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% EL   D"D91D!ztl.associative_scan((r  c                T    | j                   y t        | j                         }d|d<   |S )Nry  r   )r   r  )r  r   s     r}   _partial_scan_shapez.TritonKernel.scan.<locals>._partial_scan_shapeC  s*    99$ OE #E"I Lr   ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"r  r  r   r  rE  r   r  r  r7  r8  rK  rL  rM  r  r>  r  r   r	  r   r^  r_  r~  r  r  rj  r   r  r  r   rA   r  r}  r  r   ) r   r  r6  r>  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr!  rV  r  value_dtyper$  reduced_sizer1  reduced_size_strr  r  partial_scan_varsr  partial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr  acc_nextpartial_reducer  r  s    `                              @r}   scanzTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j&&I%%'$*A*AA/ 	1LE5((++'1%89;kk	 , K HH%%";-r$2E2E2G1HJD0023	 & E %%e,&u-H,,#335#&R "hhooEoN%&tyy'>&?q#A ,1,C,C.		##"m;/?.@7)2hZWXY ##K09	1<	=	&$ )#C(:$;#<CuBGXFYYZ[	
 ((! ):# % 12B1CCtu-.>.D.DE-.>?# # #5#6>Q8RSI'l(;=NON 03>CT/U ,I| ,YKr,qI&,,&,,K  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	5Jj*;<<<#-e#4J 	5 [!!?#s   /N20Nc                ,     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d        j                  sJ d       t        j                   j                  j                   j                        } j                          j                  z
  }t        d |D              }t!        |      t!        |      k(  sJ t#        |      D 	cg c]?  \  }}	 |d|	 d j%                          d||   t         j'                               	      A }
}}	d
  fd} j                  d   j(                  sJ  j+                   j                  d         rdnd}t!        |      dk(  r'd|
d    d|
d    d| d| d| d| d} |||
||      }nt-        d      t/        ||      D ]  \  }}||_        |j2                  |_         t        |      S c c}	}w )Nr  c              3  :   K   | ]  }|j                    d   ywr  r  r
  s     r}   r   z$TritonKernel.sort.<locals>.<genexpr>u  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  2   K   | ]  }t        |        y wrt   r  r  s     r}   r   z$TritonKernel.sort.<locals>.<genexpr>  r  ro  rQ  rR  rS  rE  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   ywr  r   r  s     r}   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>  r  r  r  r  s    r}   r  zTritonKernel.sort.<locals>.csv  r  r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )NrR  c              3  T   K   | ]  }j                   j                  |       ! y wrt   r  r  s     r}   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  r  r  rE  rk  r  )r  r  r  r  rl  r  r  rl  r  rV  r  r  r  r   s               r}   r  z'TritonKernel.sort.<locals>.cse_multiple  s   &'A;@8DaTF"QCr%1DJDLLLAKLIY/LL %(0B$C E5 e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELr  r   r   rnumelr   ztriton_helpers.sort_with_index(r   rG   z	, stable=z, descending=zUnhandled sort)r  r  r   r  rE  r   r  r  r7  r8  rK  rL  rM  r>  r  r  r  rw  r^  r_  r  r  r  r   r   r!  )r   r  r>  stable
descendingr  r  r!  r  rV  r  r  r  r  r  r  	input_varr  s   `                @r}   sortzTritonKernel.sortl  s@    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k))) &f-
 5 "5'D,?,?,A+B!DQiD0023
 
	=	&$ #00002243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  't-?OK !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!]
s   AHc                   | j                   s=| j                  s1| j                  s%| j                  s| j                  s| j
                  sy| j                  D cg c]  }|j                  s| }}| j                  rWt        |      dkD  rHt        |      D ]  \  }}| j                  j                  |      5  |j                  }| j                  rdnd}| j                  rdn| d}| j                  j                  d| d	| d
| d
|j!                          d	       ddd       | j                  j                  |dz         5  | j#                  || j                         ddd        | j                  j                  t        |            5  | j%                  | j                         | j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         ddd       t)        g t        |            D ]o  \  }}| j                  j                  |dz         5  | j*                  |j,                     j/                         D ]  \  }}|t        |      dz
  k  rs||dz      }	| j*                  |	j,                     |   }
t0        j3                  |	      }t5        |	j6                  |      }t9        ||
      D cg c]  \  }}|||z  z
   }}}| j                  j                  t;        | j<                  |   | d| d
t>        j@                  jC                  |       d              	 ddd       | jD                  jG                  | jH                         |jK                          r n| j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  rb| j                  s| j
                  rJ| jL                   d}| j                  j'                  d| dd       | jN                  jQ                          | j                  j'                  | j
                         | j                   jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j
                  jS                          yc c}w # 1 sw Y   xY w# 1 sw Y   BxY w# 1 sw Y   xY wc c}}w # 1 sw Y   ^xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nr   )r   rsplit_startr|  
rsplit_endr  zfor zoffset in range(rR  zBLOCK):rG   z = tl.advance(rS  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Trb  )*indexing_coder  r  rM  r  r  r  r	  r  r  rw  r  r  r  r  r   r   r  r  r   r  r  r   r  r   r   r   r  r   rN   r  rE   r  rU  rK  
invalidater  cache_clearr  r  r  clear)r   r   
loop_treeslevelr  
loop_startloop_endrB  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                   r}   codegen_bodyzTritonKernel.codegen_body  s	    zz{{||%%##'+'7'7Ht4<<dH
H  S_q%8(4 JtYY%%U%3 ![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh YY%%UQY%7 J88tyyIJ JJ !!Z!9 ...tyy9		  !3!34		  ,		  .		  -.  ((@)J*?(@A #tYY%%UQY%7 262K2K		3eg.	; !3z?Q#66(2519(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[BR1S+$-C !$d]&: :+K +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!4 ##D$:$:;  "9#< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD		--.  "

$$&""$] I J J. .,+ sR   V=*V=9A&WWB0WBW/#W)5AW/W	W	W&)W//W9	c                V   g }| j                         rg }| j                  d|g        |D ]  }t        |t              r|j	                  t        |             .t        |t              rL|j	                  t        t        j                  j                  j                  |j                                     t        |t        j                        rB|j	                  t        t        j                  j                  j                  |                   t        dt        |              |S )Nr   z!Unsupported numel argument type: )r  add_numel_to_call_argsr}  rT  r~  r   rc   rE   r  r   	size_hint
inner_exprr   r~  r  r  )r   rk  
numel_argsr&  s       r}   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args  s    !+-J''J;! Vc3'KKC)_5KKAGG$4$4$>$>s~~$N OPUZZ0KKAGG$4$4$>$>s$C DE$'Hc%TUUV r   c                z   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]  \  }	}
dt        |       }t        j                  j                  |	      }|r|j                  | dt        j                  j                  j                  |j                         | j                          dt        j                  j                  j                  |j#                         | j                          d|j%                          d|j'                          d
       n|	t        j                  j(                  v rt        j                  j(                  |	   }|j                  | dt        j                  j                  j                  |j+                         | j                          dt        j                  j                  j                  |j-                         | j                          d|j.                   d|j0                   d
       nt3        |
t4              rZt        j                  j                  j7                  |
j8                        }d	|
j:                  v rd
}|j                  | d|        nt3        |
t<              ryt        j                  j?                         }t        j                  j                  j7                  |
j                        }|j                  | d| d| d|
j0                   d       ntA        d|	       |jC                  |        |jE                  | jG                                |j                  ddjI                  |       d       d d d        |j                  g d       t        j                  j?                         }|jJ                  }|j	                         5  |j                  dt        j                  jL                  jO                  |       d       |j	                         5  |j                  t        j                  jL                  jQ                  |             d| }|j                  | d| d       |j                  tS        tT        jV                         d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  dt        j                  jL                  jO                  |       d       |j	                         5  |j                  t        j                  jL                  jQ                  |             |j                  dtS        tT        jV                         d       d d d        d d d        |j                  g d       |j	                         5  |j                  d       |j                  d       |j                  d       |j                  d       |j                  d|        |j                  d       |j                  d       d d d        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w) N)r   r   zdef get_args():arg_z = rand_strided()r  rR  z
, device='z	', dtype=rS  rg  r   rk  z = torch.zeros(z*Don't find the buffer or const tensor for r  r  )
r  zdef call(args):zwith rz  streamz = get_raw_stream(z.run(*args, stream=)r  r  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r  r  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")),rO   rk  python_argdefs
writelinesr  r<  r  r   rn  rE   r  try_get_bufferr   r   
size_hintsget_sizer  
get_stride
get_devicerx  	constantsr  r   devicer  r}  rT   r  r   ri  rV   r  KeyErrorr~  extendr  r   r   
device_opsdevice_guard
set_devicer   r:   KERNEL_NAME)r   num_gbr(  _argdefs	call_argsr  r  name_cnt	var_namesarg_namearg_sigvar_namer8  const_tensorsymval_hintr  r  current_devicer   stream_names                       r}   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark&  s   !,0II,D,D,F))Y56]]_ %	@ (HI%(I%>  +!'!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_os  pB  pB5P  6C  5D  DF  GH  GN  GN  GW  GW  Gb  Gb  cf  cq  cq  cs  CG  CU  CU  Gb  GV  FW  Wa  be  bp  bp  br  as  s|  }@  }J  }J  }L  |M  MN  O !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQdtx  uG  uG5P  6H  5I  IK  LM  LS  LS  L\  L\  Lg  Lg  ht  h{  h{  h}  MQ  M_  M_  Lg  L`  Ka  ak  lx  l  l  k@  @I  JV  J\  J\  I]  ]^  _  1"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC6WW@@BFGG,,66w}}EE$$#*OE7*VHIV]VcVcUddef #DXJO    *A +B T==?@wtyy';&<A>?K%	@N 	9:<<>$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;XY		 	DE]]_ 	N R 01L y12=>N	  g%	@ %	@X 
	 
	  	 		  sf   M"Y1AZBY>ZAZ$A%Z9Z$,A;Z01Y;>Z	ZZZ!	Z$$Z-0Z:c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentrl  rE   r  r  import_get_raw_stream_asr   s    r}   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernel  s:     F177%%>>?OPQ
 	
r   c                    | j                   ry| j                  ry| j                  r| j                  sJ y| j                  ryy)Nr  r  r  rE  	pointwise)r  r  r  r  r   s    r}   _get_heuristiczTritonKernel._get_heuristic  sD    !''*&&(((()""r   c                    t         j                  j                  j                         t        j                         t
        j                  t
        j                  t
        j                  j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  j                  t
        j                  j                   t
        j                  j"                  d} t         j$                  j&                  d| d<   t        j(                         rd| d<   t
        j*                  rLt
        j*                  | d<   t
        j,                  | d<   t
        j.                  | d<   t
        j0                  | d<   t
        j2                  r9t
        j2                  | d	<   t
        j4                  | d
<   t
        j6                  | d<   | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rv   r5  _tritontriton_hash_with_backendr	  r   r
  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )inductor_metas    r}   inductor_meta_commonz!TritonKernel.inductor_meta_common  su    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r   c                  +, t               }i }| j                  j                         D ]  \  }}t        |      r| j                  st
        j                  j                  j                  |      }t        |t        t        j                  f      sd}nt        t        |            }|||<    ||j                  t                      t
        j                  j!                         j"                  }|dk(  r|j                  d       n|j                  d       t$        j&                  r|j                  | j)                                | j*                  j-                         \  +}	,}	t/        ,      D ]  \  }
}t        |t0              st3        t        j4                  |j6                        }|t
        j                  j                  j8                  v sbt1        |j:                  t
        j                  j                  j8                  |         ,|
<    t=               }| j>                  D ]  }|| j*                  j@                  v r(|jC                  | j*                  j@                  |          || j*                  jD                  v rj|t
        j                  jF                  vrN|| jF                  vr@|jC                  t3        tH        | j*                  jD                  |         jJ                         || j*                  jL                  v s| j*                  jL                  |   }t        |tN              rJ |jC                  |        tQ        +,      D ]O  \  }}t        |tR              s|jT                  tV        jX                  k(  s5|jC                  |j:                         Q t[        |      }| j]                         D ]Z  }t1        |j^                   d|j`                        },jc                  |       +jc                  te        |j:                               \ +,fd}| jf                  D ]K  }|jh                  r| jj                  r|jl                  ) ||j^                  jo                          d       M | jp                  r |d	       ts        ,| jt                  +
      }|tw        jx                  t
        j                  j!                               i d}t
        j                  jz                  xs t
        j                  j|                  }| j                         j                  t        | j                        t        t        j                        ||| j                  | j                  | j                  d| j                         }t
        j                  j                  ji                         xr | jj                   }| j                  }t        | j                        dk(  xr
 |duxr d|v }|r|r|J | j                  j                  | j                        }|j                  j                  j                  d   }|j                  }|d   t        |d   d      z  }|j                  j                  j                  }|j                  j                  j                  } t
        j                  j                  j                  |t$        j                        t        t
        j                  j                  j                  | t$        j                        d      z  }!|!dk\  r|dk\  rt
        j                  j                  j                  | j                  j                  d      rHt
        j                  j                  j                  | j                  j                  d      r
|dk  rd|d<   | j                  r| j                  |d<   | j                  r| j                  |d<   | jp                  r| jj                  |d<   d}"t$        j&                  st$        j                  r| j                         dz  }"|"|"|d<   t$        j&                  r| j                         }#|#|#|d<   t        ,      g|d <   t        ,      D ]  }$d|d!   ,|$   j:                  <    || _b        | j                          | j                  D ]$  }%|j                  d"       |j                  |%       & | j                  r2d#| j                          d$| j                  j$                  d%|d&|d'	}&n| j                  r;| j                  j                         }'d#| j                          d(|d)|' d%|d&|d'}&nYd"}(t        |      dk(  rt        t        ,            d*k(  rd+}(nd,}(d#| j                          d(|d-|( d.|d&|d/| j                   d'}&|j                  |&       |j                  d0|xs t        t        j                         d1d-j                  d2 +D               d3       |j                         5  | j                  |       | j*                  j                         D ]  \  })}*|j                  |) d4|*         |j                  | j                         ddd       t$        j&                  r |j                  | j                  |"             |j                         S # 1 sw Y   IxY w)5z
        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
        metadata, and benchmarking infra.
        i    Ncpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                    t               rj                  t        |              j                  t        | d             y )NT)is_constexpr)r@   r~  rK   rI   )r  argdefsr  s    r}   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg#  s/    -/  h!78NN78$?@r   r   r_  )
size_dtyper%  )r  r  r  )	grid_typer  kernel_namemutated_arg_namesoptimize_memr  num_loadnum_reductionr   r  r   r0_rG   )fallbackg?g       @i   i   
   Tadd_persistent_rblocktiling_scoresr  r  g    eAkernel_num_gbkernel_flopconfigsr  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,rR  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  <   K   | ]  }|j                           y wrt   )	full_namer  s     r}   r   z.TritonKernel.codegen_kernel.<locals>.<genexpr>  s     CcVWAKKMCcr@  rp  rk  )srO   r  r  r;   r  rE   r  r   symbolic_hintr}  rT  r   r5  r/   r   r   r  r  r   benchmark_kernelr  rk  r  rw  rT   r	   r   r   inv_precomputed_replacementsri  r   	mutationsinput_buffersr  r  removed_buffersrP   
inner_nameoutput_buffersrS   r   rV   	zero_moderW   ZERO_ON_CALLr   rA  r  r  r~  rI   r  r  r  
tensor_dimr   r  rb   r  r+   r)  is_inferenceis_backward_get_grid_typeru   setr  r   r:   DESCRIPTIVE_NAMEr  r,  r-  r   r  r  r2  r  r  memory_stats
persistentmemoryr!  count_per_threadrU  loopedbytesr  unbacked_symint_fallbackstatically_known_leqreduction_numelstatically_known_gtr  r  estimate_kernel_num_bytesestimate_flopsr^   r_   r  r  r  r   r  r  get_reduction_hintr`   rV  r  r   r  codegen_static_numelsaliasesr  r  r   )-r   ri  coder  r  r  
numel_hintr  device_typer  r  r&  r$  mutated_argsmutationmutation_argargnamer   sizeargr&  triton_meta_signaturer  r+  r  
looped_redr2  	two_d_redrH  	dim_statsmem_ops_per_threadr_coalesce_ratio
looped_mempersistent_memsaved_bytes_ratior  flopsarg_numrx  heuristics_linereduction_hint	tile_hintoldnewr%  r  s-                                              @@r}   codegen_kernelzTritonKernel.codegen_kernel  s	    
![[..0 	+MFE"6*43H3H))77>Jj3*>? !	+C
O<	!*Jv)	+, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 )3 	/H499222  !8!8!BCDII555AGG$;$;;D$8$88  )B)B8)LMXX 499333#yy77A%lJ???  .	/6  3 	/LGS3-MM%6%C%CC  .	/ l+++- 	2DU3TZZ@GW%NN77<<01	2	A $$ 	=D  T%>%>&!2!2!4 5U;<	= %%h' 1$"2"2G!
 /&--agg.Q.Q.ST'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-(!//
 '')
 XX&&335Wd>W>W:W
**!Xm4&?XC=DX 	 ) ,,,==55dkkBL$//66::1=I!*!;!;  -U3c-:La6PP%,,3399J)44;;AAN ! 0 0 : :V%D%D !; !  **"V-L-L +  	! "S( %+GG$$99MM115 GG$$88MM114 '",9=56-1-?-?M/*##373K3KM/0%%484M4MM01""f&>&>335;F!17o.""'')E /4m,"+I"6!7I +95 	BG@AK$Yw%7%<%<=	B '++ 	 FNN2KK	  #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[] 	#&&t, II--/ 1S#c#/01KK		"		# ""KK55f=>}}	# 	#s   1A%kk'c                   t         j                  j                  j                  |       } t	        | t
        j                  t        f      rt        |       }t        |      }|S d}t         j                  j                  j                  | |      sC|dkD  rt        d|        |dz  }t         j                  j                  j                  | |      sC|S )N   i @  z!Failed to find static RBLOCK for r   )rE   r  r   simplifyr}  r   r5  rT  r/   rO  r  )r  r&  s     r}   r  z#TritonKernel._get_persistent_RBLOCK  s    !!**62fu}}c23f+C!#&C 
 Cgg&&;;FCH?$'H%QRRq gg&&;;FCH 
r   c                N    	 t         j                  |        y# t        $ r Y yw xY w)NTF)r  r  r  )r  s    r}   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCK  s*    	//7 		s    	$$c                   d	d}| j                   D ]M  }|j                  r| j                  rdt        j                  j
                  j                  |j                        } ||      r)|j                  |j                   dt        |              |j                  r| j                  r| j                  r1| j                  | j                  |j                              }d| d}n| j                  |j                        }|j                  |j                  j!                          d|        |j                  dk(  s/| j"                  s=|j                  d       P y)
a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        c                B    t        | t        j                  t        f      S rt   )r}  r   r5  rT  )r   s    r}   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r  zXBLOCK: tl.constexpr = 1N)r   r   r   r   )r  r  r  rE   r  r   rr  r  r   r  rT  r  r  r  r  r  r   r  )r   rW  rw  r   simplified_tree_numelr  r&  s          r}   rU  z"TritonKernel.codegen_static_numels  s   $	: $$ 	;D$$(=(=()(8(8(A(A$**(M%$%:;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC$++"3"3"5!66LSERS{{c!dmm9:	;r   c                   t        | j                  D cg c]  }t        |j                          c}      }| j                  r|dk(  sJ t
        j                  S |dk(  rt
        j                  S |dk(  rIt        t        | j                  | j                              rt
        j                  S t
        j                  S |dk(  rt
        j                  S t        d|       c c}w )NrG   r   r   z"Unsupported number of dimensions: )rD  r  rT  r  r  r(   CooperativeReductionGridGrid1Dr   r   r:  Grid2DWithYZOverflowGrid2DGrid3Dr  )r   r   rl  s      r}   rE  zTritonKernel._get_grid_type  s    8H8HI***+IJ%%6M6$===!V$+++!V3t22D4D4DEF(===$+++!V$+++=aSABB Js   C!c                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  r| j                  s|j                  |       |j                  t        |              y rt   )r  r}  r  r   r5  r   rE   r  wrapper_codegenerate_numel_exprr  r  r~  r  )r   ri  r  	arg_typesr   r   s         r}   r  z#TritonKernel.add_numel_to_call_args&  s    $$ 	-D$**u}}ell&CDzzww++??dK$$(=(=  &  d,	-r   c                   t         j                  j                  }|j                          | j                  j                         \  }}}}| j                  |||       | j                  j                  D ]  }|j                  |        |j                  ||d|| j                         t        | j                  j                        D ]  }|j                  |        y )NT)r   r  r  )rE   r  r  write_triton_header_oncerk  r  r  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  r  generate_workspace_deallocation)r   ri  r`  wrapperr  r  r  wss           r}   call_kernelzTritonKernel.call_kernel2  s    ''&&((*%)YY%=%=%?"9a##D)Y?))** 	6B11"5	6 	$$(( 	% 	
 499334 	8B33B7	8r   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]w  \  }}t        |t              st         j                  j                  r|j                  d| d| d       Jd| d}|j                  |       d| d}|j                  |       y y )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rE   r  r  rk  r  r   r}  rU   cpp_wrapperr   )r   r  r  r  arg_signaturesr&  arg_signaturer  s           r}   codegen_nan_checkzTritonKernel.codegen_nan_checkF  s    ''&&*.))*B*B*D'9na"%i"@ 
	,C-377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+
	,r   c                    t        |i |S rt   )r  )r   rk  r%  s      r}   re  zTritonKernel.create_cse_varU  s     $1&11r   c                   |j                    d| j                  | j                  |j                               }|j                  j
                  r| j                  j                  |       y | j                  j                  |       y )Nrk  )	ri  r  r  r   rootr	  r  r   r  )r   entryr  s      r}   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entryX  sd    **SD,@,@,L!M NO::((. II%r   c                   |j                   J | j                  |j                         }| j                  }|dk7  rd| dnd}| j                  r| j                  r|j
                  r| d}d|j                  j                          d| | S )Nr   r;  rS  r   z + rsplit_startztl.arange(0, zBLOCK))rB  indexing_size_strr  r  r  r  r  r   )r   r  r  r  r  s        r}   r  z)TritonKernel.iteration_ranges_ranges_code`  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )NrG   r_  rR  rS  )r  r>  )r   r  rV  r  r?  r  s         r}   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_codem  sC     &&&&(sTz$r%;-q99r   c                $   |j                   J d|j                    d}| j                  |      r#d| d|j                   dz    d|j                    d}|j                  j                  ||      }| j                  dk7  r| d	| j                   dS |S )
Nztl.program_id(rS  r  z + tl.program_id(rG   z) * tl.num_programs(r  r   r;  )r  r:  	pid_cacher  r  )r   r  r%  pids       r}   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pidu  s    ~~)))u~~.a0 &&u- cU+ENNQ,>+??STYTbTbSccefCoo!!#s+z)U$t//022
r   c                    |j                   dk(  xr[ |j                   xrL | j                   xr= t        j                  j
                  j                  |j                  t                      S r  )	r  has_zdimr  rE   r  r   rO  r  r.   )r   r  s     r}   r:  z#TritonKernel.needs_yz_grid_overflow  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r   c                    | j                   r | j                   |j                          d   S t        |j                            S )Nr   )r  r   r,   )r   r  s     r}   r+  zTritonKernel.max_block  s;    $$'7u%=>>//r   c                   | j                   sy| j                  rW|j                  j                          d| j                  v r.| j                  |j                  j                          d   dk(  r6yt        j
                  j                  j                  |j                  d      ry|j                  r(| j                  r| j                  |j                        }n9|j                  dk(  r| j                  rd}n| j                  |j                        }|j                  r| j                  r|| j                         z  }t        j
                  j                  j!                  |j                  |      r[|j"                  dk7  xsJ |j$                  xs< t        j
                  j                  j'                  |j                  t)                     S y)NFr   rG   Tr  )r  r  r  r   rE   r  r   r	  r  r  r  r  r  r+  r  r`  r=  r  r  rO  r.   )r   r   r+  s      r}   r  zTritonKernel._has_constant_mask  sa   !!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1Dww77

AF !:!:33DJJ?I[[CDMMIt{{3I!;!;!DOO$55I 7788YO" W==W77##88^EUV r   c                d    | j                   d   }|j                  dk(  sJ | j                  |      S )Nr   r  )r  r  r  )r   xtrees     r}   r  z TritonKernel._has_constant_xmask  s5      #||s"""&&u--r   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 |j                  d       y )Nr   r   )r  r  r]  r  )r   r   r   s      r}   rE  zTritonKernel.filter_masks  sN    $$ 	8D&&t,!!T[[M"67	8
 	&!r   c                    t        t        j                        d | j                   D cg c]  }t        |    c}S c c}w rt   )r  r   r   r  r   )r   r   s     r}   get_reduction_prefixesz#TritonKernel.get_reduction_prefixes  sB     ]::;<Ud>U>UV
 t
 	
 
s   <c                   | j                   D cg c]  }|j                  s| }}dj                  t        d |D                    }|j	                  d| j                  |              | j                   D cg c]+  }|j                  rt        j                  |j                     - }}t        |      }|j	                  d| j                  |              yc c}w c c}w )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        r   c              3  :   K   | ]  }|j                    d   yw)r  Nr  r
  s     r}   r   z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>  s     "UTdkk]%#8"Ur  z	rnumel = zRBLOCK: tl.constexpr = N)
r  r  r   r   r   r  r   r   r   r=   )r   r  r   reduction_treesr  	rn_blocksru  s          r}   r  z%TritonKernel.codegen_reduction_numels  s    
 -1,<,<RD@Q@Q4RRF"U_"UUV	$**V"4!567
 ((
   %%dii0
	 

 y)/

60B/CDE S

s   CC90Cc                |    | j                         }|D cg c]  }t        j                  | | fi | c}S c c}w )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        )r  r   r   )r   r  r%  rn_prefixesr  s        r}   r  z#TritonKernel._get_reduction_symbols  s=     113JUVxx0;F;VVVs   !9c                    | j                         }| j                  ddd      }t        t        |      dz
        D cg c]  }t	        ||dz   d        c}t        j                  d      gz   S c c}w )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   rG   N)r  r  r  r  r=   r   r5  )r   r  	rn_numelsrA  s       r}   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffs  s{     113//PT/U	;@[AQTUAU;V
47M)C!GI./
]]1 	 
s   A0c                :    | j                         }t        ||      S )zK
        Compute linear reduction indices from N dimensional ones.
        )r  r<   )r   
multi_indscoeffss      r}   r  z'TritonKernel._flatten_reduction_indices  s     113,,r   c                $   | j                  ddd      }| j                  ddd      }| j                  |      }|j                  d| j                  |              | j                  |      }|j                  d| j                  |              y)zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r  r  r   rU  )r   r  
rn_offsetsrn_indsr/  rindexs         r}   r  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }|j                  | d| j                  |       d|j                          d|j                   d| g       | j                  |      r(| j                         }|j                  | d	| d
       y |j                  | d|j                   d| d       y )Nrk  z	offset + r  z
offset = 0r   z	offset = r   r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r  r	  r   ri  r  r  rB  r  r  r  r   r  r^  )r   r  rW  r  r  r  s         r}   r  z,TritonKernel.iteration_ranges_codegen_header  sb    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5)'')ENNaSw6FGHNNaS

|3qc?@r   )r   TNN)r  zdict[str, sympy.Expr]r  zOptional[FixedTritonConfig]r  zOptional[int]r   r   r  r(  r   r   r   r   )r   r   rC  z!Optional[TMACompatibilityChecker])r   )ri  r   r  r   rF  z/Union[BlockPtrOptions, TensorDescriptorOptions]r   ztuple[str, str])r   r   r  r   r  r   r   r   )ri  r   r   r   rt   )
ri  r   r   r   rV  rM   r  rD   r   r   )r  Optional[CSEVariable]NN)r>  rM   r  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  rM   r  r(  r  r   r  z Optional[tuple[str, sympy.Expr]]r  r  r   rM   )r   ztuple[str, BlockShapeType])rV  rM   r  r(  r   rM   )
r  r(  rP  r(  r  rC   rV  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r  )r  r(  r  )ri  r   r   r   rV  r  )r>  tuple[CSEVariable, ...]r  tuple[torch.dtype, ...]r   r   )r  r  r6  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r>  r  r   r  )
r  r  r>  r  r  r   r  r   r   r  )r   rz  )r   z type[triton_heuristics.GridExpr])ri  r   r`  zOptional[IRNode]r  )r   r  )r  rZ   )r  r[   r   r   )r  r[   rV  r   r   r   )r  r[   r   r   )r  r   r   rT  )r   r[   r   r   )r   r   r   r   )r  rO   r   r   )r  r   r   zlist[sympy.Symbol]r]  )r  r  r   r   )r  r[   rW  rO   r   r   )Rru   r   r   r   r(  rz  r   r  r  r\  r  r  r   r  r  r  r  r  r  r  r  r   r  rF  rt  rz  r  r  rV  r  r  r  r  r  r  r  rE  rJ  r  r  r!  r  r`  r   ri  r  r  r  r  r  r  r  r  r&  r   ro  r  rt  rU  rE  r  r  r  re  r  r  r  r  r:  r+  r  r  rE  r4   r  r  r  r  r  r  r  r)  r*  s   @r}   r  r  R  sU    &I%%).E&.O$;!
 48'+(3%(3
 2(3 %(3 
(3T"

#J*:0


 " " GKT
T
 $ET
v
 A'A' A' B	A' 
A'FX2MM M 	M
 M4^B SW44 *43>4FO4	4l
& 480455 C5 &	5
 $5 5 15 .5 
5n.
9
(
1<
	
(XX X &	X
 ;X 
5Xt&DO&"
0F
P#0&!
	B// / ;	/b>S1>S;R>S	>S@"'"
" (" 
!"BD"'D" (D" 	D"
 D" 
!D"Lb%H Xt

 % %N\|    $;LC
-8(,2&J:(:14:	:
0
)V.
" 
 
F$W 
 
-? A(A0>A	Ar   r  c            
      `    e Zd ZU eZded<    eej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g      Zd fdZedd       Zd Zd ZdddZ	 d	 	 	 ddZ	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 	 	 dd
Zd Z xZS )TritonSchedulingz	type[Any]kernel_typec                    t         |   |       |t        |d      sy |j                  D ]$  }t	        |t
        t        f      st        |_        & y )Nr^  )	r  r   r   r^  r}  r3   r1   debug_triton_codedebug_device_str)r   	schedulerr`  r"  s      r}   r   zTritonScheduling.__init__9  sM    #GIw$?OO 	:D$0B CD(9%	:r   c                    t         j                  j                  st         j                  j                  r't	        g | j
                  t        j                        S | j
                  S rt   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrJ   REDUCE_TO_SINGLE_ELEMENT)ry   r  s     r}   get_backend_featuresz%TritonScheduling.get_backend_featuresA  sR     MM00}}99P#&&P(O(OP  ###r   c                   t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rvddlm}m	 t        fd|D              sY|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              y y y c c}w )Nr   )r0   ForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wrt   )r}  )r   rl  r  s     r}   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>X  s      >?
189s   z Fused node name list: rR  )rE   r  r  r8   make_commentr   debug_fusiontorch._inductor.schedulerr0   r  r   r}  get_namecommentr   )	r   node_scheduler  origins_detailed_originsr0   rl  
node_namesr  s	           @r}   codegen_commentz TritonScheduling.codegen_commentL  s    ''&&%8%P""  )
  CP  +!!%67 JJL
 
 $$''>tyy?T>UV s   .#Cc                L   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }t        j                  j                  rt        j                  j                   d| }||j                  |<   t        j
                  j                  r|nd}|j                  t        t         j"                        |      }|j                  t        t         j$                        |      }|j                  dd      }t'        t)        |j+                               d      \  }	}
}t-               }t.        j1                         rt.        j                  ||       |j3                  d	|d
       |j5                  |d       t         j                  j7                         }|j3                  d|j8                   d       d| }t;        ||      \  }}|d|z   dz   |z   z  }|j=                  ||j?                         |       tA        jB                  d      rtA        jD                  |||       |S )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Trb  z''', device_str='z')z# kernel path: r  kernel_metadata)#rE   r  r  src_to_kernelr   r   descriptive_namesr7   rF   r   next_kernel_suffixaot_inductormodel_name_for_generated_filesunique_kernel_namesreplacer   r:   rG  r  r$   r#   rc  rO   async_compileuse_process_poolr   r   r  r  r8   define_kernelr   r!   is_metric_table_enabledlog_kernel_metadata)r   src_coder  r  r  r)  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr  metadata_commentr  detailed_originss                    r}   r  zTritonScheduling.define_kernelf  sg   ''&&w,,,!//9Kt m ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK ""AA "(!4!4!S!S TTUVaUbc /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%Iq+,.O--/ $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M7(S%G%w 58H HH!!_5579I ../@A++KhOr   c                    | j                  |d      }t        j                  |      }| j                  ||t	        d |D                    S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r9  c              3  <   K   | ]  }|j                           y wrt   r  r   rl  s     r}   r   z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s     :WA1::<:Wr@  )r  )generate_kernel_code_from_nodesr%   rV  benchmark_codegened_moduler   )r   r^  n_spills_thresholdr  rq  s        r}   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes  sV    
 77PT7Ux(..#
:WQV:W0W / 
 	
r   c                  	
 t        t        j                  j                        }t	               5  |j                  t        j                  j                               5  dfd

fd}
fd}||nt        dg      }t        j                  d|j                          |        j                  fcddd       cddd       S j                         	j                  j                  	   j                  	 d          j(                  }t+        |      d
k(  sJ |d   j,                  |kD  rt'        d	      nNt/        j0                  	fd      t+        j2                        dkD  rt/        j0                  	fd      z
  t        j                  d|        |        j                  fcddd       cddd       S # t        $ rn}t         j"                  j$                  r t        j                  d||       t'        d	       |        j                  fcY d}~cddd       cddd       S d}~ww xY w# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z$Benchmark an already compiled moduleNc                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextrq  s   r}   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s6    ||///ww''5a8>IIr   c                 >            } t        | t                     y rt   r&   r   )r   r  mss    r}   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s    &(T3r7+r   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wrt   )r  r   existsopenr  readr   fdr  s     r}   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  sM    &(77>>$'d 0r$RWWY/0 00s   AA rW  %kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrG   c                 4      j                     d         S r3  
clone_argsrk  callwrapped_jit_functions   r}   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>       D!@!5!@!@$!G!JK r   c                 "     j                     S rt   r  rk  r  s   r}   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  s     ? 4 ? ? F r   z+The fused kernel for %s took %.3f ms to run)r   rE   r  rY  r   r  r  r   r  r  r  get_argsr  r  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr  n_spillsr)   benchmark_gpur*  )r   rq  r  r  device_interfacer  r  r$  r  rk  r  r  r  r  s    `       @@@@@r}   r  z+TritonScheduling.benchmark_codegened_module  sJ    4AGG4G4GH O	$##AGG$G$G$IJO	$ BJ, )4
*i[:Q  II7
 B~3<<'?O	$ O	$ O	$B <<>D88D#&;; (4)44d;A>? -66Iy>Q&&& |$$'995\ !..K +==>Bk77F B II=
 Ms||#_O	$ O	$ O	$N  
(==OO		@
 5\3<<''cO	$ O	$ O	$N
(OO	$ O	$ O	$ O	$ O	$sh   .I%"AI<	I%(I8GB3I	I%	IAI0I1I5	I%III	I%%I.c                   |j                  d      }|xr  t        d |j                         D              }| j                  }|rddlm} |}|rd|d<   |j                  d      r
d|d	<   d|d<   t        j                  |j                        s|j                  d	      rJ d|d	<   t        j                  j                  ||||      } ||i |}| j                  |||      S )
Nr  c              3  <   K   | ]  }|j                           y wrt   )is_split_scan)r   r`  s     r}   r   z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
%)D (
r@  rG   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)contains_opr   scheduler_nodesr  triton_split_scanr$  r  rt  rP  r  rE   r  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr#  r  r$  r  s	            r}   create_kernel_choicesz&TritonScheduling.create_kernel_choices  s     "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v.=AM9:>CM:;11/2Q2QR$(()HIII=BM9:		66+}
 k;];,,V[-PPr   c           	        |g}t         j                  j                  s|S |j                  xr |j	                  d       }|j
                  xr |j	                  d       }|r%|j                   | j                  |i |ddi       |r|j                  j                  }t        j                  j                  j                  |d      r[|j                   | j                  |i |ddix}       |r2|j                  r&|j                   | j                  |i |ddd       t        |      dkD  r.|dd  D ]  }	|j                  |	_         |j!                  d        |S )	Nr&  r%  Fi   )r%  r&  rG   c                    | j                   S rt   )r  )ks    r}   r  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>]  s    q'='= r   r  )r   r   multi_kernelr  r  r  r~  r  r  rP  rE   r  r   rO  r  must_keep_buffersr  )
r   r  r-  r.  kernelsoptional_persistentoptional_cooperativer  r  kernel2s
             r}   r+  z)TritonScheduling.add_multi_kernel_choices+  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN    # 38  __44Fww44VUC-T--$' 8= E '5+E+ENN((((+ <A:?	 w<!"12; E,2,D,D)E LL=L>r   c                   fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }t        |      t         j                  _        t
        j                  dkD  }	t
        j                  dkD  }
| j                  |d|	|
d      }|D ]  \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}|j                  t        t        j                        d      }t        j                   |      t"        j%                  d	|j&                          |       \  &|z  }|z  }|j)                  j&                         ܉j+                         j,                  j.                    j0                   d          j2                  }t5        |      d
k(  sJ |d   j6                  dkD  rt9        d      xn3t;        j<                  fd      t;        j<                  fd      t"        j%                  dt        d |D                      |        |z  }|z  }|j)                  j&                          |t         j                  _        |t         j                  _        |||fS c c}w c c}}w )Nc                 ~     j                   J t        j                  j                   j                         d   dz   S r  r  r  s   r}   r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathe  s6    <<+++77##CLL1!4~EEr   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wrt   )r  )r   r$  s     r}   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>m  s      Eaq Ero  r  )r  r   r	  r
  r  r  splitr  s     r}   r  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cachei  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                 \            } t        | t              dz   t              z          y )Nr  r  )r   r  r  ms_clones    r}   r  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cachep  s&    "$Ds2w}s8}<=r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coder  r  rG   r  c                 4      j                     d         S r3  r  r  s   r}   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  r  r   c                 (     j                     d   S r3  r  r  s   r}   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  s    ;0;;TB1E r   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  <   K   | ]  }|j                           y wrt   r  r  s     r}   r   z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>  s     <A1::<<r@  )rE   r  r=  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr  r  r   r:   r  r%   rV  r  r  r  r~  r  r  r  r  r  r  r  r  r)   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origrC  rD  kernel_code_listr  r  
node_groupr`  fused_node_listsr^  rl  namesr  rk  r  r  rq  r  r@  r  s                        @@@@@@@r}   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel`  s   
	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8 2	+#Ha=GHT 0HH/?OeOAQZZ\O\OEO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV<<<	 MNHh&NS\\*e2	+f #7%<"22i  IOs   K7K#)r  zOptional[Scheduler]r   r   )r  ztorch.device)   )r   tuple[float, str])rY  N)r  zOptional[OrderedSet[str]]r   rZ  )r,  rj   r-  	list[Any]r.  r}  r   list[TritonKernel])r  r  r-  r[  r.  r}  r   r\  )ru   r   r   r  r  r   r   rJ   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONr  r   r   r  r  r  r  r  r0  r+  rX  r)  r*  s   @r}   r  r  *  s   )K)!""$$**44++**		
: $ $4=~	
 RVT$5NT$	T$l#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jY3r   r  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} | j                         }|J | j                  j                  |      }t        |t        |f      sJ dt        |              t        j                  j!                  |      5  t"        j$                  }|j'                  | j)                               j+                         }|t"        _        d d d        |j                  | j                          d       |j                  t-        j.                  d             |S # 1 sw Y   RxY w)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder}  r    MultiTemplateBuffermake_kernel_renderr~  r  0torch._inductor.codegen.cuda_combined_schedulingrf  r  r  get_backendr]   r  rE   r  set_current_devicer!   generated_kernel_countr  rM  rc  r   r  )r`  linesmulti_templaterf  r  backendold_generated_kernel_counttriton_codes           r}   r  r    sd   E++-N!Z@V@V%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KL 	
klpqxlykz{	
L WW''/ 	H *1)G)G&!AA eg  .HG*	H 	(67X__[&9:L	H 	Hs   A	E66E?r   )rV  r   r  r   r  r   r   r   r  )r  r(  r   r(  )r  r(  r   r   )r  zUnion[CSEVariable, Any]r   r   )r   rg   rm  )r{   r   r   zCallable[[_T], _T])r`  r0   r   rz  )
__future__r   r  r  r  r7  r<  loggingr#  rS  r  r   collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   rv   torch._loggingtorch.utils._pytreer5  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   r   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r    r!   r  r"   	codecacher#   r$   r%   r&   ops_handlerr'   runtimer(   runtime.benchmarkingr)   runtime.hintsr*   r+   r,   r-   runtime.runtime_utilsr.   r/   r  r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   virtualizedrB   r  rC   rD   rE   wrapper_benchmarkrF   block_analysisrH   commonrI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   simdrX   rY   rZ   r[   r\   r]   triton_utilsr^   r_   r`   ra   rb   r  rc   typesrd   re   r,  rg   rh   ri   simd_kernel_featuresrj   rk   	getLoggerru   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrp   r   r   r   	dataclassr   r   rc  rp  rT  r  r  r  r	  r  r  r  r  r  r  r-  rI  rK  _initialize_pointwise_overridesr(  rx  r   r  r  r   r  r  r  r  r  r  r   r   r}   <module>r     s   "         	  .  F F  0   $ $ C < 0 / K K M X X 4 " " ( F F ( ' .  D W W     C B B /    "   %  L&8	Bg!00<H~~//*E^^--hA
6 6  4 $ 4 *, ,: 
 
 
B A A AH ?4 ? ?8 =, = =@++/+<P++>jQM jQZ 	3
&8
;P :(.bS&k S&l  / / 9|8O |8~$+ $+N : : :&! !H # # #
%uS%S/-A'BBC 
 e e ePU+A:/0 U+ApVO3~ O3dr   