
    pi                   x   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9 erddl0m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@mAZA ddl1mBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z`maZambZb erd d lcmdZdmeZemfZf d d!lmgZg  ej                  ei      Zjej                  j                  eid"      Zmej                  j                  eid#      Znej                  j                  eid$      Zo eZ       j                  Zq e"g d%      Zrd:d;d&Zsej                   G d' d(             Zu G d) d*eu      Zv G d+ d,eu      Zwd<d-Zx ed.eWeW/      Zy G d0 d1eYey   eey         Zz G d2 d3eC      Z{ ej                  d45       G d6 d7             Z| G d8 d9e}      Z~y)=    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNode)'set_kernel_post_grad_provenance_tracing)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                j    t         j                  j                  j                  j                  }||S | S N)torch	_inductorr   triton	max_tiles)defaultrV   s     ^/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesrY   Z   s-    &&--77I!-9:7:    c                       e Zd ZdZej
                  j                  ej
                  j                  d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZee	e
dd                     Zd	dZee	e
d
d                     Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y rR   )super__init__namevar_list
var_rangesnumelprefixr]   r^   kernelroot)selfrb   rc   rd   re   rf   rg   r]   r^   rh   	__class__s             rX   ra   zIterationRanges.__init__o   sO     		 $
	rZ   c                ,    t        | j                        S rR   )r1   rf   ri   s    rX   is_reductionzIterationRanges.is_reduction   s     #4;;//rZ   c                ,    t        | j                        S rR   )r2   rb   rl   s    rX   symbolzIterationRanges.symbol   s    !$)),,rZ   c                z    t        j                         D ci c]  \  }}||
 }}}|| j                     S c c}}w rR   )r   itemsrf   )ri   symtrf   prefix_to_symts       rX   rr   zIterationRanges.symt   s>     <F;K;K;MN<4&$,NNdkk** Os   7)rb   strrc   list[sympy.Symbol]rd   dict[sympy.Symbol, sympy.Expr]re   
sympy.Exprrf   rt   rg   
SIMDKernelrh   IterationRangesRootreturnNonerz   boolrz   zsympy.Symbol)rz   r   )__name__
__module____qualname____doc__sympySOnera   propertyr,   r   rm   ro   rr   __classcell__rj   s   @rX   r\   r\   _   s    . ww{{ % 3	
    " 
0 0   0- +   +rZ   r\   c                       e Zd ZdZ	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZ	 	 	 	 ddZ	ddZ
	 	 	 	 dd	Z xZS )ry   z
    Root of a iteration range tree that represents a single
    tiled dimension in the output kernel. It contains multiple
    sets of iteration represented with IterationRangesEntry.
    c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r| j
                  r|	J || _        || _        |	| _        |
| _	        y )N)rb   rc   rd   re   rf   rg   rh   )
r`   ra   indexnodes	pid_cacherm   is_loop
tensor_dimgrid_dimhas_zdim)ri   rb   re   rf   r   rg   r   r   r   r   r   rj   s              rX   ra   zIterationRangesRoot.__init__   s     I 	 	
 
=?
 *3 t00X5EFF$  rZ   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))rb   re   rl   s    rX   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGrZ   c                b    | j                   j                         D ]  }|j                           y rR   )r   valuescache_clear)ri   nodes     rX   r   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	rZ   c                2    t        | j                   d      S )Nr   )r2   rf   rl   s    rX   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788rZ   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r8   graphsizevarsstatically_known_equalsre   r   r   r   r   IterationRangesEntryrf   nextrg   iter_vars_countrange_tree_nodesro   rc   appendrd   )ri   r]   r^   exprr   s        rX   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$rZ   c                    t         j                  j                  }g }t        |      D ](  }|j	                  | j                  ||             ||z  }* g t        |      S rR   )r   r   r   reversedr   r   )ri   lengthsr]   itervarsr^   s        rX   construct_entriesz%IterationRangesRoot.construct_entries   s]     ''++w' 	'FOODKK89&G	' %(#$$rZ   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rR   )r   ro   )ri   r   es      rX   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     \  	
 dd|j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  fd       t        j                  j                  g 	g 
	
fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                         s, || j                  t        | j                                      g t#        	      g t#        
      fS c c}w c c}w )z,Figure out vars from this tree used in indexc                   t         j                  j                  j                  | j                  t
        j                        }t         j                  j                  j                  | j                  t
        j                        dk(  }|| fS )a:  
            Gets the key for sorting nodes. When two nodes have the
            same divisor, the node with length as 1 should be handled
            first so the current divisor is not changed after multiplied
            node.length. Returns `not length_is_one_hint` for ascending
            sort.
            fallbackr9   )r8   r   r   	size_hintr]   r   unbacked_symint_fallbackr^   )rN   divisor_hintlength_is_one_hints      rX   get_sort_keyz8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   s     77++55		F$C$C 6 L   **HHv'F'F +    !&8"899rZ   c                     |       S rR    )rN   r   s    rX   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    a rZ   keyc                    j                  | j                                j                  | j                         | j                  z  y rR   )r   ro   r^   )r   r]   
index_varssizess    rX   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+GrZ   )rN   r   rz   ztuple[int, bool])free_symbolsr8   rg   r   getrf   sortr   r   r   r   r   r   r]   r   r   re   r   )ri   r   sr   nr   r   r]   r   r   r   s          @@@@rX   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sK   
	:& <A;M;MNa**..q1NN!CqQ188t{{+BCC

0
1''++
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCD&*%&(:(5/(:::/ OCs   0F$F)F)/F)rR   )rb   rt   re   rw   rf   rt   r   intrg   rx   r   Optional[dict[str, str]]r   r}   r   Optional[int]r   r   r   r}   rz   r{   rz   rt   rz   r{   r~   )r]   rw   r^   rw   rz   r   )r   list[sympy.Expr]rz   zlist[IterationRangesEntry])r   r   rz   ru   )r   rw   rz   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   ra   r   r   r   r   r   r   r   r   r   s   @rX   ry   ry      s     /3(!(! (! 	(!
 (! (! ,(! (! "(!  (! (! 
(!TH9 .%'%	#%E/;/;	4/;rZ   ry   c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZddZddZd
dZddZddZ	ddZ
 xZS )r   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	rb   re   rc   rd   rf   r]   r^   rg   rh   )r`   ra   re   rc   rd   rf   rg   rh   parent	functools	lru_cache_codegencodegenr   )ri   rb   r]   r^   r   r   rj   s         rX   ra   zIterationRangesEntry.__init__*  s~     	,,'__((==== 	 
	
 0y**40?	rZ   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(r   ))rb   r]   r^   r   rd   rl   s    rX   r   zIterationRangesEntry.__repr__A  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrrZ   c                L    fd| _         d | j                   _        | _        y )Nc                      S rR   r   )rb   s   rX   r   z/IterationRangesEntry.set_name.<locals>.<lambda>E  s    t rZ   c                      y rR   r   r   rZ   rX   r   z/IterationRangesEntry.set_name.<locals>.<lambda>F      rZ   )r   r   rb   )ri   rb   s    `rX   set_namezIterationRangesEntry.set_nameD  s    ##/ 	rZ   c                8    | j                   j                          y rR   )r   r   rl   s    rX   r   z IterationRangesEntry.cache_clearI  s      "rZ   c                X    t         j                  j                  |        | j                  S rR   )r8   rg   codegen_iteration_ranges_entryrb   rl   s    rX   r   zIterationRangesEntry._codegenL  s    	//5yyrZ   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr9   r   c              3  P   K   | ]  }t        |t        j                           y wrR   )r   r   SIZE.0r   s     rX   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>Y  s       ,56N1dii0,   $&)
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )ri   precomputed_argsargsymbolss       rX   r   z%IterationRangesEntry.precomputed_argsP  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  rZ   c                ,    t        | j                        S rR   )hashrb   rl   s    rX   __hash__zIterationRangesEntry.__hash___  s    DIIrZ   c                X    t        |t              sJ | j                  |j                  k(  S rR   )r   r   rb   )ri   others     rX   __eq__zIterationRangesEntry.__eq__b  s&    %!5666yyEJJ&&rZ   )rb   rt   r]   rw   r^   rw   r   rw   r   r\   rz   r{   r   )rb   rt   rz   r{   r   )rz   r   rz   r   )r   objectrz   r}   )r   r   r   ra   r   r   r   r   r   r   r   r   r   s   @rX   r   r   )  sf      	
    
.s
# 'rZ   r   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    rX   constant_reprr   g  s9    e	%-		E	;rZ   CSEVariableType)boundrW   c                  &    e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 	 d8	 	 	 	 	 	 	 	 	 	 	 	 	 d9 fd
Ze	e
ed:d                     Zd;dZd<dZe	d=d       Zd>dZ	 	 	 	 	 	 	 	 	 	 	 	 d?dZd@dZdAdZdBdZd>dZd>dZdCdZd:dZdDdZdEdZd=dZdFdZ	 	 	 	 	 	 dGdZ	 	 	 	 	 	 dGdZdHdZdIdZ e!	 	 	 	 	 	 dJd        Z"e#e$jJ                  jL                  f	 	 	 	 	 	 	 dKd!       Z'e#e$jJ                  jL                  f	 	 	 	 	 	 	 dLd"       Z(	 	 	 	 dMd#Z)e#	 	 	 	 	 	 dNd$       Z*dOd%Z+dOd&Z,dPd'Z-	 	 	 	 dFd(Z.dQd)Z/dRd*Z0dSd+Z1dTdUd,Z2e3jh                  	 	 	 	 	 	 dVd-       Z5dWd.Z6e!d/        Z7dXd0Z8d1 Z9d2 Z:d3 Z;d4 Z<d5 Z=d6 Z>dYd7Z? xZ@S )Zrx   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFr}   allow_block_ptrrt   kernel_namec                    |i }t         
           | _        |j                          _        t                _        t                _        |j                         D ci c]/  \  }}|t        j                  j                  j                  |      1 c}} _        g  _        i  _        t!        j"                          _        |j'                          _        ||n j+                          _        | _        | _        ||n j3                          _         j7                          _        d  _        t<        j>                  d fd       }	|	 _          jC                  |       y c c}}w )Nc                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rR   )r8   r   r   simplify_with_rangesrd   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treeri   s     rX   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing  sb    GG$$99%ARSE(( B44UDAB 66u==rZ   )r   rw   )"r`   ra   featuresget_mutations	mutationsr/   bodyindexing_coderq   r8   r   r   simplifynumelsr  r   	itertoolscountr   rm   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scorestilingshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr    r   cacher  initialize_range_tree)ri   r  r  r   override_persistent_reductionoverride_cooperative_reductionr  rf   valr  rj   s   `         rX   ra   zSIMDKernel.__init__~  sW    I !//1"$	+-FLlln
7BvsFAGG$$--c22
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L-3 -8 *557 	!
 **,(, 
	> 
	> "3""9-A
s   "4E)c                :    t        d | j                  D              S )Nc              3  2   K   | ]  }t        |        y wrR   )r1   )r   rf   s     rX   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I6&v.I   )sumr  rl   s    rX   num_reduction_dimszSIMDKernel.num_reduction_dims  s     IT[[IIIrZ   c                    t         rR   NotImplementedError)ri   dtypes     rX   dtype_to_strzSIMDKernel.dtype_to_str      !!rZ   c                6    | j                   j                         S rR   )r  select_index_dtyperl   s    rX   get_index_dtype_as_torch_dtypez)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11rZ   c                @    | j                  | j                               S rR   )r-  r1  rl   s    rX   index_dtypezSIMDKernel.index_dtype  s      !D!D!FGGrZ   c                     yNFr   rl   s    rX   r  zSIMDKernel.want_no_x_dim      rZ   c                   t        fdt        D              }| xs | }d	d}g d}	t        t        |	            }
ddg}|r|}n
|r|
}n|
|z   } |||      } ||	t              }g }t	        |      D ]s  \  }}t        |      }|j                  |      }|j                  |      }||n|}|j                  t        | d|   ||| ||xr | j                   ||dv 
             u |S )
Nc              3  ,   K   | ]  }|v s|  y wrR   r   )r   rf   r  s     rX   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
61AF%
   	c                `    t        fd| D              D ci c]  \  }}||
 c}}S c c}}w )Nc              3  ,   K   | ]  }|v s|  y wrR   r   )r   r#  masks     rX   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U3PT32Ur9  )	enumerate)seqr<  idxr#  s    `  rX   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s3    )22U#2U)U%S#S  s   *)rN   rM   rL   rO   rP   r   rL   )r   r   r   r   r   )rz   zdict[Any, int])
r   all_prefixeslistr   r=  r1   r   r   ry   r  )ri   r   r  rm   r  r  active_prefixesno_r_dimr@  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  irf   r   r   r   s       `                rX   construct_range_treesz SIMDKernel.construct_range_trees  s3    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/.@K ,KI))\B"?3 	IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F]	& rZ   c                    | j                  || j                  | j                  j                         | j                  | j
                        }| j                  j                  |       y rR   )rL  r  r  rm   r  r  r  extend)ri   r   r  s      rX   r   z SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,rZ   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )ri   indicess     rX   finalize_indexingzSIMDKernel.finalize_indexing  r   rZ   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wr5  )r  store)ri   rb   r   r   priors        rX   store_reductionzSIMDKernel.store_reduction
  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yr5  r   rl   s    rX   r  z+SIMDKernel.should_use_cooperative_reduction  r6  rZ   c                     yr5  r   rl   s    rX   r  z*SIMDKernel.should_use_persistent_reduction  r6  rZ   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrR   )rd   rq   r   r  s     rX   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *,0%%'*r   )dictr  chainfrom_iterabler  rl   s    rX   rd   zSIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
rZ   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrR   )r   r   rZ  s     rX   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>   s     Q3td23Qs   !#)r'  r  rl   s    rX   triton_tensor_ndimzSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQrZ   c                ^    dg| j                         z  }d||<   ddj                  |       dS )Nr{   :[r   ])r`  join)ri   rK  r   s      rX   indexing_size_strzSIMDKernel.indexing_size_str"  s9    42244a499U#$A&&rZ   c                    dg| j                         z  }| j                  D ]R  }|j                  |j                  r| j                  s)|j
                  j                          d||j                  <   T |S )N1BLOCK)r`  r  r   rm   r  rf   upper)ri   r   r  s      rX   dense_size_listzSIMDKernel.dense_size_list'  sv    //11$$ 	GD&$$(=(=,0KK,=,=,?+@)Fdoo&	G rZ   c                L    | j                         }ddj                  |       dS )Nrc  r   rd  )rk  re  ri   r   s     rX   dense_size_strzSIMDKernel.dense_size_str1  s)    $$&499U#$A&&rZ   c                   t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  j                   |j                  j"                        j%                         i      S Nr   )r   r   r   r   r   r4   r   r8   r   r   r
  rh   r   r   r   r   r   re   ro   )ri   r   rN   	tree_node	new_indexs        rX   r
  z)SIMDKernel.combine_modular_indexing_pairs5  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
rZ   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rR   )r8   r   r   expand_floor_divr   _combine_contiguous_dims)ri   r   r  
expand_resrr  denominators         rX   r	  z"SIMDKernel.combine_contiguous_dimsG  s[     ))::5AA:A%/"I{D99)TJKXX00==rZ   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r9   )r   r   r   r   r   r   r8   r   r   _simplify_loopsr<   r   r4   r[  zip)
ri   r   r  r   r   	new_sizesreindex_prunenew_index_varsrr  s
             rX   ru  z#SIMDKernel._combine_contiguous_dimsP  s     eemmU\\:;L //6
Eu:?L%&WW%5%5%E%E7US&
"	7F L	2ud3z7>;R+S&TU	rZ   c                      j                   d   j                  xs  j                  t        j                   fd       } |       S )Nc               3     K    j                   j                         s j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)NFT)r  rm   r  codegen_body)ri   should_flushs   rX   ctxz)SIMDKernel.disable_reduction.<locals>.ctxg  sl     ==--/0000 !!#$)D!-%%'(,%%s   AA5A) !A5)	A22A5)r  r   r  
contextlibcontextmanager)ri   r  r  s   ` @rX   disable_reductionzSIMDKernel.disable_reductiond  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ urZ   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rR   )r   r  rz  r   )ri   r   r^   rangess       rX   
set_rangeszSIMDKernel.set_ranges|  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                D   t        d |D              r| D cg c]  }g  c}g fS t        j                  j                  | D cg c]  }g  c}| D cg c]  }j	                  |       c}t        j                         d
fd}	 	 	 	 	 	 	 	 dd}g }d}|D ]K  }	g }
|	D ]/  }j                  |d      r|
j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)|dz   t              k  roj                  ||         rZj                  ||         st        |   }t        ||         }|
j                   || |||       ||dz   |                   |t              k  s|
j                  t        j                   |||                   2 |j                  |
       N t        d D              sJ d d	|        |fS c c}w c c}w c c}w )Nc              3  8   K   | ]  }t        |      d k(    ywr   N)r   )r   r^   s     rX   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6Fs6{a6s   c                    j                  |      }j                  |    |      st        t        |    |      | <   |    j	                  |       t              S rR   )r  statically_known_multiple_of	CantSplitr   r   r   )rK  r   
new_ranges	remainingsv	var_counts     rX   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sZ    ;;t$D229Q<F#IaL$7IaLqM  &	?"rZ   c                     d fd}|S )Nc                     |    z  |    z   S rR   r   )	flat_varsidx1idx2sizes    rX   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??rZ   )r  r   rz   rw   r   )r  r  r  r  s   ``` rX   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s    @ MrZ   r   r9   c                6    t         j                  j                  S rR   )r   r   Zero)_s    rX   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLL rZ   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r9   Nr8   r   r   r   r   s     rX   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s*     I!177##--a0A5Is   68zfailed to set ranges  )rK  r   r   rw   rz   r   )r  rw   r  r   r  r   rz   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r8   r   r   r  r  r  r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r  r  r  r  s                 @@@@rX   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  sj    6g66$*+5B+R//WW:@-AQb-A
-34R[[^4	OO%		# 	#		$'	/2	5	 !## %	9LN$ "--dA6"))*@A#c)n49S9Sm,:
 "Q&M $c)n49S9Sm,: !1$s9~5":P:P)M2; ::i6 (%m4E$T9]+CDE"))%!%mU;%ma&7? %s9~5&--$//	-0NOA"F "((8K%	9N IyII 	
#I;ay9	
I 000K , .B4s   	H	HHc                   t         j                  j                  }t        |d         dk(  r\|j	                  |t
        j                  j                        s2|j	                  t        |      t        |d         |z        r|d   |gfS |S )z1Fill in the reduction numel of lengths if missingr9   r   )	r8   r   r   r   r   r   r   r   r3   )clsr  r   reduction_numelr   s        rX   prepare_split_iteration_lengthsz*SIMDKernel.prepare_split_iteration_lengths  s{     77##wqz?a00%''++N00f%gaj)O;
 AJ 122rZ   c                n    | j                  |||      }	 | j                  ||       y# t        $ r Y yw xY wNTF)r  r  r  )r  r  r   r  s       rX   is_compatiblezSIMDKernel.is_compatible  sB     55fgW	''8 		s   ( 	44c                >   | j                   D ci c]  }|j                  |j                   }}| j                  s0|D ]+  }t	        |      st
        j                  j                  ||<   - g |j                         }| j                  ||| j                        S c c}w )a5  
        Split and set iteration ranges for the kernel based on the provided lengths.

        This method maps the kernel's tiling structure to the node's iteration space,
        handling both pointwise and reduction dimensions appropriately.

        Args:
            lengths: A sequence of sequences of symbolic expressions representing
                    the sizes of different dimensions for each node.

        Returns:
            A list of lists of symbolic expressions representing the mapped
            iteration variables for each dimension.
        )r  rf   re   r  r1   r   r   r   r   map_kernel_groups_to_node_sizesr  )ri   r   rtr  rf   r  s         rX   split_and_set_rangeszSIMDKernel.split_and_set_ranges  s    $ 150@0@A""))RXX%AA $$  1&v.%*WW[[F6N1
 $6==?# 33FGT__UU Bs   Bc           
     F   t        |      t        |      k(  r!t        d t        ||      D              r || S | j                  ||      \  }}g t        j
                  j                   ||       }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr  r8   r   r   r  r3   )r   rN   r  s      rX   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>*  s@      /
1 GG%%mA&6&:;q@/
s   AA)r   r   rz  r  r  r\  r]  )	r  r  r   r  r  r  r   fnsfns	            rX   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
GV,/
 ,
 w'',/,G,GPW,X)
)LY__22:z3JKL8MN,"H,NN,Ns   7	B BBBc                6    t        |t        j                        S rR   )r   r   TMPri   r   s     rX   is_indirect_indexingzSIMDKernel.is_indirect_indexing4  s    "5$((33rZ   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                  j!                               D              S )NFr9   c              3  F   K   | ]  \  }} |       |      k7    y wrR   r   )r   	idx_range
iter_ranger  s      rX   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>I  s,      
%	: Y8J#77
s   !)r  r   r  r   r   r   r   ry   r   r^   r8   r   r   r  anyrz  r   )ri   r   index_numelsro   entryr  s        @rX   is_broadcastedzSIMDKernel.is_broadcasted8  s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;;M;M;O)P
 
 	
rZ   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        rc  r   rd  )r   rB  re  mapindex_to_strr  rename_indexingr  s     rX   r  zSIMDKernel.index_to_strN  sN     eT"tyyT%6%6!>?@BBzz$..u566rZ   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrR   )r   r   r   PRECOMPUTED_SIZEr   s     rX   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>o  s.      , #1tyy$2G2G&HI,s   46)r  r4   r8   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)ri   r   ar   replacements
simp_indexs         rX   prepare_indexingzSIMDKernel.prepare_indexing[  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00rZ   c                r    | j                   D cg c]  }|j                  r| j                  s| c}S c c}w rR   )r  rm   r  )ri   ts     rX   active_range_treeszSIMDKernel.active_range_trees  s3    ''
q~~AVAVA
 	
 
s   44c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r8   r   r   r  rd   sortedr   rt   r   r   r  r   r4   r   r   )ri   r   symr  pss        rX   r  zSIMDKernel.codegen_indexing  s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c277$7D))#.3 %%c*224	5 rZ   c                    t        d      )NzNYI: codegen_nan_checkr*  rl   s    rX   codegen_nan_checkzSIMDKernel.codegen_nan_check  s    !":;;rZ   c                    t        d      )NzNYI: call_kernelr*  )ri   rb   r   s      rX   call_kernelzSIMDKernel.call_kernel  s    !"455rZ   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr6   logical_andr7   _unwrap)ri   r<  r   rT  	prior_vals        rX   
mask_loadszSIMDKernel.mask_loads  sy     
 $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r9   r   )r   rq   r   r4   r  r2   rb   )	ri   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            rX   get_strides_of_loadzSIMDKernel.get_strides_of_load  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rR   )r   tupler  )r  r   s     rX   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalar  s'    eU#R((%yrZ   c                    t        j                  | j                  j                        D cg c]  }|j	                          }}t        t        d |            S c c}w rR   )rC   
only_nodesr  node_scheduleestimate_flopsr'  filter)ri   r   flopss      rX   r  zSIMDKernel.estimate_flops  sX     +55dmm6Q6QR
 !
 
 6$&''	
s   Ac           	        g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}| j                  j                         }t        j                  j                  j                  t        | j                  j	                                     }t        |      D ]2  \  }}||vr|j                  d       t        j                  j!                  |      }	t        j                  j                  j                  |	      }
|
|kD  rwt#        t$                  }d}||   D ]M  }t'        |t(        t*        f      r|j-                  d|        |dz  }3|j-                  |j.                         O t        |      |z  }n|
}t        j                  j1                  |      }t3        |      }|j                  ||z  dt5        ||k        z   z         5 t7        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r9   )r   r5   r   inplace_buffersr   python_argdefsr  buf_accessesr8   r   r   r   r3   r  r=  r   	get_numelr   r   r   r"   r#   r   r   	get_dtyper.   r   r'  )ri   nbytesninplace_argsr  	call_argsr  	out_numelrK  r   	arg_numelbuf_sizerP  no_index_dep_countdepre   r,  
dtype_sizes                    rX   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY5579a}}113 GG$$..}T[[=O=O=Q/RS		* 	MFAs ,&a ))#.Iww''11)<H)# %S/+%&"', /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL9	M: 6{rZ   c           	     &   t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]F  }t        j                  j                  |      }|s&|j                         }	t        |	j                        dk(  sOt        |	j                  D 
cg c]
  }
|
dk(  s	|
 c}
      dk(  r|t        j                  |	j                        }||}||k7  st        d| dd| d	| z         }t        j!                  |       |D cg c]m  }t        j                  j                  |      rJt        j                  t        j                  j#                  |      j                         j                        ndo }}|D cg c]Z  }t        j                  j                  |      r7t        j                  j#                  |      j                         j                  nd\ }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}|D 
cg c]  }
|
j(                   }}
t        d| d| d| d| d| dz         }t        j!                  |        y t+        d| d      }t        j!                  |       yc c}
w c c}w c c}w c c}w c c}
w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r9   r   N   r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr  r  r8   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider(   logwarning
get_buffergraph_inputsname_to_bufferrb   r'   )ri   r  argdefsr  
_signaturer  uniform_stride_orderarg_namebuflayoutrN   stride_ordermsgrb   stride_order_list	size_listsource_listargdef_namess                     rX   warn_mix_layoutzSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#! 0	H''((2C^^%F6;;1$6;;9a!q&9:a?!226==A'/+7()\9%01E0FF^_l^<}EFC KK$ %.) ! 7711$7 ++GG..t4??AHH "	")% ) %.	! ! 7711$7 **40;;=BB!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # 5<#<qAFF#<L#<%(nYK|\m[no&ykk]"MNC KK$a0	b 3K=@TU
 	C[ :)!# $=s'   -
K:
8K:
"A2K?AL?AL	
Lc                   t        j                  ||d|      }d| _        t        j                  | j                  j
                  |      }t        j                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )Nr'  FT)r6   	reductionr  
index_exprr  r  truedivsubmulr7   r  )	ri   r,  r   sum_rnumelmeandxdx2m2s	            rX   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback\  s    }}UE5%8 % = =uE{{4( $WWUD!ggb"o]]5%4!!4V"455rZ   c                    t        j                  ||d|      }t        j                  ||      }t        j                  |      }t        j                  ||d|      }t	        j
                  ||f      S )Nmaxr'  )r6   r1  r4  expr7   r  )ri   r,  r   vmaxr4  r?  vsums          rX    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallbackh  s\    }}UE5%8ggeT"ggcl}}UE5#6!!4,//rZ   c                    t         rR   r*  rl   s    rX   codegen_kernelzSIMDKernel.codegen_kernelo  r.  rZ   c                     y rR   r   rl   s    rX   r  zSIMDKernel.codegen_bodyr      rZ   c                     y rR   r   )ri   r  s     rX   r   z)SIMDKernel.codegen_iteration_ranges_entryu  rF  rZ   )NNNN)r  dict[str, sympy.Expr]r  rD   r   r   r!  Optional[bool]r"  rI  r  Optional[dict[str, sympy.Expr]]rz   r{   r   )r,  torch.dtyperz   rt   )rz   rK  r   r|   )r   r   r  r}   rm   r}   r  rH  r  r}   rz   list[IterationRangesRoot])r   zdict[str, str]rz   r{   )rP  Sequence[sympy.Expr]rz   r{   )rb   rt   r   rw   r   r;   rz   r{   )rz   rv   )rK  r   rz   rt   )rz   z	list[str])r   rw   rz   rw   )r   rw   r  ry   rz   rw   )rz   z'contextlib.AbstractContextManager[None])r   rw   rz   ru   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]rz   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  rN  r   rO  r  rw   rz   rO  )r  rN  r   rO  r  rw   rz   r}   )r   rO  rz   list[list[sympy.Expr]])r  rM  r   rO  rz   rP  )r   rw   rz   r}   )r   rw   rz   rt   )rz   rL  )r   rw   rz   rw   r   rR   )rb   rt   r   zOptional[IRNode]rz   r{   )r<  zUnion[str, OpsWrapper]r   Union[int, float]rz   zIterator[str])r   rw   rz   rv   )rz   r   )r  r   )Ar   r   r   r   pexprr  __annotations__r  ra   r   r,   r   r(  r-  r1  r3  r  rL  r   rQ  rU  r  r  rd   r`  rf  rk  rn  r
  r	  ru  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r/  r<  rB  rD  r  r   r   r   s   @rX   rx   rx   t  sK    */E&.&&!OT! /38<9=9=0.%0. %0. ,	0.
 (60. )70. 70. 
0.d J   J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
'
$>>':>	>':	(0
 L1$L1/ML1
L1 L1\ 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
  V5 V	 VD O$O 0O
 
 O O84
,7$1$1 
$1L

"<6 )*)3D)	) )&0  
(=~EN
60"rZ   rx   c                     e Zd ZU dZeZded<   d Zd ZeZ	eZ
d Z	 	 d%dZe	 	 	 	 	 	 d&d       Zd'd	Z	 	 	 	 d(d
Zd ZdddZddd	 	 	 d)dZd Z	 d*	 	 	 	 	 	 	 	 	 	 	 d+dZd Ze ej2                  d      d,d              Ze	 	 	 	 	 	 d-d       Ze	 	 	 	 	 	 d.d       Ze	 	 	 	 	 	 	 	 d/d       Ze	 	 d0d       Ze	 	 	 	 	 	 	 	 	 	 d1d       Ze	 	 	 	 	 	 	 	 d2d       Z e	 	 	 	 	 	 	 	 d3d       Z!ee"jF                  jH                  df	 	 	 d4d       Z%ee"jF                  jH                  df	 	 	 d5d       Z&d  Z'd6d!Z(	 d7	 d8d"Z)d# Z*d$ Z+y)9SIMDSchedulingzo
    Single Instruction Multiple Data parent class used for fusion across
    multiple different backends.
    z	type[Any]kernel_typec                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrR   r  r   s     rX   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>  s*     PQQWW%%..}Q/?@Ps   <>)r  rm  s     rX   group_fnzSIMDScheduling.group_fn  s    P%PPPrZ   c                   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }}|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r,|j                         r|k(  xr |k(  }|s |d||       |S |j                         s|j                         s|k(  r|k(  s|j                         s |d||       y|j                         D ]`  }|j                         r nN|j                         |j                         z  s7|j                  \  }\  }	}
||	k(  r||
k(  rT |d||	||
        y ||fD ]  }|j                         s y | j                  |j                         ||      }| j                  |j                         ||      }| j                  |j                         |j                         z   ||      }t        j                  j                  rVd}t!        |      dkD  r%t!        |      dkD  r||cxk(  xr |k(  nc }n||k(  }nt!        |      dkD  r||k(  }|s |d|||       yy|j                         s|j                         r|d	k(  rd	k7  sJ |z  k(  rt#        fd
|j                         D              s	 |d       yt        j                  j$                  r\|j                         sLt'        | j                  |j                         |      j)                               |d	fd	ffv }|s |d       |S y|k7  r |d       |k(  S |j                         r|j                         rJ | j+                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r9   c              3  j   K   | ]*  }t         j                  f|j                                , y wrR   )rx   r  
get_ranges)r   r   numel2rnumel2s     rX   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r+   is_split_scanrm   is_template	get_nodesused_buffer_namesget_buffer_namesselect_tilingr   rU    tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)ri   node1node2r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr_  r`  s                    @@rX   rb  zSIMDScheduling.can_fuse  s    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G &%!!#E,>,>,@f$G);((*O ! !& 1 )++-!  $557%:P:P:RR$59ZZ22Iz &) 3:8M \ & ) ' * $)#)& U^  ==? 
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/05**5??+<fELLN1  !,1- 5:;4412V##!!#E,>,>,@@@''u55rZ   c           
     z   g t        t        j                            t               t               d fd}fd}fd}fd}t        j                  fd       }fd}	|D ]  }
|
v rj                  |
        ||
      r? |	|
      r |       5  	 d d d        r ||
      sxs t              nd  ||
       ` ||
      r" |       5  j                  |
       d d d        t        d d d	|
j                  d
           S # 1 sw Y   |xY w# 1 sw Y   xY w)Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S Nr9   r  r   r  
node_numelnode_rnumelre   r7  s       rX   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sH    +,77(A(
K%'AK6,A efn,A1ArZ   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S r{  r|  r}  s       rX   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s4    +,77(A(
K&K;!+;K!KrZ   c                \    | j                   j                  D ]  }|j                  v s y yr  )read_writesreadsrb   )r   readcurrent_loop_buffer_usages     rX   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s1    ++  99 99  rZ   c                   j                  |        j                  |        j                  | j                  j                  D cg c]  }|j
                   c}       | j                         rt        | t        j                        rrt        | j                  t        j                        rNt        | j                  j                  t        j                        s j                  | j                                y j                  | j                  j                   D cg c]  }|j
                   c}       y c c}w c c}w rR   )r   r   updater  r  rb   rm   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   rN   r  doner  not_ready_yet_nodess     rX   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop!  s    HHQK  #%,,amm>Q>Q-Raff-RS
  q)"9"9:qvvr'8'89"166;;8#''

5)00!--BVBV1WQ!&&1WX .S 2Xs   D; E c               3  L  K   rd   t         u rj                          nj                  t               r1j	                  t               j	                  dz   t                d d  j                  t                j                           j                          y w)Nr  r9   )rA   popr   r@   insertclear)r  maybe_split_indexr  r  s   rX   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop2  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B!B$c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr9   Fr  )	ancestorsr   rA   r@   r}   )r   r  r  r7  s     rX   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionB  sN    {&7 b!O5E#F*   +,,rZ   zunexpected group: (r   z) != r9   )
r   r   r)   r  r  r   r   r   r+  r  )ri   r   re   r7  r  r  r  r  r  r  r   r  r  r  r  r  s     ``       @@@@@rX   generate_node_schedulez%SIMDScheduling.generate_node_schedule  sW   #%)5568 0:|5?\!+/		L		Y" 
	"	"	. 
#	.	-  	Dt|HHTN &6t]K35  -5QRV5W(9(OS=O% )-%%d+'-/1 /!((./ / *)%6(%

1O -	4 ' / /s   (D%&D1%D.	1D:	c                b   |j                         }t        j                  j                  j                  j
                  rt        |      }nd}t        |d       j                  \  }\  }}| j                  |||      }t        j                  d|       | j                  t        ||||            S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        Nc                4    t        | j                               S rR   r   rm   rN   s    rX   r   z-SIMDScheduling.codegen_node.<locals>.<lambda>u  s    c!..:J6K rZ   r   zSchedule:
 %s)re  rS   rT   r   rU   coalesce_tiling_analysisr   r>  r  r  schedule_logdebugcodegen_node_schedulerD   )ri   r   r   coalesce_analysisr  re   r7  r  s           rX   codegen_nodezSIMDScheduling.codegen_nodeh  s     04~~/???!!((AA 9$ ? $ ,KLRR?E633E5&I+];))}eV=NO
 	
rZ   c                   t        j                  t         j                        j                  }t	        |       sy|D cg c]0  }|j                         r|j                         j                         2 }}|D ]}  }|j                         rt        |t        j                        s/|j                         }||D cg c]0  }|j                         r|j                         j                         2 c}z  } t        d |D              syt        j                  j                  j!                  | |       |D ],  }t        j                  j                  j!                  ||       . yc c}w c c}w )NFc              3  2   K   | ]  }t        |        y wrR   )r-   )r   r  s     rX   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>  s     FD)$/Fr&  T)rS   iinfoint32r>  r-   has_tensor_outputr  storage_sizer   r   MutationOutputget_mutation_buffersr   r8   r   r   	check_leq)re   buffersint_maxr'  	buf_sizesmutated_bufsr  s          rX   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing~  s=    ++ekk*..%e, 
$$& NN))+
	 
  	C((*z#r?P?P/Q"779+,,. NN$113 		 FIFF 	
""5'2 	6DGG&&tW5	6/
s   5E&;5E+c                   |j                   }| j                  ||j                  |j                  |j                        \  }}| j                  ||g||d      }|D ]  }| j                  ||        t        j                  |       g }|D ]  }t        j                  |      5  |j                         }d d d        | j                  ||      }	t        j                  j                  dk7  rt!        ||	      }
|j#                  |	|
f       t$        j'                  d|	       |	|_        t+        |      |_         ~t-        |      dkD  rt        |      }n|\  }t        j                  |      5  |j/                         D ]  }|j1                           	 d d d        | j3                  |       |D ]/  \  }	}
t        j4                  j6                  j9                  |	|
       1 |j;                  |j(                         t        j<                  r|j?                          t        j@                  r|jA                  |d   j(                         t        j4                  xjB                  |jB                  z  c_!        t        j4                  xjD                  |jD                  z  c_"        t        j4                  j6                  jF                  rt        jH                  r|d   jJ                  jM                         }|j/                         D ]  }|jO                         }||vr|jP                  J |jP                  jS                         }|CtT        d   dxx   dz  cc<   t        j4                  j6                  jW                  d|jX                  d| d	        | j[                          y # 1 sw Y   xY w# 1 sw Y   $xY w)
N)r  r  r   z+Generating kernel code with kernel_name: %sr9   inductorintermediate_hookszrun_intermediate_hooks(r   r   ).r  get_tiling_and_scoresre   r  r  create_kernel_choices!codegen_node_schedule_with_kernelr?   merge_workspaces_inplacer8   set_kernel_handlerrD  define_kernelr   traceprovenance_tracking_levelr%   r   r  r  r  r    r   scheduler_nodesmark_runcodegen_commentr   wrapper_codewrite_provenance_debug_handler  nan_assertsr  r/  removed_buffersinplaced_to_removesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinerb   free_buffers_in_scheduler)ri   kernel_featuresr  r  tiling_scorekernelsrg   debug_handlessrc_coder  debug_handlefinal_kernelr   	live_outsrb   origin_nodes                   rX   r  z$SIMDScheduling.codegen_node_schedule  sT   '55#99!!++--	 
 ,,H(<H

  	JF22=&I	J,,W59; 	3F%%f- 3!0023,,X}fMK||55:F!  $$k<%@AIIC[Q!,F(2F	3  w<!&w/L%O\!!,/ 	 '779   	  	]+)6 	%KGG  >>\	 	  !9!9:**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779 
}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO
 	&&(u3 3(	  	 s   #N*.&N7*N4	7Oc                (     | j                   |i |gS rR   )rX  )ri   r  kernel_argskernel_kwargss       rX   r  z$SIMDScheduling.create_kernel_choices  s)     D
 	
rZ   c           	     <   |5  t        j                         }i }|D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j                                       |j!                  |j#                                |D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          Dt%        |j                         |j                  |j                               }|j'                  |        	 d d d        y # 1 sw Y   y xY wrR   )r  	ExitStackr@   enter_contextr  rA   closedecide_inplace_updater  r^  r  r[  fromkeys_bodyindexing_from_argsr   rQ  keysr&   r   )ri   r  rg   stackall_indexingr   r   s          rX   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  sS    	-((*EL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 & 	-++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL,	--	- 	- 	-s   FFFFonly_gen_src_codec               	   i }|j                         }g }	|D ]  }
|
j                         }|	j                  |
       ||z  s*t        |      dk(  sJ |	|t	        t        |            <   |j                  j                  t	        t        |                   g }	 t        |	      dk(  sJ |5  |s|g|D ]  }|j                            |       }|j                  d      5  |D ]0  }|j                  |j                  |j                                      2 |j                  j                  t                      ddd       |j                   j#                         D ]+  \  }}d| d}|j%                  |j'                         g       x}	s0t)        d |	D              }t+        j,                  d|       5  |j                  |      5  |	D ]  }t        |j                               dk(  r<t        |	      dk(  r.t/        |      r#|xj0                  |j                         z  c_        |j                  |j                  |j                                       |j                  j                  t                      ddd       ddd       . 	 ddd       t3        t4              s`t6        j8                  j;                  |j<                  j>                        5  |jA                  d	       ddd       |jA                  d
d       tC        jD                  |      5  |j                   jG                         D ]  }d| d}|jA                  |d        |j                  d      5  t3        |t4              s|jA                  d       ddd       t3        |t4              r|}n|jI                         }g |||}t*        jJ                  rH|jM                         dz  }|jO                          d| d|jQ                  |      jS                          }|r|cddd       S | jU                  |||      |_+        t*        jX                  jZ                  dk7  rt]        ||jV                         |cddd       S # 1 sw Y   ;xY w# 1 sw Y   
xY w# 1 sw Y   ;xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   1xY w# 1 sw Y   yxY w)zK
        Helper method to codegen a single template kernel variant
        r9   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  <   K   | ]  }|j                           y wrR   )can_codegen_without_upcasts)r   p_ns     rX   r   z:SIMDScheduling._codegen_single_template.<locals>.<genexpr>P  s      5>A7795   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )/rf  rg  r   r   r   iterprologue_fused_inputsr   r  set_subgraph_bodyr   r  r^  cse
invalidater   named_input_nodesrq   r   r  r   r   patchr   #prologue_fused_inputs_preserve_zeror   rt   r   r$   current_originsr   originsfinalize_hookr8   r  r  finalize_remainingbenchmark_kernelr  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r  r%   )ri   rg   rendertemplate_nodeepilogue_nodesprologue_nodesr  buf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder  r  num_gbs                         rX   _codegen_single_templatez'SIMDScheduling._codegen_single_template   s    &("&88:& 	$H--/E!!(+~%5zQ&@N*4U+<=,,00d5k1BC!#	$ >"a''' ,	@$ +<^< $DMMO$ "8L))*:; 4* QDLL!<!<T__=N!OPQ

%%jl34
 '-&>&>&D&D&F @"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W @ $55mD @1? "$'(F(F(H$IQ$N(+N(;q(@'CM'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!"" #JJ11*,?!@@ @@,	@\ ,,**=+=+=+E+EF ;**>:;&&{5&A !!&) &	 %66;;= H
".zl! <**=*GH ))*:; A!,4 ../?@A ,,' (::<MnMmMnMM&&99;cA::<=Rj66v>GGIJL  !;&	 &	> "&!3!3HmV!TF||55:7!6#5#5 M&	 &	Y4 4&@ @@ @3,	@ ,	@`; ;A A&	 &	s   +5R AQ29AR)R4RB1Q?	7R?RR&A
S ""R3BS AS 2Q<	7R?R	RRRR#&R03R=	8S  S	Nr  hint_overridec          	        |j                   \  }\  }}|dk(  sJ t        |j                  t              r|j                  j                  r{g }	g }
|j                  j                  j                         D ]~  } ||j                  |      \  }}|r;| j                  |||||d      }t        |t              sJ |
j                  |       W| j                  |||||d      }|	j                  |        |rdj                  |
      S t        j                  |	       t        |	      }g |||}| j                  |       |j                  |j                         t        j                   xj"                  |j"                  z  c_        t        j                   xj$                  |j$                  z  c_        | j'                          y|j                  j)                  |j                  |      \  }}|r| j                  |||||d      S | j                  |||||d      }g |||}| j                  |       |j                  |j                  |j                         t        j                   xj"                  |j"                  z  c_        t        j                   xj$                  |j$                  z  c_        | j'                          y)z
        Codegen a triton template with multi-kernel dispatch support

        If `only_gen_src_code=True` the src code will be returned instead of being
        codegenned into the wrapper
        r9   )r  Tr  Fz

N)r  r   r   r   _make_kernel_rendersr   r  rt   r   re  r?   r  r  r  r  r8   r   r  r  r  make_kernel_render)ri   r   r  r  r  r  r  _numelr7  r  	src_codesr  rg   r  r  multi_kernelr  s                    rX   codegen_templatezSIMDScheduling.codegen_template  s     ,11FF{{ }))+>?""77GI&3&8&8&M&M&T&T&V +"!3!&&m" %#<<%&&*.  =  H &h444$$X.!::%&&*/ ; F NN6*3+6 !{{9--009&w/LMnMmMnMM  /$$\%=%=>GG##|'C'CC#GG&&,*I*II&**,*//BB""- C NFF !44!""&* 5   66!""&+ 7  !R. Q- Q. Q$$]3""6#5#5}7I7IJ''6+A+AA'**f.G.GG*..0rZ   c                    t         j                  j                  j                  t         j                  j                  j                                y rR   )r8   r   r  r  
device_opssynchronizerl   s    rX   codegen_synczSIMDScheduling.codegen_sync  s-    	&&qww'9'9'E'E'GHrZ   c           
        ddl m} |D cg c]  }|j                          }}i i }
}	t        ||      D ]u  \  }}t	        |d       j
                  \  }\  }}| j                  |||      }| j                  |||      }||||f|
|<   |j                  |t        |||      |       |	|<   w |j                  || ||	|
      }t        j                  dt        |      |D cg c]  }t        |       c}       g }|D ]>  }|D cg c]  }|j                          }} |||      }t        ||      D ]  \  }}| j                  |
|   d	   |j                  |	|                |	|   }|
|   d	   }|sIt!        j"                  |      5  t%        j&                  |      D ]  }|j)                           	 d d d        t         j*                  xj,                  |j,                  z  c_        t         j*                  xj.                  |j.                  z  c_         |j1                         }|j3                  |||f       A |S c c}w c c}w c c}w # 1 sw Y   xY w)
Nr9   )ComboKernelc                4    t        | j                               S rR   r  r  s    rX   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>      #ann>N:O rZ   r   )r  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelr  re  rz  r>  r  r  rh  create_triton_kernelrD   horizontal_partitionr  r  r   r  create_sub_kernelr8   r  rC   r  r  r   r  r  rD  r   )ri   subkernel_nodescustom_part_algorithmr&  r'  r  r  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  re   r7  r  r  
partitionspkernel_code_list
node_grouprg   	subkernelr  s                            rX   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code   s    	59HIDNN,II+-r(_.>? 		IB!$U0O!P!V!VAv 77ufMM''ufEF$165&$Hb! + @ @+M5&I"-o !A !M"		 !55!"2$+ 6 

 			? '(SV(	

 $ 	DJ=GHT 0HH  /'F
 !-=> K	E66%b)!,,,]2->? *"-	 1" 5a 8(--i8 ,$6$A$A-$P ,D MMO,, ''9+D+DD'**i.J.JJ*K ,,.H##Xvz$BC-	D.  c J. )  I, ,s   I II+IIc                <   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]  \  }}}	| j                  ||g|      }
t        j                  j                  dk7  rt        |j                  |
       | j                  |g       t        j                  d|
       |j                  t        j                   j"                  |
        | j%                          y )Nr9   r   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor&  r   combo_kernel_allow_mixed_sizesr7  r  r  r  r%   snodesr  r  r  r  r8   r   r  r  )ri   combo_kernel_noder,  r-  r&  r'  r4  r  rg   r  r  s              rX   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernel=  s   +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 		BHfa,,X8I7JFSK||55:7%,,k   "3!45II:KHqww33[A		B 	&&(rZ       c           
        
 dk(  }d 
fd}|j                         \  }
t        |      dk  rt        
      dk  st        |
z         rg S |j                         \  }
 |||r|n
|j                  |            }|D cg c]?  }t	         j                  |j                  |      |j                  |j                        A }	}|	S c c}w )Nr9   c                d   t        |j                        t        |      k(  sJ d|j                  d|       |j                  |j                  g}t	        d t
        j                  j                  |      D              sJ t
        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}t        |j                  D cg c]  }|j                   c}      }dd}t        j!                   ||      g|       dd      g}|D ]  }t        j                  j"                  j%                  |j&                  |j                        }	t        |	      t        |      k(  sJ 	 |	j'                  d      dz   }
|
t        |      k(  rt	        d	 |	|
d
 D              r	  ||d
|
        |||
d
       f}t        j                  j"                  j+                  t-        d t/        ||	      D                    }|j                  |v r|dz  }t        j1                  |d         r|dz  }t        j1                  |d         r|dz  }t        j                  j"                  j+                  |t-        t        j                  |            z
        dk\  s|j3                  t        j!                   ||d
|
        |||
d
       g      ||j                                |S c c}w c c}w # t(        $ r Y w xY w)zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  H   K   | ]  }t        |t        t        f        y wrR   )r   r!   r"   )r   r  s     rX   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>f  s$       3G 45s    "c                f    t         j                  j                  j                  t	        |             S rR   r  )r  s    rX   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesr  s"    ww''00v1FGGrZ   noner   )r  rb   scorer9   c              3  &   K   | ]	  }|d k(    ywr  r   r   s     rX   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s     ;a16;s   Nc              3  2   K   | ]  \  }}|d k7  s|  ywr  r   )r   r  r  s      rX   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s       "!-vST"s   r   r  rF  rb   )r  rM  rz   rw   )r   
range_varsr  r  r   r  r\  r]  rb   r8   r   r  r   r!   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r3   rz  is_good_sizer   )is_pointwiser  rwdep_sourcesr  depswrite_namesrD  tilingsr  splittiled_groupsrF  r  r  reduction_rangess                rX   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_ranges[  s    r}}%V4S8H	&6SS4 88RYY/K $??88E    %??88E88177#:#::sI. D  %"))%D3chh%DEKH
  44(01<  G  4''**77		2==Q7|s6{222
#MM!,q0EF+ ;756?;; ! < $F6EN3#F56N3  ((22! "14VW1E" 
 88{*QJE"//Q@QJE"//Q@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F56N$C!" !0$ #(!$
Q4l N[ &E: " s$   $?L8L"L"=L""	L/.L/rI  )rP  r}   rz   list[CandidateTiling])	r^  r   r   "pointwise_or_reduction_read_writesrK  complete_partial_tilingr  rF  rb   )r  r   re   r  rP  rY  pointwise_rangespartial_tilingsr  full_tilingsrX  s   `  `      @rX   candidate_tilingsz SIMDScheduling.candidate_tilingsV  s     '!+\	| .2__->** !Q&$%*$%58H%HII .2__->**% ,2B33LA
 *	
  22MM5/ ll[[	
 	
 	
s   ACc                    g dt        |       d }ddgdt        |       }t        g t        ||      t        ||            S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rL   rM   rN   NrO   rP   )r   r   rz  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        rX   create_tilingzSIMDScheduling.create_tiling  sY     &s9~o&78#U^,Cc2B.CDVc+y)VC0BDT,UV
 	
rZ   c                >    | j                  |r|ng |s|      S g       S rR   )rf  )r  r  rP  s      rX   rL  z$SIMDScheduling.create_partial_tiling  s0       "F&F
 	
,.
 	
rZ   c                    t        |j                               }d|v }||z  }|t        |      z  g}|r||fn||f} | j                  | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rN   )rB  r   r3   rf  )	r  r  re   r  splitsrP  total_numelmissing_tilingtiling_argss	            rX   r\  z&SIMDScheduling.complete_partial_tiling  sf     fmmo&f}o-%f(==> )5V^$>6:R 	 !s  +..rZ   c           
     z   |dk(  }t        t        t        t        j                  f             }t        j                  |      D ]  }t        |t        j                        s|j                         }|st        |d         dk(  rC||rdnd   }|g}	|j                  j                         D 
cg c],  }
t        |
t              rt        |
j                        dkD  r|
. }}
|D ]  }
g |
j                  j!                         }t        j"                  j$                  }t&        j(                  j*                  }t-        |      D ]!  \  }\  }}||z  }|j/                  ||      s! n |j1                  ||      sdz   }|r|d| n||d }g }|D ]  \  }}t3        j4                  |
j6                  |      }t9        d|j;                  t<              |j;                  t>              z   t        |            }t3        j@                  ||||      }||d   n|g}|jC                  |        |D cg c]F  }t&        j(                  j*                  j1                  |t        j"                  j$                        s|H }}t        |      dkD  s|	jE                  |        |	D ]z  }t9        dt        |      tG        d      z
        }|dz   }tI        |d|       }|ftK        ||d       z   }|jM                  | jO                  | jQ                  ||      ||             |  tS        |t        d      }|S c c}
w c c}w )z
        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r9   r   Nr   T)r   reverse)*r   r   rt   r   ExprrA   r   r   r   r  r^  r   r  reads_and_writesr!   r  rq   r   r   r8   r   r   r=  statically_known_geqr   r:   get_subexpr_involving_symbolr   r>  r  r   r   match_mod_div_block_exprrN  r   rY   r3   r  r   r\  rL  r  ) r  r  pointwise_numelr  rP  rU  r   node_rangesranges_to_tilenode_tilingsr  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxvarre   reduction_start_idxrd   index_tilingr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                    rX   get_nd_tilingszSIMDScheduling.get_nd_tilings  sy    '!+^CO<=?#**=9 _	DdI$;$;< //+KCA$71$< )lBN*+L  ++<<>c9-#cjj/A2E K 
 # 96 "73::#3#3#5!6',ww{{$77++7@7P 3%|U(E1(44,o   77(/  '8!&;# $ ##7$78'(;(<=   "", .JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-%..  , 77++CCCU     |$q( ''5s96x  , #&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''e_	F  
 qn s   .1L3,AL8c                   j                   sdnj                   j                  j                  j                  j                  j                  j                  j
                  }D cg c]  }||   	 c}D cg c]  }||   	 c}t        j                  t              k(  fd       t        j                  t              k(  fd       i g }	 	 	 d	 	 	 	 	 	 	 df	d}|j                   |d       |d      f       r$|j                   |fdd       |d      f       j                  j                         z  }	|	D ]%  }|j                   ||fd       |d      f       ' t        d	
      d	k(  rBdk(  r=t        j                  |	d      D ]$  }
|j                   ||
d       |d      f       & g }|D ]b  \  \  }}\  }}t        | j!                  ||      t#        |      t#        |      z         }| j!                  ||      }|j                  ||f       d | j!                  gg      }ddfd}t%        ||      D ]  \  }}| j'                  |j(                        rt+        |j(                        dk(  rdndz
  }|t        d	
      kD  rDt,        j/                  d|t        j0                  j2                  j4                  j6                         |j(                  |fc S |j(                  |k(  s|j(                  |fc S  |dfS c c}w c c}w )zr
        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
        Nc                      d d  S Nr   r   )r  rt  	pw_rangess   rX   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    ykO#4B}oF rZ   c                      d d  S r  r   )r  
red_rangesr  s   rX   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    zl"_$5RG rZ   Fc                B  	 |rn}|rn}|s|r|gg fS g g fS t        |       ||f}j                  |d      x}r|S |rn}g }g }	d}
d}t        ||      D ]  \  }}|| vr"|
|z  }
j                  j                  |d      }-|r|k(  rj                  }|J |j
                  }t        ||j
                        }|j                  |
|z         |	j                  |j                         |j                  |       |	j                  j                  j                  |d             d}
d}|
|z  }
|j                  |
       |	j                  j                  j                  |d             d}
 |
dk7  s|r0t        |      dk(  r"|j                  |
       |	j                  |       t        t        |            D ]S  }t        j                  j                  j                  ||   d      }t        |d      }t!        |	|   |z  dz        |	|<   U ||	f|<   ||	fS )z]
            Generate a tiling, and a tiling score, given vars to use as splits.
            Nr9   r   r?  r      )r   r   rz  coalesced_by_varsuggested_splittiling_factorr   r   rF  r   ranger8   r   r   r   minr   )vars_to_useuse_split_varrP  r  target_numelr   outsplitting_varsri  split_scoresprodprev_var_coalesced_scorer  v_range
var_tilingtile	remainderrK  r   all_iter_varsall_red_varsr  rt  r  r  r  scored_sub_split
tiling_vars                      rX   process_node_varszASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars  sZ    #/YJF.:?L)NB//8O$m\BC&**355s5
.:]NFLD'($ ".&9 
7K'GOD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (*2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ;> qy\c&kQ.>d###$<= 3v;' ?GG$$..vay2.F1I"%l1o&9A&=">Q?
 &,\$:S!L))rZ   T)rP  )r  rP  r   rW   r9   r   )rF  gffffff?gGz?c                    d}| d   j                   j                         D ]"  }t        j                  |      s|z  }|z  }$ | d   j                   |z  S )Ng      ?r   )r  r   rK  rO  rF  )r  score_factor	tile_size"bad_size_additional_tiling_penaltygood_size_tiling_penaltys      rX   	score_modz9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod+	  se    LqT[[//1 K	&33I>#/2T#TL#/2J#JL	K aDJJ;--rZ   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r  ztuple[sympy.Expr, ...]r  r}   rP  r}   rz   ztuple[list[int], list[int]])r  r|  norm_read_writesr   reduce_varsrd   rS   _checkr3   r   r  r  rY   r  combinationsrK  rf  r'  r  tiling_is_compatibler  r   perf_hint_loginforT   r   rU   rV   )r  r  rt  r  r  r  r  score_splitr  overlapping_iter_varsr  rU  pw_splitpw_score	red_split	red_score	candidater  default_tilingr  cand
tiling_lenr  r  r  r  r  r  r  r  s    ````                 @@@@@@@@rX   compute_tiling_strategyz&SIMDScheduling.compute_tiling_strategy~  s    %44 "2266 	 *::EE(99EE"33>>(561VAY6	)56AfQi6
)$7F	
 	*%8G	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 ' 	A%qd>%59	 #q(_-A(556KQO "")+DI)u= RT<G 	68 Xx"89i'!!(I6(mc)n4I ,,XyALNNI|45	6 **O+<>OP .3*#( 	. #)i"@ 	1D,'' !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00)	1, t##s 76s   7M
Mc                T    t        t              sJ t        fd|D              S )Nc              3     K   | ]R  }t        |t        j                        r6t        j	                  j                         |j                                 T yw))r  N)r   r   r  rx   r  r   r^  )r   r   r  r  s     rX   r   z6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>W	  sO      
 $	 7 78	 $$!2O % 
s   AA)r   r[  r   )r  r  re   r  r  s      ``rX   r  z#SIMDScheduling.tiling_is_compatibleN	  s1     &$''' 
 &	
 
 	
rZ   c                B    |D ]  }| j                  ||||      s|c S  y rR   )r  )r  r  re   r  r  r  s         rX   get_first_compatible_tilingz*SIMDScheduling.get_first_compatible_tiling_	  s1     % 	F''uovV	 rZ   c                0    | j                  ||||      d   S rp  )r  )r  r  re   r  r  s        rX   rh  zSIMDScheduling.select_tilingm	  s)     ((5/3D

 	rZ   c                   |dk(  }| j                  |g|g      }t        j                  j                  j                  j
                  r0|r.t        j                  j                  s| j                  ||||      S |st        j                  j                  rt        d      dk  rt        j                  t        j                  k  rt        j                  |      D ]i  }t        j                  j                  rt!        | j#                  |||            dkD  s>t        j%                  t'        j(                  d              |dfS  |dfS t+               }t-        j.                         }	t        j                  |      D ]g  }| j#                  |||      D ]O  }
|
j0                  |v r|
j0                  |j3                  |
j0                         |	|
xx   |
j4                  z  cc<   Q i |	j7                         D 
cg c]  \  }
}|
j8                   }}
}t        d      dk\  r?|r=	 	 	 	 	 	 d
d}t;        dt!        |            D ]  } ||d   ||         }||g|z   } n t!        |      dkD  rt        j%                  d	|       t        j                  j                  r| j=                  |||      |z   }| j?                  ||||      x}r|dfS |dfS c c}}
w )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r9   r   r  r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nr   c                8   | d   | j                  dd      }}|d   |j                  dd      }}t        ||g      s/t        j                  j                  j                  ||z
        dk(  ry t        j                  j                  j                  ||z
        dk  r||f||fc\  }}\  }}t        j                  j                  j                  ||z
        dkD  sJ t        j                  j                  j                  ||      sy |t        ||      || d   d}|S )NrN   rM   r9   r   rO   )rL   rM   rN   rO   )r   r   r8   r   r   r   r  r   )tiling0rt  a0a1b0b1
new_tilings          rX   convert_tiling_to_3dzBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d	  s    !w{{3':B w{{3':B *2r(3ww''11"r':a?77##--b2g6:*,bB8&HRhr2ww''11"r':Q>>>ww''DDRL !"b)"5>	
 "!rZ   zpossibly bad tiling: %s)r  rH  rt  rH  rz   rJ  ) rf  rS   rT   r   rU   r  prefer_nd_tilingr  tile_reductionsrY   r  levelloggingWARNINGrA   r   r   r`  r  textwrapdedentr   collectionsr   rb   r   rF  most_commonr  r  r  r  )r  r  re   r  r  rP  r  r   
seen_namescandidate_tilescandidate_tilingrF  r  r  rK  new_3d_tilingr  s                    rX   r  z$SIMDScheduling.get_tiling_and_scoresy	  s    " '!+ **E7_4EF OO""))BB!MM22..uo7H  V]]%B%B}H
H ""goo5+22=A D"MM99 5 5dE? STWXX%**$OO!$ !4'' "4''&0l
4?4G4G4I#**=9 	LD$'$9$9$$W L #((J6%**6NN#3#8#89 015E5K5KK1L	L ,;+F+F+H7
' % ##7
 7

 #q(\"."9N"0"8 1c.12  4"1%~a'8! !,&3_~%EN ~"8.I ==))""=%I ! 
 445/>
 
6 
 4<t##7
s   9Kc                     y rR   r   rl   s    rX   flushzSIMDScheduling.flush	  rF  rZ   c                     yr5  r   rl   s    rX   ready_to_flushzSIMDScheduling.ready_to_flush	  r6  rZ   c                   t        d |D              st        |d       j                  \  }\  }}| j                  |||      }| j	                  |||      }| j                  |t        |||            }	| j                  ||	       t        j                  d|      5  t        j                  |	      5  |	j                         }
d d d        d d d        nM|d   j                  |      \  }}}t        j                  d|      5  | j                  |||d|      }
d d d        
j                  t!        t"        j$                        d	      }
|
S # 1 sw Y   xY w# 1 sw Y   @xY w# 1 sw Y   LxY w)
Nc              3  <   K   | ]  }|j                           y wrR   )rd  )r   r   s     rX   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>
  s     2q1==?2r  c                4    t        | j                               S rR   r  r  s    rX   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>
  r   rZ   r   )r  r  r   Tr  triton_)r  r>  r  r  rh  rX  rD   r  r   r  r8   r  rD  get_prologue_template_epiloguer  replacert   r0   KERNEL_NAME)ri   r   r  r  r  re   r7  r  r  rg   r  r  templateepilogues                 rX   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodes 
  sx    2E22!$U0O!P!V!VAv 77ufMM''ufEF%%+M5&I & F 22=&I/1AB3$$V,3 "002	3 3 3 ,18+R+R,(Hh 02BC 00&*"/ 1  ##C(?(?$@)L'3 3 3 3 s0   E3EEE%E	EE"%E.c                     y rR   r   )ri   r  s     rX   r  zSIMDScheduling.codegen_comment!
  rF  rZ   c                    t         rR   r*  )ri   r  r  rg   s       rX   r  zSIMDScheduling.define_kernel$
  r.  rZ   )r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])re   rw   r  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]rz   r}   )r  rD   )r  rD   rz   zlist[SIMDKernel])r  r   rz   Optional[str])F)r,  zlist[BaseSchedulerNode]r-  r}   r&  r}   r'  r}   r  r}   rz   zlist[tuple[str, Any, Any]])rz   rZ  )rb  rM  rc  rM  rz   immutable_dict[str, sympy.Expr])r  rM  rP  r}   rz   r  )r  rH  re   rw   r  rw   rz   r  )rz   z%list[immutable_dict[str, sympy.Expr]])
r  list[NodeScheduleEntry]rt  rw   r  rw   r  rH   rz   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])r  r  re   rw   r  rw   r  rH  )r  r  re   rw   r  rw   r  zlist[dict[str, sympy.Expr]])r  Optional[CoalesceVarAnalysis]rz   rH  )r  r  rz   r  r|   )FN)r  r   ),r   r   r   r   rx   rX  rS  r[  rb  can_fuse_verticalrk  r  r  rT  r  r  r  r  r  r  r  r7  r>  rU  r   r   r`  rf  rL  r\  r  r  r  r  r   r   r   rh  r  r  r  r  r  r  r   rZ   rX   rW  rW  y  sZ   
 (K'Q6B !"^@
P
, $$
$
 
$ $LM)^
1
	
 -T  x@  '+a %a 
aFI #(; 0;   $;  	; 
 ;   ;  
$; z)2 Y}  }~ 

,

@T

	(

 

 
$
 
 
)	
 
 /%/ / $	/
 
)/ /( w
 
/w wr M$.M$ $M$ $	M$
 /M$ 
GM$ M$^ 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?~$
 9~$ 
G~$ ~$@ MQ<IB"rZ   rW  T)frozenc                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	rK  rH  r  r   rF  Nr  rb   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr?  r   r  )r   s    rX   rO  zCandidateTiling.is_good_size.
  s5     GG&&q)Bw(AFaK(rZ   )r   r   r   rS  rb   rT  rO  r   rZ   rX   rK  rK  (
  s)    !!JD-) )rZ   rK  c                      e Zd Zy)r  N)r   r   r   r   rZ   rX   r  r  5
  s    rZ   r  )r   )rW   r   rz   r   )r   rQ  rz   rt   )
__future__r   r  r  dataclassesr   r  r  r   r  r  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   r   rS   torch._loggingtorch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher    dependenciesr!   r"   r#   r$   r  r%   optimize_indexingr&   runtime.runtime_utilsr'   r(   r)   r*   r+   utilsr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   virtualizedr6   r7   r8   block_analysisr:   commonr;   r<   r=   r>   r  r?   simd_kernel_featuresr@   rA   rB   rC   rD   collections.abcrE   rF   rG   rH   	getLoggerr   r  _logginggetArtifactLoggerr  r  
fusion_logdoprintrR  rA  rY   	dataclassr\   ry   r   r   r   rx   rW  rK  	Exceptionr  r   rZ   rX   <module>r     s   "           X X X %    2 B G 9 / L L  & $ $ F ! 6 6  ; A ; D D   - , / P P %  <<@ g!00<H~~//*E^^--hA
 	78;
 5+ 5+ 5+pN;/ N;b;'? ;'| +;TB('/*B BJl"^ l"^- d#	) 	) $	)		 	rZ   