
    pi                    	   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ ddlKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZg ej                  dk(  Ziej                  d        Zkej                  j                  end      Zo eg d      Zpddddddd d!d"d"d#
Zq eg d$      Zrd%d&d'd(d)d*d+d,d-d.d/
Zsd0d1d2Ztej                  ej                  gZwej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                   gZeej                     ed3<   ej                  ej                  ej                  ej                  ej                  gZeej                     ed4<   d5 Zd6 Z	 	 	 dzd7eej                     fd8Zd9 Zd:eSd;ej                  d<ed=ej                  d>ej                  d?eMfd@ZdAeeeOf   dBedCedDej                  dEeeef   f
dFZdGeSdHedIefdJZdGeSfdKZej$                  d7ej                  dLej                  fdM       Zej$                  d7ej                  dLej                  dNefdO       Zej$                  	 d{d7ej                  dLej                  dNee   fdP       Zej,                   G dQ dR             Z G dS dTe6      Z G dU dV      ZdW Z G dX dYeV      Zej9                  dZ        G d[ d\e      Zej9                  d]       ej=                           G d^ d_e      Z G d` daeT      Z G db dce      Z G dd dee      Zdfe0d?eeej                     ezf   fdgZ G dh di      Z G dj dke      Z G dl dme      Z G dn doe      Z G dp dqe3      Z G dr ds      Z G dt du      Zej,                   G dv dw             Zej,                   G dx dy             Zy)|    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configcpp_buildercpu_vec_isairmetrics)'set_kernel_post_grad_provenance_tracing)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPPget_promote_dtype
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                      t         rdS dS )Nz__declspec(dllexport) _IS_WINDOWS     ]/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrY   Y   s    &1"9r9rW   schedule)+*^||minmaxr[   r\   r]   r_   r`   argminargmaxr^   welford)
sumprodxor_sumr_   r`   ra   rb   anywelford_reducewelford_combine)
r`   r_   rd   re   rf   rh   ri   ra   rb   rg   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrl   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                    |t         v rt        j                  }| dv ry| dk(  ry| dv r|t        |   }|t        j                  k(  r| dv rt        t        j
                     }t        |      rd| dnd	| d
}t        |      rd	| dnd	| d}| dv r|n|}| dv r|S d| d| dS t        |       rdt        |    dS t        |       )N)rf   rd   rg   r   re   r5   )r`   rb   r_   ra   ra   rb   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r`   rb   )r`   r_   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rH   rl   ro   r   r*   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rX   reduction_initr      s    22;;e$EJJ>5I#I!%++.F e$ $F8=9'xx8 	 e$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N+,u-.c22

((rW   c                     t         t        |      }t        |       rd| dS | dv r0|t        j                  k(  rt         t        j
                     }d| dS |S )Nr   >rz   r   )rH   r<   r*   r   rl   ro   )r   r   scalar_types      rX   reduction_acc_typer      sa    9%@AKN++a((--EJJ&u{{3K[M++rW   indexc           	      T   |t         j                  k(  }| dk(  r|r	d| d| dS |rdnd}| d| d| S | dk(  r| d	| S | d
k(  r| d| S | dk(  r| d| S | dv r|  d| d| dS | dk(  r	d| d| dS | dk(  r6t        |t              r|\  }}	}
nt	        | |      \  }}	}
d| d| d|	 d|
 d	S | dv rkt        |d      rD|j                  t         j                  k(  r'|j                  s||  d| d| d| dS |  d| d| dS ||  d| d| d| dS |  d| d| dS t        |       )Nrd   cascade_sum_combine(, &)|r[    re    * rf    ^ rg    || )r_   r`   z_propagate_nan(, rh   welford_combine(ri   , {})rz   r   z	_combine(z, static_cast<float>(), )))	r   rl   
isinstancetuplereduction_projecthasattrr   is_vecr   )r   var
next_value
helper_valr   	src_dtypeis_boolconjunctionmeanm2weights              rX   reduction_combiner      s    5::%G)*SAFF!(#cKU!K=*66c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%()D"f0LD"f!#d4&2$bDD--J(  EJJ.%% ()3%7LZLX[\a[bbcdd &&iu4I*UWX $%Yse2j\E7!LL$%Yse2j\CC

((rW   c                 J    t        |       r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightrz   z.index)r*   )r   accs     rX   r   r   	  sC    N+e}SkcU'?::	/	/f~JrW   codeiter_varnew_iter_var
loop_startloop_endreturnc                 b   t               }t        j                         5 }|j                  dt         d| dt        |       d| dt        |       d| dz          |j                  |j                                t        | j                        D ]  \  }}t        |t        t        f      sJ d}	t        |t              r|j                  }	|j                  }t        j                   d	| z   d	z   | |      }
|	rt        |	|
      }
|j                  |
        	 ddd       |S # 1 sw Y   |S xY w)
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r7   
contextlib	ExitStack	writelinerJ   rE   enter_contextindent	enumerate_linesr   rp   r;   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rX   move_code_under_inner_loopr     s?   , $~				 15""J<qc+j2I1J!Lc+h"7!8<.LM	
 	,3356 - 	1GAt    !M$- $		yyvve
3e;~PTUH'x@&&x0	11, -1, s   C:D$$D.acc_varacc_typer   r   lenc                     t               }t        j                         rd|  d| d| dn
| d|  d| d}|j                  |        |j	                  d| d	d
d|  d |||       ddg       |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performance.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r=   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rX   reduction_prefix_arrayr   A  s     !"K !!# y0
$se2Fz7)5R0 
 XJ("3%v.7):gne&D%EQG		
 rW   bufferr   new_namec                 D   t        | j                        D ]  \  }}t        |t        t        f      sJ t        |t              r/t        j                  d| z   dz   | |j                        |_        ]t        j                  d| z   dz   | |      | j                  |<    y )Nr   )r   r   r   rp   r;   r   r   r   )r   r   r   ir   s        rX   replace_acc_namer   b  s    V]]+ V4
 	
 
 dL)u$058XJSDI!vve&7%&?H:PTUFMM!VrW   c           
         d}t        | j                        D ]  \  }}t        |t        t        f      sJ t        |t              r|j
                  n|}t        j                  ||      }|sU|j                         \  }}t        j                  || d| d| d|      }t        |t              r||_        || j                  |<    y)zT
    Replaces `acc = cascade_sum_combine(value, ...)` with `acc = acc + value;`
    z/(.*?)\s*=\s*cascade_sum_combine\(([^,]+),.*?\);r    + r   N)
r   r   r   rp   r;   r   r   searchgroupsr   )	r   patternr   r   contentmatchr   valuenew_contents	            rX   replace_cascade_sum_with_addr   q  s    
 AGV]]+ /4
 	
 
  *$=$))4		'7+JC&&SESEUG1*EwOK$-'	#.a !/rW   r   c                     | j                  |      st        j                  j                  S ||dz   i}t	        | |      }t        j
                  || z
        S Nr5   )hassympySZeror0   simplify)r   r   replacement	new_indexs       rX   	stride_atr     sJ    99S> ww||a.K5+.I>>)e+,,rW   
vec_lengthc                   	 dd	fd}	fd}| }t        j                  dd      }| j                  t              r| j	                  t        |      |      } t        j                  dd      }| j                  t
              r| j	                  t        ||      |      } t        j                  |       } | |k7  rt        |       S | S )a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                     t        |       }t        j                  |       k(  rt        j                   d       }dz  |S )N_div_cr5   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rX   visit_indexing_divz7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrW   c                    t        | |      }t        j                  |       k(  r!t        j                   d       }dz  |S | dk(  r;t        j                  |      k(  r"t        j                   d       z   }dz  |S )N_mod_cr5   )r   r   r  r  )r  modulusr  mod_freevar_idr   r   s      rX   visit_modular_indexingz;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     gw799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrW   r  T)integerr	  )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r  r  original_indexdivmodr  r
  s
    ``     @@rX   r  r    s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!oc3<>TUNN5!E*5#zBBLrW   c                 8    |rt        | ||      } t        | |      S N)r  r   )r   r   r   s      rX   stride_at_vec_ranger    s"     +E3
CUC  rW   c                   &    e Zd ZU dZeed<   eed<   y)ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rn   __annotations__rV   rW   rX   r  r    s    
 rW   r  c                   d     e Zd Zededefd       Zdddeeee	f      f fdZ
d Zd	 Zd
 Z xZS )OuterLoopFusedSchedulerNodenode1node2c                    |j                   |j                   u sJ t        d ||fD              sJ t        d ||fD              rt | |j                   t        |      t        u rt        |j                               n|gt        |      t        u r t        |j                               z   |      S |gz   |      S  | |j                   ||g|      S )Nc              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )typer   r$   r"   .0nodes     rX   	<genexpr>z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  s1      
  J+"
   &(c              3   >   K   | ]  }t        |      t        u   y wr  r%  r   r&  s     rX   r)  z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>       TTtDz88T   )	schedulerallrg   r%  r   listget_outer_nodes)clsr!  r"  outer_loop_fusion_depths       rX   fusez OuterLoopFusedSchedulerNode.fuse  s     %//111 
 
 
 	
 
 TeU^TT E{&AA ..01  E{&AA ..01 (!   (! & u8OPPrW   r/  r#   outer_fused_nodesc                     || _         || _        g }| j                   D ]B  }t        |t        t        f      sJ |j                  t        |j                                      D t        | %  ||       y r  )
r6  r4  r   r$   r"   extendr1  	get_nodessuper__init__)selfr/  r6  r4  flatten_snodes_node	__class__s         rX   r;  z$OuterLoopFusedSchedulerNode.__init__  su      	 (?$++ 	;Eem5G%HIII!!$u'8"9:	; 	N3rW   c                     | j                   S r  )r6  r<  s    rX   r2  z+OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%%rW   c           
      8   dt         dt         dt        dt        dt        f
fdt        t	        |      dz
        D ]0  }||   j
                  }||dz      j
                  } |||d      r0 y	 |D ]  }t        j                  t        j                  |j                  d |       }t	        |j                        |kD  sMt        |t        j                        sht        |j                  |   t        j                        s|d
z  |j                  |   k  s y	 y)Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                 ^   | j                   sJ |j                   sJ | j                   |   |j                   |   g d}t        fd|D              sy|dk\  sJ |dz
  x}dkD  rE|dz   }|t        | j                         k  sJ |t        |j                         k  sJ  | |||      syy)N)r   sizeoffsetstepsc              3   P   K   | ]  }t        |      t        |      k(    y wr  )getattr)r'  attr_compareleft_loop_levelright_loop_levels     rX   r)  zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr><  s2       % O\:/>?s   #&Fr5   r   T)loopsr0  r   )rC  rD  rE  rF  outer_loops_attr_compare_listrN  rO  _inners        @@rX   rR  zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner*  s     "''''"((((,223IJO.445KL-)   )F  $)))%6%::!a?)?!)C&-N4H4H0IIII-O4I4I0JJJJ"#%*	 !rW   r5   r   F,  T)LoopNestrn   rl   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r<  cpp_kernel_proxy_listr4  idxrC  rD  cpp_kernel_proxyouter_rangesrR  s	           @rX   "check_outer_fusion_loop_level_attrz>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr!  s<   (	$(	%(	  #(	 %(	(	
 (	T 23a78 		C237AAN3C!G<FFO'	 		 !6 	$++ ''(@)@AL $++,/FF|U]];$++,CDMM !3&"))*ABC )	, rW   c                 D   |d   j                   }t        |      }|D cg c]'  }|j                  j                  | j                        ) c}|_        |d   }||j                  _        |j                  j                  d | j                   |j                  _        |S c c}w Nr   )kernel_groupOuterLoopFusedKernelrV  from_loop_levelr4  innerkernelrP  )r<  r]  rd  outer_loop_fused_kernelproxyouter_fused_proxys         rX   merge_outer_fusion_kernelsz6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelsw  s     -Q/<<"6|"D /)
 OO++D,H,HI)
% 2!4-D##*,=,G,G,M,M*d**-
##) ! )
s   ,B)r  r  r  classmethodr   r5  r1  r	   r"   r$   r;  r2  ra  rl  __classcell__r?  s   @rX   r   r     sb    !Q%!Q.?!Q !QF44  &8-&G HI4 &Tl!rW   r   c                   2    e Zd ZddefdZd Zd Zd Zd Zy)	RecordOptimizationContext	func_namec                 .    || _         d | _        d | _        y r  )rr  current_nodeopt_ctx)r<  rr  s     rX   r;  z"RecordOptimizationContext.__init__  s    "596:rW   c                    t         j                  sJ t         j                  j                  sJ t         j                  j                  | _        | j                  J t        j                  | j                  j
                  v r-| j                  j
                  t        j                     | _        nt               | _        | j                  J | j                  | j                  _        | S r  )	r4   interpreterrt  rA   keymetaru  rr  ops_namerA  s    rX   	__enter__z#RecordOptimizationContext.__enter__  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rW   c                     | j                   sJ | j                  sJ | j                  | j                   j                  t        j                  <   y r  )rt  ru  ry  rA   rx  r<  exc_typeexc_valexc_tbs       rX   __exit__z"RecordOptimizationContext.__exit__  s>        |||:>,,2667rW   c                     | j                   S r  )ru  rA  s    rX   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s    ||rW   c                 6    | j                   sJ | j                   S r  )rt  rA  s    rX   get_fx_nodez%RecordOptimizationContext.get_fx_node  s           rW   N)rS   )	r  r  r  rp   r;  r{  r  r  r  rV   rW   rX   rq  rq    s#    ;# ;
G
!rW   rq  c                  r    t        d | D              rJ d       t        |       x}	t        |   S d| d    dS )Nc              3   X   K   | ]"  }t        |t              xr |j                   $ y wr  )r   rG   r   )r'  args     rX   r)  z$decltype_promoted.<locals>.<genexpr>  s#     Rc:c>2AszzARs   (*z*Promotion of vector types is not supported	decltype(r   r   )rg   rI   rH   )argsdts     rX   decltype_promotedr    sP    RTRR 4R  %%2B47)1%%rW   c                   (   e Zd ZdZed        Zed        Zed        ZedMd       Zed        Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedC        ZEedD        ZFedEeGj                  dFeGj                  fdG       ZIedEeGj                  dFeGj                  fdH       ZJedEeGj                  dFeGj                  fdI       ZKedJ        ZLedK        ZMedL        ZNy)NCppOverrideszMap element-wise ops to C++c                 ,    t        | |       d|  d| dS )N(r   r   r  abs     rX   addzCppOverrides.add  #    #Aq)*!A3c!A66rW   c                 ,    t        | |       d|  d| dS )Nr   - r   r  r  s     rX   r   zCppOverrides.sub  r  rW   c                 ,    t        | |       d|  d| dS )Nr  r   r   r  r  s     rX   rZ  zCppOverrides.mul  r  rW   Nc                    t        | t              sJ || j                  }t        j                  j                  | ||      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        v r6|t        j                  k(  r#	 t        j                  j                  | |||       |S )Nto_dtyper   )r   rG   r   r4   rh  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   ro   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rX   r  zCppOverrides.to_dtype  s    !^,,,Ixx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>> HH((IvuErW   c                 T    |t         v sJ | dt         d       dt         |    d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rH   r  )r  r   r   s      rX   to_dtype_bitcastzCppOverrides.to_dtype_bitcast  s=    $U~hZ}&UU$U 34Bqc;;rW   c                     d|  dS )Nz	std::abs(r   rV   r  s    rX   abszCppOverrides.abs      1#QrW   c                     d|  dS )Nz	std::sin(r   rV   r  s    rX   sinzCppOverrides.sin  r  rW   c                     d|  dS )Nz	std::cos(r   rV   r  s    rX   coszCppOverrides.cos  r  rW   c                     d|  d|  dS )Nr  z)(-r   rV   r  s    rX   negzCppOverrides.neg       1#S1%%rW   c                     d|  dS )Nz	std::exp(r   rV   r  s    rX   expzCppOverrides.exp  s     1#QrW   c                     d|  dS )Nz
std::exp2(r   rV   r  s    rX   exp2zCppOverrides.exp2	      A3a  rW   c                     d|  dS )Nzstd::expm1(r   rV   r  s    rX   expm1zCppOverrides.expm1      QCq!!rW   c                     d|  dS )Nz	std::erf(r   rV   r  s    rX   erfzCppOverrides.erf  r  rW   c                     d|  dS )Nz
std::erfc(r   rV   r  s    rX   erfczCppOverrides.erfc  r  rW   c                     d|  dS )Nzcalc_erfinv(r   rV   r  s    rX   erfinvzCppOverrides.erfinv      aS""rW   c                     d|  dS )Nz
std::sqrt(r   rV   r  s    rX   sqrtzCppOverrides.sqrt  r  rW   c                     d|  dS )Nz1 / std::sqrt(r   rV   r  s    rX   rsqrtzCppOverrides.rsqrt!  s    s!$$rW   c                 |    t         j                  j                  }|dk(  r|  d|  dS |d|  dS t        d|      )Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rX   log1pzCppOverrides.log1p%  sW    jj66*SQCt,,[ 1%% J3'R rW   c                     d|  dS )Nz	std::tan(r   rV   r  s    rX   tanzCppOverrides.tan1  r  rW   c                     d|  dS )Nz
std::tanh(r   rV   r  s    rX   tanhzCppOverrides.tanh5  r  rW   c                 &    t         rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rT   r  s    rX   signbitzCppOverrides.signbit9  s-      /qc4	
 !1%	
rW   c                     d|  d| dS )Nz	std::pow(r   r   rV   r  s     rX   powzCppOverrides.powE  s    1#Rs!$$rW   c                     d|  dS )Nz	std::log(r   rV   r  s    rX   logzCppOverrides.logI  r  rW   c                     d|  dS )Nzstd::nearbyint(r   rV   r  s    rX   roundzCppOverrides.roundM  s     1%%rW   c                     d|  dS )Nzstd::floor(r   rV   r  s    rX   floorzCppOverrides.floorQ  r  rW   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rV   )r  r  quotrems       rX   floordivzCppOverrides.floordivU  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrW   c                     d|  dS )Nz
std::ceil(r   rV   r  s    rX   ceilzCppOverrides.ceil\  r  rW   c                     d|  dS )Nzstd::trunc(r   rV   r  s    rX   trunczCppOverrides.trunc`  r  rW   c                     |  d| S Nr  rV   r  s     rX   truncdivzCppOverrides.truncdivd  s     Cs|rW   c                     d|  d| dS )Nz
std::fmod(r   r   rV   r  s     rX   fmodzCppOverrides.fmodi  s    A3b1%%rW   c                     d|  dS )Nzstd::isinf(r   rV   r  s    rX   isinfzCppOverrides.isinfm  r  rW   c                     d|  dS )Nzstd::isnan(r   rV   r  s    rX   isnanzCppOverrides.isnanq  r  rW   c                     d|  dS )Nzstd::lgamma(r   rV   r  s    rX   lgammazCppOverrides.lgammau  r  rW   c                     d|  dS )Nz
std::acos(r   rV   r  s    rX   acoszCppOverrides.acosy  r  rW   c                     d|  dS )Nzstd::acosh(r   rV   r  s    rX   acoshzCppOverrides.acosh}  r  rW   c                     d|  dS )Nz
std::cosh(r   rV   r  s    rX   coshzCppOverrides.cosh  r  rW   c                     d|  dS )Nz
std::sinh(r   rV   r  s    rX   sinhzCppOverrides.sinh  r  rW   c                     d|  dS )Nz
std::asin(r   rV   r  s    rX   asinzCppOverrides.asin  r  rW   c                     d|  dS )Nzstd::asinh(r   rV   r  s    rX   asinhzCppOverrides.asinh  r  rW   c                     d|  d| dS )Nzstd::atan2(r   r   rV   r  ys     rX   atan2zCppOverrides.atan2      QCr!A&&rW   c                     d|  dS )Nz
std::atan(r   rV   r  s    rX   atanzCppOverrides.atan  r  rW   c                     d|  dS )Nzstd::atanh(r   rV   r  s    rX   atanhzCppOverrides.atanh  r  rW   c                     d|  d| dS )Nzstd::copysign(r   r   rV   r  s     rX   copysignzCppOverrides.copysign  s    s"QCq))rW   c           	         d|  dd|  df}t        d |D              rt        d |D              S t               }t        j                  j
                  j                  t        j                  | j                        }t        j                  j
                  j                  | j                  | j                        }|j                  d| d       |j                  d	| d
|  d| d       t        j                  j                  j                  |       ||f}t        ||      D ]/  \  }}t        j                  j
                  j                  ||       1 ||fS )Nfrexp()[0])[1]c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r4   rh  r  try_getr'  	cache_keys     rX   r)  z%CppOverrides.frexp.<locals>.<genexpr>  (     Wyqxx||##I.d:W   57c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r  s     rX   r)  z%CppOverrides.frexp.<locals>.<genexpr>  #     UY--i8U   35)r   shapezint32_t r   r   z = std::frexp(r   r   )r0  r   r7   r4   rh  r  newvarr   int32r  r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsr  cse_vars           rX   frexpzCppOverrides.frexp  s$   aS%s$'77
WJWWU*UUU~88<<&&U[[&H88<<&&QWWAGG&D(1-.xjqcXJbIJ	%h'"%j("; 	1IwHHLLY0	1!!rW   c                     d|  d| dS )Nzstd::hypot(r   r   rV   r  s     rX   hypotzCppOverrides.hypot  r  rW   c                     d|  dS )Nzstd::log10(r   rV   r  s    rX   log10zCppOverrides.log10  r  rW   c                     d|  dS )Nz
std::log2(r   rV   r  s    rX   log2zCppOverrides.log2  r  rW   c                     d|  d| dS )Nzstd::nextafter(r   r   rV   r  s     rX   	nextafterzCppOverrides.nextafter  s     2aS**rW   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rX   reluzCppOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rW   c                     d|  d| dS )Nzmin_propagate_nan(r   r   rV   r  s     rX   minimumzCppOverrides.minimum      #A3b1--rW   c                     d|  d| dS )Nzmax_propagate_nan(r   r   rV   r  s     rX   maximumzCppOverrides.maximum  r<  rW   c                     |  d| d| S )N ?  : rV   )r  r  cs      rX   wherezCppOverrides.where  s    Cs#aS!!rW   c                     d|  d| dS )Nzmod(r   r   rV   r  s     rX   r  zCppOverrides.mod  s    aS1#QrW   c                 (    t        | t        |         S r  )rP   rH   )valr   s     rX   constantzCppOverrides.constant  s    Ce!455rW   c                    t        t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |t        |             }t        j                  ||      S )Nbounds)
rD   r4   rh  rename_indexingr  r  r  r&   r2   r  )r  r   idx_strr   s       rX   
index_exprzCppOverrides.index_expr  sb    0067hhll##HHg.CD.I $ 
 ||C''rW   c                 "   t               }t        j                  j                  j	                         }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d| d      }|  d| d| S # 1 sw Y   exY w# 1 sw Y   ixY w)	Nr    = [&]return r   r  z())r@  z() : )r7   r4   rh  r  r  r   swap_buffersr   r  r  rP   )maskbodyotherr   body_varr  
other_codes          rX   maskedzCppOverrides.masked  s    ~ 88<<&&(xj/0XX""4( 	0$++- 	0VFNNWVHA./	0 	0 	s	% "%9XJc)BC
s8*E*66	0 	0 	0 	0s$   'D8C9D9D	>DDc                     |  d| S )N && rV   r  s     rX   logical_andzCppOverrides.logical_and      D}rW   c                     d|  S )N!rV   r  s    rX   logical_notzCppOverrides.logical_not      1#wrW   c                     |  d| S )Nr   rV   r  s     rX   
logical_orzCppOverrides.logical_or  r[  rW   c                     |  d| S )N != rV   r  s     rX   logical_xorzCppOverrides.logical_xor  r[  rW   c                     d|  d|  d| dS )Nr  )( & r   rV   r  s     rX   bitwise_andzCppOverrides.bitwise_and      1#Rs#aS**rW   c                     d|  d|  dS )Nr  z)(~r   rV   r^  s    rX   bitwise_notzCppOverrides.bitwise_not  r  rW   c                     d|  d|  d| dS )Nr  rg   | r   rV   r  s     rX   
bitwise_orzCppOverrides.bitwise_or  rj  rW   c                     d|  d|  d| dS )Nr  rg  r   r   rV   r  s     rX   bitwise_xorzCppOverrides.bitwise_xor  rj  rW   c                    t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d       |j                  d| d| d| d       |j                         5  |j                  d	|  d
       d d d        |j                  d	|  d| d|  d| d	       d d d        |j                  d       |S # 1 sw Y   BxY w# 1 sw Y   (xY w)N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r7   r   r   rH   r   r  r  r   scalar_ts       rX   bitwise_left_shiftzCppOverrides.bitwise_left_shift  s   ~w[[] 	#AGG,HNN%aS(=hZ}U NN6xjA3kRSQTTcd  <!1!E:;<NN"1#%H
RUVWUXX]^_]``bc	 	t< <	 	s$   AC&C&C&C#	C&&C/c           
         t               }|j                  d       |j                         5  t        | j                     }|j                  d| d| d| d       |j                  d| d| d| d	       |j                         5  |j                  d
|  d|  d       d d d        |j                  d
|  d|  d| d       d d d        |j                  d       |S # 1 sw Y   ?xY w# 1 sw Y   (xY w)Nrs  rt  ru  z ) * CHAR_BIT - std::is_signed_v<z>;rv  rw  rx  ry  rz  rg  z >> max_shift); >> r   r{  r|  r}  s       rX   bitwise_right_shiftz CppOverrides.bitwise_right_shift3  s   ~w[[] 
	A#AGG,HNN%aS(=hZGghpgqqst NN6xjA3kRSQTTcd  K!1!BqcIJKNN-aS1#T!B?@
	A 	t	K K
	A 
	As$   AC)C#C)C&	"C))C2seedrI  c                     d|  d| dS )Nznormalized_rand_cpu(r   r   rV   r  rI  s     rX   randzCppOverrides.randE  s    %dV2fXQ77rW   c                     d|  d| dS )Nz
randn_cpu(r   r   rV   r  s     rX   randnzCppOverrides.randnI  s    D6F81--rW   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rV   )r  rI  lowhighs       rX   	randint64zCppOverrides.randint64M  s#    vRxr#ba@@rW   c                     d|  d|  d|  dS )Nr  z)(1) / (decltype(z)(1) + std::exp(-r   rV   r  s    rX   sigmoidzCppOverrides.sigmoidQ  s    1#.qc1B1#RHHrW   c           
      N   t               }d|  d}d|  d}|j                  d       |j                         5  |j                  d|  d| d| d       |j                  d	|  d
| d| d       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr  )(0)r  rs  auto left = z > 0 ? rA  r   auto right = z < 0 ? return left - right;r{  r7   r   r   )r  r   scalar_zero
scalar_ones       rX   signzCppOverrides.signU  s    ~!!D) 4(
w[[] 	3NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12	3 	t	3 	3s   ABB$c                     d|  d| dS )Nr  z" ? 0 : (throw std::runtime_error("z"), 0))rV   )condmsgs     rX   device_assert_asyncz CppOverrides.device_assert_asyncb  s    4&:3%wGGrW   NT)Or  r  r  r  staticmethodr  r   rZ  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r&  r(  r*  r,  r.  r9  r;  r>  rC  r  rG  rM  rW  rZ  r_  rb  re  ri  rl  ro  rq  r  r  r   Exprr  r  r  r  r  r  rV   rW   rX   r  r    s   %7 7 7 7 7 7 ( (T < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + +  &  " 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I 
 
 H HrW   r  r  c                       e Zd ZdZ fdZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAed?        ZBed@        ZCedA        ZDedB        ZEedC        ZFedD        ZGedE        ZHedF        ZIedG        ZJedH        ZKedI        ZLedJ        ZMedRdK       ZNedL        ZOedM        ZPedN        ZQedO        ZReSdP        ZTeSdQ        ZU xZVS )SCppVecOverridesz.Map element-wise ops to aten vectorization C++c                     t         |   |       fd}t        t              j	                         D ]<  \  }}t        |dd       t        k(  s|dvs t        | ||j                               > S )Nc                       fd}|S )Nc                     | D cg c]@  }t        |t        t        j                  f      st        |t              r|j
                  s|B }}| D cg c]   }t        |t              r|j
                  r|" }}t        |       }|r|rg }| D ]  }t        |t        t        j                  f      rt        |t        j                        r1|j                  s%t        j                  |t        j                        }n$t        j                  |t        j                        }t        |t              r|j                  n|}|j                  |        |r>t!        |      dk(  rt#        |      }n$
t$        j&                  k(  rt#        |dd        |dd  |r|rt        t(        j*                  t,              sJ |D cg c]p  }t        |t              r\|j
                  sP
t$        j.                  t$        j0                  t$        j2                  fvrt(        j*                  j5                  |      n|r }}|r 
|i |S t7        t$              }t9        |
j:                        }|J  || i |S c c}w c c}w c c}w )Nr   r5   )r   rn   r   r  rG   r   r1  	is_numberr2   rM  r   int64rG  r3   r   appendr   rM   r  rC  r4   rh  CppVecKernelr  r  r  	broadcastr:  rL  r  )r  kwargsr  scalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr?  funcr<  s            rX   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper}  s8     $!#UZZ'89"37

    $!#~63::  
  :w!H# -%cC+<=)#uzz:3==&)nnS%++&F&)ll3&D/9#x/H#))cC ,-  8})#/#9!6!66'3HQRL'A w%ahh===  (0  $ !+7N C(/$($3$8$8$3$9$9$3$=$=("%"	 HH..w7 ")) H  $ 4V44 "'!=J")*dmm"DK&222&777@ s   AI)%I.6A5I3rV   )r  r  r?  r<  s   ` rX   wrapz%CppVecOverrides.__new__.<locals>.wrapp  s    @8D NrW   r?  )rW  rM  )	r:  __new__varsr  itemsrL  r  setattr__func__)r3  r  kargsr  r   methodr<  r?  s         @rX   r  zCppVecOverrides.__new__m  sx    ws#O	b !1779 	;LD&v{D1\Ad S G dD$9:	; rW   c                     |  d| S )Nr   rV   r  s     rX   r  zCppVecOverrides.add      Cs|rW   c                     |  d| S )Nr  rV   r  s     rX   r   zCppVecOverrides.sub  r  rW   c                     |  d| S Nr   rV   r  s     rX   rZ  zCppVecOverrides.mul  r  rW   c                     |  d| S r  rV   r  s     rX   truedivzCppVecOverrides.truediv  r  rW   c                     |  dS )Nz.abs()rV   r  s    rX   r  zCppVecOverrides.abs      F|rW   c                     |  dS )Nz.sin()rV   r  s    rX   r  zCppVecOverrides.sin  r  rW   c                     |  dS )Nz.cos()rV   r  s    rX   r  zCppVecOverrides.cos  r  rW   c                     |  dS )Nz.exp()rV   r  s    rX   r  zCppVecOverrides.exp  r  rW   c                     |  dS )Nz.exp2()rV   r  s    rX   r  zCppVecOverrides.exp2      G}rW   c                     d|  d}|  d| S )Nr  r  z	.exp() - rV   )r  vec_ones     rX   r  zCppVecOverrides.expm1  s#     aS%IgY''rW   c                     |  dS )Nz.erf()rV   r  s    rX   r  zCppVecOverrides.erf  r  rW   c                     |  dS )Nz.erfc()rV   r  s    rX   r  zCppVecOverrides.erfc  r  rW   c                     |  dS )Nz	.erfinv()rV   r  s    rX   r  zCppVecOverrides.erfinv      IrW   c                     |  dS )Nz.sqrt()rV   r  s    rX   r  zCppVecOverrides.sqrt   r  rW   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   == r   r   r4   rh  r  rG   r   _get_mask_typer  s     rX   eqzCppVecOverrides.eq  c    !((L111!^,,,ww"""(())!''231QCtA3a@@rW   c                    t        t        j                  t              sJ t        | t              sJ | j
                  t        j                  k(  rO|j
                  t        j                  k(  sJ t        t        j                  j                  | |f      \  }}| d| S | j
                  J t        j                  j                  | j
                         d|  d| dS )Nrd  r  r   )r   r4   rh  r  rG   r   r   rl   rO   r  r  )r  r  x_casty_casts       rX   nezCppVecOverrides.ne  s    !((L111!^,,,77ejj 77ejj(((1!((2B2BQFKNFFXT&**77&&&hh--agg67q4s!DDrW   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  r   r   r  r  s     rX   ltzCppVecOverrides.lt  c    !((L111!^,,,ww"""(())!''231QCs1#Q??rW   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  z > r   r  r  s     rX   gtzCppVecOverrides.gt  r  rW   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   <= r   r  r  s     rX   lezCppVecOverrides.le%  r  rW   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   >= r   r  r  s     rX   gezCppVecOverrides.ge,  r  rW   c                     |  d| S Nrh  rV   r  s     rX   and_zCppVecOverrides.and_3  r  rW   c                     |  dS )Nz.rsqrt()rV   r  s    rX   r  zCppVecOverrides.rsqrt7      H~rW   c                     |  d| dS )Nz.pow(r   rV   r  s     rX   r  zCppVecOverrides.pow;  s    E!ArW   c                     |  dS )Nz.log()rV   r  s    rX   r  zCppVecOverrides.log?  r  rW   c                     |  dS )Nz.round()rV   r  s    rX   r  zCppVecOverrides.roundC  r  rW   c                     |  dS )Nz.floor()rV   r  s    rX   r  zCppVecOverrides.floorG  r  rW   c                     |  dS )Nz.ceil()rV   r  s    rX   r  zCppVecOverrides.ceilK  r  rW   c                     |  dS )Nz.trunc()rV   r  s    rX   r  zCppVecOverrides.truncO  r  rW   c                     |  d| dS )Nz.fmod(r   rV   r  s     rX   r  zCppVecOverrides.fmodS  s    F1#QrW   c                     |  dS )Nz	.lgamma()rV   r  s    rX   r  zCppVecOverrides.lgammaW  r  rW   c                 .    t        | |      \  } }|  d| S r  rL   r  s     rX   rZ  zCppVecOverrides.logical_and[  "    ,Q21Cs|rW   c                     d|  S N~rV   r^  s    rX   r_  zCppVecOverrides.logical_not`  r`  rW   c                 .    t        | |      \  } }|  d| S Nrn  r  r  s     rX   rb  zCppVecOverrides.logical_ord  r  rW   c                 .    t        | |      \  } }|  d| S Nr   r  r  s     rX   re  zCppVecOverrides.logical_xori  r  rW   c                 .    t        | |      \  } }|  d| S r  r  r  s     rX   ri  zCppVecOverrides.bitwise_andn  r  rW   c                     d|  S r  rV   r^  s    rX   rl  zCppVecOverrides.bitwise_nots  r`  rW   c                 .    t        | |      \  } }|  d| S r  r  r  s     rX   ro  zCppVecOverrides.bitwise_orw  r  rW   c                 .    t        | |      \  } }|  d| S r  r  r  s     rX   rq  zCppVecOverrides.bitwise_xor|  r  rW   c                     |  d| S )Nz << rV   r  s     rX   r  z"CppVecOverrides.bitwise_left_shift  r[  rW   c                     |  d| S )Nr  rV   r  s     rX   r  z#CppVecOverrides.bitwise_right_shift  r[  rW   c                     t        t        j                  t              sJ t        j                  j	                  | |       S r  )r   r4   rh  r  load)r   rI  s     rX   	load_seedzCppVecOverrides.load_seed  s.    !((L111((--f-./rW   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r4   rh  r  r7   rF   r  rI  r   rand_functions       rX   r  zCppVecOverrides.rand  s@    !((L111~7v=ST 	 FD-88rW   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nzresult[offset_idx] = randn_cpu(r  r  r  s       rX   r  zCppVecOverrides.randn  s;    !((L111~9$?UVFD-88rW   c                     t        t        j                  t              sJ t	               }d|  d| d| d}t        |||t        j                        S )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r4   rh  r  r7   rF   r   r  )r  rI  r  r  r   r  s         rX   r  zCppVecOverrides.randint64  sS    !((L111~=dVCYZ]Y^^`ae`ffhiFD-EErW   c                 ~    | j                   |j                   k(  sJ d       |  dt        j                  | |       d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r  s     rX   	remainderzCppVecOverrides.remainder  sI    ww!''! 	
I	
! D11!Q78QC@@rW   c                     |  dS )Nz.tan()rV   r^  s    rX   r  zCppVecOverrides.tan  r  rW   c           	          t         j                  j                  r"d|  d}d|  d}d|  d}| d| d| d|  d| 	S |  d	S )
Nr  r  z)(2)z)(-2)z / (z + (r   z).exp()) - z.tanh())r   r  use_decompose_tanh)r  r  vec_twovec_minus_twos       rX   r  zCppVecOverrides.tanh  sl    ::((!!D)G!!D)G's%0M)4y]O3qcWIV S= rW   c                     |  dS )Nz.reciprocal()rV   r^  s    rX   
reciprocalzCppVecOverrides.reciprocal  s    M""rW   c                     |  dS )Nz.atan()rV   r  s    rX   r  zCppVecOverrides.atan  r  rW   c                     |  dS )Nz.acos()rV   r  s    rX   r  zCppVecOverrides.acos  r  rW   c                     |  dS )Nz.asin()rV   r  s    rX   r  zCppVecOverrides.asin  r  rW   c                     |  dS )Nz.cosh()rV   r  s    rX   r  zCppVecOverrides.cosh  r  rW   c                     |  dS )Nz.sinh()rV   r  s    rX   r  zCppVecOverrides.sinh  r  rW   c                     |  dS )Nz.log10()rV   r  s    rX   r*  zCppVecOverrides.log10  r  rW   c                     |  dS )Nz.log2()rV   r  s    rX   r,  zCppVecOverrides.log2  r  rW   c                     |  d| dS )Nz.nextafter(r   rV   r  s     rX   r.  zCppVecOverrides.nextafter  s    Ks!$$rW   c                     |  d| dS )Nz
.copysign(r   rV   r  s     rX   r  zCppVecOverrides.copysign  s    Jqc##rW   c                     |  d| dS )Nz.atan2(r   rV   r  s     rX   r  zCppVecOverrides.atan2      GA3a  rW   c                     |  d| dS )Nz.hypot(r   rV   r  s     rX   r(  zCppVecOverrides.hypot  r  rW   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	Nr  r  z)(0.5)z * ((r   z)/(r  z)).log()rV   )r  r  vec_one_halfs      rX   r
  zCppVecOverrides.atanh  sE     aS%"1#V,uWIS3wis1#XNNrW   c                     |  dS )Nz.asinh()rV   r  s    rX   r  zCppVecOverrides.asinh  r  rW   c                     |  dS )Nz.acosh()rV   r  s    rX   r  zCppVecOverrides.acosh  r  rW   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Nr0  r1  r2  r3  r  r  r  zat::vec::clamp_min(r4  r5  r6  r7  r  s     rX   r9  zCppVecOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rW   c                     d|  d|  d|  dS )Nr  z)(1)/(decltype(z)(1) + z.neg().exp())rV   r  s    rX   r  zCppVecOverrides.sigmoid  s    1#_QCwqcGGrW   c                     |  dS )Nz.neg()rV   r  s    rX   r  zCppVecOverrides.neg  r  rW   c                    t        | j                        r)| j                  |j                  k(  sJ d       d|  d| dS t        d | |fD              sJ d|  d}t        j                  j                  |j                        dk  r,| ddt        j                  j                  z  dz
   d	| d
| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d
| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   F   K   | ]  }t        |j                          y wr  )r   r   )r'  items     rX   r)  z+CppVecOverrides.floordiv.<locals>.<genexpr>  s     G'

3G   !r  r5   ::blend<r  (1), r  r  r  rd  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  rh  )r   r   r0  r4   rh  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rX   r  zCppVecOverrides.floordiv  s1   !''"77agg% V% -QCr!A66GAGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrW   c                     t         j                  j                  |j                        dk  r2d| d}| ddt         j                  j                  z  dz
   d| d| d}|  d| S )Nr5   r  r   r+  r  r,  r  )r4   rh  r-  r   r.  )r  r  r/  s      rX   r  zCppVecOverrides.truncdiv'  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rW   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nrh  at::vec::minimum(r   r   r   r   rl   rO   r4   rh  r  r  r  a_castb_casts       rX   r;  zCppVecOverrides.minimum0  l    77ejj 77ejj(((1!((2B2BQFKNFFXS))&qcA3a00rW   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nrn  at::vec::maximum(r   r   r5  r6  s       rX   r>  zCppVecOverrides.maximum9  r9  rW   c                     |  d|  S r  rV   r^  s    rX   squarezCppVecOverrides.squareB  r  rW   c                    t        t        j                  t              sJ |j                  t
        j                  k(  rY|j                  t
        j                  k(  sJ t        t        j                  j                  | ||f      \  }}}d| d| d| d| d	S d| d| d| dt        j                  j                  | |j                         d	S )Nr  
)::blendv(r   r   )
r   r4   rh  r  r   r   rl   rO   r  _get_mask_cast)r  r  rB  blendv_ablendv_bblendv_cs         rX   rC  zCppVecOverrides.whereF  s    !((L11177ejj 77ejj(((+?  1a),(Hh xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQRQXQX6Y5ZZ[\\rW   c                 ~   t               }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|j                  d       |j                         5  |j                  d	| d
       |j                  d| d
       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr  r  r  r?  r   r   r   rs  r  r   r  r  r{  r  )r  r   vec_zeror  blendv_lblendv_rs         rX   r  zCppVecOverrides.signR  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[] 	3NN\(156NN]8*A67NN12	3 	t	3 	3s   <B33B<c                 "   |t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  fv sJ t         d|        t        | t              sJ | j                  }t         j"                  j%                  | ||      }t         j"                  j&                  j)                  t         j"                  j*                  |      }|j-                  d| |fd|i       |t.        v r5|t         j                  k(  r"t         j"                  j1                  | |||       |S )Nz does not support r  r   )r   rl   float64ro   bfloat16float16uint8int8r  r  float8_e4m3fnfloat8_e5m2r  r   rG   r   r4   rh  r  r  r  r  r  r   r  )r  r   r   use_compute_dtypesr  r  s         rX   r  zCppVecOverrides.to_dtypea  s"   JJMMKKNNMMKKJJKKKK
 
 	2 Z)%1	2 
 !^,,,GG	xx))!UI>&&qxx'7'7>j1e*{I6NOM!i5;;&>HH((IvuErW   c                 z    t         j                  j                  }|dk(  r|  d|  dS ||  dS t        d|      )Nr  r  r  z.log1p()r  r  r  s     rX   r  zCppVecOverrides.log1py  sT    jj66*SQCt,,[S>! J3'R rW   c                 	   t        t        j                  t              sJ t	               }t        j                  j
                  j                         }t        j                  j                  |       5 }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        d d d        |j                  d       t        j                  j                  j                  |       j                  | d}fd}|j                  r|}	n ||      }	t        |t                  }
 ||
      }t        t"              sJ |       |j                  rt	               }|j                  d       t        j                  j                  |      5  |j                         5  |j                  d| d	       |j                         5  |j                  d| d       d d d        |j                  d
       |j                         5  t        j                  j
                  j%                  t        j                  j                  |	      }t        j                  j
                  j%                  t        j                  j                  |      }t        |t"              sJ |       t        |t"              sJ |       |_        |_        t        j                  j&                  }|j                  d|j)                  |||       d       d d d        d d d        d d d        |j                  d       t        j                  j
                  j%                  t        j                  j                  |      }n|j                  rKt        j                  j
                  j%                  t        j                  j                  |  d|	 d|       }nJt        j                  j
                  j%                  t        j                  j                  |  d| d|
       }|j+                  d| |||fi        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   yxY w# 1 sw Y   \xY w# 1 sw Y   axY w# 1 sw Y   fxY w)Nr   rO  rP  r   r{  c                     t         j                  k(  r$t        j                  j	                          d|  dS t        j                  j                         d|  dS )N::from(r   r  )r   rl   r4   rh  r  _get_vec_type)r   r   s    rX   maskify_or_vecifyz1CppVecOverrides.masked.<locals>.maskify_or_vecify  s]     EJJ& 88**,-WTF!< ..u56avQ?rW   [&]if (z.all_zero())elser@  rA  rW  )r   r4   rh  r  r7   r  r  rW  r   rQ  r   r  r  r   r   rP   rH   rG   r  	overridesrC  r  )rR  rS  rT  r   r   new_maskr  	body_coderV  body_code_vecrV  other_code_vecbody_vec_varother_vec_varrZ  r  r   s                   @rX   rW  zCppVecOverrides.masked  s   !((L111~hhll!!#XX__T" 	4hNNU3%v./&&t, 4dkkm 4234 4	4
 	s	%e2J		 ==%M-i8M!%e)<=
*:6(N3=X=3??>DNN5!&&t, dkkm hZ|<=[[] @NNW^,<A#>?@v&[[] #$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCQ\QC%m^DSmSD).L&*/M'**  NN!)//(L-"X!YYZ[# 2 NN4 XX\\**  F ]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	htUF(CRHI4 4 4 4	4 	4>@ @    s   35R(Q89Q+Q8RR9&R,R)R,DRR,R9+Q50Q88R	=RRRR,R)$R,,R6	1R99Sc                 X   t        t        j                  t              sJ t        j                  j	                  |       }t        j                  j
                  t        j                  j                     }t        j                  j                  ||      }|dk(  rt        j                  | |      S |t        j                  j                  j                  t        j                  j                  t        |      t        |             }t        j                   ||      }t        |t"              r|j$                  }t        j                  j'                  ||      }n:t        j                  j)                  d ||t        j                  j                        }|j+                  d| |fi        |S )Nr   rI  rM  )r   r4   rh  r  rK  itervars
tiling_idx_try_get_const_strider  rM  r  r  r  rD   r&   r2   r  r3   r   arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstrider^  r   r  s           rX   rM  zCppVecOverrides.index_expr  s3   !((L111((.XX&&qxx':':;
//zBQ;**477((,,''  %,7LT7R ( C LLe,E%*XX__UF3FXX;;eUAHH$4$4F 	lT5M2>rW   c           
      x   d|  dd|  df}t        d |D              rt        d |D              S t        | j                     }t        j
                  j                  rt        j
                  j                  nt        j
                  j                  }t               }t        j
                  j                  j                  t        j                        }t        j
                  j                  j                  | j                        }|j                  d| fi        |j                  d| fi        t        j
                  j                  | j                        }|d	k(  rd
| dnd| d| d}|j                  |d	k(  rd| dnd| d| d       |j                  | d| d       |j                  d       |j!                         5  |j                  d| dt        j
                  j                   d       |j                  |  dt#        |       d       |j                  dt        j
                  j                   d       |j                  d| dt        j
                  j                   d       |j                  dt#        |       d       |j!                         5  |j                  d       d d d        |j                  |d	k(  r| dt#        |       dn| d| d t#        |       d       |j                  | d!| d"t#        |       d       d d d        |j                  d#       t        j
                  j$                  j'                  |       ||f}	t)        ||	      D ]/  \  }
}t        j
                  j                  j+                  |
|       1 ||fS # 1 sw Y   xY w# 1 sw Y   xY w)$Nr  r  r  c              3   r   K   | ]/  }t         j                  j                  j                  |      d u 1 y wr  r  r  s     rX   r)  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  r  c              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr  r  r  s     rX   r)  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  r  r   r&  )r  r5   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   rs  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r0  r   rH   r   r4   rh  	tail_sizer.  r7   r  r  r   r  r  _get_num_vectorsr   r   rE   r  r  r  r   )r  r!  r   rH  r   r"  r#  n_vec
mantissa_tr$  r  r%  s               rX   r&  zCppVecOverrides.frexp  s{   aS%s$'77
WJWWU*UUUagg&%&XX%7%7qxx!!QXX=S=S~88<<&&U[[&988<<&&QWW&5!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[] 	NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKL V NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+	0 	u	%h'"%j("; 	1IwHHLLY0	1!!# 	 	s&   
CN0&N$8A$N0$N-	)N00N9c                     fd}|S )Nc                     |rJ t         j                  }t        |t              sJ t	               }|j                  d       | d   j                  }|j                  |      }|j                  r|j                  n|j                  }g }t        |   }j                  dv }	|	rdn|}
j                  dk(  rt        | d      n|
}
|j                         5  t        |       D ]  \  }}t        |t              r}|j                  sJ |j                  |k(  sJ |j                  d| d|j                   d	| d
       |j                  | d| dt!        |       d       |j#                  d| d       |j#                  |        |j                  d|
 d|j                   d        | }|j                  dt!        |       d       |j                         5  |j                  d| d
       d d d        |	r|j                  rJ d}d| d| d}n#dt!        |       }|dk(  rd|
 d}n	d|
 d| d}|j                  d| d| d       d d d        |j                  d       |S # 1 sw Y   {xY w# 1 sw Y   (xY w) Nrs  r   )r  r  r  rl   r  rp  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r5   rm  z>::loaduz at::vec::VectorizedN<rP  r  r{  )r4   rh  r   r  r7   r   r   rt  rs  r.  rH   r  r   r   rG   r   rE   r  )r  r  rh  r   	vec_dtyperu  rH  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rX   rg  z)CppVecOverrides._scalarize.<locals>.inner$  s   :XXFfl333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
  B#,T? 0KFC!#~6"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/0 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[] >NN%5cU!#<=>%//// 3I 1&5'IG"5k$6G5H IIz$8"I$:6("UG8"T	9+R@A?B@ NN4 K> >'B Bs&   DI6I*0AI6*I3	/I66I?rV   )r3  r  rg  s    ` rX   
_scalarizezCppVecOverrides._scalarize"  s    7	r rW   c                    t        t              }t        t              j                         D ]S  \  }}t	        |t
              s||vs| j                  |j                        }||_        t        | |t        |             U y r  )
r  r  r  r  r   r  r  r  r  r  )r3  vec_varsr   r  r  s        rX   _initialize_scalarizez%CppVecOverrides._initialize_scalarize_  sh    ( .446 	7LD&&,/D4H~~foo6 $T<#56		7rW   r  )Wr  r  r  r  r  r  r  r   rZ  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rZ  r_  rb  re  ri  rl  ro  rq  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r*  r,  r.  r  r  r(  r
  r  r  r9  r  r  r  r  r;  r>  r=  rC  r  r  r  rW  rM  r&  rm  r  r  rn  ro  s   @rX   r  r  j  sV   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A   	! 	! # #               % % $ $ ! ! ! ! O O        H H   S S$   1 1 1 1   	] 	]    . 	 	 J JX  , 6" 6"p : :x 7 7rW   r  cppvecc                       e Zd Zed        Zy)CppTile2DOverridesc                     t        t        j                  t              sJ t        j                  j	                  |       } t
        j                  | |      S r  )r   r4   rh  CppTile2DKerneltransform_indexingr  rM  )r  r   s     rX   rM  zCppTile2DOverrides.index_exprn  s=    !((O444xx**40))$66rW   N)r  r  r  r  rM  rV   rW   rX   r  r  m  s    7 7rW   r  c                       e Zd ZdZeZeZdZdZ	 fdZ
eefdZd Zd2dee   fdZej&                  d	        Z	 d3d
ej,                  fdZd
ej,                  defdZd
ej,                  dej4                  fdZd
ej,                  dej4                  fdZd Zdej,                  dej,                  dedefdZded
ej,                  fdZ d2dZ!de"e#ef   dedede$jJ                  fdZ&d2dee'   fdZ(d  Z)	 	 d4d!Z*	 d5d"Z+d# Z,d$ Z-d% Z.d& Z/d' Z0d( Z1e2defd)       Z3d* Z4ej&                  d+        Z5d, Z6d- Z7d. Z8	 	 d6ded/ee   d0eej4                     fd1Z9 xZ:S )7	CppKernela%  
    Base class for C++ kernel code generation in PyTorch Inductor.
    This class is responsible for generating C++ code from the intermediate representation.

    Args:
        args: Kernel arguments used for code generation
        num_threads: Number of threads for parallel execution
    r   r   c                    t         |   |       i | _        g | _        d | _        g | _        g | _        d | _        t               | _	        g | _
        t               | _        t               | _        t               | _        t               | _        t               | _        d| _        t               | _        t               | _        t'        | j(                  | j*                  d      | _        t'        | j(                  | j*                  d      | _        t'        | j(                  | j*                  d      | _        t               | _        t               | _        || _        i | _        g | _        y )NFtmp_acc)name_prefixwelford_helpercascade_helper)r:  r;  active_rangesinner_itervarscall_rangesr[  rb  reduction_depthr=   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixnon_parallel_reduction_suffixr8   newvar_prefixsuffixreduction_csewelford_helper_csecascade_helper_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r<  r  r  r?  s      rX   r;  zCppKernel.__init__  s7    HJ 35=A(*,.# . 0 <>( . 0)7)9&)7)9&$2$4!&4&6#!-;-=*-;-=* !3!3T[[iX"%9I#
 #&9I#
 '((*&=?.0 rW   c                 8   t         j                  j                  r'| j                  s| j                  j	                  d       | d}t         j                  j                  rdn	t               }| d}	| j                  j	                  | d| d |||       d       | j                  j                  t        ||||||             | j                  j	                  |	 d| d       | j                  j                  d| d	d
d| d ||||	|       ddg       y )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr  r   r+   r  r  r   r  r  r   )
r<  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr  acc_local_in_arrays
             rX   _gen_parallel_reduction_buffersz)CppKernel._gen_parallel_reduction_buffers  s=    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 !$uI.!!++j)C(9.%(P'QQRS	
 	&&--"!		
 	##--1C0DC	{RS.TU&&11*;-x@se33NCI[glmnnop		
rW   c                 Z    | j                   D ]  }t        | j                  || d        y )Nr  )r  r   stores)r<  var_names     rX   %update_stores_with_parallel_reductionz/CppKernel.update_stores_with_parallel_reduction  s0    00 	IHT[[(xj4GH	IrW   r   c                    |J t               }t        j                         5 }t        | d      rK|j	                  | j
                         | j                  |       |j                  |j                                |j	                  | j                         |j	                  | j                         |j	                  | j                         d d d        t        | d      r|j	                  | j                         | j                  r5| j                  D ]&  }| j                  |   \  }}t        ||| d||      }( |S # 1 sw Y   sxY w)Ncodegen_inner_loops_tail)r7   r   r   r   r  r  r  r   r   loadsr  r  r  r  r  r   )r<  r   r   r^  startends         rX   gen_bodyzCppKernel.gen_body  s   ||~!!# 	%ut23DMM*((.##DKKM2KK

#KK%KK$	% 4./KK(** X!//4
s1$uE]ESVWX 	% 	%s   B)D>>Ec              #     K   | j                   }|rYt        j                  ||      }t        |t              r3|j
                  }t        |t              sJ t        j                  |_	        || _         	 | || _         y# || _         w xY ww)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr2   r  r   r3   r   rG   r   rl   r   )r<  rR  priors      rX   rW  zCppKernel.masked  ss      88D%(D$)zz!$777 #ZZ
	$J#DOeDOs   A/B
2A> 6B
>	BB
r   c                 P    | j                   |   }|||z  |z   i}t        ||      }|S r  )rb  r0   )r<  r   scaleitervar_idxrI  r   r   r   s           rX   scale_index_with_offsetz!CppKernel.scale_index_with_offset  s7     mmK(C%K&01uk2	rW   r   c                 6    t        | j                  |            S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rD   rK  r<  r   s     rX   index_to_strzCppKernel.index_to_str	  s    
 T))%011rW   itervarc                 D     t         fd|j                  D              S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c              3   (  K   | ]  }|j                   j                  j                  v ret        j                  j                  |j                      t              r4j                  j                  |j                      j                          y wr  )r   r  varname_mapr   rG   
depends_on)r'  sr  r<  s     rX   r)  z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>  sj      
vv---488//7H HH  (33G<
s   BB)rg   free_symbolsr<  r   r  s   ` `rX   index_indirect_depends_onz#CppKernel.index_indirect_depends_on  s%      
''
 
 	
rW   c                 F    ||j                   v xs | j                  ||      S r  )r  r  r  s      rX   index_depends_onzCppKernel.index_depends_on  s,    %,,, 
0N0N71
 	
rW   c                 T    t        t        | j                  | j                              S r  )dictr  rb  r[  rA  s    rX   
var_rangeszCppKernel.var_ranges   s    Ct{{344rW   r  rH  lowerupperc                    |s|sy t        |t        j                        }|rIt        j                  |t
        j                        j                  }t        j                  j                  }nt        j                  j                  }	 | j                  t        j                  _
        t        j                  |t
        j                        j                  }|t        j                  _
        | j                  }|r.t        j                  j                  | j                  |            nd }	| j                  ||rdnd |	| j                        }
| j                   j#                  ||
d       y # |t        j                  _
        w xY w)N0F)
assignment)r   r   TMPr2   rM  r   r  r   r4   rh  r  r  sexprrK  indirect_assertr  r  r  )r<  r  rH  r  r  indirectr  r   prior_computesize_strr   s              rX   check_boundszCppKernel.check_bounds#  s
    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHdoo
 	&$59 $1 s   AE' 'E>r   c                 6   | j                   j                  |      }| j                  |      }| dt        |       d}| j                  j                  | j                  |t        j                  j                  |            }|j                  d| ||fi        |S )N[]rl  r  )r  inputrK  rE   r  r  r  r4   graph	get_dtyper  )r<  r   r   r   r   r  s         rX   r  zCppKernel.loadC  s    iiood#$$U+aE*+1-""4::t177;L;LT;R"SftT5&92>rW   c                    d|v sJ | j                   j                  |      }| j                  |      }|| dt        |       d| d}n|dk(  rt        j
                  j                  s$| j                  dk(  r| dt        |       d| d}nSt        j                  j                  |      }dt        |    d	| d
}d| dt        |       d| d}nt        d|       | j                  j                  t        ||             y )Nbufr  ] = r   
atomic_addr5   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputrK  rE   r   r  r  r  r4   r  r  rH   NotImplementedErrorr  r   r;   )r<  r   r   r   moder   r   r   s           rX   storezCppKernel.storeK  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rW   r   r   rtyper   c                 @    ddt         t           ffd}|S )NrH  c                 L    |  d d        dS t        |       S )Nr   r   r   )r   )rH  r   r   r   r   r  s    rX   rg  z.CppKernel._gen_reduction_prefix.<locals>.innerk  sH    |"1SEWUE-B,C1EE- rW   r  )r   rn   )r<  r   r   r  r   r   rg  s    ````` rX   _gen_reduction_prefixzCppKernel._gen_reduction_prefix]  s    	 	 	 rW   c                 h    | j                   D ]#  }| j                  j                   ||             % y r  )r  r  r  )r<  rH  gen_fns      rX   finalize_reduction_prefixz#CppKernel.finalize_reduction_prefixz  s/    66 	7F!!((6	7rW   c                 f   |dk(  r| S |dk(  r|t         j                  k(  r | j                  J t        j                  t
        j                  | j                  | j                  d        }t        j                  j                  r|}nt        |t                     }d}	 t        j                  j                  j!                  |      }||kD  r+t        j                  j                  j%                  ||       yt        j                  j                  j'                  ||       y# t"        $ r Y yw xY w)Nrh   rd      TF)r   ro   r  rW  rX  rY  rZ  r  r   r  r  r   r+   r4   r  sizevars	size_hint	Exceptioncheck_lt	check_leq)r<  r   r   
use_scalarreduction_sizert_size
chunk_sizert_size_hints           rX   need_use_acc_helperzCppKernel.need_use_acc_helper~  s    --!>! U"u';##///&--d..t/C/C/EFN zz)) )!.2F2HI J ww//99'B j(  ))*g>  **7J?  s    )D$ $	D0/D0c           
      `   |rt        ||      n|}t        |      }|dv sJ |dk(  rdnd}	t        ||	      }
|dk(  rdnd}|r
t        |   }n&t        | d      r| j	                  |      nt        |   }| d| d	|	 d
| d| d
}|dk(  r|S t        |
t        j                        r
|
dk  rd| S |S )N)rh   rd   rh   i   r  WelfordHelperCascadeSumHelperrU  <r   ro  r  r   rd   r5   zstatic )r   rE   rH   r   rU  r   r   r\  )r<  r   r   helper_ranger   r  r  num_range_threadnum_range_thread_exprr  
num_chunkshelper_typeh_typehelper_init_lines                 rX   _acc_helper_initzCppKernel._acc_helper_init  s
    3>GL+.< 	 !,,< =!::::+/??TU
-z:
 !11 # 	
 !%(F 41 ""5)!%(  m1VHBzl"ZL$% 	 U"##j%--0Z1_ -.//##rW   c           
      >   t         j                  j                  rdn	t               }| j                  j                  | j                  ||||d |             | j                  j                  | j                  ||||||             |r|n| d}|dk(  rI| j                  j                  | d| d| d       | j                  j                  | d| d| d       y | j                  j                  | d	| d       | j                  j                  | d
| d       y )Nr  _vecrh   z = welford_combine(r   r   z_local = welford_combine(z	_local, &z = cascade_sum_final(&z_local = cascade_sum_final(&)
r   r  r  r+   r  r   r  r  r  r  )	r<  r   r   r   r  r   r  r  r  s	            rX   _use_acc_helperzCppKernel._use_acc_helper  sA    $ZZ77M=Q=S 	 	**44!!
L%z	

 	!!++!!
L%j	

 #3%t--..88(-fXSBG ''11(3F89ZLPRS ..88(0B? ''11(6zl"ErW   c           
         |dv }|||f}|| j                   j                  v r| j                   j                  |   S | j                   j                  | j                  d| d      }| j                  j                  |        d| _        |r|n|}t        ||      }	| j                  j                  | j                  ||	||t                     | j                  ||d      rt        j                  t        j                  | j                   | j"                  d        }
| j$                  j                  | j&                  d| d      }d| }| j)                  ||||
|d       | j*                  j-                  | dt/        ||||       d	       n| j"                  J | j0                  | j"                     }t3        | j"                  d
z   t5        | j0                              D ]$  }|| j                   |   z  | j0                  |   z   }& | j*                  j-                  | dt/        ||||       d	       | j7                  ||	||       t9        ||      }|| j                   j                  |<   |S )Nrb   ra   
reduction FwriteTscalar_r  r   r   r5   )r   )r  reduction_cacher  r  r  r  r  r   r  r  r   r	  rW  rX  rY  rZ  r[  r  r  r  r  r  r   r   rb  rU  r   r  r   )r<  r   r   r   r   argmax_or_argminreduction_keyr   
init_dtyper   r  r   scalar_helper_valr   r   r  s                   rX   	reductionzCppKernel.reduction  s   )-AA!>58D..>>>%%55mDD  ))JJ*]O4E * 
 	  ''3%1 "2Y
%njA((//&&X~z>	
 ##NE4@&--dkk$*>*>*@AN 0099
=/:% : J #** 6  ! !  KK!!%s,^S%IZ[\\]^ ''333MM$"6"67E4//!3S5GH BA.q1AABKK!!%s,^S%uUVVWX 	,,S(NJW">37<B**=9rW   c                     | j                  |      }| j                  j                  |      }| j                  j	                  t        || dt        |       d| d             y )Nr  r  r   )rK  r  r  r  r   r;   rE   )r<  r   r   r   r   s        rX   store_reductionzCppKernel.store_reduction)	  s]    $$U+iit$''#aE(:';4waHI	
rW   c                    | j                   ri| j                   t        |      t        |      z   k(  s+J | j                    dt        |       dt        |              | j                  t        |      k(  sJ t        |      t        |      z   | _         | j                   D cg c]  }| j	                  |       c}| _        t        t        | j
                              D cg c]  }t        t        j                  |       c}| _
        t        |      | _        | j                  d | j                   | j                  | j                  d  fS c c}w c c}w )Nr  r   )r  r   r  r   rK  r[  rU  r.   r   XBLOCKrb  )r<  lengthsreduction_lengthsr  ns        rX   
set_rangeszCppKernel.set_ranges0	  s2   ##uW~>O8P'PP ##$Dw(8EBS<T;UVP ''3w<777$W~6G0HHD<@<L<LMq4//2MDK s4;;/0 /t{{A>DM $'w<D MM0D001MM$..01
 	
 Ns    E !Ec                     | j                   J t        j                  j                  j	                  t        | j                         d      S )N    fallback)r  r4   r  r  r   r/   rA  s    rX   r   zCppKernel.size_hintC	  sF    +++ww))$**+d * 
 	
rW   c                 ^   t        | t              sJ t               | j                  J t        |j                  t
              r+|j                  j                  |j                               n | j                  |j                               |j                  d uxr# |j                  j                     j                  t        j                         5 }j                  r6rj                          nj                         |j!                         n4dkD  r/j#                         r|j%                  j'                                dt(        ffddd	 ddt(        dt*        ffdddt(        dt*        ffd	 	 ddt(        dt*        dt,        ffd	|j%                  j'                                t        |j                  t
              r t        t.        j0                  t2              rt.        j0                  j4                  rt.        j0                  j4                  }|j7                         D ]  }t9        |j;                         j<                  D cg c]  }| j?                  |       c}      }t@        |j;                         jB                     }	d
|	 dtE        |       d}
|jG                         }jI                  d|	 d| d|
 d       jI                  |	 d| d| d         |       d d d        y c c}w # 1 sw Y   y xY w)Nr5   
_loop_nestc                      fd} j                         }t        |t              r|j                  D ]
             y t        |t              sJ  j
                   |       r|j                          t        j                         5 }|j                  j                                |j                         d d d        y # 1 sw Y   y xY w)Nc                      j                   sJ j                   j                     } | j                  xr | j                  S r  )rP  r  r  parallel)rootr3  	par_depths    rX   is_parallel_reductionzOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reductiong	  s=    %++++%++I,A,ABD,,>>rW   )
get_kernelr   re  rg  CppKernelProxyrP  r  r   r   r   r   r  )r3  r9  rh  r   r   gen_loop_nestr8  s   `   rX   
gen_kernelz0CppKernel.codegen_loops_impl.<locals>.gen_kernelf	  s    ?
 $..0f&:;&,ll 2
%j12 &fn===!''38M8ODDF#--/ .5++DKKM:-. . .s   1CCc                     |r0| j                   }|r| j                  |z   }|S | j                  |z   }|S | j                  }|r|| j                  z   }|S || j
                  z   }|S r  )r  r  r  r  r  r  )rh  r6  	is_suffixr  prefixs        rX   get_reduction_prefix_suffixzACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffixx	  s}    #44F!'!A!AF!J "M "(!E!E!N!M#44F!'&*J*J!J "M "(&*N*N!N!MrW   depthc                    | j                         }| j                  sJ | j                  |   }t        j                         5 }|j                  rI|sG 	||j
                  d      }|r|j                  j                                j                  |       
rR|j
                  rFj                         |j                  r)|j                  sJ j                  |j                          | |       
rC|j
                  r7|j                  rj                  |j                         j                          |j                  r&|s$j                   	||j
                  d             d d d        y # 1 sw Y   y xY w)NF)r?  T)r:  rP  r   r   r  r6  r   r   r  r  r  close)r3  rB  in_reductionrh  loopstack_outerr  r   gen_loop_atrA  is_reduction_loopthreadsworksharings          rX   gen_loop_with_reductionz=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction	  sA    $..0!''''!''.))+ {((+F"DMMU,( ,'55dkkmD$45(T]]#,,W5!66#)#@#@@#@ KK(C(CD
E2(T]]!88 KK(E(EF#))+((7 &+  s   D*E66E?c                 X   t        j                         5 }| j                  sJ | j                  |   }|j                         }|
	 d d d        y j	                  |       |j                  j                                 | |dz   |j                         d d d        y # 1 sw Y   y xY wr   )r   r   rP  linesr   r   r   r  )r3  rB  r   rF  
loop_linesr   r<  s        rX   rH  z1CppKernel.codegen_loops_impl.<locals>.gen_loop_at	  s    ))+ Lu%++++%++E2D!%J!)L L OOJ/''6!*eai9J9JKL L Ls   1B AB  B)rE  c                 t    | j                   |t        | j                         k(  r	 |        y  | ||       y r  )rP  r   )r3  rB  rE  r=  rL  s      rX   r<  z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest	  s7    
 ##+uJ<L<L8M/Mz*+J|LrW   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )%r   r;  r+   r  rh  re  decide_parallel_depthmax_parallel_depthrP  r  r  r   r   r  rD  r6  mark_parallelsingler   r   rT  rn   rl   r4   local_buffer_contextrK   local_buffersvaluesr/   
get_layoutrH  rK  rH   r   rD   get_namer  )r<  rV  r   rK  r   rV  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namer=  rH  r<  rL  rA  rI  r8  rJ  s     ``        @@@@@@@@rX   codegen_loops_implzCppKernel.codegen_loops_implI	  s   $///&(+++i&&(<=!((>>,,.I 22,,.I
 OO4' D	 5 56CC 	 !!# ~	%u''$%%'((1''	21%%'''6.x .$"" DI$-0 B	L 	L 	L %*M$MM #M . 9++-ABq557IJ**88 !" 6 6 D D$1$8$8$: L%2 -9,C,C,E,J,J ( !00:&N '3<3J3J3L3R3R&SO!2?2C5~I^H__`aH(4(=(=(?%KK*?*;9EVDWWZ[cZddef KK*+2.?-@HYGZZab" )$}~	% ~	%`a~	% ~	%s    !FL#4LB	L#L##L,c                 T    t         j                  |       }| j                  |||       y r  )rT  buildr`  )r<  r   rK  rV  s       rX   codegen_loopszCppKernel.codegen_loops	  s"    NN4(		4=rW   c                 :    t         j                  j                  ryy)NAOTI_TORCH_CHECKTORCH_CHECK)r4   r  aot_moderA  s    rX   assert_functionzCppKernel.assert_function	  s    77% rW   c                    | j                   J | j                   |j                  |j                  |j                  z    }| j                         }d}d}|D ]m  }t        j
                  j                  j                  |d      }|d|z  k\  s||k(  r n3||z  t        j                  j                  k  r n|dz  }||z  }||z  }o t        j                  j                  r|dk(  rt        |      dkD  rd}t        ||j                        S )Nr5   r   r/  r0  r   r  r  )r  r  r  r   r4   r  r  r   r  min_chunk_sizer  r   r  )	r<  rR  rJ  r[  seqparrB  r  hints	            rX   rQ  zCppKernel.decide_parallel_depth	  s   +++!!**"..1C1R1RR

 nn 		D77##--dT-BDa'k!SG^g~

 9 99QJE4KC4KC		 ::%%%1*VqE .@.L.L
 	
rW   c              #     K   | j                   | j                  | j                  | j                  f}t	               | _         t	               | _        t	               | _        | j                  j                         | _        d  | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         |\  | _         | _        | _        | _        y wr  )r  r  r  r  r=   cloner  r  )r<  r  s     rX   write_to_suffixzCppKernel.write_to_suffix
  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     t        |i |S r  )rG   )r<  r  r  s      rX   create_cse_varzCppKernel.create_cse_var
  s    t.v..rW   c                 "    dt         |    d| dS )Nzc10::convert<r  r   )rH   )r<  srcr   r   s       rX   r  zCppKernel.get_to_dtype_expr
  s    |E232cU!<<rW   c                 b    | j                  |||      }| j                  j                  ||       y r  )r  r  r   )r<  dst	dst_dtyperu  r   r  s         rX   r  zCppKernel.cache_dtype_convert
  s(    %%c9i@T3rW   r@  r   c                 V   
 |d} j                   syg 

 fd}|-| j                   v sJ  j                   |   \  }} ||||      s4y j                   j                         D ]  \  }}|\  }} ||||      r y dj                  
      }	|	r|j                  d| d|	 d       yy)	NrS   Tc                 8   | |k(  ryd }t        j                        D ]  \  }}||k(  s|} n t              t        k(  r|r| dk(  r|j                  |   k(  rd}j                  | dt        |               j                  | dt        |              y)NFr   r5   r  r   T)r   rb  r%  r  r[  r  rE   )r  r  r   var_idr   _var
conditionsr<  s         rX   genz)CppKernel.codegen_conditions.<locals>.gen%
  s    |F$T]]3 4$;F
 T
i'QJ4;;v..T+e*<)=>?SS)9(:;<rW   FrY  zif(r  r   )r  r  joinr   )r<  r   r@  r   r~  r  r  r|  _rangejoined_conditionsr}  s   `         @rX   codegen_conditionszCppKernel.codegen_conditions
  s     >F!!
	& ?$,,,,,++C0JE3uc3' $ 2 2 8 8 : !f#
s5#t, ! #KK
3NNS*;)<B?@rW   r  )r5   r   NFFNN);r  r  r  r  r  rZ  rD   r  r  r  r;  r   r   r  r  r   r7   r  r   contextmanagerrW  r   r  r  rp   r  r  r  r  r  rl   r  r  r  r	   r9   r   r   r  rn   r  r	  r  r  r%  r'  r-  r   r`  rc  propertyrh  rQ  rq  rs  r  r  r  rn  ro  s   @rX   r  r  u  s)    IEMF'1^ /('
RIXl3 ( $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$;#$  	
 {{:7hsm 7'^ *$Z PU@4l

&
O%b> ! ! !
: 
B 
B/=  !%&*	.. . ell#	.rW   r  c                   B    e Zd ZeZ	 d' fd	Zdej                  dej                  fdZ	de
j                  defdZde
j                  defd	Zde
j                  defd
Ze
j                  fde
j                  defdZdede
j                  defdZ	 d'dedej                  de
j                  dee   fdZ	 	 	 d(dee   dej                  de
j                  dee   deeeef      dedee   fdZdedej                  f fdZ	 d)deeef   dedej                  de
j                  def
dZd'dZd Zd ZdedefdZ dedej                  defdZ!d  Z"d! Z#ddde
jH                  fdeej                     d"ee   d#ee
j                     fd$Z%d' fd%	Z& fd&Z' xZ(S )*r  Nc                     t         |   ||       t        j                         | _        | j                  sJ |dkD  sJ d       || _        || _        || _        |r|| _        y || _        y )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r:  r;  r   pick_vec_isavec_isar.  rc  rs  	num_elems)r<  r  r  r.  rc  rs  r?  s         rX   r;  zCppVecKernel.__init__M
  si     	{+"//1|||q T"TT *$"&/]rW   r   r  c                       j                  ||      ry  fd|j                  D        D ]"  }t        |t              sJ |j                  s" y  t        || j                        }|j                  r|S d S )Nc              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  r   r   r  r  r  r   r'  r  r<  s     rX   r)  z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>a
  s:      
a* HH  (
   AA
)r  r  r   rG   r   r  r.  r  )r<  r   r  indirect_varrh  s   `    rX   rd  z"CppVecKernel._try_get_const_stride^
  s|    ))%9
''
 	L
 lN;;;""	 %UGT5G5GH))v3t3rW   r   r   c                     t        j                  | j                  |j                  z  dz  | j                  j                         z        }|dk\  sJ |S )N   r5   )mathr  r.  itemsizer  	bit_widthr<  r   num_vectorss      rX   rt  zCppVecKernel._get_num_vectorsl
  sO    ii/!3dll6L6L6NN
 arW   c                 p    | j                   |j                  z  dz  | j                  j                         z  S )Nr  )r.  r  r  r  )r<  r   s     rX   r-  z!CppVecKernel._get_raw_num_vectorss
  s0     !!ENN2Q69O9O9QQQrW   c                 h    | j                  |      }|dk(  rdt        |    dS dt        |    d| dS )Nr5   rm  r   rn  r|  )rt  rH   r  s      rX   rU  zCppVecKernel._get_vec_typex
  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrW   c                 l    |t         j                  k(  ry| j                  |      }dt        |    d| dS )NrS   r{  r|  r   )r   rl   rt  rH   r  s      rX   r  zCppVecKernel._get_mask_type
  s<    EJJ++E2"<#6"7qQGGrW   rR  c                     |j                   t        j                  k(  sJ t        |             | j	                  |      }| dt
        |    d| dS )Nz.template cast<r|  r   )r   r   rl   reprrt  rH   )r<  rR  r   r  s       rX   r@  zCppVecKernel._get_mask_cast
  sP    zzUZZ'3d3'++E2|E':&;1[MMMrW   r   	load_maskc                    t         |   }| j                  |      }d}|rS|j                  s&| j                  t        j
                         d| d}n!| j                  |t        j
                         }|dk7  r| dt        |       n|}|t        j                  k(  r| j                          d| d}	|	S |r| d| d| d| dn,| j                  |       d	| d
t        | j                         d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        NrT  r   r   r   z.template loadu<r|  r  ::loadu(r   )rH   rt  r   r  r   ro   r@  rE   rl   rU  r  )
r<  r   r   r   r  cpp_typer  load_mask_strloadbufr   s
             rX   _get_vec_load_linezCppVecKernel._get_vec_load_line
  s      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9A>D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rW   r   store_value
accu_storec                 ^    |r	|J d       |r|sJ  j                   dt        j                  dt        f fddt        j                  dt        f fddt        dt        f fd}t               }|j                  d	       |j                         5   |      }	 |      }
d
t        |    d|
 d}|j                  |       |r |j                  | dt        |	       d       t         j                   j                      d      }i } fd|j                  D        D ]4  }t        |t              sJ |j                  s" ||      }| d| d||<   6  j!                  | j                  |      }d} j"                  l|rJ d       t         j"                  t              sJ  j"                          j"                  j                  r j"                   d| d}n j"                   d}t%        j&                         r|j                  d j(                          n|j                  d j(                          |j                  d| d| dt         j*                         dz   | dz          |j                         5  t-        j.                         5 }t        |      }|D ]#  }t1        j2                  d| z   dz   ||   |      }% || d| dn| }|r4|j                  d | d       |j5                  |j                                |r!|rd!nd"}|j                  | d#| d$| d%       n|j                  d&| d'| d(       ddd       ddd       |s( j7                  d)d*|      }|j                  d+| d(       ddd       |j                  d,       |r#|j                  d(       j9                  |       y j:                  j=                  ||-      }t        |t              sJ d.|_        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)/a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S N   )r  r  r   r<  s    rX   get_result_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size
  s1    ~~!~~enn)<==~~%rW   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S r  )r  r.  r  s    rX   get_tiling_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size
  s5    ~~!))Q%..-@AA)))rW   vec_varc                 L   | j                   sJ t               }|j                  d       |j                         5  | j                  }|J |t
        j                  k(  rt
        j                  } |      } 	|      }|j                  dt        |    d| d       |  dt        |       d}|j                  |       |j                  d       d d d        |j                  d       
j                  j                  |      }t        |t              sJ |S # 1 sw Y   JxY w)	NrW  rp  r   rq  rr  r   zreturn tmpbuf;r{  )r   r7   r   r   r   r   rl   ro   rH   rE   r  r  r   rG   )r  r   r}  result_sizetiling_sizer   r  r   r  r  r<  s          rX   vec_to_arrayz@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array
  s   >>!>>DNN5! 1#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/01 NN4 XX&&vt4Ffn555M!1 1s   BDD#rW  rp  r   rq  rr  r   rR  c              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr  r  r  s     rX   r)  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>  s:      !!!TXX. $$QVV,!r  r  r  r  rI  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   rX  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   rP  r{  rl  T)r  r   r   rn   rG   r7   r   r   rH   rE   r-   rb  rc  r  r   r   r  r  r   is_gccr.  r  r   r   r   r   r   r  r  r  r  )r<  r   r   r   r   r  r  r  r   r  r  result_declareitervar_innerreplacementsr  	array_varr  r   index_crhsr   	load_liner  r  r  s   `   `                  @@rX   rf  z*CppVecKernel._load_or_store_non_contiguous
  s`   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[] ?	7)%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++! Q
 ",???&& ,\ :I4=;aa1PL.Q 004??= 1 E I*&I(II!$//>BSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!#!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
  H
 4 4 6 H%%e,$0 L ff<.1E9$\2G .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG!H H"  33OQN	156?	7@ 	tNN3MM$XX&&vt5&AFfn555 FMM;H H H HY?	7 ?	7sE   B1P#EP#P(B2PP"2P#PPP 	P##P,r   c                 :   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| j                  | j                     }| j                  ||      }|dk(  rt        	| )  ||      S |dk(  rG| j                  |||| j                        }| j                  j                  | j                  ||      }n| j!                  |||      }t#        |t$              sJ |j'                  d| ||fi        d|_        |S )Nr   r5   rl  r  T)r  r  rK  r4   r  r  rb  rc  rd  r:  r  r  r  r  r  r  rf  r   rG   r  r   )
r<  r   r   r   r   rg  rh  r   r  r?  s
            rX   r  zCppVecKernel.load=  s    iiood#$$U+!!$']]4??3
++E:>Q;7<e,,q[**3udooNDXX&&tzz4u&EF77UEJF&.111ftT5&92>rW   r   c           	         t        |t              s#t        |t              r|j                  sJ |       | j                  | j
                     }| dt        |       }| j                  ||      }t               }	|dk(  r|rl|t        j                  k(  r#| j                  | j                  |       d| dn,| j                  |       d| dt        | j                         d}
d| d|
 d}|t        j                  k(  r%| j                  |	j                  | d| d       |	S |	j                  | d| dt        | j                         d       |	S | j                  ||||	||	       |	S )
a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        r   r5   r  r   r   r  .store(r   )r   r  r  )r   rp   rG   r   rb  rc  rE   rd  r=   r   ro   rs  rU  r  r   rf  )r<  r   r   r   r   r  rg  var_exprrh  r   r  s              rX   _get_store_linezCppVecKernel._get_store_lineQ  s   " %%un-%,,		 
 ]]4??3
U#k%012++E:>Q; +0F ))%01(1E ..u56hxj;W[WeWeKfJgghi 
 E7#dV1-#(>%z<=  gWXJbT^^1L0MRP  ..UE$Ej /  rW   c                    dv sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                        }| j                  |      }t        j                  j                        }|B| j                  ||||      }| j                  j                  |j                  fd             y |dk(  r.t        j                  j                   sT| j"                  dk(  rE| j                  | |||d      }| j                  j                  |j                  fd             y | j%                  |      }| j%                  t&        j(                        }	t*        |   }
t-        j.                  |t&        j(                        j0                  }t        |t              r|j                  sJ d|
 d	|	 d	| d
| d	| d	| d}| j                  j3                  t5        |             y t7        d|       )Nr  c                     t        |       S r  r;   r  r   s    rX   <lambda>z$CppVecKernel.store.<locals>.<lambda>  s    ,tQ2G rW   r  r5   T)r  c                     t        |       S r  r  r  s    rX   r  z$CppVecKernel.store.<locals>.<lambda>  s    l46K rW   zatomic_add_vec<r   r  r   r  )r   rG   r   r  r  r  rK  r4   r  r  r  r  r  mapr   r  r  r  rt  r   r  rH   r2   rM  r   r   r;   r  )r<  r   r   r   r  r   r   r   n_srcn_idxr   r   s    `          rX   r  zCppVecKernel.store}  s   }}%07%70||NN5)Eiit$$$U+!!$'<''sE5ADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@!%8U\\II(5'E7"SEE7RTUZT[[]^%%l4&>?%D6&:;;rW   c           
      b   |t         v sJ |dv }| j                  | j                  k\  }|r|n|}t        |t              sJ |       |j
                  s| j                  |      }|||f}|| j                  j                  v r| j                  j                  |   S d}	|	 dt        |    d}
t        ||      }| j                  ||      }| j                  j                  | j                  d| d      }t        |t              sJ | d}d	| }d	| }| xj                  | ||gz  c_        d
| _        | j                   j#                  | j%                  ||||t&                     | j                   j#                  | j%                  ||||| j(                               | j+                  ||d      }|r| j                   j#                  | j%                  ||||| j(                               | j                  J t-        j.                  t0        j2                  | j4                  | j                  d       }|dk(  r,| j6                  j                  | j8                  d| d      }n+| j:                  j                  | j8                  d| d      }d	| }t=        | j4                  | j                     | j>                        rl| j                  | j                  k\  rQt=        || j4                  | j                           t=        | j4                  | j                     | j>                        z  n|ntA        jB                  d      }| j4                  | j                     | j>                  z  r>| j                  | j                  k\  r#t=        || j4                  | j                           n|ntA        jB                  d      }|dk(  rd| }| jE                  |||||d
       | jE                  |||||       | jE                  |||||       | jF                  r|n|}| jF                  r|n|}|dk(  r4| jH                  jK                  | d| jM                  ||||       d       n| jH                  jK                  | d| jM                  ||||       d       n| j                  J | jN                  | j                     }tQ        | j                  dz   tS        | jN                              D ]$  }|| j4                  |   z  | jN                  |   z   }& ||||d}| jH                  jK                  | d | jL                  ||fi | d       | jU                  ||||| jL                  | j(                         | jU                  ||||tV        t&               |r+| jU                  ||||| jL                  | j(                         |tX        jZ                  k(  }|rDt]        |      rS| j_                  |      dv sJ d       d| d}d| d}| j`                  jK                  | dtW        |||       d       n|r	| d| d}n|r|dv rd| d}n|dk(  sJ | d}nd | jM                  |d!d"      z   d#z   } |tX        jZ                  k(  }|rtX        jb                  n|}!d$t        |!    d}
d%t        |!    d&| j_                  |!       d}"| }#|r|dk(  sJ | d'| }#|" d(|
 d)|
 d*|  d&|# d
}| j`                  jK                  | dtW        ||||+       d       |}$nq|}$t]        |      r2d	|$ }%| j`                  jK                  |$ dtW        ||$|%       d       n2|r0|dk(  sJ d	|$ }%| j`                  jK                  |$ d|$ d'|% d       te        ||$      }&|&| j                  j                  |<   |&S ),aw  
        Perform vectorized reduction operation.

        This method handles vectorized reduction for different reduction types.
        It manages special cases for low-precision floating point types and
        employs precision improvement techniques for certain reduction operations.

        Args:
            dtype: The output data type for the reduction result
            src_dtype: The source data type of the input value
            reduction_type: Type of reduction operation (sum, min, max, etc.)
            value: The input value to reduce

        Returns:
            The result of the reduction operation
        r  zat::vecz::Vectorized<r   r  Fr  r  masked_TNrh   r   rd   r  r  r   r   r5   )r   r   horizontal_reductionr   )r  r  )r5   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rg   rd   r`   r]  z.all_zero()r_   z.all_masked()z	{ return r  r  z; }rm  zat::vec::vec_reduce_all<r   r   z([](z& x, z& y) r  )3VECTORIZABLE_RTYPESrc  r  r   rG   r   r  r  r   rH   r   reduction_acc_type_vecr  r  r  r  r  r  r  r   reduction_init_vecr	  rW  rX  rY  rZ  r[  r  r  r  r   r.  r   r\  r  rs  r  r   reduction_combine_vecrb  rU  r   r  r   r   rl   r*   rt  r  ro   r   )'r<  r   r   r   r   r!  r  r#  r"  vec_nsvecr   acc_type_vecr   acc_vec
masked_accmasked_acc_vecuse_acc_helperr  r   masked_helper_valhelper_vec_rangemasked_helper_vec_ranger$  acc_vec_helper_val_r   r   r  r   r   masked_next_valuereduce_all_bodyr}  vec_reduce_all_func
result_vectmpvarmasked_tmpvarr  s'                                          rX   r%  zCppVecKernel.reduction  s   & !4444)-AA#$2F2FF"2Y
%07%70||NN5)E!>58D..>>>%%55mDDl5&9%:!<%njA22>:N  ))JJ*]O4E * 
 #~...E,se_
"7),  uw$GG  ((//&&X~z>	

 	((//&&''	
 11.%O,,33**" "++ ''333&--dkk$*>*>*@AN !11!44==LLJ}o">e > 
 "44==LLJ}o">e > 
 #** 6 DKK8$:L:LM $*>*>> ^T[[-IJt{{4??;T=O=OPQ ( ]]1%   ;;t/$2D2DD $*>*>> ^T[[-IJ' ]]1% $ &&-j\$:!$$"%"# %    Z1A5   !' *.~WH/3~~+:K&%%jD$>$>~xY^`k$l#mmno %%jD$>$>~xY^`k$l#mmno ''333MM$"6"67E4//!3S5GH BA.q1AAB $(<&	F KK!!)39t99.'\U[\]]^_ 	,,!%!;!;"55 	- 	
 	,,!2, 	- 	
 00%)%?%?"&"9"9 1  5::%#N3,,U3 8  J JJ   7wiqA
&=n=MQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 'y
!)U222$+9C/?!@J 34DU3%u_L]]_`j_kklm
!!++%s,^S*Xabccde FF#N3")& 2%%//hc"3NFM"Z![[\]  %...")& 2%%//hc&]O1= #>6:<B**=9rW   c                 (   | j                  |      }| j                  j                        }t        j                  j                        }|j                  r$|t        j                  k7  rt        j                  }n|}t        j                  j                  |      }t        j                  j                  |      }t               }	| j                  | j                  k\  r.|	j                  | dt!        |       dt"        |    d| d       n||k7  rt"        |   j%                  dd       d| }
|t        j&                  k(  r&| d| j                  t        j&                         d}n?||cxk(  rd	k(  rn nd
t"        |    d| d}n d
t"        |    d| dt"        |    d| d| d}|	j                  d|
 d| d       |
}|	j)                  | j+                  ||||             | j,                  j)                  |	j/                  fd             y )Nr  z] = static_cast<r  r   z::r   z.template cast<bool,r   r5   at::vec::convert<r   r|  r   r   r   c                     t        |       S r  r  r  s    rX   r  z.CppVecKernel.store_reduction.<locals>.<lambda>  s    T18M rW   )rK  r  r  r4   r  r  is_floating_pointr   rk   ro   rh  rt  r=   rc  r  r   rE   rH   r  rl   r  r  r  r  )r<  r   r   r   r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rX   r'  zCppVecKernel.store_reduction  s   $$U+iit$GG%%d+	&&9+DKKEE((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 E!#I.66tSAB!E7K   

*!&';D<Q<QRWR\R\<];^^abG&/>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UC	JK$$TXX.M%NOrW   
scalar_varc                    |j                   rJ |j                  t        j                  k(  rE| j                  j                  | j                  | j                          d|j                   d      }n]|j                  J | j                  j                  | j                  | j                  |j                         d|j                   d      }t        |t              sJ |j                  |_        |j                  |_        d|_         |S )NrT  r   r  T)r   r   r   rl   r  r  r  r  r   rU  r   rG   dependent_itervars)r<  r  r  s      rX   r  zCppVecKernel.broadcast  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>222"((%/%B%B"rW   rh  c           	      "   |j                   rJ |j                  J | j                  j                  | j                  | j                  |j                         d| d| d      }t        |t              sJ |j                  |_        d|_         |S )Nz	::arange(r   r   T)r   r   r  r  r  rU  r   rG   )r<  r   rh  r  s       rX   re  zCppVecKernel.arange  s    <<{{&&&""LL!!%++./yr&K
 &.111{{rW   c                    t         |   }| j                  |      }t        |      rd| dS |dv rWt        |   }| j	                  ||      }|dk(  rt        |      rd| dnd| d}nt        |      rd| dnd| d	}| d
| dS |dk(  r| j                          dS t        ||      }| d
| d}	|t        j                  k(  r|dv sJ | j                          d| dS |	S )Nr   r   rz   ra   r}   r|   r   r{   r~   r  r   rg   z	::from(0))r_   r`   rd   rT  )
r<   rU  r*   rH   r  r   r  r   r   rl   )
r<  r   r   r   vec_typer   r   rF  scalar_initvec_inits
             rX   r  zCppVecKernel.reduction_init_vec  sA   07%%k2/hZs++11!+.F22>5IH) &e, +6(-@/xx@  &e, ,F8=A/xx@ 
 ZqQ''U"))+,I66$^U;ZqQ/EJJ!%::::))+,GK=BBrW   c                    t         |   }| j                  |      }t        |      rd| dS |dv rw| j                  |      }| j                  t        j
                        }|t        j                  k(  r!dt        t        j                      d| d| dS dt        |    d| d| dS |t        j                  k(  r|dv sJ | j                          S |S )Nr   r   rz   zIndexValueVec<r   )r_   r`   rg   rd   )
r<   rU  r*   rt  r   r  rl   rH   ro   r  )r<  r   r   r   r  r  r  s          rX   r  z#CppVecKernel.reduction_acc_type_vec  s    07%%k2/hZq))11))+6E))%++6E

"'U[[(A'B"UG2eWTUVV#L$=#>br%PQRREJJ!%AAAA))+,-rW   r  r   c                 Z   |t         j                  k(  }|dk(  r=| j                  rd| d| dt        | j                         dS |r| d| S d| d| dS |dk(  r=| j                  rd| d| dt        | j                         dS |r| d	| S d
| d| dS |dk(  rq|r4| j                  rd| dt        | j                         d| dS d| d| dS | j                  rd| d| dt        | j                         dS |rdnd}	| d|	 d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  rp|r:| j                  r"d| d| dt        | j                         d| d	S d| d| d| dS | j                  rd| d| dt        | j                         dS d| d| dS |dk(  rgt	        |t
              r|\  }
}}nt        ||      \  }
}}| j                  r%d| d|
 d| d| dt        | j                         dS d| d|
 d| d| d	S |dv r|J t        |   }|t         j                  k(  rt        t         j                     }| j                  |      }| j                  t         j                        }d}d}|%|J dt        |      j                          }d| }| j                  r.| d | d| d| | d!| d| | dt        | j                         dS | d | d| d| | d!| d| | dS |d"k(  r]t	        |t              rF|j                  t         j                  k(  sJ t        t         j"                  j$                  |f      \  }| d| S t&        )#Nr`   zmax_masked_reduce(r   r   rn  r;  r_   zmin_masked_reduce(rh  r4  rd   r   r   zsum_masked_reduce(r   r[   r   re   zprod_masked_reduce(r   rf   zxor_sum_masked_reduce(r   rh   r   ri   r   z}, r   rz   rS   z_combine_vec<r  rg   )r   rl   rs  rE   r   r   r   rH   ro   rt  r  rp   r  rG   r   rO   r4   rh  r  r  )r<  r   r   r   r   r   r  r   r   r   r   r   r   r   r  r  t_extra	arg_extras                     rX   r  z"CppVecKernel.reduction_combine_vec  s    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$>>1*RDNN@[?\\_`j_kklmm1*SANN>>/uBzl"[QUQ_Q_E`Daabcc)0#cK!U!K=*>>v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//>>-cU"ZL;t~~C^B__bcmbnnopp-cU"ZLJ<qQQ>>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,#- b& $5^Z#P b&~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),FEJJ&%ekk2)))4E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$*n5!''5::555 4QXX5E5E
} UU#j\**%%rW   c           	         t        |t              sJ |j                  J |j                  s4t        |t              r|j                  rd| d}t        	|   ||||      S |}|}|r!| j                  |j                         d| d}|r!| j                  |j                         d| d}|r|rd| d| d| d| d	}| d| d| }n#|r| d| }| d| }n|sJ | d| }| d| }| j                  |j                         d| d}|r6|j                  s!| j                  |j                         d| d}d| d| d}| j                  rS| j                  |j                         d| j                  |j                         d	| d
t        | j                         d}d| d}| j                   d| d| dS )Nr  z).all_masked()r   r  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rG   r   r   r:  r  rU  r  rs  rE   rh  )
r<  r   r  r  rR  lower_scalarupper_scalarr  
cond_printr?  s
            rX   r  zCppVecKernel.indirect_assertq  s$   #~...yy$$$zz$/DKK4&/7*3udCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii894&BtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrW   c                 *   t        |t              sJ |j                  st        	|   |||      S t
        |   }| j                  |      }t
        |   }| j                  |      }d| d}|t        j                  k7  r2|t        j                  k(  r| j                  |       d| d| d| d}|S |t        j                  k(  r |t        j                  k7  r| d| d| d}|S ||k7  r+||cxk(  rdk(  rn nd	| d| d}|S d	| d| d| d| d| d}|S )
Nr  r   z::from<r|  r  z.to<r   r5   r  )
r   rG   r   r:  r  rH   rt  r   rl   r  )
r<  ru  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsr  r?  s
            rX   r  zCppVecKernel.get_to_dtype_expr  s\   #~...zz7,S%CC#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD  %**$%**)<U$|nAo->cBD  %/6Q6*<.3%qA  +<./9J!L>YZ[jZkkmnqmrrstrW   r  )NNFr  ))r  r  r  r  rZ  r;  r   r  r  rd  r   r   rn   rt  ro   r-  rp   rU  r  rG   r@  r   r  r=   r	   rl   rf  r  r  r  r%  r'  r  re  r  r  r   r  r  r  rn  ro  s   @rX   r  r  J
  s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N /3## zz# {{	#
 N+#T ,0<@ Lc]L zzL {{	L
 (L eC$789L L 
.	!L\ UZZ 4 !*S.()* * zz	*
 {{* *X<Bun%PNN ~ $
N 
ELL 
^ 
 D* (,/3+0==c& %c& 'tnc& EKK(c&J#VJ rW   r  c                        e Zd ZdZeZ	 	 d fd	Zd Zd Z	 ddZ	de
dej                  f fdZd fd		Zd
 Z fdZdej                  dej                  fdZ xZS )r  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                     t         |   ||||d   |       || _        || _        || _        |r|n|| _        |r|n|| _        d| _        y )Nr5   T)r:  r;  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r<  r  r  r.  r
  r  r  r?  s          rX   r;  zCppTile2DKernel.__init__  s`     	1	
 -..2A}2A}#' rW   c                 L    t        | j                  | j                      d      S )NrR  )r-   rb  	outer_idxrA  s    rX   inner_itervarzCppTile2DKernel.inner_itervar  s"    !T]]4>>%B$C6"JKKrW   c                 b   | j                   | j                     }| j                   | j                     }t        ||| j                        }t        ||| j                        }| j
                  d u xr@ |dk(  xr9 |j                  |      xr& |j                  |       xr |j                  |       S r   )rb  r  rc  r  r.  r  r   )r<  r   	outer_var	inner_varouter_strideinner_strides         rX   need_vec_transposez"CppTile2DKernel.need_vec_transpose  s    MM$..1	MM$//2	*5)T=O=OP*5)T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rW   c                 n   t         j                  j                  |      }| j                  }| dt	        |       }d}	t	        t        || j                  | j                     | j                               }
t	        | j                         }|r|	|}	}||
}}
d}| j                  |z  r| j                  | j                  }}n| j                  | j                  }}|r|dk(  rdnd}t        |t        j                        r|j                  r&t        |t        j                        rA|j                  s5dt         |    d| d	| d
|
 d
|	 d
| d
t	        |       d
t	        |       d}n4dt         |    dt	        |       dt	        |       d| d	| d
|
 d
|	 d
| d}|r| j"                  j%                         }na| j"                  j'                  |      s)| j"                  j)                  | j*                  |d      }nd}| j"                  j-                  |      }|r>t         |   }d| d| d}| d| d| d| d| d
}| j*                  j/                  |       |j1                  dt3        |            }|r'| j4                  j/                  t7        ||             |S | j*                  j/                  |       |S )Nr   __place_holder__Tr  truefalseztranspose_mxn<r|  r  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r\   r   )r4   r  r  r.  rE   r  rb  rc  r  r  r  r  r   r   r  r  rH   r  r  containsr  r  getr   r  rp   r  r;   )r<  r   r   r   is_store
store_moder   factorru  rw  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rX   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_store  s    !!$'##SU+,-  3E4==;Y[_[m[m nop/0CC#VFF##h.'')=)=qA $$$$ A !)jL.HVPW
q%**%akkq%**%akk !e!4 5Qzl C56("SEF82k!n5ERTUGWWY[  !e!4 5Q{1~6FaTUGWWXYcXd e56("SEF827  xx(H""=1xx((U(SHKxx||M2H$U+I 6fX[SVWG$IQyk8*AfXQvhbQKMM##K0%--.@#h-POO%%l4&GH  MM##M2rW   r   r   c                 Z   | j                   j                  |      }| j                  |      }| j                         }| j	                  |      r| j                  |||d      }| dt        || j                  z         }t        j                  j                  |      }| j                  |d|      }| j                  j                  | j                  ||      }	|	j                  d| ||fi        t!        |	t"              sJ d|	_        |	S | j'                  |      }
t(        | U  ||
      S )NF)r  r   r   rl  r  T)r  r  rK  r  r  r,  rE   r  r4   r  r  r  r  r  r  r  r   rG   r   r  r:  r  )r<  r   r   r   rg  r(  r  r   r   r  r   r?  s              rX   r  zCppTile2DKernel.load/  s   iiood#$$U+""$""5)::c55 ; H "
#k%$..2H&I%JKGGG%%d+E**7Au=DXX&&tzz4u&EF!!&4u*=rBfn555 FMM//6I7<i00rW   c                    d|v sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                  |      }| j                         }| j                  |      }| j                  |      r| j                  |||d|      }| dt        || j                  z         }| j                  sFt        j                  j                  |      t         t"        j$                  t"        j&                  gz   v r| d| dt        | j                         d}	n| d| d}	| j(                  j+                  t-        ||	             y | j/                  |      }
t0        | e  ||
||       y )Nr  T)r  r   r   r  r   r   )r   rG   r   r  r  r  r  rK  r  r,  rE   r  rs  r4   r  r  r   r   rL  rM  r  r   r;   r  r:  r  )r<  r   r   r   r  r   rg  r(  storebufr   r   r?  s              rX   r  zCppTile2DKernel.storeE  sd   }}%07%70||NN5)Eiit$""$$$U+""5)::c54D ; H #3{54>>3I'J&KLH~~!2!24!8M

M = "  zK4O3PPRSz4KK!!,tT":;//6IGM$	5$7rW   c                    | j                         }| j                  r2|j                  d| d| dt        | j                         d| d	       y |j                  d| d| dt        | j
                         d| d	       y )Nr  r  r   r   r   )r  r  r   rE   r  r  )r<  r   rg  s      rX   r  z#CppTile2DKernel.codegen_inner_loopsb  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrW   c                    t         |   ||      }| j                  d   | j                  k  r| j                  nt	        | j                        \  | _        | _        | j                  | j                  d   k(  r+| j                  | _        | j                  | _
        d| _        |S | j                  | _        | j                  | _
        d| _        |S )Nr5   r   FT)r:  r-  r
  r  reversedr  rc  r  rs  r  r  r  r  r  )r<  groupreduction_groupr  r?  s       rX   r-  zCppTile2DKernel.set_rangesm  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rW   r   c                 Z    | j                  || j                  | j                               S )Nr  )r  r  r  r  s     rX   r  z"CppTile2DKernel.transform_indexing  s0    ++%%' , 
 	
rW   r  r  )r  r  r  r  r  rZ  r;  r  r  r,  rp   r   r  r  r  r  r-  r  rn  ro  s   @rX   r  r    ss    < #I (.L
 6::x1 1UZZ 1,8:	$


 
uzz 
rW   r  _bodyc                 j   | j                   gt        | j                  j                               z   }d}d}|D ]  }|j                  j
                  D ]  }|j                  dk(  s|j                  dv r!|j                  dvrd}t        |d      r|j                  rt        j                  |j                  v sJ |j                  t        j                     }|j                  r|j                  t        vrd}|&||j                  k7  st        j                  d       |j                  }d}  ||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexrM  )r  r  r  r  r  Try  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr1  	subblocksrW  r  nodesoptargetr   ry  rA   rx  r   r   warningswarn)r6  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr>  ru  s          rX   get_loop_body_lowp_fprE    s$    ""#d5??+A+A+C&DDJ+/MI !	__** 	!Exx=(ELL = -  || $  !	uf%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW$+MMM 	9	!!> )##rW   c                   J     e Zd ZdZ fdZdeee   ee   f   fdZd Z	 xZ
S )TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                 "    t         |           y r  )r:  r;  r<  r?  s    rX   r;  zTilingSelect.__init__  s    rW   r   c           	        # t        |      }t        |      }|sJ t        d |D              rg g fS t        j                  }t        |d         d   ##rt        #fd|dd  D              r#}t        j                         j                  |      }| j                  |||      }|rt        |d       \  }}	t        |      t        |	      z   }
t        j                  j                  rVd }d	 }d
 }t!        t#        |
            D cg c]  }t%        t&        j(                  |       }}t#        |      }|d | ||d  }}i }i }|D ]n  }|j*                  gt-        |j.                  j1                               z   }|D ]4  }|j2                  j4                  D ]  }|j6                  dv r|j6                  dk(  rdnd}|j8                  j;                  ||f      |j<                  |   j<                  d      } |||      r4 |||||      }|j6                  dk(  r|n|dvr ||j6                  |       t?        |j6                  t@              s|j6                  jC                  d      r|j6                  dv r|j6                  |vrd||j6                  <   ||j6                  xx   dz  cc<    7 q tE        |j1                               }tE        |j1                               }d}d}||k\  s|dkD  r||z  |k\  rg g fS |	s9|r7t#        |      dk(  r)tG        ||d      g      s||d      |dz  k  r	|dk  rg g fS |tH        v rt        j                         j                  |      } |D ]  }!|!dk  r|!t#        |
      z   }!|!dk  s|!t#        |
      k\  r*tG        |
      retJ        j2                  jL                  jO                  |
|!   d      }"|"| k  sitJ        j2                  jL                  jQ                  |"|        | dz  } n|
|!   | k  s| dz  } n t#        |      dk(  r|g|fS t#        |      dk(  r||g|fS g g fS c c}w )Nc              3   ,   K   | ]  }|t         v  y wr  )rw   r'  r   s     rX   r)  z-TilingSelect.select_tiling.<locals>.<genexpr>  s     HEu//H   r   c              3   @   K   | ]  }t        |      d    k(    yw)r   N)rE  )r'  	loop_body_lowp_fp_dtypes     rX   r)  z-TilingSelect.select_tiling.<locals>.<genexpr>  s(      "
 #9-a0NB"
   r5   rl  c                     t        | d         S r   r   sizess    rX   r  z,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh- rW   rx  c                 L    ||d      }t        | ||      }|j                  r|S d S rc  )r  r  )r   rb  r.  r
  r  rh  s         rX   _try_get_stridez3TilingSelect.select_tiling.<locals>._try_get_stride  s4     '~a'89G0OF%+%5%56?4?rW   c                 2    | |vrd|| <   y || xx   dz  cc<   y r   rV   )	node_namenon_contig_indexing_op_counters     rX   _update_negative_op_countz=TilingSelect.select_tiling.<locals>._update_negative_op_count  s(     !(FFDE6yA6yAQFArW   c                     t        |      dk(  xr: t        |       dkD  xr* |d   dk\  r|d   n|d   t        |       z   t        |       k  S Nr5   r   rS  )rb  r
  s     rX   _is_valid_indicesz5TilingSelect.select_tiling.<locals>._is_valid_indices  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rW   )rM  r  r  rM  r   r   r5   masked_subblock)r2   r  rG  r9  gQ?#   r  
   r0  ))rC   rB   rg   r   ro   rE  r0  r   r  	nelements_select_tiling_indicesr`   r   r   r  enable_tiling_heuristicsrU  r   r.   r   r)  r:  r1  r;  rW  r  r<  r>  rS  indexing_from_argsr  r   rp   
startswithrd   r(   r   r4   r  r   r  )$r<  fn_listvar_sizes_listloop_bodies
all_dtypesr   r.  r
  r3  r4  r  rX  r\  r_  r,  rb  r  r  reduction_vars
op_counterr[  r6  rA  rD  r>  arg_idxr   rh  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_rangerP  s$                                      @rX   select_tilingzTilingSelect.select_tiling  s    %W-/<
zHZHHr6M.{1~>qAc "
(_"
 
 #E#002<<5<I44^]
 %($?&"E?  ,)??Kzz22@G" #3{#34 34;;B  #&e*-o._-. % .0
 BD.( BE"'"2"2!3d5??;Q;Q;S6T!TJ%/ B	%.__%:%: BE$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X~#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,< % 7 78I J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7BBB@ Z..01-0299;.* #'%'"-1CCQJ2V;N
 r6M (N+q0,!."34
 nQ/0=13DD r6M% *668BBBO%3 M$q((5K8H(H$q(MS=M,M '4%&WW%5%5%?%?'6 &@ &
 &3GG,,55j+N,71,<M!$]3kA(3q(8" >"a'%66>"a'%}5~EE2vQs   
!Qc           	         g }t        ||      D ]`  \  }}t        j                  |g| }|t        j                  |j
                  |j                        D cg c]  }|j                   c}z  }b t        t                  }	g }
t        t                  }t        t                  }|D ]  }|j                  D ]  }t        j                  d|j                        s$t        |||      }|dk(  r7|dk(  rO|	j                  t        |j                  dd               |
j!                  t        |j                  dd               t#        d |j                  D              r(|j                  t        |j                  dd               |j                  t        |j                  dd                
 |	|z
  |z
  }t%        |d       \  }}t'        |      t'        |      z   }t'        |	      dk(  r|dz
  gS |rt)        |      dd  S |	|z  |z
  }t)        |	      }t'        |      dk(  r|d   |v r|d   |dz
  k(  r|S t)        ||
j*                        dd  S c c}w )	Nz^d\d+$r   r5   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   SIZEr'  r  s     rX   r)  z6TilingSelect._select_tiling_indices.<locals>.<genexpr>  s     S!4995S   $&c                     t        | d         S r   rS  rT  s    rX   r  z5TilingSelect._select_tiling_indices.<locals>.<lambda>      s5QR8} rW   rV  r  r   )r  r
   extract_read_writes	itertoolschainreadswritesr   r   rn   r  r   r   r   r  r  r  r0  r`   r   sortedcount)r<  ri  rj  r.  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   rh  contig_onlyr3  r4  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rX   re  z#TilingSelect._select_tiling_indicesg  sV    	 .9 	UMB	11"AyAByrxx/ST#))TTI	U !o'",S/"3",S/"3 	CE)) CyyCHH5,UCGQ;q[OOC$56$++C,=>Sv?R?RSS+//CHHQRL0AB+//CHHQRL0ABC	C "$;;>UU!$^9T!U5zC$88{q  1$%%+&rs++11##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   I)r  r  r  r  r;  r   r1  rn   rw  re  rn  ro  s   @rX   rG  rG    s7    
i 
tCy$s)#	$	iV.KrW   rG  c                        e Zd ZU eZee   ed<   eZee   ed<   e	Z
ee	   ed<    fdZd ZdefdZdefd	Zd
 Zd Zd Zdee   fdZd Zd Zddee   fdZdeded   fdZ xZS )r;  
kernel_clsvec_kernel_clstile2d_kernel_clsc                     t         |   |j                  |j                  j                         || _        d | _        d | _        t        j                         | _
        g | _        y r  )r:  r;  r  wsr  rd  rV  r  r   r  picked_vec_isakernelsr<  rd  r?  s     rX   r;  zCppKernelProxy.__init__  sQ    **LOO,G,GH(2=2J2J2L(*rW   c                 `    |D ])  }t        |t              sJ t        j                  |       + y r  )r   r$   r:   propagate_scheduler_node)r<  r<  r>  s      rX   data_type_propagationz$CppKernelProxy.data_type_propagation  s1     	@Ee]33388?	@rW   scheduler_nodec                     t        |j                  t              syt        j                  |       t        |j                        d   d uxr t        |j                        d    S )NTr   r5   )r   r6  r   r:   r  rE  )r<  r  s     rX   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler  s\    ...944^D!."6"67:$F C).*>*>?BB	
rW   rO  c                     dt         j                  j                  fd}|j                  gt	        |j
                  j                               z   }|D ]  } ||j                          y )N	sub_graphc                 l   dt         j                  j                  dt        t         j                     fddt         j                  j                  dt        t         j                     fddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffddt         j                  j                  dt         j                  ffd}t        | j                        }g |D ]9  }|j                  d	v r |      xt        v rt        fd
|j                  D              rB|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t         xj"                  dz  c_        d d d        |j                  dk(  r |      xt        v r|j                  \  }}}}} ||      rt$        j&                  j)                  |      | j+                  |      5  | j                  d||f      |j-                  |       t         xj"                  dz  c_        d d d        u|j                  dk(  r|j                  \  }}}	}
|t        v st         j                  t         j.                  t         j0                  t         j2                  fv sJ |t        v rt         j                  nt         j                  |	|
f|_        |j                  dk(  r`|j                  d   t        v rK|j                  \  }}
t        fd|j                  D              rk||
t         j                  f|_        |j                  dk(  rq|j                  d   t        v r\|j                  \  }}t        fd|j                  D              rډj5                  |       ||t         j                  f|_        |j                  dk(  r%|j                  \  }}}|t        v rd |||      s[| j+                  |      5  | j                  d|||f      |j-                  |       t         xj"                  dz  c_        d d d        t        v st        fd|j                  D              r|j                  d   }| j                  |      5  | j                  d||t         j                  f      |j                  fd       t         xj"                  dz  c_        d d d        :< dt         j                  j6                  ffd} ||        y # 1 sw Y   pxY w# 1 sw Y   }xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr(  r   c                 Z   | j                   dk(  r,t        j                  j                  | j                  d         S | j                   dk(  r| j                  d   S | j                   dk(  rCt        | j                        dkD  r| j                  d   S | j                  j                  dd      S y)	z6Get input dtype for nodes that may consumes lowp fp dtr  r5   r  r  r  r   r   N)r>  r4   r  r  r  r   r  r  r(  s    rX   get_input_dtypez]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype  s    ;;')77,,TYYq\::[[$6699R=([[J.499~)#yy|+#{{{DAArW   c                 $   | j                   dk(  rFt        | j                        dk(  sJ t        j                  j                  | j                  d         S | j                   dv r| j                  d   S | j                   dk(  r| j                  d   S y)	z6Get output dtype for nodes that may produce lowp fp dtr  r   r5   )r  rG  rM  r  r  r   N)r>  r   r  r4   r  r  r  s    rX   get_output_dtypez^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype  sz    ;;&(tyy>Q...77,,TYYq\::[[$JJ99R=([[$6699Q<'rW   r  c                 .    |t         v sJ  |       |k(  S )z]Check if the given node produces output with expected low precision floating point data type.)r   )r(  r  r  s     rX   is_lowp_fp_sourcez_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source  s!    ]***'-33rW   c                 X    |t         v sJ  |       x}r||k(  S | j                  dk(  ryy)zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   r>  )r(  r  input_dtyper  s      rX   is_lowp_fp_sinkz]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sink  s>    ]***"1$"77;7&",,[[J. rW   c                 Z     |       xr t        fd| j                  D              S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c              3   0   K   | ]  } |        y wr  rV   r'  userr  r  s     rX   r)  z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>  s      ;26OD"-;   r0  users)r(  r  r  r  s    `rX   is_lowp_fp_source_no_promotezjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote  s1     )r2 s ;:>**; 8 rW   )r  rM  c              3   0   K   | ]  } |        y wr  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>       M?44Mr  r   r  r  c                     | uS r  rV   r,  to_type_nodes    rX   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>  s    A\4I rW   r5   r  r%  rG  r  c              3   0   K   | ]  } |        y wr  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>.  r  r  c              3   0   K   | ]  } |        y wr  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>4  r  r  r  c              3   0   K   | ]  } |        y wr  rV   )r'  r  r   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>\  s     Ue <Ur  c                     | uS r  rV   r  s    rX   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>d  s    A\<Q rW   r  c                 V    dt         j                  j                  ffd} ||        y )Nr  c                 B   dt         j                  j                  fd}| j                  D cg c]  }|j                  dk(  s| }}|D cg c]  } ||      s||j
                  i }}|D ]  }|j                         D ]q  \  }| j                  v st        fd|D              sv s.t        d |D              sAj                  d   }j                  |       | j                         s  | j                  | j                          y y c c}w c c}w )Nto_nodec                 :    t        d | j                  D              S )Nc              3   :   K   | ]  }|j                   d k(    yw)r  N)r>  r'  usrs     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>s  s     "U3::#;"Us   r  )r  s    rX   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_tor  s    ""Uw}}"UUUrW   r  c              3   \   K   | ]#  }|j                   d    j                   d    k(   % ywr  Nr  )r'  r  r(  s     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>~  s&     #ScCHHRLDIIbM$A#Ss   ),c              3   F   K   | ]  }|j                   d    t        v   ywr  )r  r   r  s     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s"      ,&JM(E,&r*  r  )r   fxNoder<  r>  r  r  r0  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	r  r  r(  all_to_nodesall_to_nodes_and_users
node_usersr  val_nodeto_lowp_fp_legalized_nodess	     `     rX   _eliminate_duplicate_to_nodezCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_nodek  s-   VUXX]] V *3$!%DKK:<U$L $ 8D./3{SWGXtzz*.* . '= ;
+5+;+;+= ;KD%#y6 ##SU#S S$(,F$F(+ ,&QV,& )&
 ,0+?+?+C $ : :8 D ) 4 4T :;;, !..6!( 79$.s   DDDD)r   r  Graph)r  r  r  s     rX   eliminate_to_dtypez`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtypej  s"    ')EHHNN ')R -Y7rW   )r   r  r  r   r   r1  r<  r>  r   r0  r  r  inserting_aftercall_methodro   r  r   cpp_to_dtype_countr4   r  r  inserting_beforereplace_input_withrJ  rK  r  r  r  )r  r  sub_graph_nodesr>  r2   r   r   	value_varr   r   r   r  r  r  r   r  r  r  r  r  r  s                @@@@@@@@rX   add_to_dtypezDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s    ehhmm  8M  
 uxx}} 
 %++9N 
 4 45;; 4
	!ehhmm 	! 	!588== ekk  #9??3O)+&( wLL$::/66=H MMM **Q-C"2259 8'0'<'<&c5%++-F (= ( 33(*I  22a728 8 LLG+.u55-G16.Cq)Q3IrB GG--d3E"33E: 8'0'<'<&c9e-D (= ( 00LI22a728 8 \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 \\Z/EJJrNm4S',zz$S%MMM "%uekk!:EJ\\Z/EJJrNm4S#(::LS!RMMM  /55e<"%q%++!6EJ\\%779>6S)UI !M1  <IyQ!*!;!;E!B @/8/D/D$.c9i5P 0E 0" !& 8 8L Q ' : :a ? :@ -  UUU"'**Q-C!*!:!:5!A @/8/D/D$.c5%++5N 0E 0" !& ; ;$02Q!" !( : :a ? :@ @ owr*8ehhnn *8X y)y8 8 8 8J@ @ @ @s4   ?AVAV%AV:AV)V	V	V&	)V3	)r   r  r  r:  r1  r;  rW  r  )r<  rO  r  rA  rD  s        rX   legalize_lowp_fp_dtype_loopbodyz.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  s]    Z	*EHHNN Z	*x  **+d93F3F3M3M3O.PP
# 	*I)	*rW   c                     t         fd|D              r|D ]  }|j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]n  }|j                  dv s|j                  sJ t        j                  |j                  v sJ |j                  t        j                     }|j                  t        v rnJ    y |D ]^  }t        |t              sJ t        |j                  t               sJ |j                  }|j#                         rN j%                  |       ` y )Nc              3   d   K   | ]'  }t        |t              xr j                  |       ) y wr  )r   r$   r  )r'  r>  r<  s     rX   r)  z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  s3      
 um,Q1J1J51QQ
   -0)r  r  )r0  r6  r:  r1  r;  rW  r  r<  r>  ry  rA   rx  r   r   r   r$   r   is_memory_copyr  )r<  r<  r>  rA  rD  fx_noderu  rS  s   `       rX   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype  sD    

 

  B#kk445KK))0029 
 ", BI#,??#8#8 B">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#ABB	B  	;Ee]333ekk8444"[[D&&(44T:	;rW   c           	         !"# t              t              k(  sJ | j                  !t        d       \   "| j                   "       !#fd} "fd# || j                        }t
        j                  xj                  |j                  z  c_        t
        j                  xj                  |j                  z  c_        t        j                  |      | _        | j                  r| j                  s6|g| _        | j                  dd        | j                  j!                  |        y t"        j$                  j&                  j)                  d      5  t+               }|j-                        \  }}t        |      t        |      k(  sJ d}t/        t1                    }	t3        d |	D              rd}d}
d }|r{d}|d	   }|d
z   }t        | j                  j4                        |kD  rM| j                  j4                  |   j6                  }| j                  j4                  |   j6                  }|xr | }
t        |      d
k(  rt8        xj:                  d
z  c_        | j                  j=                  |d	   |d	         } || j>                  |d	   |d	         }|j@                  |jB                  z
  }|jD                  d	|jB                  fi|_#        t&        jH                  jJ                  r|r || j>                  |d	   |d	   |      }n|}|jD                  g|_&        |jD                  |jB                  |j@                  fi|_#        ||g| _        |}nt        |      dk(  r|d
   t        | j                        d
z
  k(  r|d	   |d
   k(  sJ t8        xj:                  dz  c_        | j                  j=                  |d	   |d	         }d	|jB                  f|jB                  |j@                  fd}|j@                  |jB                  z
  }| j                  j=                  |d
   |d	         }d	|jB                  f|jB                  |j@                  fd}|j@                  |jB                  z
  } || jN                  |d	   |      }|jD                  |d   |jD                  |d   i|_#        g }t&        jH                  jJ                  rn|rldD ]f  \  }}|dk(  r|nd }|dk(  r|nd } || jN                  |d	   |||      }|jD                  ||   |jD                  ||   i|_#        |jQ                  |       h n || j>                  |d	   |d	         }|jD                  |d   |jD                  |d   i|_#        |jD                  g|_&        |jQ                  |       |jD                  |d   |jD                  d	|j@                  fi|_#        |jD                  |jD                  g|_&        |jQ                  |       |g|z   | _        |}n|g| _        | j                  |
|       | j                  j!                  |        d d d        y # 1 sw Y   y xY w)Nc                     t        | d         S r   rS  rT  s    rX   r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>  r~  rW   rV  c                      j                   | g| 5 }t        xj                  dz  c_         |       |cd d d        S # 1 sw Y   y xY wr   )
new_kernelr   generated_kernel_count)r3  r  rh  rd  runs      rX   codegen_kernelz8CppKernelProxy.codegen_functions.<locals>.codegen_kernel  sL    (((4t4  ..!3.F  s   #AAc           	      T   | j                        \  }}d}t        	      D ]u  \  }}|ft        t        j                              dffv r|rJ  |||       ;d}|dfk(  sJ d| d d        | j                         5   ||d       d d d        w y # 1 sw Y   xY w)NFrV   Tzunexpected group: rd  r   )r-  r  r   r  r  rq  )
rh  r  rm  	in_suffixr  r  ri  r3  r4  rj  s
         rX   r  z-CppKernelProxy.codegen_functions.<locals>.run  s    #)#4#4UO#L D.I!$Wn!= %IO,9??5/BCRH!   )(=t^, $I$)  V ,I;d5'OCTUV 
  //1 %4% %%% %s   	
BB'	Finplace_buffersTc              3   ,   K   | ]  }|t         v  y wr  )rx   rL  s     rX   r)  z3CppKernelProxy.codegen_functions.<locals>.<genexpr>  s     Su5 ::SrM  r   r5   )r!  r   maintailr  )r  )r  r  )r  r  r  ))r   rd  r`   r-  r  r4   r  removed_buffersinplaced_to_removerT  rb  rV  r  rb  r  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchrG  rw  rB   rC   rg   rP  r  r   generated_cpp_vec_kernel_counttiler  rH  
tiled_sizer   r  r  enable_loop_tail_vecr  r  r  )$r<  ri  rj  r  scalar_kerneltiling_selecttiling_factorsr
  could_masked_vecrl  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionrF  
vec_kernelrs  tail_kernel
outer_loopr`  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerh  r3  rd  r4  r  s$    ``                             @@@@rX   codegen_functionsz CppKernelProxy.codegen_functions  s7   7|s>2222((!$^9T!U/		%( 't7	=#@#@@	""m&F&FF"!6""$--)?DL,,UD9NN%%d+ __##))%)@ C	,(NM-:-H-H.*NN ~&#n*====#3N74KLJS
SS#( .3+K',$#1!#4 #3a#7 t~~++,/??+/>>+?+?(,"l ) ,0>>+?+?(,"l ) -I5I1I 4 >"a'66!;6~~**>!+<^TUEV*W+''):N1<M
 !II7	,0HHq$//6J+K
(::227G"0++&q)&q)!	#K #0K48HH:M0-1XX7S,T) *K8"^$)"1%T]]);a)??&q)^A->>? 66!;6!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I .**"1%"! NNL$8NNL$8/+ !::227G- 3( 07&/@Od ) 07&/@Od ) "0 22*1-*,," 'NNL,A&NNL,A0, $**62-30 "0++^A->q@Q"J #V(<"V(<0J, 2<0@J-&&z2"V(<"JOO(<3M/ 5?NNJNN3SM0&&}5 -<( -,,/ NN%%d+GC	, C	, C	,s   S#YYc                     |D ](  }| j                  |       t        j                  |       * | j                  ||       y r  )r  r:   propagate_loopbodyr  )r<  rk  rj  rS  s       rX   codegen_loop_bodiesz"CppKernelProxy.codegen_loop_bodiesr  s?     	9D0062248	9 	{N;rW   r<  c                    | j                  |       | j                  |       t        |      dk\  sJ d }|D cg c]  }t        j                  ||       }}t        t        j                  t              r2t        j                  j                  rd }|D cg c]
  } ||       }}|D cg c]  }|j                  d    }}| j                  ||       y c c}w c c}w c c}w )Nr5   c                     | j                          | j                          t        t        j                  t
              r | j                  | S | j                  |      S r  )decide_inplace_updatemark_runr   r4   rh  r1   r6  codegen)r(  
index_varss     rX   r  z(CppKernelProxy.codegen_nodes.<locals>.fn~  sF    &&(MMO!(($56!tzz:..||J//rW   c                 R    t         j                  j                  |       }| |_        |S r  )r4   rU  localize_functionoriginal_fn)r  
wrapped_fns     rX   wrap_fnz-CppKernelProxy.codegen_nodes.<locals>.wrap_fn  s+    33EE
 *,
&!!rW   )r  r  r   rW  partialr   r4   rU  rK   rV  r3  r  )r<  r<  r  r(  ri  r%  rj  s          rX   codegen_nodeszCppKernelProxy.codegen_nodesx  s    ##E*""5)5zQ	0 <AA49$$R.AA q--/AB&&44" .55rwr{5G549:D$**Q-::w7# B 6:s   CC-C"c                 >    | j                  | j                  ||       y r  )r`  rV  )r<  r   rK  s      rX   rc  zCppKernelProxy.codegen_loops  s    kBrW   c                 F    | j                   D ]  }|j                           y r  )r  r  r<  rh  s     rX   r  z4CppKernelProxy.update_stores_with_parallel_reduction  s!    ll 	;F88:	;rW   r   c                 (   |J d}| j                   D ]q  }t        j                         5 }|j                  ||      r@d}|j	                  |j                                |j                  |j                                d d d        s y # 1 sw Y   ~xY w)N
C10_LIKELYC10_UNLIKELY)r  r   r   r  r   r   r  r  )r<  r   	if_prefixrh  r   s        rX   r  zCppKernelProxy.gen_body  s     	ll 	3F%%' 35,,T9= .I''6KK 12	3 3	33 3s   ABB	inner_loop_reduction_outer_notr  	LoopLevelc                     d fd} j                   d   }|r|sJ  ||       nZ|j                           j                  j                  |j                          j                  j                  |j                          j
                  j                  |j
                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                          j                  j                  |j                         y)z
        CppKernel/CppVecKernel/CppTile2dKernel have reduction buffers themselves.
        Here, we decide how to aggregate them together and place new reduction buffers
        under CppKernelProxy.
        c           
         t        j                        dk\  sJ j                  d   }j                  d   }t        |j                        sJ t	        |      j
                  k(  r^|j                  |j                         |j                          j                  j                  |j                  |j                  z          n5|j                          j                  j                  |j                         t               }t        j                         5 }|j                  |d| j                        r:|j                  |j!                                |j                  |j"                         d d d        t        j                         5 }|j                  |d| j                        r$|j                  |j!                                t	        |      j
                  k(  r|j$                  }|D ]X  }| d| j                   dt'        | j(                         d}t+        |j,                  ||       t+        |j"                  ||       Z t/        |j,                         |j                  t1        |j"                  | j                  | j                   d	| j(                  | j2                               n|j                  |j"                         d d d        |_        y # 1 sw Y   qxY w# 1 sw Y   |_        y xY w)
Nr   r   r  r,  r-  r   z_tail - r  r  )r   r  r   r  r%  r  r  r.  r  r  r7   r   r   r  r   r   r   r  r  rE   r  r   r  r   r   rH  )	r  main_loop_kerneltail_loop_kernel
suffix_bufr   rm  r   r   r<  s	           rX   !aggregate_reduction_prefix_suffixzUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix  s   t||$)))#||A#||B/.0C0CDDD $%8 !::$22 !::<%%,,$55&778
 !::<%%,,-=-N-NO &J%%' I5#66jnn ''
(9(9(;<%%&6&G&GHI %%' M5#66
 ''
(9(9(;<,-@)9)M)M$2 D*.uZ^^4DH[YcYnYnMoLppq'rH,-=-D-DdHU, 0 A A4 55E5L5LM"))6 0 A A *#->>"2% 8 * 5 5 * #))*:*K*KL5M6 %/D!CI IM6 %/D!s   AK"EK/"K,/K?r   N)r  r0  )r  r  r  r  r  r  r  r  r  r  r  )r<  r/  r  r6  main_kernels   `    rX   r  z*CppKernelProxy.aggregate_reduction_buffers  s   9	/v ll1o):-j9113!!(()E)EF!!(()E)EF&&--k.S.ST&&--k.S.ST!!(()I)IJ##**;+M+MN**1155	
 	**1155	
rW   r  )r  r  r  r  r  r%  r  r  r  r  r  r;  r  r$   r  r   r  r  r  r  r1  r'  rc  r  r   r7   r  rl   r  rn  ro  s   @rX   r;  r;    s     #,JY+)5ND&5/>tO,>+@
= 
_* _*B;<v,p<84#6 8BC;3Xl3 3U
.2U
@H@UU
rW   r;  c                   $     e Zd Z fdZd Z xZS )re  c                 p    t         |   |j                  |j                  j                         g | _        y r  )r:  r;  r  r  r  rg  r  s     rX   r;  zOuterLoopFusedKernel.__init__  s)    **LOO,G,GH%'
rW   c           
         g }| j                   D cg c]  }|j                          }}|D ]h  }|j                  }|J |j                  |j	                  t        t        |      |j                  z
  |j                        |      j                         j t        t        |j                  t        |            |j                        S c c}w )Nrj  )rg  r:  r  r  rQ  r  r   r  r  r_   r`   )r<  rR  rJ  kernels_parallel_depthrV  nested_kernelsrh  r  s           rX   rQ  z*OuterLoopFusedKernel.decide_parallel_depth  s    !#48JJ+
'0I  "+
 +
 % 	F !,,K***")),,!,/A/M/MM$6$B$B	  !.
	  "1137M3N +66	
 	
)+
s   C)r  r  r  r;  rQ  rn  ro  s   @rX   re  re    s    (
rW   re  c                       e Zd ZdZdZdZy)ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrV   rW   rX   r>  r>  %  s    )1%E"rW   r>  c                       e Zd ZU eZee   ed<   dZ ee	j                  e	j                  g      Zedej                  dee	   fd       Z fdZdefdZd	 Zd
 Zd Zdee   fdZd Zd Zd ZdededefdZd Zd Z d Z!d Z"de#e$   fdZ%de&fdZ'de(e&e)e$f   fdZ*dedefdZ+dede,e   de,e   fd Z-d! Z.d" Z/d# Z0d&d$Z1d% Z2 xZ3S )'CppSchedulingkernel_proxy_clsi  devicer   c                     | j                   S r  )backend_features)r3  rH  s     rX   get_backend_featuresz"CppScheduling.get_backend_features;  s    ###rW   c                 V    t         |   |       |r| j                          d| _        y r  )r:  r;  reset_kernel_group_ready_to_flush)r<  r/  r?  s     rX   r;  zCppScheduling.__init__?  s'    ###%$rW   statusc                     || _         y r  rN  )r<  rO  s     rX   _set_flush_statuszCppScheduling._set_flush_statusE  s
    %rW   c                 &    t        d |D              S )Nc              3      K   | ];  }t        t        t        j                  j                  j
                  |             = y wr  )r   r  r4   r  r  r   r{  s     rX   r)  z)CppScheduling.group_fn.<locals>.<genexpr>I  s,     M!U3qww//88!<=Ms   AA)r   )r<  rU  s     rX   group_fnzCppScheduling.group_fnH  s    MuMMMrW   c                 "    t               | _        y r  )KernelGrouprd  rA  s    rX   rM  z CppScheduling.reset_kernel_groupK  s    'MrW   c                    |j                         s|j                         rt        j                  ||      S |j                         r(|j                         rJ t	        j                  ||      S | j                  ||      t        j                  k(  rt        |t        t        f      sJ t        |t        t        f      sJ |j                  \  }\  }}|j                  \  }\  }}|dk(  r|dk(  s	J ||f       fdt        |      t        |      k  r|n|}t        |t              sJ t        |      t        |      k  r|n|}	 |	      }
|j                  |
       |j                  \  }\  }}|j                  \  }\  }}||k(  rt	        j                  ||      S  |      }t        |	t              r|	j                  |       ngt        |	t              sJ |	j                  D ]&  }t        |t              sJ |j                  |       ( t	        |	j                  |	j                        }	|j                  \  }\  }}|j                  \  }\  }}||k(  s	J ||f       t	        j                  ||      S | j                  ||      r't         j                  ||| j#                  ||            S t	        j                  ||      S )NrV   c                 D   t        | t              rt        | j                        dkD  sJ | j                         d }t	        t
                  }| j                  D ];  } 	|      \  }}||}||k(  sJ ||| j                  f       |j                  |       = |t        |      fS t        | t              sJ | j                  }t        |t        j                        sJ |j                         \  }}}|j                  t        |j                  j                               fS rc  )r   r"   r   snodesr   r   updater1  r$   r(  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsrW  )
r(  r  r^  snodevexprscomp_bufferr   rS  get_indexing_ranges_exprss
            rX   rc  z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs`  s   !$(:;"4;;/!3@T[[@3%)
)3C):%)[[ 9E'@'GHAu)1-.
#-?PZDKK4PP?*11%89  *4+???)$>>>&*ii)+r7H7HIII%0%G%G%I
4#T5H5H5O5O5Q0RRRrW   )extra_indexing_constraints)
is_foreachr!   r5  is_templater"   _why_fuse_nodesr>  rD  r   r$   r3  r   recompute_size_and_bodyrZ  r/  can_fuse_vertical_outer_loopr   _get_outer_loop_fusion_depth)r<  r!  r"  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsr_  rc  s                @rX   r5  zCppScheduling.fuseN  s   !1!1!3-225%@@ ((***%**5%88 $$UE2#BBC "%-9K)LMMM!%-9K)LMMM&+kk##E7&+kk##E7"}BJ'8JJ6S& +.e*s5z*Au!.-@@@$'JU$;5+DX+N(66/G 7  !&:E1 %:E1E>-225%@@ 7P"73 h6443V 5  &h0BCCC!) )%???557Z 6 
  2(2D2DhooVH %:E1 %:E1~5u~5~)..ue<<225%@2775$"C"CE5"Q  *..ue<<rW   c                     |j                   \  }\  }}|j                   \  }\  }}||k(  r||k(  rt        j                  S |dk(  r|||z   k(  rt        j                  S | j	                  ||      rt        j
                  S y )NrV   )r3  r>  rB  rC  &_can_fuse_nodes_with_compatible_rangesrD  )r<  r!  r"  r   rk  rl  rm  rn  s           rX   rg  zCppScheduling._why_fuse_nodes  s    #kkE7#kkE7E>g0#444b=Uego5#88866ueD#BBBrW   c                 <   |j                   \  }\  }}|j                   \  }\  }}|dk(  xr |dk(  }t        j                  |      t        j                  |      k(  }	t        |      dk(  xs t        |      dk(  }
|r|	r|
syt        |      t        |      k  r|n|}t        |      t        |      k  r|n|}t	        |t
              ryt	        |t              sJ t	        |j                  t        j                        ryt	        |j                  t        j                        sJ |j                  j                  j                         }d }t	        |t
              rt        t        t        df             }|j                   D ]  }t	        |j                  t        j                        r ndt	        |j                  t        j                        sJ |j#                  t        |j                  j                  j                                       t        |      dk7  ryt%        t'        t)        |                  }n\t	        |t              sJ t	        |j                  t        j                        sJ |j                  j                  j                         }||k7  ryy)NrV   r5   F.T)r3  r  re   r   r   r"   r$   r(  r   TemplateBufferr\  dataget_sizer   r   r   rZ  r  r1  nextiter)r<  r!  r"  r   rk  rl  rm  rn  c1c2c3ro  rp  ranges2ranges1
ranges_setr_  s                    rX   rt  z4CppScheduling._can_fuse_nodes_with_compatible_ranges  s    $kkE7#kkE7],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&89 .-888n))2+<+<=.--r/@/@AAA !%%**335h 23#E#s(O46J! Bejj"*;*;<!%**b.?.?@@@uUZZ__%=%=%?@A	B :!#4Z 012Gh666hmmR->->???mm((113GgrW   c                     t        |t        t        f      sJ t        |t        t        f      sJ t        d ||fD              ry| j	                  ||      d uS )Nc              3   <   K   | ]  }t        |t                y wr  )r   r   r&  s     rX   r)  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>  s      
>BJt89
s   F)r   r"   r$   rg   rg  r<  r!  r"  s      rX   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_impl  sd    %"4m!DEEE%"4m!DEEE 
GLen
 
 ##E51==rW   c                    |j                         s|j                         ryt        |j                               t        |j                               z   t        j                  j
                  kD  ry| j                  ||      S r  )rf  r   r9  r   r  max_horizontal_fusion_sizer  r  s      rX   can_fuse_horizontalz!CppScheduling.can_fuse_horizontal  sf    %"3"3"5!"S):%;;jj334 --eU;;rW   r!  r"  c                 p   |j                         x}rt        |j                  t        j                        xr~ t        |j
                  t        j                        xrX t        |j
                  j                        dk(  xr4 |j
                  j                  d   j                         |j                  k(  S y)Nr5   r   F)get_template_noder   layoutr   MultiOutputLayoutr(  MultiOutputr   inputsrY  r   )r<  r!  r"  template_bufs       rX   can_fuse_multi_outputs_templatez-CppScheduling.can_fuse_multi_outputs_template  s     !2244<4<..0D0DE Iuzz2>>:I

))*a/I JJ%%a(113|7H7HH	 rW   c                    d}t        d ||fD              s|S t        |t              r|j                         d   n|}t        |t        t
        f      sJ t        |t              r|j                         d   n|}t        |t        t
        f      sJ |j                  \  }\  }}|j                  \  }\  }	}
|dk(  r|	dk(  r|dk7  r|
dk7  r|S t        d ||fD              r'|j                  |j                  k(  r|j                  S |S t        t        |      t        |	            }|dk\  rI|d | |	d | k(  r>t        d ||fD              r(t        |      t        u r|n|}|j                  |k(  r|S |S |S |S )Nr   c              3   T   K   | ]   }t        |      t        t        t        fv  " y wr  )r%  r   r"   r$   r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s/      
  J+-?OP
r*  r  rV   c              3   >   K   | ]  }t        |      t        u   y wr  r,  r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>   r-  r.  r5   c              3   >   K   | ]  }t        |      t        u   y wr  r,  r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>+  s      >BT
99r.  )r0  r   r   r2  r"   r$   r3  r4  r_   r   rg   r%  )r<  r!  r"  DISABLE_OUTER_LOOP_FUSION_node1_node2r   rk  rl  rm  rn  r4  _compare_nodes                rX   rj  z*CppScheduling._get_outer_loop_fusion_depth  s   $%! 
 
 

 -, %!<= !!#B' 	
 &#5}"EFFF %!<= !!#A& 	
 &#5}"EFFF$llE7$llE7B;5B;7b=W],,TeU^TT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLen  "%[,GGEU  !88<SS2244 /.((rW   c                    |j                          xro |j                          xr\ |j                         |j                  z  xr= | j                  ||      xr |j	                           xr | j                  ||      dk\  S r   )rf  get_operation_names	ancestorsr  r  rj  r  s      rX   ri  z*CppScheduling.can_fuse_vertical_outer_loop;  s    !!## E%%''E))+eoo=E ..ue< -**,,E 11%?1D		
rW   c                 *    | j                  ||      ryyr^  )ri  r  s      rX   get_fusion_pair_priorityz&CppScheduling.get_fusion_pair_priorityG  s    ,,UE:rW   c                     |j                         ry|j                         r%t        ||g      \  }}|j                          xr |S | j                  ||      xr |j                          xs | j	                  ||      S r  )rf  rN   r  r  ri  )r<  r!  r"  template_fusion_supportedr   s        rX   can_fuse_verticalzCppScheduling.can_fuse_verticalN  s    +Sw,(%q ))++I0II**5%8UASASAU=U=..ue<	=rW   r<  c                    t        d |D              r|S ddd}d}d}d}|D ]  }t        |j                  t        j                        sJ |j                  j                         \  }}}|j                  j                         D ]0  \  }	t        |	t        j                        s"|	j                  t              D ]  t        fd|j                  D              r|k7  r}|dz  }|dkD  r|c c c S t        j                  d   t        j                  j                  j                         svj                  d   |j                  v st#        fd|j                  j                         D              sÉj                  d   dkD  s։j                  d   j                  d   d	}|} 3  |s|S dfd
}
|D ]  }||k(  s	|j%                  |
        |D ]  }||k7  s	|j%                  |
        |S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c              3      K   | ]X  }t        |j                  d    d          dk7  xs4 t        d |j                  j                  j                         D               Z yw)r5   r   c              3   F   K   | ]  }|j                  t                y wr  )r   r   )r'  r  s     rX   r)  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>p  s      .2)r*  N)r   r3  rg   r6  r^  rW  r&  s     rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>n  sc      

 	 

1a !Q&  6:jj6O6O6V6V6X 
s   AA Nr   Fc              3   @   K   | ]  }j                  |        y wr  )r   )r'  r   div_exprs     rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>  s     Q#HLL-QrQ  r5   c              3   d   K   | ]'  \  }}|k7  rt        |j                  d          dv  ) yw)r   r`  N)r  r  )r'  name_expr_r  r   s      rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>  s9        ,u$} 0x}}Q7GHFR r  r  Tc                    | \  }}|\  }}|j                        }|j                         }||   z  ||<   |j                  |dz          t        j                  ||d      \  \  }	}
}|	j                         }|j                  |dz         }||   z  |z   ||<   t        j                  |||g||	|      }s/|j                  t        |j                  j                               f||f||	|ffS )Nr5   r  )r@  )r   copyinsertr
   index_vars_no_squeezepopr   r   r  r1  r^  rW  )rU  rS  r  
index_sizereduce_sizer   reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_varrd  split_number	split_vars                 rX   
loop_splitz0CppScheduling.try_loop_split.<locals>.loop_split  s   &+#J&*#J"((3I'__.N(29(=(MN9%!!)a->.:.P.PC/+^Q '++-I#--	A6K#/)I2F#F#TIi ;;y+.
NKD .OO,,3356.*
  -- rW   )recompute_sizes_body_func)rd  r  )rg   r   r(  r   r\  r]  r^  r  r   r  findr   r  r  corenumbersr\  r0  rh  )r<  r<  num_div	div_expr_	match_divmatched_noder(  r   original_bodyr  r  r  rd  r   r  r  s              @@@@@rX   try_loop_splitzCppScheduling.try_loop_split[  s   &  

 
 
 L			 	,Ddii):):;;;"&))"B"B"DA}a+::@@B ,
d!$

3 $		( 3 ,HQ9P9PQQ$	1$,	1{$"8==#3UZZ5G5G5O5OP$MM!,0G0GG ,  0=0L0L0R0R0T  
 %MM!,q0$,MM!$4	'/}}Q'7$(	'+/,,	,@ L%)"	:  	SD|#,,z,R	S  	D|#,,/I.8 - 	 rW   r(  c                 X   	  j                   t        j                  }g g 	t        |t              sJ dt        f	 fd} ||      s|t        _        j                          	j                          t        j                  j                  j                  d      5  |j                         D ]^  }t        |t        t        f      sJ |j                         } j                        }|j                  |       j!                  ||       ` 	 ddd       yy# 1 sw Y   yxY w)a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r(  c           	          t         t              sJ j                          j                          dt        fdg }i t	         fd j                         D              rt                j                         D ]  t        t              sJ j                  j                                j                         st        j                               dk7  rbj                         d   t	         fdj                  D              sj                  }t        |t         j"                        sJ |j%                         } j&                  t                     z
  }fd}|j)                         r |       s	g }|j*                  d   }       |d	 }t-        |      D ]  }	|j/                  d|       ||	z  } t!        j0                  |j2                  |j4                  ||      }
fd
}d} ||
|      }|sEt!        j6                  | dt        |       |
      }|j9                  |       g |j:                  <   |j:                     j9                  |        t=        j>                        5 }t        |      dkD  r4|D ]/  }|j:                  J |jA                  ||j:                            1  j                         D ]z  }t        |tB        t        f      sJ jE                        }|jG                  |j                                j9                  |       j9                  |j                                |  jI                   j&                        sD|jJ                  D ]+  }tL        jN                  jJ                  jQ                  |       - 	 d	d	d	       ytR        jT                  j9                  tS        jV                  t              t        |jX                                      j[                        }j]                  |g t^        j`                  jc                               d	d	d	       y# 1 sw Y   yxY w)zN
            Codegen code with fused outer loop and local Buffer.
            r(  c                     t        | t        t        f      sJ | j                         }t	        |d       j
                  \  }\  }}t        |      t        |      z   }|S )Nc                 4    t        | j                               S r  )rn   r  r  s    rX   r  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>  s    Q^^-=)> rW   rV  )r   r$   r"   r9  r`   r3  r   )r(  r<  r   r3  r4  r  s         rX   get_call_rangeszlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  s`    !$8J(KLLL-1^^-=.1>/% ,+E? $ElU?-CC""rW   c              3   `   K   | ]%  }t         |            j                  d z   k(   ' yw)r5   N)r   r4  )r'  r>  r  r(  s     rX   r)  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s3       OE*+t/K/Ka/OOs   +.r5   r   c              3   V   K   | ]   }|j                   j                         v  " y wr  )r(  r9  )r'  r  r(  s     rX   r)  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s&      :>		T^^%55   &)c                  B   dd} t        j                  j                  j                               D ]  \  }}| |z  z  | |z  }  j                  j	                  j                               }fd |      xr t        fdj                  D              S )Nr   r5   c                     | k(  S r  rV   )r  contiguous_index_exprs    rX   is_contiguous_indexzCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArW   c              3      K   | ]Y  }t        |j                  t              xr9  |j                  j                  j	                  j                                      [ y wr  )r   r(  r$   r6  get_read_exprrY  )r'  r  r  scheduler_buffers     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s\      Q %) !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!"Qs   AA")r2  r6  r  r  get_write_exprrY  r0  r  )rh  r   rU  write_index_exprr  r  r  r  s       @@rX   is_all_write_read_contiguouszyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s    451%&F.6 . 4 4 ? ? E E G/ 0
U !6# E 5 &%	0
 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rW   r  Nc                 ~    |D ]7  }| |j                   k(  st        fd|j                     D              s5|c S  y )Nc              3      K   | ]]  }|j                   Ot        fdt        j                  j                  j
                  |j                      j                  D               _ y w)Nc              3   V   K   | ]   }|j                   j                         v  " y wr  )r(  rY  )r'  r  visited_scheduler_nodess     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>?  s,      (&,0 )-		(:(:(<@W(W(&r  )r   r0  r4   r  r/  name_to_bufr  )r'  global_bufferr  s     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>>  s`      S" )6 (5'9'9'E %( (&45GG4E4E4Q4Q,9,>,>5**/%	(& %&S"s   A#A&)r  r0  r   )local_buffer_layoutrV  	local_buflocal_to_global_buffersr  s      rX   try_share_local_bufferzsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer<  sS    -: 5	#6):J:J#Js S" :Q(1:&S" P" ,5$45 $(rW   local_buffer_datar   )r   r  F)local_buffer_numberT)2r   r   clearr   r0  r2  r   r9  r$   r  rY  r  r   get_outputsr  r(  r   r\  rX  r4  is_contiguousrh  r2  r  FixedLayoutrH  r   Bufferr  r   rK   r  add_local_bufferr"   rG  r'  ra  r  r4   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountrV  rl  finalize_kernelr  r  from_iterable)r(  rV  r  global_buffer_layoutsize_offsetr  local_buffer_striderh  local_buffer_sizeszr  r  local_buf_prefixlocal_buffer_usedscoperZ  r>  r_  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  r  r  r]  rd  
nodes_listr<  s   `                   @@@@@rX   $try_outer_loop_fusion_with_local_bufzSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  sO    d$?@@@!'')#&7 # .0MBD# !113  <F<'&*nn&6 cN%nmDDD+//0G0G0IJ&335~99;<A '5'A'A'CA'F$ BRBXBX  )9(=(=)-9J9JKKK/</G/G/I,&*&B&BS+N;F '4 1>>@ < >$9;+!5!<!<R!@,;N,K'L-) #++<"= )B/66q&A"bLF) /1nn077066-/	/+(" ,?(,B/-)  102		(8'93};M:N%O':1- *001BCNP34E4J4JK/0A0F0FGNN)CcJ $L$5$56 "%}%)(5 +00<<<..(*A,BSBS*T
 "113 9E%e.@--PQQQ'+'<'<\'J$$225??3DE)001AB%%eoo&789 >>)4+G+G +0*?*? G //66~FG !+" ", 99@@2212,/0C0C,D 150O0O)1- ,,1@ioo33J?@?"H I"H s   D'Q<BQQ Fr  N)rd  r   r  r   r   r  r   r  r   r  r2  r"   r$   r9  rG  r'  r  )
r<  r(  r  r  r>  _nodesr_  r]  rd  r  s
   `      @@@rX   codegen_outer_loop_nodez%CppScheduling.codegen_outer_loop_node  s%    (()0)O)O&=?02
$ ;<<<f	7R f	 f	P 4D95SG2!'') ''--e-D K!113 KE%e.@--PQQQ27//2CF'+'<'<\'J$$226: 001A6JKK K :K Ks   #A2D  D)c                 t   | j                   }t        |t              r| j                  |       nU|j	                         }| j                  |      }| j                  |      }|j                  |       |j                  ||       | j                         }|t        j                  kD  r| j                  d       yy)zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)rd  r   r   r  r9  r  rG  r'  r  _get_scheduled_num_argsrF  MAX_FUSED_KERNEL_ARGS_NUMrR  )r<  r(  rd  r<  r_  args_nums         rX   codegen_nodezCppScheduling.codegen_node  s     ((d78((.)-)9E''.E#44\B**51(()95A//1m===""4( >rW   c                 n    t        |t              xr$ t        |j                  t        j                        S r  )r   r$   r(  r   CppTemplateBuffer)r<  r(  s     rX   is_cpp_templatezCppScheduling.is_cpp_template  s,    $. 
:IIr++4
 	
rW   template_nodeepilogue_nodesprologue_nodesc                    |rJ |D cg c]  }t        |t        t        f      r| }}t        d   dxx   dz  cc<   t        d   dxx   t	        |      z  cc<   | j                  |      sJ d       t        t        |      }|j                  \  }\  }}|dk(  sJ t        t        j                  |j                        }|D cg c]  }|j                   }	}t        d |	D              sJ d       d	 }
 |
||j                  |	      }|j                  |||	
      \  }}|5  t        |j                        s|j                          |D ]  }|j                            |       }ddd       t!        j"                  |      5  |g|}| j%                  ||j&                        }ddd       t        |j                        rt	        |j(                        dk(  sJ d       |j(                  d   j*                  D ]r  }t        |j                  t,              sJ d       t        |j                  j                  t        j.                        sJ d       |j                  j                          t |j1                  |       t         j2                  xj4                  |j4                  z  c_        | j7                          yc c}w c c}w # 1 sw Y   fxY w# 1 sw Y   3xY w)zG
        Codegen a CPP template, possibly with fused epilogues
        inductorcpp_templated_kernel_counterr5   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrV   c              3   P   K   | ]  }t        |t        j                           y wr  )r   r   r\  )r'  r,  s     rX   r)  z1CppScheduling.codegen_template.<locals>.<genexpr>  s     O:a!2!23Or|  z9Epilogue nodes must all be instances of ir.ComputedBufferc                     sy| j                         |v sJ || j                            j                  }t        fd|D               S )NFc              3      K   | ]8  }t        |j                  t              xr |j                  j                  v  : y wr  )r   r(  r   )r'  r  r  s     rX   r)  zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  sA        499&78 5IINNn45s   >A)rY  r  r0  )template_bufferoutputs_by_namer  r  s     ` rX   template_buffer_has_other_userszGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  s\     ""++-@@@#O$<$<$>?EEE  "   rW   )$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r$   r"   r   r   r   r   r3  r   r  r(  r0  r  make_kernel_renderr)   r  r4   set_kernel_handlerdefine_kernelr  outputsr  r    r  call_kernelr  r  free_buffers_in_scheduler)r<  r  r  r  epilogue_noder   rnumelctbr,  epilogue_ir_nodesr  r  rh  renderr(  src_codenode_schedulekernel_namer  s                      rX   codegen_templatezCppScheduling.codegen_template  s    "!!
 "0
--9K)LM 
 
 	;<A<:;s>?RR;##M2 	
z	
2 ]M:&,,;Av||$()=)=}?Q?Q$R*;
AFF;
 ;
 O=NOO 	
G	
O	 0O..0A0
, //1U, 0 

  	 ,]-?-?@&&(&   xH	  !!&) 	S*<^<M,,X}fkkRK	S %]%7%78 },,-2 e2 &--a066 %!$))-FG UG "$)).."..A MA 		""$% 	;,	6#9#99&&(S
 ;
:	  	 	S 	Ss$   K K"AK
#K
KK!c                 6    | j                   j                         S r  )rd  get_num_argsrA  s    rX   r  z%CppScheduling._get_scheduled_num_args  s      --//rW   c                     | j                   S r  rQ  rA  s    rX   ready_to_flushzCppScheduling.ready_to_flush	  s    ###rW   c                      y r  rV   rA  s    rX   codegen_synczCppScheduling.codegen_sync  s    rW   c                 $   t         j                  j                  }t        j                  j
                  r$t        |t        j                  j
                        nd}dj                  d||j                         g      }t         j                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                        |      }|j                  dd      }|j                  d      }|j!                  d|      }	t"        r|j!                  d|	d	z         }	|||	d	z     d
}
t%               }|| j&                  j(                  n|}|j+                         \  }}}t         j                  j                  s|j-                  d|d       |j/                  |d       t         j                  j                  s|j-                  d       |j1                  ||j3                         d|
       |S )NrS   r   r  rh  z#pragma CMTz//z
extern "C"r   r5   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r4   r  wrapper_coder   r  descriptive_namesr'   r  next_kernel_suffixcpp_wrapperr  rp   r,   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  rU   r=   rd  r  cpp_argdefsr   r  r  getvalue)r<  r  r<  kernel_argsr  
fused_namer  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rX   r  zCppScheduling.define_kernel  s   ''&& zz++ "%)E)EF 	
 hhz73M3M3OPQ*+''*=*=;8##C(?(?$@BRS##C(D(D$E{S ##M48 ^^L1
MM#z2	 c9q=9I'
Y]CDCH(*)4)<t  %%+**,1iww""%%(DYMQV&WXxt4ww""%%f-$$&,	 	 	
 rW   c                    | j                   j                         }|r| j                  || j                   j                        }d }t        j
                  j                  dk7  r t        | j                   j                  |      }| j                   j                  t        j                  j                  ||       | j                          | j                  d       y )Nr   )debug_handleF)rd  codegen_groupr  scheduled_nodesr   traceprovenance_tracking_levelr   r  r4   r  r(  rM  rR  )r<  r  r  r:  s       rX   flushzCppScheduling.flush7  s    $$224,,$++;;K +/L||55:F%%55{  ))$$k *  	!u%rW   r  )4r  r  r  r;  rG  r%  r  r  r   r6   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTrJ  rm  r   rH  rK  r;  rl   rR  rU  rM  r5  r   r>  rg  rt  r  r  r   r  rj  ri  r  r  r1  r$   r  r   r  r	   r"   r  r   r   r  r  r!  r#  r  r?  rn  ro  s   @rX   rF  rF  +  s    .<d>*; !$!**33	
 $%,, $:n;U $ $%& &N*P=dx8H/I 6p>	<
&
/@
	
4)l

=oD$7 obFK)FKP)/1C]RS),
$5 
$ 

U)(U) !!23U) !!23	U)n0$&P&rW   rF  c                   R     e Zd Z fdZd Zd Zd Zd	defdZd	de	e
   fdZ xZS )
rW  c                    t         |           t               | _        t	               | _        t        | j
                        | _        t        j                         | _
        | j                  j                  | j                         g | _        y r  )r:  r;  r?   r  r7   
loops_codeWorkSharingr  r   r   r   r   r<  rI  s    rX   r;  zKernelGroup.__init__K  s^    L	&.doo.))+


  )!rW   c                 :     || j                   t               g| S r  )r  r+   )r<  r3  r  s      rX   r  zKernelGroup.new_kernelT  s    49924<t<<rW   c                     | xj                   |z  c_         | j                  }| j                  }|j                  ||       y r  )r<  rD  r  rc  )r<  r  r<  r   r  s        rX   r  zKernelGroup.finalize_kernelW  s5    %WW  r*rW   c                 X    | j                   j                         \  }}}t        |      }|S r  )r  r/  r   )r<  arg_defs
_call_args
_arg_typesr  s        rX   r  zKernelGroup.get_num_args]  s)    +/99+@+@+B(*jx=rW   r   c                 X   | j                   j                          | j                  syt               }t        j
                  j                  xr t        j                  dv }|r|j                  dg       |j                  d       |t        t        j                        n|}|t        t        j                        n|}| j                  j!                         \  }}}dj#                  d      j%                  |      }t'               }t        j
                  j(                  rdnd}	|j                  d| d	|	 d
| d| d	       |j+                         5  |rNt,        j.                  j0                  }
|
dt        |
      z   dz   nd}|j                  d||z    d||z    dg       | j                  j3                         D ]  \  }}|j                  d| d| d        |j5                  | j6                         d d d        |j9                         S # 1 sw Y   |j9                         S xY w)NrS   )linuxrQ   z3#include <torch/csrc/inductor/aoti_runtime/utils.h>z+#include <torch/csrc/inductor/cpp_prefix.h>z,
   C10_ALWAYS_INLINE_ATTRIBUTEzextern "C" z void r   r  r   graph_r   z9torch::aot_inductor::RAIIAtenRecordFunctionHandle record_z_("z", nullptr);r   r   r   )r   rD  r<  r7   r   r  enable_kernel_profilesysplatformr   r   rp   r,   r,  r-  r  r/  ljustr  rY   force_inline_kernelr   r4   r  graph_idaliasesr  rD  r0  )r<  r   r   rQ  r3  r  rI  r   func_export_declinline_attrrV  r@  oldnews                 rX   r;  zKernelGroup.codegen_groupb  s   

##~ !'

 @ @ !
S\\ V
 F
 !OORSTDE <@<3{667T;?<c+667T..0!Q;;r?''113-3ZZ-K-K)QS 	 	*+6+a@P?QQRS[R\\]^	

 [[] 	)$77++;C;OCM1C7UW&&,{&:%;3v?S>TT`b !II--/ 7Sse3se1567KK(	) }}	) }}s   B&HH)r:  c                 l    | j                   j                         \  }}}|j                  ||d||       y )NF)tritonr8  r:  )r  r/  generate_kernel_call)r<  r  r  r:  r   	call_argsr8  s          rX   r  zKernelGroup.call_kernel  s=    "&))"7"7"99i$$% 	% 	
rW   r  )r  r  r  r;  r  r  r  rp   r;  r   rn   r  rn  ro  s   @rX   rW  rW  J  s4    "=+
,# ,\
hsm 
rW   rW  c                   0    e Zd Zd Zd Zd Zd Zd Zd Zy)rE  c                 `    || _         d| _        d | _        t        j                         | _        y r  )r   in_parallelr  r   r   r   )r<  r   s     rX   r;  zWorkSharing.__init__  s)    	 ))+
rW   c                    | j                   r|| j                  k7  r| j                          | j                   s|| _        d| _         t        j                  j
                  r| j                  j                  d       n| j                  j                  d| d       | j                  j                  | j                  j                                | j                  j                  d       y y )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)rb  r  rD  r   r  r  r   r   r   r   r   )r<  rJ  s     rX   r6  zWorkSharing.parallel  s    4+;+; ;JJL&D#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1  rW   c                 h    | j                   r| j                  j                  d       | j                   S )Nz#pragma omp single)rb  r   r   rA  s    rX   rT  zWorkSharing.single  s*    II 45rW   c                 F    | j                   j                          d| _        y r  )r   rD  rb  rA  s    rX   rD  zWorkSharing.close  s    

 rW   c                 :    | j                   j                          | S r  )r   r{  rA  s    rX   r{  zWorkSharing.__enter__  s    

rW   c                 >    | j                   j                  |||       y r  )r   r  r}  s       rX   r  zWorkSharing.__exit__  s    

Hgv6rW   N)	r  r  r  r;  r6  rT  rD  r{  r  rV   rW   rX   rE  rE    s     ,  
!7rW   rE  c                      e Zd ZU dZeej                     ed<   dZeej                     ed<   ej                  j                  Zej                  ed<   ej                  j                  Zej                  ed<   ej                  j                  Zej                  ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   d	Zeed<   d Zd Zd Zy)r0  Nr   rH  rI  r  rJ  r   r6  Fsimd_ompsimd_vec	collapsedr  c                 j    t        j                         }|r|j                         | _        y d| _        y rc  )r   r  rd  simd_nelements)r<  r  s     rX   __post_init__zLoopLevel.__post_init__  s-     .9-E-E-GAO>#;#;#=UVrW   c                    t        j                  |      }t        | j                  | j                        }||_        d|_        t        |j                  |      |z  |_        | j                  |_	        d|_
        | j                  |_        |S )NTF)r   r\  r0  r   rH  rJ  rj  r   r  r6  rk  r  )r<  r!  sympy_factorrF  s       rX   r   zLoopLevel.tile  sn    }}V,499-!
"499l;lJ --rW   c                    t        | j                        }t        | j                        }t        j                  j
                  r||k(  ry | j                  r| j                  dkD  rd| j                   dnd}| j                  rFd}| j                  dkD  r|d| j                   dz  }| j                  r\|j                  dd|       }nF| j                  rd}n7| j                  rd	| }n%| j                  st        j                         rd
}nd}t         d| j                   d| }| j                   d| }| j                   j"                  r%| j                   dt        | j                          }n;| j                   dt        | j                          dt        | j                          d}d| d| d| d}| j$                  s|s|gS ||gS )Nr5   zsimd simdlen(z) rS   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r  r  r  z+=(z == 0 ? 1 : zfor(r   )rE   rI  rH  r   r  no_redundant_loopsri  rm  r6  r  rj  r  r   r  rJ   r   rJ  r  rk  )	r<  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rX   rN  zLoopLevel.lines  s   !$++.		*	::(([I-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7Nu~rW   )r  r  r  r   r   r   r  r  rH  r   r   rI  r  OnerJ  r6  rn   ri  rl   rj  rk  r  rn  r   rN  rV   rW   rX   r0  r0    s     $C%**	$!%D(5::
%FEJJ% #WW\\J

)E5::#HcHdHdItL$
W	'rW   r0  c                       e Zd ZU dZdZeee      ed<   dZ	ee
   ed<   ede
fd       Zd Zed        Zd Zd	 Zd
e
fdZd ZdefdZy)rT  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    NrP  rh  c                 $   | j                   }| j                  }| j                  }|J d}t        t	        ||            D ]B  \  }\  }}t        ||      }|s|g}n|j                  |       ||k\  s2| j                  |_        D t        |      }	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	rb  r[  r  r   r  r0  r  r  rT  )
rh  rb  r[  r  rP  loop_idxr   rH  rF  rV  s
             rX   rb  zLoopNest.build'  s     ?? 00***+/%.s8V/D%E 	8!HksDS$'DT"?*$*$7$7!	8 UO	rW   c                 ,    t        | j                        S r  )rl   rP  rA  s    rX   __bool__zLoopNest.__bool__<  s    DJJrW   c                    | j                   t        dd      S d}d}| j                   d   j                  }t        j                  d      }| j                   D ];  }|j                  |k7  r n*|t        |j                  |j                        z  }|dz  }= d } || j                         }dt        fd}|t        | j                         k  rt        |t        j                        rt        | j                   |   j                  t        j                        r|dz  t        | j                   |   j                  | j                   |   j                        k  r|&||kD  r!| j                   |   j                  r ||       sd|}d}| j                   |   j                  }t        |t        | j                               D ]%  }	| j                   |	   j                  |k7  r n|dz  }' t        ||      S )a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        r   rj  r5   c                 J    t        |       D ]  \  }}|j                  s|c S  y r  )r   rj  )rP  r   rF  s      rX   get_simd_vec_depthz7LoopNest.max_parallel_depth.<locals>.get_simd_vec_depthU  s+    $U+ 4==H rW   rV  c                     t        | j                  t              sJ t        d | j                  j                  D              S )Nc              3   >   K   | ]  }t        |t                 y wr  )r   r  )r'  rh  s     rX   r)  zILoopNest.max_parallel_depth.<locals>.has_scalar_kernel.<locals>.<genexpr>`  s"       v|44r.  )r   rh  r;  rg   r  )rV  s    rX   has_scalar_kernelz6LoopNest.max_parallel_depth.<locals>.has_scalar_kernel^  s>    i..??? '..66  rW   rS  )rP  r  r  r   r\  r   rH  rJ  rT  r   r   rU  )
r<  r  	max_depthr  	num_stepsrF  r  simd_vec_depthr  r   s
             rX   rR  zLoopNest.max_parallel_depth?  s    :: qAA	zz!}11MM!$	JJ 	D  L0!HTYY

$CCINI			 ,DJJ7	 	 DJJ'9emm44::i055u}}ECtzz),114::i3H3N3NOP *.JJy)66%d+ $KI::k2??L;DJJ8 ::a=--=Q	 I;OOrW   c                    |j                   | j                         j                   k  sJ d       | j                  J t        | j                        |j                   k\  sJ | j                  |j                     }|j                   |_        |j                  rt        xj                  dz  c_        t        |j                  dz   |j                         D ]  }d| j                  |   _
         y )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr5   T)r  rR  rP  r   r  r6  r  r   parallel_reduction_countrU  rk  )r<  r8  rF  r   s       rX   rS  zLoopNest.mark_parallel~  s    ''4+B+B+D+S+SS 	
M	
S zz%%%4::)":"::::zz)//0!00,,1,y,,q0)2J2JK 	+A&*DJJqM#	+rW   c                     | j                   sJ | j                   |   j                  |      | j                   |<   | j                   |   S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )rP  r   )r<  rB  r!  s      rX   r   zLoopNest.tile  sA     zzz JJu-226:

5zz%  rW   r   c                 6    | j                   sJ | j                   S r  rh  rA  s    rX   r:  zLoopNest.get_kernel  s    {{{{{rW   c                     || _         y r  r  r*  s     rX   r  zLoopNest.set_kernel  s	    rW   levelc                     | j                   sJ t        | j                         |k\  sJ |t        | j                         k(  rd n| j                   |d  }t        || j                        S r  )rP  r   rT  rh  )r<  r  rP  s      rX   rf  zLoopNest.from_loop_level  sV    zzz4::%'''TZZ0djj6Ht{{++rW   )r  r  r  r  rP  r   r1  r0  r  rh  r  r  rb  r  r%   rR  rS  r   r:  r  rn   rf  rV   rW   rX   rT  rT    s     (,E8DO$+"&FHY&i  (  <P <P|+
!I ,S ,rW   rT  )NNNr  )r   dataclassesrW  r  r  rY  r   rR  r?  collections.abcr   enumr   typingr   r   r   r   r	   r   r   torch.fxtorch._inductorr
   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rS   r   r   r   r   r   debugr   rO  r   r/  r   r   r    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   commonr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   	cpp_utilsrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rS  rU   cacherY   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPrJ  rK  r   rI  ro   rl   rL  rM  r  r  rN  rO  rw   r1  r   r  rx   r   r   r  r   r   r  rp   r   rn   r   r   r   	lru_cacher   r  r  	dataclassr  r   rq  r  r  _initialize_pointwise_overridesr  r  r  r  r  r  r   rE  rG  r;  re  r>  rF  rW  rE  r0  rT  rV   rW   rX   <module>r     sV         	 
  $  7 7    ( @ / K K O O % < < ;        > =       & llg% : : ~~//*EBC   !   #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK		* T%++&  
KK	NN	MM	KK	JJ1 D- )D $(/)
 ELL!/)d-
-jj- - 

	-
 jj- -`3#$  ;;	
 
sCxBV^ V3 V# V/ /2 -UZZ -ell - - ;uzz ;

 ;PS ; ;| FJ!::!!LL!6>sm! !   ]!"4 ]!@! !B&oH; oHd  , ,U 3|7l |7~  / / 9  % % '7 7R Rj^9 ^BY
l Y
x)$ )$eHU[[4I44O.P )$XbK bKJg	
Y g	
T 
9  
FFt F\&N \&~N
 N
b%7 %7P R R Rj H, H, H,rW   