
    pi                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZm Z  d dl!Z!d dl"Z"d dl#Z"d dl$m%Z% d d	l&m'Z( d d
l)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZV erid dlWmXZXmYZYmZZZ d dl#m[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZb ddlcmdZd ddlemfZfmgZgmhZh ddlAmiZi ddljmkZk  e d       Zleeeg   geff   Zmenek   Zoeepe!j                  f   ZrepZse"j                  j                  evd!      Zw ej                  ev      Zyd	d"Zzej                   G d# d$             Z| G d% d&ej$                        Z} G d' d(e      Z~ eId)*       G d+ d,e~             Z G d- d.      Zej                   G d/ d0             Zej                   G d1 d2             Zej                   G d3 d4             Zej                   G d5 d6             Zej                   G d7 d8             Zeeeeeef   Zi Zd9ed:<    G d; d<      Zi Zd=ed><   i Zd?ed@<   i ZdAedB<   	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddCZ G dD dEe      Z	 	 	 	 ddFZ	 	 	 	 	 	 ddGZddHZ	 d	 	 	 	 	 	 	 ddIZddJZddKZej*                  ddL       Z	 	 	 	 	 	 	 	 ddMZ	 	 	 	 	 	 ddNZddOZe"j4                  e"j6                  e"j8                  e"j6                  ie"j:                  e"j<                  e"j>                  e"j@                  e"jB                  e"jD                  e"jF                  e"jH                  e"jJ                  e"jL                  e"jN                  fD  ci c]  } | |  c} ZdPedQ<   	 	 	 	 	 	 	 	 ddRZ	 	 	 	 	 	 	 	 ddTZ G dU dV      Z G dW dXe1      Z0 G dY dZ      Z ejZ                  d[ej\                  \      Zdd]Z G d^ d_e?eeRe         Zej                   G d` da             Z edi db ee%jh                  dc dde      df ee%jh                  dg dh dij      dk ee%jh                  dl dm dnj      do ee%jh                  dp dq drj      ds ee%jh                  dt du dvj      dw ee%jh                  dx dy dwz      d{ ee%jh                  d| d} d~j      d ee%jh                  d d d d      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d d d d      d ee%jh                  d d dz      d ee%jh                  d d dj      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d d dj      d ee%jh                  d d dj      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d de      d ee%jh                  d d¬e      d ee%jh                  dĄ dŬe      d ee%jh                  dǄ dȬe      d ee%jh                  dʄ dˬe      d ee%jh                  d̈́ dάe      d ee%jh                  dЄ dѬe      d ee%jh                  dӄ dԬe      d ee%jh                  dք d׬e      d ee%jh                  dل dڬe      d ee%jh                  d܄ dݬe      d ee%jh                  d߄ de      d ee%jh                  d de      d ee%jh                  d de      Zded<   ddZ G d deE      Z G d deH      Z G d de      Zej                   G d d             Z G d d      Z e       Z G d d      Z G d d      Z e dep      Z e dSee      Zeree"j                  eTeeeedf   f   f   Z G d deeef         Z G d d      Z G d  deee         Zej                   G d d             Zej*                  dd       Z G d d      Z G d de@      Zyc c} w (      )annotationsN)ABCabstractmethod)autoEnum)chain)	AnyCallablecastClassVarGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)SelfTypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)ConfigModule)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)ShapePropagationOpsHandler)boolean_opsDeferredLineBasegenerate_assertget_current_backendIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)ops
OpsHandlerOpsValueReductionType	StoreModeV)IteratorMutableMappingSequence)GraphModule)CustomGraphModulePass)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode)BlockShapeType   PythonWrapperCodegen_Tschedulec                x    t         j                  t        j                        rt         j	                  d|        y y )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)msgs    `/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/common.pydata_type_loggerrT   Z   s*      /6< 0    c                  Z    e Zd ZU dZded<   ded<   ddZedd       ZddZedd	       Z	y
)FileBackedGraphModulez
    Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
    map back to a GraphModule instead of Python source.
    r;   gmzCallable[..., Any]compiled_fnc                &   t        j                  ddd      | _         t        j                  t        j
                  | j                   j                         | j                   5 }|j                  | j                         d d d        y # 1 sw Y   y xY w)Nzw+z.pyF)modesuffixdelete)	tempfileNamedTemporaryFileatexitregisterosremovenamewritevalue)selffs     rS   __post_init__z#FileBackedGraphModule.__post_init__i   si     !33eE
 			4==#5#56]] 	 aGGDJJ	  	  	 s   "BBc                .    | j                   j                  S N)r^   rd   rg   s    rS   __file__zFileBackedGraphModule.__file__s   s    }}!!!rU   c                      | j                   | S rk   )rY   rg   argss     rS   callzFileBackedGraphModule.callw   s    t&&rU   c                .    | j                   j                  S rk   )rX   coderl   s    rS   rf   zFileBackedGraphModule.valuez   s    ww||rU   NreturnNoneru   str)rp   	list[Any]ru   r	   )
__name__
__module____qualname____doc____annotations__ri   propertyrm   rq   rf    rU   rS   rW   rW   _   sF    
 	O##  " "'  rU   rW   c                  <    e Zd ZdZdZdZedd       Zedd       Zy)	WorkspaceZeroModer   rG   r   c                    | |k(  s|t         j                  k(  r| S | t         j                  k(  r|S t        d| d|d      )NzWorkspaceZeroMode.combine(, ))r   UNINITIALIZEDNotImplementedErrorabs     rS   combinezWorkspaceZeroMode.combine   sK    6Q+999H!///H!$>qe2aU!"LMMrU   c                F    | rt         j                  S t         j                  S rk   )r   ZERO_ON_CALLr   )	zero_fills    rS   	from_boolzWorkspaceZeroMode.from_bool   s    $111 ...rU   N)r   r   r   r   ru   r   )r   boolru   r   )	rz   r{   r|   r   r   ZERO_PER_GRAPHstaticmethodr   r   r   rU   rS   r   r      s9    MLNN N / /rU   r   c                  4    e Zd ZdZedd       Zedd       Zy)CodegenSymbolzP
    An IR object possibly corresponding to a variable in the wrapper code.
    c                     y rk   r   rl   s    rS   get_namezCodegenSymbol.get_name       rU   c                     y rk   r   rl   s    rS   get_examplezCodegenSymbol.get_example   r   rU   Nrw   ru   z!Union[torch.Tensor, sympy.Symbol])rz   r{   r|   r}   r   r   r   r   rU   rS   r   r      s/        rU   r   T)frozenc                  &   e Zd ZU dZded<   ded<   ded<   ded	<   d
Zded<   ej                  Zded<   e	ddd       Z
e	d d       Ze	d!d       Ze	d!d       Zd"dZeZd#dZd$dZd%dZed%d       ZeZeZeZd&dZd'dZd'dZd(dZd)dZd*dZy)+WorkspaceArga2  A temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.

    Args:
        nbytes: The size of the buffer in bytes.
        zero_fill: Whether the buffer should be initialized to zero.

    
sympy.Exprcountr   	zero_modetorch.devicedevicerx   
outer_namews_ptr
inner_nametorch.dtypedtypec                P    |  t        t        j                  j                         S rk   )nextr7   graphworkspace_id)prefixs    rS   unique_namezWorkspaceArg.unique_name   s!    $qww334566rU   c                    | j                   |j                   k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rk   )r   r   r   r   s     rS   can_joinzWorkspaceArg.can_join   s@     LLALL(XQWW-?XAHHPQPXPXDX	
rU   c                    t        | j                  |j                  z   t        j                  | j                  |j                        | j
                  | j                  | j                  | j                        S N)r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   s     rS   joinzWorkspaceArg.join   sS    ''AGG#'//Q[[I''88||||
 	
rU   c                   | j                   |j                   k(  r2| j                  |j                  k(  r| j                  |j                  k(  sJ t        t	        j
                  | j                  |j                        t        j                  | j                  |j                        | j                   | j                  | j                  | j                        S r   )r   r   r   r   sympyMaxr   r   r   r   r   r   s     rS   maximumzWorkspaceArg.maximum   s     GGqww188qxx#7ALLALL<X	
X))AGGQWW-'//Q[[I''88||||
 	
rU   c                    | j                   S rk   r   rl   s    rS   
get_devicezWorkspaceArg.get_device   s    {{rU   c                    | j                   S rk   r   rl   s    rS   	get_dtypezWorkspaceArg.get_dtype   s    zzrU   c                >    | j                         j                         S rk   )
get_layoutr   rl   s    rS   r   zWorkspaceArg.get_example   s     ,,..rU   c                f    ddl m}  || j                  | j                  | j                  gdg      S )Nr   )r?   rG   )r   r   sizestride)irr?   r   r   r   )rg   r?   s     rS   r   zWorkspaceArg.get_layout   s.    $;;****3	
 	
rU   c                "    | j                         S rk   )r   rl   s    rS   layoutzWorkspaceArg.layout   s      rU   c                6    t         j                  j                  S rk   )r   SZerorl   s    rS   
get_offsetzWorkspaceArg.get_offset   s    ww||rU   c                    | j                   gS rk   )r   rl   s    rS   get_sizezWorkspaceArg.get_size   s    

|rU   c                8    t         j                  j                  gS rk   )r   r   Onerl   s    rS   
get_stridezWorkspaceArg.get_stride   s    }rU   c                    | j                   S rk   )r   rl   s    rS   r   zWorkspaceArg.get_name   s    rU   c                     y)NFr   rl   s    rS   get_is_pinnedzWorkspaceArg.get_is_pinned  s    rU   c                    g S rk   r   rl   s    rS   get_inputs_that_alias_outputz)WorkspaceArg.get_inputs_that_alias_output  s    	rU   N)
workspace_)r   rx   ru   rx   )r   r   r   r   ru   r   )r   r   r   r   ru   r   )ru   r   )ru   r   r   )ru   r?   )ru   r   )ru   list[sympy.Expr]rw   )ru   r   )ru   	list[str])rz   r{   r|   r}   r~   r   torchuint8r   r   r   r   r   r   r   get_device_or_errorr   r   r   r   r   get_output_specmaybe_get_output_specmaybe_get_layoutr   r   r   r   r   r   r   rU   rS   r   r      s    	   OJE;$7 7 
 

 
 
 
 
 %/
 ! ! !O&!rU   r   c                      e Zd ZddZddZy)TritonScratchWorkspacec                     || _         || _        y rk   )r   _generate_dtype_str)rg   r   generate_dtype_strs      rS   __init__zTritonScratchWorkspace.__init__
  s    	#5 rU   c                "    | j                         S rk   )r   rl   s    rS   r   z)TritonScratchWorkspace.generate_dtype_str  s    ''))rU   N)r   intr   Callable[..., str]rw   )rz   r{   r|   r   r   r   rU   rS   r   r   	  s    6*rU   r   c                  p    e Zd ZU ded<   ded<   ded<   ej
                  j                  Zded<   dZd	ed
<   y)	TensorArgrx   rd   bufferr   r   r   offsetNOptional[str]alias_of)	rz   r{   r|   r~   r   r   r   r   r   r   rU   rS   r   r     s.    
IKFJ%"Hm"rU   r   c                  4    e Zd ZU ded<   ded<   edd       Zy)SizeArgrx   rd   r   exprc                     y rk   r   rl   s    rS   r   zSizeArg.alias_of   s    rU   Nru   r   )rz   r{   r|   r~   r   r   r   rU   rS   r   r     s    
I
 rU   r   c                      e Zd ZU ded<   y)ConstexprArgrx   rd   Nrz   r{   r|   r~   r   rU   rS   r   r   %  s    
IrU   r   c                  6    e Zd ZU ded<   ded<   ded<   ded<   y)	TMADescriptorArgrx   rd   api_typezOptional[list[sympy.Expr]]block_shapeOptional[torch.dtype]r   Nr   r   rU   rS   r   r   *  s    
IM++  rU   r   c                  >    e Zd ZU ded<   ded<   dZded<   dZded<   y)	DeviceCodegenSchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNOptional[WrapperConstructor]cpp_wrapper_codegenfx_wrapper_codegen)rz   r{   r|   r~   r   r   r   rU   rS   r   r   2  s&    %%''8<5<7;4;rU   r   zdict[str, DeviceCodegen]device_codegensc                      e Zd ZddZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZddZddZddZddZ	 d	 	 	 	 	 	 	 ddZy)DeviceOpOverridesc                    t         rk   r   rg   rd   s     rS   import_get_raw_stream_asz*DeviceOpOverrides.import_get_raw_stream_as@      !!rU   c                    t         rk   r  rg   
device_idxs     rS   
set_devicezDeviceOpOverrides.set_deviceC  r  rU   c                    t         rk   r  rl   s    rS   synchronizezDeviceOpOverrides.synchronizeF  r  rU   c                    t         rk   r  r
  s     rS   device_guardzDeviceOpOverrides.device_guardI  r  rU   c                    t         rk   r  rl   s    rS   cpp_device_guardz"DeviceOpOverrides.cpp_device_guardL  r  rU   c                    t         rk   r  rl   s    rS   cpp_aoti_device_guardz'DeviceOpOverrides.cpp_aoti_device_guardO  r  rU   c                    t         rk   r  rl   s    rS   cpp_stream_guardz"DeviceOpOverrides.cpp_stream_guardR  r  rU   c                    t         rk   r  rl   s    rS   cpp_aoti_stream_guardz'DeviceOpOverrides.cpp_aoti_stream_guardU  r  rU   c                    t         rk   r  rl   s    rS   cpp_getStreamFromExternalz+DeviceOpOverrides.cpp_getStreamFromExternalX  r  rU   c                    t         rk   r  rl   s    rS   kernel_headerzDeviceOpOverrides.kernel_header[  r  rU   c                    t         rk   r  rl   s    rS   kernel_driverzDeviceOpOverrides.kernel_driver^  r  rU   c                    t         rk   r  rl   s    rS   cpp_stream_typez!DeviceOpOverrides.cpp_stream_typea  r  rU   c                    t         rk   r  rl   s    rS   aoti_get_streamz!DeviceOpOverrides.aoti_get_streamd  r  rU   c                    t         rk   r  rl   s    rS   cpp_kernel_typez!DeviceOpOverrides.cpp_kernel_typeg  r  rU   c                    t         rk   r  rl   s    rS   cpp_device_ptrz DeviceOpOverrides.cpp_device_ptrj  r  rU   c                    t         rk   r  rl   s    rS   tma_descriptor_helpersz(DeviceOpOverrides.tma_descriptor_helpersm  r  rU   Nc                    t         rk   r  )rg   idx	workspacer   s       rS   cpp_scratchzDeviceOpOverrides.cpp_scratchp  s
     "!rU   rd   rx   ru   rx   )r  r   ru   rx   rw   rk   )r*  r   r+  r   r   r   ru   zOptional[tuple[list[str], str]])rz   r{   r|   r  r  r  r  r  r  r  r  r  r  r  r   r"  r$  r&  r(  r,  r   rU   rS   r  r  ?  s~    """""""""""""""" TX""#9"CP"	("rU   r  zdict[str, DeviceOpOverrides]device_op_overrides_dictz*dict[str, Optional[CustomGraphModulePass]]custom_backend_passesz!dict[str, Optional[ConfigModule]]custom_backend_codegen_configsc                    t        ||||      t        | <   |t        | <   |r)t        |t              r|t
        usJ d|dt
               |t        | <   y )Nzdevice_custom_config=z: cannot be the same as the default inductor config config=)r   r  r/  
isinstancer   r    r0  )r   device_schedulingdevice_wrapper_codegendevice_cpp_wrapper_codegendevice_fx_wrapper_codegendevice_custom_passdevice_custom_configs          rS   register_backend_for_devicer9    sv     ,"!	OF %7&!+\:$F2	
 %#%%`Y_Xab		
3
 .B"6*rU   c                      e Zd Z e       Z e       Z e       Z e       Z e       Z e       Z	 e       Z
 e       Z e       Z e       Zy)BackendFeatureN)rz   r{   r|   r   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERTRITON_TEMPLATESREDUCE_TO_SINGLE_ELEMENTr   rU   rS   r;  r;    sL    fGIfO $6D6DfO"fv#vrU   r;  c                :   | 
t               S t                t        | t        j                        r| j
                  }n7t        | t              sJ t        |              | }t        j                  |      } t        |      }|sJ  |d       }|j                  |       S rk   )	r   init_backend_registrationr2  r   r   typerx   get_scheduling_for_deviceget_backend_features)r   device_typescheduling_ctorr   s       rS   rJ  rJ    s     ~|&%,,'kk&#&4V4&k*/<O? &J**622rU   c                @    t        |t              sJ |t        |       v S )zSee also V.graph.has_feature)r2  r;  rJ  )r   features     rS   has_backend_featurerO    s%     g~...*6222rU   c                <    | t         v rt         |    j                  S d S rk   )r  r   r   s    rS   rI  rI    s     17?1J?6"--TPTTrU   c                v    | t         v r1t         |    }|r|j                  S |r|j                  S |j                  S y rk   )r  r   r   r   )r   cpp_wrapper
fx_wrapperwrapper_codegen_objs       rS   get_wrapper_codegen_for_devicerU    sD      -<V-D&999&:::&666rU   c                (    | t         v r	t         |    S d S rk   )r/  r   s    rS   "get_custom_backend_pass_for_devicerW    s    ,26K,K (UQUUrU   c                (    | t         v r	t         |    S d S rk   )r0  r   s    rS   $get_custom_backend_config_for_devicerY    s&     33 	'v. rU   c                    ddl m}  ddlm} ddlm} ddlm} ddlm	} ddl
m} ddlm} dd	lm} dd
lm} ddlm}	 ddlm}
 ddlm} t1        d      4| ||	dt3        dfd|
t4        j6                  j8                  r|n||       t1        d      ||dt3        dfd|
||       t1        d      t3        d|	|
||       t1        d      t3        d||
||       t1        d      t3        d|	|||       t:        j<                  j?                         }|dk7  rLt1        |      @ddl m!} 	  |d      } |d      } |d      } |d      }|r|r|rt3        |||||       yyyyyy# tD        $ r Y yw xY w)z
    Register the backend for different devices, including the scheduling
    for kernel code generation and the host side wrapper code generation.
    rG   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CppWrapperMps)CUDACombinedScheduling)HalideScheduling)MetalScheduling)PythonWrapperMtia)TritonSchedulingrH   )WrapperFxCodegencpuN)cpphalidetritonc                6     t         j                     |       S rk   )r    cpu_backend)r   cpu_backendss    rS   <lambda>z+init_backend_registration.<locals>.<lambda>  s    ?|F,>,>?
K rU   cuda)ri  rh  c                6     t         j                     |       S rk   )r    cuda_backend)r   cuda_backendss    rS   rm  z+init_backend_registration.<locals>.<lambda>  s    A}V-@-@A*M rU   xpumpsmtiaprivateuseoner   )_get_custom_mod_func
SchedulingrI   CppWrapperCodegenre  )#rg  r[  cpp_wrapper_cpur\  cpp_wrapper_cpu_array_refr]  cpp_wrapper_gpur^  cpp_wrapper_mpsr_  cuda_combined_schedulingr`  rh  ra  rs  rb  python_wrapper_mtiarc  ri  rd  wrapperrI   wrapper_fxirre  rI  r9  r    aot_inductorallow_stack_allocationr   _C_get_privateuse1_backend_name torch.utils.backend_registrationrv  RuntimeError)r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  rI   re  private_backendrv  r3  r   r   r   rl  rq  s                     @@rS   rG  rG    s    #.@..@($6(-. '/ &&

 	$K ""99 "	
 !(0 -&
 	$M 	
 !'/# 	
 !'/# 	
 !(0#	
 hh<<>O?*%o6>I	 4\ B23IJO"67J"K!56H!I _9L+#%#'& :M_  ? 	+$  		s   /5E* *	E65E6c                L    ddl m} g | t        ||j                  |            S )Nr   )FlexibleLayout)r   r  r-   contiguous_strides)index
index_varssizesr  s       rS   index_prevent_reorderingr  T  s,    
 $ UUTIj.*K*KE*RSTTrU   c                    |t         | <   y rk   )r.  )r   device_op_overridess     rS   register_device_op_overridesr  _  s     (;V$rU   c                    t        | t              sJ t        |              t        sddlm}m} ddlm} ddl	m} ddl
m} t        |    S )NrG   )cpu_device_op_overridesmps_device_op_overrides)r  )r2  rx   rH  r.   r  r  rn  r  rt  rr  )r   r  r  r  mtia_op_overridesxpu_op_overridess         rS   get_device_op_overridesr  e  s5    fc"0DL0"#F-B@#F++rU   zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEc                r   | t               v rt        j                  S | dv rd|v r|d   S |d   S | dv rt        j                  S | dv rt        j                  S | dk(  rd|v r|d   S |d   S | dk(  rd|v r|d   S |d   S | d	v r$|d   }t
        j                  j                  |      S | d
k(  rd|v r|d   S |d   S y)zK
    Given op name and a list of input dtypes, deduce the output dtype
    )to_dtype
index_exprr   )randrandn)	get_index	randint64	load_seed	reductionrG   constant)loadstorestore_reductionto_dtype_bitcastN)r&   r   r   floatint64r7   r   r   )op_namerp   kwargsbuf_names       rS   deduce_output_dtype_by_namer    s
    +-zz	  
 #*V"3vgAbA	  
 {{	  

 {{	K	")V"3vg@a@	J	")V"3vgAbA	  

 7ww  **	&	&")V"3vgAbArU   CSEVariableTypec                   t               }t        j                  j                  r'|dk(  r"| j	                  d| dt        |       d       y t        j                  j                  r|dk(  rddlm}m	} t        ||      sJ t        |             |t        j                  k(  r|j                  rd| d	}n.d
| d| d}n$d| d}|j                  rd| d}d| d||    d}| j	                  d| d       y y y )Nri  ztl.static_assert(z
.dtype == r   rg  rG   )CppCSEVariableDTYPE_TO_CPPzIsVecMaskType<decltype(z	)>::valuezstd::is_same_v<decltype(z$), bool> || std::is_same_v<decltype(z), int>z	decltype(z	typename z::value_typezstd::is_same_v<r   >zstatic_assert(z);)r)   r    test_configsruntime_triton_dtype_assert	writeliner0   static_cpp_dtype_assert	cpp_utilsr  r  r2  rH  r   r   is_vec)r   varr   backendr  r  
is_same_dt
c_var_types           rS   check_dtyper    s    "#G667h;N,SEK<N;OqQR				4	4E9I;#~.9S	9.EJJzz6se9E
  8u<`ad`eelm
$SE+Jzz(LA
*:,be9L8MQOJ>*R89! :J	4rU   c                  `    e Zd Zd
dZddZddZddZddZddZe	dd       Z
e	dd       Zy	)DataTypePropagationc                    || _         d|j                  j                  i| _        |j                  j                         D ]  \  }}|j                  | j                  |<     y Nroot)body
root_blockr   graphs	subblocksitems)rg   r  kvs       rS   r   zDataTypePropagation.__init__  sU    	DOO))B
 NN((* 	%DAqWWDKKN	%rU   c                   |j                   }|D cg c]9  }t        |t        j                  j                        s(|j
                  dk7  s8|; }}t        |      dk(  ry t        d |D              }|sy t        j                  t        j                  |D cg c])  }|j                  t        j                     j                  + c}      S c c}w c c}w )Nplaceholderr   c              3     K   | ]K  }t         j                  |j                  v xr) |j                  t         j                     j                  d u M y wrk   )OptimizationContextkeymetar   ).0ns     rS   	<genexpr>zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>  sS      )
   ##qvv- B*../55TAB)
s   AA)all_input_nodesr2  r   fxNodeoplenall	functoolsreducepromote_typesr  r  r  r   )rg   nodeinputsr  input_nodesall_input_nodes_propagateds         rS   deduce_node_dtype_by_inputsz/DataTypePropagation.deduce_node_dtype_by_inputs  s    %%
Auxx}}!=!$$-BWA
 
 {q %( )
 !)
 &
"
 *<GHqQVV'++,22H
 	

  Is   )CCC.C
c                b    | j                   |j                     }| j                  |      }|sJ |S rk   )r  targetpropagate_graph)rg   r  	sub_graphr   s       rS   deduce_node_dtype_by_subgraphz1DataTypePropagation.deduce_node_dtype_by_subgraph  s0    KK,	$$Y/urU   c                   |j                   dk(  ry |j                  dk(  rt        |j                        dk7  ry |j                  t        j
                  k(  rT|j                  d   }t        |t        j                  j                        sJ t        |             | j                  |      S t        |j                  t              sJ t        |j                               |j                  j                  d      r| j                  |      S t        |j                  g|j                  i |j                   x}	 |S | j#                  |      S )Nr  outputrG   r   masked_subblock)r  r  r  rp   operatorgetitemr2  r   r  r  rH  deduce_node_dtyperx   
startswithr  r  r  r  )rg   r  node_argoutput_dtypes       rS   r  z%DataTypePropagation.deduce_node_dtype  s   77m#;;("s499~':;;(***yy|Hh6FXF6))(33$++s+>T$++->>+;;!!"3455d;; 8 ++ L
   //55rU   c                n   |j                   sJ d }|j                   D ]  }t        j                  |j                  v r|j                  t        j                     }n
t               }| j	                  |      |_        ||j                  t        j                  <   |j                  dk(  s|j
                  } |S )Nr  )nodesr  r  r  r  r   r  )rg   r   graph_dtyper  opt_ctxs        rS   r  z#DataTypePropagation.propagate_graph  s    {{{-1 KK 		,D"&&$))3))$7$;$;<-/ 2248GM18DII)--.{{h&%mm		, rU   c                >    | j                  | j                  d         S r  )r  r  rl   s    rS   	propagatezDataTypePropagation.propagate  s    ##DKK$788rU   c                .     | |      j                         S rk   )r  )clsr  s     rS   propagate_loopbodyz&DataTypePropagation.propagate_loopbody   s    4y""$$rU   c                    ddl m} ddlm} t	        ||      sJ t        |             t	        |j                  |      sJ t        |j                               t        j                  |j                        S )Nr   rA   )rE   )		loop_bodyrB   	schedulerrE   r2  rH  _bodyr  r  )r  r  rB   rE   s       rS   propagate_scheduler_nodez,DataTypePropagation.propagate_scheduler_node$  sX    (-$.:T
:.$**h/Adjj1AA/"55djjAArU   N)r  rB   ru   rv   )r  torch.fx.Noderu   r   )r  r  ru   r   )r   ztorch.fx.Graphru   r   )ru   r   )r  rB   ru   r   )r  rE   ru   r   )rz   r{   r|   r   r  r  r  r  r  classmethodr  r  r   rU   rS   r  r    sJ    %
*6:$9 % % B BrU   r  c                  D     e Zd Zddd	 	 	 	 	 	 	 d fdZdd fdZ xZS )r   T)simplifypc                   |r]t        |t        j                        rCt        t        j
                  d      r)t        j
                  j                  j                  |      }t        | %  |      S )Nsizevars)
r2  r   Exprhasattrr7   r   r  r  superdoprint)rg   r   r  r  	__class__s       rS   r	  zPythonPrinter.doprint/  sK     
44*9U77##,,T2Dwt$$rU   c                    t        |t        j                        rd| j                  |       dS t        |   |||      S N(r   )r2  r   Mod_printr  parenthesize)rg   itemlevelstrictr
  s       rS   r  zPythonPrinter.parenthesize7  s@    dEII& t{{4()++7'eV<<rU   )r   r   r  r   r  r   ru   rx   )F)r  r   r  r   r  r   ru   rx   )rz   r{   r|   r	  r  __classcell__r
  s   @rS   r   r   .  s7    48D%%-1%=A%	%= =rU   r   c                  T   e Zd ZdZedd       Zedd       Zedd       Zedd       Zedd       Z	edd       Z
edd       Zedd	       Zedd
       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zy)OpDecompositionsz!
    Decomposes inductor ops
    c                    | S rk   r   )rf   s    rS   identityzOpDecompositions.identityE  s	     rU   c                r    t        j                  t        j                  dt        j                        |       S NrG   )r2   truedivr  r   int32xs    rS   
reciprocalzOpDecompositions.reciprocalJ  s"    {{3<<5;;7;;rU   c                .    t        j                  | |       S rk   )r2   mulr  s    rS   squarezOpDecompositions.squareN  s    wwq!}rU   c                    t        j                  t        j                  dt        j                        t        j
                  |             S r  )r2   subr  r   float32erfr  s    rS   erfczOpDecompositions.erfcR  s*    wws||Au}}5swwqzBBrU   c                    t        j                  t        j                  t        j                  |             t        j                  |             S rk   )r2   r"  expr#  r(  r  s    rS   erfcxzOpDecompositions.erfcxV  s,    wwswwszz!}-sxx{;;rU   c                    t        j                  t        j                  |       t        j                  dt        j
                              S r  )r2   r%  r*  r  r   r&  r  s    rS   expm1zOpDecompositions.expm1Z  s*    wwswwqz3<<5==#ABBrU   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrG   
   r2   r"  logr  mathr   r&  r  s    rS   log10zOpDecompositions.log10^  s7    wwswwqz3<<DHHRL0@%--#PQQrU   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrG   r   r0  r  s    rS   log2zOpDecompositions.log2b  s6    wwswwqz3<<DHHQK#OPPrU   c           
         t        j                  t        j                  | t        j                  t	        j
                  d      t        j                                    S )Nr   )r2   r*  r"  r  r2  r1  r   r&  r  s    rS   exp2zOpDecompositions.exp2f  s3    wwswwq#,,txx{EMM"JKLLrU   c           	         t        j                  t        j                  | t        j                  dt        j
                                    S r  )r2   r1  addr  r   r  r  s    rS   log1pzOpDecompositions.log1pj  s+    wwswwq#,,q%++">?@@rU   c                    t        j                  dt        j                        }t        j                  |t        j
                  |t        j                  t        j                  |                         S r  )r2   r  r   r  r  r9  r*  neg)r  ones     rS   sigmoidzOpDecompositions.sigmoidn  sC    ll1ekk*{{3SWWSWWQZ-@ ABBrU   c                r    t        j                  | t        j                  dt        j                              S Nr   )r2   r   r  r   r  r  s    rS   reluzOpDecompositions.relus  s"    {{1cll1ekk:;;rU   c                V    t        j                  t        j                  | |      |      S rk   )r2   r9  r"  r  yzs      rS   fmazOpDecompositions.fmaw  s     wwswwq!}a((rU   c                T    t        j                  t        j                  |       |      S rk   )r2   r  floorr   r   s     rS   floor_to_intzOpDecompositions.floor_to_int|      ||CIIaL%00rU   c                T    t        j                  t        j                  |       |      S rk   )r2   r  ceilrI  s     rS   ceil_to_intzOpDecompositions.ceil_to_int  s    ||CHHQK//rU   c                T    t        j                  t        j                  |       |      S rk   )r2   r  truncrI  s     rS   trunc_to_intzOpDecompositions.trunc_to_int  rK  rU   c           	        t        j                  | |      }t        j                  t        j                  |t        j                  dt
        j                              t        j                  t        j                  |      t        j                  |                  }t        j                  |t        j                  ||      |      S r@  )
r2   modand_ner  r   r  signbitwherer9  )r   r   rconds       rS   	remainderzOpDecompositions.remainder  sy    GGAqMxxFF1cll1ekk23FF3;;q>3;;q>2
 yyswwq!}a00rU   c                T    t        j                  t        j                  |       |      S rk   )r2   r  roundrI  s     rS   round_to_intzOpDecompositions.round_to_int  rK  rU   N)rf   OpVarTru   r^  r  r^  ru   r^  )r  r^  rD  r^  rE  r^  ru   r^  )r   r^  r   r   ru   r^  r   r^  r   r^  ru   r^  )rz   r{   r|   r}   r   r  r   r#  r(  r+  r-  r3  r5  r7  r:  r>  rA  rF  rJ  rN  rQ  rZ  r]  r   rU   rS   r  r  @  s}      < <   C C < < C C R R Q Q M M A A C C < < ) ) 1 1 0 0 1 1 1 1 1 1rU   r  z[a-z0-9_.]+|\([^)]*\)|)flagsc                    | d   dk7  st        |       dk  ryd}t        | dd        D ]3  \  }}|dk(  r|dz  }n
|dk(  r|dz  }|dk(  s!|t        |       dz
  k7  s3 y |dk(  sJ y)Nr   r  r   FrG   r   T)r  	enumerate)stringr   ichars       rS   _all_in_parensrg    s    ayC3v;?EVABZ( 43;QJES[QJEA:!s6{Q. A::rU   c                  H   e Zd Zed d       Zed!d       Zed"d       Zed#d       Zed$d       Zed$d       Z	ed$d       Z
ed$d       Zed$d	       Zed%d
       Zed&d       Z	 	 d'	 	 	 	 	 	 	 	 	 d(dZ	 	 	 	 	 	 	 	 	 	 d)dZd*dZ	 d+	 	 	 	 	 	 	 	 	 d,dZd-dZ	 	 	 	 	 	 	 	 	 	 d.dZ	 	 	 	 	 	 	 	 d/dZ	 	 	 	 	 	 	 	 	 	 d0dZ	 	 d1	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2dZd3dZdej4                  ddd	 	 	 	 	 	 	 	 	 	 	 	 	 d4dZd5dZd6dZed7d       Zed8d       Z ed9d       Z!y):OpOverridesc                r    t        | t              s t        j                  |       st	        |       r| S d|  dS r  )r2  CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchrg  )rd  s    rS   parenzOpOverrides.paren  s9     v{+#--f5f% M6(!}rU   c                    t        |       S rk   )repr)rf   r   s     rS   r  zOpOverrides.constant  s    E{rU   c                2    dt         j                  |        S )N~ri  rn  r  s    rS   bitwise_notzOpOverrides.bitwise_not  s    ;$$Q'())rU   c                2    t         j                  |        dS )Nz == 0rs  )r   s    rS   logical_notzOpOverrides.logical_not  s    ##A&'u--rU   c                \    t         j                  |        dt         j                  |       S )Nz & rs  r  rD  s     rS   bitwise_andzOpOverrides.bitwise_and  +    ##A&'s;+<+<Q+?*@AArU   c                \    t         j                  |        dt         j                  |       S )Nz | rs  rx  s     rS   
bitwise_orzOpOverrides.bitwise_or  rz  rU   c                \    t         j                  |        dt         j                  |       S )Nz ^ rs  rx  s     rS   bitwise_xorzOpOverrides.bitwise_xor  rz  rU   c                \    t         j                  |        dt         j                  |       S )Nz << rs  rx  s     rS   bitwise_left_shiftzOpOverrides.bitwise_left_shift  +    ##A&'tK,=,=a,@+ABBrU   c                \    t         j                  |        dt         j                  |       S )Nz >> rs  rx  s     rS   bitwise_right_shiftzOpOverrides.bitwise_right_shift  r  rU   c                .    t        j                  | |      S rk   )r2   r  r   s     rS   int_truedivzOpOverrides.int_truediv  s    
 {{1a  rU   c                T    t        j                  | t        j                  |            S rk   )r2   r  r   Integer)rd   r   s     rS   r  zOpOverrides.load_seed  s    xxemmF344rU   Tc                *    t        t        |            S rk   )r.   rx   )rg   r  r   checkwrap_negs        rS   indirect_indexingzOpOverrides.indirect_indexing  s     "#c(++rU   c                D    t        t        |       j                   d      )Nz,: check_bounds should be handled by CSEProxyr   rH  rz   rg   r   r   loweruppers        rS   check_boundszOpOverrides.check_bounds  s'     "Dz""##OP
 	
rU   c                D    t        t        |       j                   d      )Nz$: load should be handled by CSEProxyr  rg   rd   r  s      rS   r  zOpOverrides.load  s%    !Dz""##GH
 	
rU   Nc                D    t        t        |       j                   d      )Nz%: store should be handled by CSEProxyr  rg   rd   r  rf   r[   s        rS   r  zOpOverrides.store  s'     "Dz""##HI
 	
rU   c                D    t        t        |       j                   d      )Nz/: store_reduction should be handled by CSEProxyr  rg   rd   r  rf   s       rS   r  zOpOverrides.store_reduction  s%    !Dz""##RS
 	
rU   c                D    t        t        |       j                   d      )Nz): reduction should be handled by CSEProxyr  rg   r   	src_dtypereduction_typerf   s        rS   r  zOpOverrides.reduction   s'     "Dz""##LM
 	
rU   c                D    t        t        |       j                   d      )Nz$: scan should be handled by CSEProxyr  rg   dtypes
combine_fnvaluess       rS   scanzOpOverrides.scan  s'     "Dz""##GH
 	
rU   c                D    t        t        |       j                   d      )Nz$: sort should be handled by CSEProxyr  rg   r  r  stable
descendings        rS   sortzOpOverrides.sort  s'     "Dz""##GH
 	
rU   c                D    t        t        |       j                   d      )Nz): bucketize should be handled by CSEProxyr  rg   r  
boundariesboundary_indicesindexing_dtyperightsortersorter_indicess           rS   	bucketizezOpOverrides.bucketize#  s'     "Dz""##LM
 	
rU   c                D    t        t        |       j                   d      )Nz2: halide_clamp only implemented for Halide backendr  )rg   rf   r   r  s       rS   halide_clampzOpOverrides.halide_clamp1  s%    !Dz""##UV
 	
rU   rG   )constraintsr   is_purepackc               D    t        t        |       j                   d      )Nz<: inline_asm_elementwise only implemented for Triton backendr  )rg   asmr  r   r  r  r  s          rS   inline_asm_elementwisez"OpOverrides.inline_asm_elementwise6  s'     "Dz""##_`
 	
rU   c                D    t        t        |       j                   d      )Nz.: ops.output should not appear at codegen timeAssertionErrorrH  rz   ro   s     rS   r  zOpOverrides.outputC  s%    Dz""##QR
 	
rU   c                D    t        t        |       j                   d      )Nz3: ops.placeholder should not appear at codegen timer  rg   r  s     rS   r  zOpOverrides.placeholderH  s%    Dz""##VW
 	
rU   c                0     d fd} |_         d|_        |S )Nc                J    t        t        |       j                   d       )Nz does not implement ops.r  )rg   rp   r  rd   s      rS   unimplementedz1OpOverrides._unimplemented.<locals>.unimplementedO  s*    %:&&''?vF rU   T)rg   ri  rp   r	   r  r	   ru   r^  )rz   is_unimplemented)rd   r  s   ` rS   _unimplementedzOpOverrides._unimplementedM  s     	
 "&)-&rU   c                p    t        | |d       }t        t        |d       }| xs ||k(  xs t        |dd      S )Nr  F)getattrr3   )r  rd   fn
default_fns       rS   _is_unimplementedzOpOverrides._is_unimplementedX  s?    S$%Zt4
vSz)SWR9KU-SSrU   c                P   |dv sJ |       t         j                         D ]  \  }}t        ||      }|/| j                  |      s&t	        | || j                  |             C|| j                  vsJ d| d| j                          ||_        t	        | |t        |              y )N)ri  rg  cppvecrh  rs  zmultiple definitions of z on )	pointwise_overrides_datar  r  r  setattrr  __dict__rz   r   )r  r  funcnamedataimpls        rS   _initialize_pointwise_overridesz+OpOverrides._initialize_pointwise_overrides^  s    EEMvME6<<> 
	;NHd4(D|((2C3+=+=h+GHs||3 .xjS\\NK3 !)X|D'9:
	;rU   )rd  r^  ru   r^  )rf   zUnion[bool, float, int]r   r   ru   r^  r_  )r   r^  ru   r^  )r  r^  rD  r^  ru   r^  r`  )rd   rx   r   r^  ru   r^  TT)
r  r^  r   Union[sympy.Expr, int]r  r   r  r   ru   sympy.Symbol
r   r   r   r   r  r   r  r   ru   rv   )rd   rx   r  r   ru   r^  rk   )
rd   rx   r  r   rf   r^  r[   r6   ru   rv   )rd   rx   r  r   rf   r^  ru   rv   )
r   r   r  r   r  r5   rf   !Union[OpVarT, tuple[OpVarT, ...]]ru   r  )r  tuple[torch.dtype, ...]r  zFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]r  tuple[OpVarT, ...]ru   r  )
r  r  r  r  r  r   r  r   ru   r  NN)r  r^  r  .tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  r^  r  r   r  r   r   Optional[tuple[str, sympy.Expr]]r  zOptional[OpVarT]ru   r^  )rf   r^  r   r   r  r   ru   r^  )r  r^  r  rx   r  r   r   r   r  r   r  r   ru   r^  )rp   r^  ru   rv   )r  r   ru   r^  )rd   rx   ru   zCallable[..., OpVarT]rd   rx   ru   r   )r  rx   ru   rv   )"rz   r{   r|   r   rn  r  rt  rv  ry  r|  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r&  r  r  r  r  r   r  r  r   rU   rS   ri  ri    s'       * * . . B B B B B B C C C C ! ! 5 5 ,, %, 	,
 , 
,

&0
9=
FJ
	

 NR

 *
39
AJ
	


	
	
 	
 &		

 1	
 
+	

'


 #
 

	
'	
 #	
 		

 	
 
	
$ 48+/

 C
 !	

 $
 
 1
 )
 


 &*"]]

 
 #	

 
 
 
 





   T T
 ; ;rU   ri  c                  |    e Zd ZU ded<   ded<   dZded<   dZded<   ej                  Zd	ed
<   dZ	ded<   dZ
ded<   y)OverridesDatarx   rd   r   rg  NzOptional[Callable[..., str]]ri  r  r   type_promotion_kindrh  rs  )rz   r{   r|   r~   ri  r  r   DEFAULTr  rh  rs  r   rU   rS   r  r  o  sQ    
I	+/F(/+/F(/'// 8  ,0F(/(,C	%,rU   r  airy_aic                    d|  dS )Nzairy_ai_forward(r   r   r  s    rS   rm  rm    s    (1- rU   special_airy_ai)r  rg  rd   	bessel_j0c                    d|  dS )Nzbessel_j0_forward(r   r   r  s    rS   rm  rm        *1#Q/ rU   c                    d|  dS )Nzlibdevice.j0(r   r   r  s    rS   rm  rm        =1- rU   special_bessel_j0)r  rg  ri  rd   	bessel_j1c                    d|  dS )Nzbessel_j1_forward(r   r   r  s    rS   rm  rm    r  rU   c                    d|  dS )Nzlibdevice.j1(r   r   r  s    rS   rm  rm    r  rU   special_bessel_j1	bessel_y0c                    d|  dS )Nzbessel_y0_forward(r   r   r  s    rS   rm  rm    r  rU   c                    d|  dS )Nzlibdevice.y0(r   r   r  s    rS   rm  rm    r  rU   special_bessel_y0	bessel_y1c                    d|  dS )Nzbessel_y1_forward(r   r   r  s    rS   rm  rm    r  rU   c                    d|  dS )Nzlibdevice.y1(r   r   r  s    rS   rm  rm    r  rU   special_bessel_y1digammac                    d|  dS )Nzcalc_digamma(r   r   r  s    rS   rm  rm    s    aS* rU   c                    |  dS )Nz
.digamma()r   r  s    rS   rm  rm    s    A3j) rU   )r  rg  r  rd   r+  c                    d|  dS )Nzcalc_erfcx(r   r   r  s    rS   rm  rm        A3a( rU   c                    d|  dS )Nzlibdevice.erfcx(r   r   r  s    rS   rm  rm    s    +A3a0 rU   special_erfcxrF  c                    d|  d| d| dS )Nz	std::fma(r   r   r   rC  s      rS   rm  rm    s    is"QCr!A6 rU   c                    d|  d| d| dS )Nzfmadd(r   r   r   rC  s      rS   rm  rm    s    s"QCr!A6 rU   c                    d|  d| d| dS )Nzlibdevice.fma(r   r   r   rC  s      rS   rm  rm    s    s"QCr!A> rU   )r  rg  r  ri  rd   igammac                    d|  d| dS Nzcalc_igamma(r   r   r   rx  s     rS   rm  rm        <s"QCq1 rU   igammacc                    d|  d| dS Nzcalc_igammac(r   r   r   rx  s     rS   rm  rm        =2aS2 rU   gammaincc                    d|  d| dS r   r   rx  s     rS   rm  rm    r  rU   special_gammainc	gammainccc                    d|  d| dS r  r   rx  s     rS   rm  rm    r  rU   special_gammaincci0c                    d|  dS )Nzcalc_i0(r   r   r  s    rS   rm  rm        1o rU   c                    d|  dS Nzlibdevice.cyl_bessel_i0(r   r   r  s    rS   rm  rm        3A3a8 rU   c                    |  dS )Nz.i0()r   r  s    rS   rm  rm    s    A3e rU   )r  rg  ri  r  rd   i0ec                    d|  dS )Nz	calc_i0e(r   r   r  s    rS   rm  rm        	!A& rU   c                    |  dS )Nz.i0e()r   r  s    rS   rm  rm    s    A3f rU   special_i0ei1c                    d|  dS )Nzcalc_i1(r   r   r  s    rS   rm  rm    r  rU   c                    d|  dS Nzlibdevice.cyl_bessel_i1(r   r   r  s    rS   rm  rm    r  rU   
special_i1i1ec                    d|  dS )Nz	calc_i1e(r   r   r  s    rS   rm  rm    r  rU   special_i1elog_ndtrc                    d|  dS )Nzcalc_log_ndtr(r   r   r  s    rS   rm  rm    s    qc+ rU   special_log_ndtrmodified_bessel_i0c                    d|  dS )Nzmodified_bessel_i0_forward(r   r   r  s    rS   rm  rm        3A3a8 rU   c                    d|  dS r  r   r  s    rS   rm  rm    r  rU   special_modified_bessel_i0modified_bessel_i1c                    d|  dS )Nzmodified_bessel_i1_forward(r   r   r  s    rS   rm  rm    r%  rU   c                    d|  dS r  r   r  s    rS   rm  rm    r  rU   special_modified_bessel_i1modified_bessel_k0c                    d|  dS )Nzmodified_bessel_k0_forward(r   r   r  s    rS   rm  rm    r%  rU   special_modified_bessel_k0modified_bessel_k1c                    d|  dS )Nzmodified_bessel_k1_forward(r   r   r  s    rS   rm  rm    r%  rU   special_modified_bessel_k1ndtrc                    d|  dS )Nz
calc_ndtr(r   r   r  s    rS   rm  rm    s    
1#Q' rU   special_ndtrndtric                    d|  dS )Nzcalc_ndtri(r   r   r  s    rS   rm  rm    r  rU   special_ndtri	polygammac                *    |  d| d|  d| d| d|  dS )Nz == 0 ? calc_digamma(z) : (z == 1 ? trigamma(z) : calc_polygamma(r   z))r   rx  s     rS   rm  rm  	  s8    S%aSaS0A!DWXYWZZ\]^\__ab rU   scaled_modified_bessel_k0c                    d|  dS )Nz"scaled_modified_bessel_k0_forward(r   r   r  s    rS   rm  rm        :1#Q? rU   !special_scaled_modified_bessel_k0scaled_modified_bessel_k1c                    d|  dS )Nz"scaled_modified_bessel_k1_forward(r   r   r  s    rS   rm  rm    r<  rU   !special_scaled_modified_bessel_k1spherical_bessel_j0c                    d|  dS )Nzspherical_bessel_j0_forward(r   r   r  s    rS   rm  rm    s    4QCq9 rU   special_spherical_bessel_j0zetac                    d|  d| dS )Nzzeta(r   r   r   rx  s     rS   rm  rm  !  s    52aS* rU   special_zetachebyshev_polynomial_tc                    d|  d| dS )Nzchebyshev_polynomial_t_forward(r   r   r   rx  s     rS   rm  rm  &      :1#Rs!D rU   special_chebyshev_polynomial_tchebyshev_polynomial_uc                    d|  d| dS )Nzchebyshev_polynomial_u_forward(r   r   r   rx  s     rS   rm  rm  +  rI  rU   special_chebyshev_polynomial_uchebyshev_polynomial_vc                    d|  d| dS )Nzchebyshev_polynomial_v_forward(r   r   r   rx  s     rS   rm  rm  0  rI  rU   special_chebyshev_polynomial_vchebyshev_polynomial_wc                    d|  d| dS )Nzchebyshev_polynomial_w_forward(r   r   r   rx  s     rS   rm  rm  5  rI  rU   special_chebyshev_polynomial_wlegendre_polynomial_pc                    d|  d| dS )Nzlegendre_polynomial_p_forward(r   r   r   rx  s     rS   rm  rm  :      9!BqcC rU   special_legendre_polynomial_pshifted_chebyshev_polynomial_tc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_t_forward(r   r   r   rx  s     rS   rm  rm  ?      B1#Rs!L rU   &special_shifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_uc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_u_forward(r   r   r   rx  s     rS   rm  rm  D  rZ  rU   &special_shifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_v_forward(r   r   r   rx  s     rS   rm  rm  I  rZ  rU   &special_shifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_wc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_w_forward(r   r   r   rx  s     rS   rm  rm  N  rZ  rU   &special_shifted_chebyshev_polynomial_whermite_polynomial_hc                    d|  d| dS )Nzhermite_polynomial_h_forward(r   r   r   rx  s     rS   rm  rm  S  s    82aSB rU   special_hermite_polynomial_hhermite_polynomial_hec                    d|  d| dS )Nzhermite_polynomial_he_forward(r   r   r   rx  s     rS   rm  rm  X  rV  rU   special_hermite_polynomial_helaguerre_polynomial_lc                    d|  d| dS )Nzlaguerre_polynomial_l_forward(r   r   r   rx  s     rS   rm  rm  ]  rV  rU   special_laguerre_polynomial_lzdict[str, OverridesData]r  c                     t         fdt        j                  j                  t        j                  j                  t        j                  j
                  t        j                  j
                  fD              S )Nc              3  &   K   | ]  }|v  
 y wrk   r   )r  r  rd   s     rS   r  z$is_buffer_removed.<locals>.<genexpr>d  s       		   )anyr7   r   removed_bufferskernelinplaced_to_removerd   s   `rS   is_buffer_removedrv  c  sU      GG##HH$$GG&&HH''	
  rU   c                  4     e Zd ZdZd fdZddZddZ xZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersc                V    t         |   |       || _        t        |t              rJ y rk   )r  r   rd   r2  r'   )rg   rd   liner
  s      rS   r   zDeferredLine.__init__r  s+    	d$45555rU   c                F    t        | j                        s| j                  S y rk   )rv  rd   rz  rl   s    rS   __call__zDeferredLine.__call__w  s     +99rU   c                .    t        | j                  |      S rk   )rx  rd   )rg   rz  s     rS   	_new_linezDeferredLine._new_line|  s    DIIt,,rU   )rd   rx   rz  rx   r   )rz  rx   ru   rx  )rz   r{   r|   r}   r   r|  r~  r  r  s   @rS   rx  rx  o  s    R6

-rU   rx  c                      e Zd ZdddZy)BracesBufferc                H     t         j                  d fd       } |       S )Nc               3    K   t              D ](  } j                  d       xj                  dz  c_        * t               D ](  } xj                  dz  c_        j                  d       * d  t               D ](  } j                  d       xj                  dz  c_        * t              D ](  } xj                  dz  c_        j                  d       * y w)N{rG   })ranger  _indent)_r   rg   s    rS   ctxz BracesBuffer.indent.<locals>.ctx  s     6] "s#!" F7^ $!s#$ F7^ "s#!" 6] $!s#$s   C C#)ru   Iterator[None])
contextlibcontextmanager)rg   r   r  s   `` rS   indentzBracesBuffer.indent  s$    		"	"	$ 
#	$ urU   N)rG   )r   r   ru   z'contextlib.AbstractContextManager[None])rz   r{   r|   r  r   rU   rS   r  r    s    rU   r  c                  "    e Zd ZU ded<   ded<   y)InplacedBufferrx   r   r   other_namesNr   r   rU   rS   r  r    s    OrU   r  c                  .    e Zd ZU ded<   dZded<   ddZy)	ArgNamerx   rd   Fr   is_constexprc                B    | j                    | j                  rd S d S )Nz : tl.constexprr  )rd   r  rl   s    rS   	full_namezArgName.full_name  s*    ))$2C2C.LMMLMMrU   Nrw   )rz   r{   r|   r~   r  r  r   rU   rS   r  r    s    
IL$NrU   r  c                      e Zd ZddZy)
RemovedArgc                     y)NREMOVEDr   rl   s    rS   __str__zRemovedArg.__str__  s    rU   Nrw   )rz   r{   r|   r  r   rU   rS   r  r    s    rU   r  c                      e Zd Ze	 	 	 	 	 	 	 	 dd       ZddZddZedd       ZddZddZ	ddZ
ddZdd	Zdd
ZddZd dZd!dZd"dZd#dZ	 d$	 	 	 d%dZ	 	 d&dZd'dZd(dZd)dZy)*
KernelArgsc                ~    |j                  |t              }t        |t              r|  t	        |       x||<   }|S |S rk   )getr  r2  r  r  )r   odictrd   result
new_results        rS   _lookupzKernelArgs._lookup  sD     */4)Afj)*0#e*'>>E$K*rU   c                J    i | _         i | _        i | _        i | _        g | _        y rk   )input_buffersoutput_buffersinplace_buffersr  workspace_argsrl   s    rS   r   zKernelArgs.__init__  s)    -/ACMO/124rU   c                    dj                  dj                  t        t        | j                  | j
                  | j                  | j                  g                  S )NzKernelArgs({})r   )formatr   maprp  r  r  r  r  rl   s    rS   __repr__zKernelArgs.__repr__  sS    &&II**++,,	

 	
rU   c                "    t        | t              S rk   )r2  r  ru  s    rS   _buffer_is_marked_removedz$KernelArgs._buffer_is_marked_removed  s     $
++rU   c                :   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v rt        t        | j                  |         S || j                  v r't        t        | j                  |         j                  S |j                  d      r| j                  d| j                  |      S | j                  d| j                  |      S )Nseedin_ptr)r7   r   r  mutation_real_namer  rr  r  r   rx   r  r  r   r  r  r  r  s     rS   inputzKernelArgs.input  s    7777$$77;;D$GD1772228D824&&&T006774'''(<(<T(BCNNN??6"<<(:(:DAA||Hd&8&8$??rU   c                   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v r't        t        | j                  |         j                  S | j                  d| j                  |      S )Nout_ptr)r7   r   r  r  r  rr  r  r   r  r   r  r  r  s     rS   r  zKernelArgs.output  s    7777$$77;;D$GD1772228D824'''(<(<T(BCNNN||It':':DAArU   c                   |t         j                  j                  v r)t         j                  j                  j                  |       || j                  vsJ |       || j                  v rL| j                  |   }t        |t              rJ |j                  j                  |       || j                  |<   y | j                  j                         D cg c]  }t        |t              s| }}| j                  j                         D cg c]  }t        |t              r| }}t        t        |            t        |      z   }t        d| ||g      }|| j                  |<   || j                  |<   y c c}w c c}w )N
in_out_ptr)r7   r   unaligned_buffersr9  r  r2  r  r  appendr  r  r1   r  )rg   
input_nameoutput_namebufvalalive_buffersrr  inplace_buffer_idxs           rS   make_inplacezKernelArgs.make_inplace  sk   222GG%%))+6$"6"66CC6---&&z2C!#z222OO"";/03D  -  //668!#z2 M   //668c:. O 
 "%VM%:!;c/>R!R /01[)C 03D  ,03D  -!
s   E3E8c                J   t        |t        j                  |      t        j                  j                         t         j                               }t        | j                        D ]  \  }}t         j                  ||      r?|j                  }t         j                  ||      | j                  |<   |j                  |fc S |j                  |j                  k7  r|j                  |j                  k7  rJ |        | j                  j                  |       |j                  dfS )a  
        Allocate or extend a workspace buffer of nbytes bytes.

        This function manages the allocation of a workspace buffer. It either creates
        a new WorkspaceArg or extends an existing one.

        Note:
        - Calling this function will in-place mutate the args by adding or updating
        a WorkspaceArg.
        - The codegen for generating the Python argdefs and call_defs will check
        this field and allocate the buffer accordingly.
        - A new argument "ws_ptr" will be present in the generated code.

        Args:
            nbytes (sympy.Expr): The number of bytes to allocate.
            zero_fill (bool): Whether to initialize the buffer to zero.

        Returns:
            Tuple[str, int]: A tuple containing:
                - "ws_ptr": A string identifier for the workspace pointer.
                - offset: An integer representing the byte offset in the workspace.
        )r   r   r   r   r   )r   r   r   r7   r   get_current_device_or_throwr   rc  r  r   r   r   r   r   r  )rg   nbytesr   argre  existing_argr   s          rS   r+  zKernelArgs.workspace  s   . '11)<77668#//1	
  ))<)<= 	OA|$$\37%++)5):):<)M##A&#..66''3>>9 ++s~~= >	 	""3'~~q  rU   c           
        t         j                  j                         }t        |t        j
                  t        j                  dd|j                   d|j                   |      }| j                  D ]*  }|j                  |j                  k(  s||k(  r#J ||f        | j                  j                  |       |j                  S )a  
        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

        Warning: multiple calls to this function will return the same buffer.

        Args:
            min_size: the number of int32 semaphores required

        Returns:
            name of the semaphores buffer
        sem_ptrsemaphores_r  )r   r   r   r   r   r   )r7   r   r  r   r   r   r   uint32rH  r  r  r   r  )rg   min_sizecurrent_devicer  r  s        rS   
semaphoreszKernelArgs.semaphores-  s     <<>'66,, $^%8%8$9>;O;O:PQ!
 !// 	@L&&#..8l*?S,,??*	@ 	""3'~~rU   c                f   t        |t              sJ t        |      |f       t        j                  |      }|| j
                  v r| j
                  |   S | j
                  j                         v r0 t        fd| j
                  j                         D               | j
                  |<   S )Nc              3  F   K   | ]  }|j                        sd   yw)rG   N)r  )r  r  rd   s     rS   r  z)KernelArgs.seed_offset.<locals>.<genexpr>Q  s     U1!,,tBTQUs   !!)r2  r   rH  r   r  r  r  sum)rg   rd   rf   s    ` rS   seed_offsetzKernelArgs.seed_offsetI  s    %%;UU';;%e$DMM!==''4==''))&U(<(<(>UUVW   $erU   c                    t        |t        j                        sJ t        |      |f       |j                  dk(  rd| j
                  |<   y| j                  d| j
                  |      S )Nr  ks)r2  r   SymbolrH  rd   r  r  r  s     rS   r   zKernelArgs.sizeV  sX    $-AT
D/AA-99"(DMM$||D$--66rU   c                    t        | j                  j                         | j                  j                         | j                  j                               S rk   )r   r  keysr  r  rl   s    rS   
call_nameszKernelArgs.call_names]  sA    ##%t':':'?'?'A4==CUCUCW
 	
rU   c                   | j                   j                  |d      }|t        |t              s|j                  S | j
                  j                  |d      }|t        |t              s|S | j                  j                  |d      S )z;
        Returns inner name of a given outer name.
        N)r  r  r2  r  r   r  r  )rg   rd   inplacedr  s       rS   arg_namezKernelArgs.arg_nameb  s}     ''++D$7
8Z(H&&&))--dD9":k:+N!!%%dD11rU   c                    |S rk   r   )rg   r  r   s      rS   wrap_ptr_argzKernelArgs.wrap_ptr_argn  s    
rU   c                    t        |      S rk   )rx   )rg   r   s     rS   wrap_size_argzKernelArgs.wrap_size_argq  s    4yrU   Nc                   ddl m} |ddl m} |}g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  d   }|j                  }	t        j                  j                  |      }
||
   }|j                  | d|	        |j                  | j                  ||
             |j                  | d        | j                  j!                         D ]  \  }}	|| j                  v rt        j                  j                  |      }
||
   }|j                  d| d|	        |j                  | j                  ||
             |j                  d| d        | j"                  j!                         D ]  \  }}|| j                  v st        |t              r%t        j                  j                  |      }
||
   }|j                  | d|        |j                  | j                  ||
             |j                  | d        | j$                  j!                         D ]  \  }}	|j                  d| d|	        |j                  | j'                  |             |j                  d|        t        j                  j(                  slt        j                  j(                  j+                  |        | j,                  rJ d	       |||fS )
NrG   )
INDEX_TYPE)r  r  z* *zconst  zWorkspace not supported on CPU )r  r  r  r1   r  r  r2  r  r  r   r7   r   r   r  r  r  r  r  r  r  wrapper_codeensure_size_computedr  )rg   dtype_to_cpp_typer  r  	call_argsarg_defs	arg_typesr  outerinnerr   	cpp_dtypemaybe_inners                rS   cpp_argdefszKernelArgs.cpp_argdefst  s    	*$/ ,		t33::<= 		.H(J/((,E''EGG%%e,E)%0IOOykE734T..ue<=	{!_-		. !..446 	4LE5,,,GG%%e,E)%0IOOfYKr%9:T..ue<=vi[23	4 #'"5"5";";"= 	.E;,,,
;
0SGG%%e,E)%0IOOykK=9:T..ue<=	{!_-	. !MM//1 	ALE5OOfZL%9:T//67vj\23ww##$$99%@	A &&I(II&I--rU   c                   g }g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  t        |j                               |j                  |j                  d          |j                  t        j                  j                  |j                  d                |j                  t        |j                  |j                  d   t        j                  j                  |j                  d                       t        | j                  j                         | j                   j                               D ]  \  }}|| j                  v st        |t              r%|j                  t        |             |j                  |       |j                  t        j                  j                  |             |j                  t        ||t        j                  j                  |                    | j"                  j                         D ]  \  }}|j                  t        |             |j                  |       |j                  t%        |             |j                  t'        ||             t        j                  j(                  st        j                  j(                  j+                  |        | j,                  D ]m  }|j                  t        |j                               |j                  |j.                         |j                  |       |j                  |j0                         o ||||fS )Nr  )rd   r   r   )r1   r  r  r2  r  r  r  r   r  r7   r   r   r   r   r  r  r  r  rH  r   r  r  r  r   r   )	rg   r  r  r  precompile_argsr  r  r  r  s	            rS   python_argdefszKernelArgs.python_argdefs  s    #%!	!	/1t33::<= 	H(J/OOGH$7$789X11"56QWW..x/C/CB/GHI""!,,#//3''++H,@,@,DE	 "$$&(;(;(A(A(C
 	LE5 ,,,
5*0MOOGEN+U#QWW..u56"" ''++E2	 !MM//1 	ALE5OOGEN+U#T%[)""75%#89ww##$$99%@	A && 	(COOGCNN34S^^,""3'SYY'		(
 OY>>rU   c              #    K   t        | j                  j                               D ]  }t        |t              r|j
                  D ]  }|t        j                  j                  v s|t        j                  j                  v r<|| j                  v r| j                  |   |j                  f || j                  v svt        t        | j                  |         |j                  f   y wrk   )r1   r  r  r2  r  r  r7   r   rt  rs  r  r   r  r   rx   )rg   r  others      rS   aliaseszKernelArgs.aliases  s     t33::<= 	UH(J/!-- 	UQWW777 ; ;;D...,,U3X5H5HHHD///sD$7$7$>?ATATTT	U	Us   B9C,<0C,c                    t        | j                  j                  |t              t              xr. t        | j
                  j                  |t              t              S rk   )r2  r  r  r  r  r  r  s     rS   
is_removedzKernelArgs.is_removed  sK    ##D'2J
 N--11$@*M	NrU   c                l   t               }t        | j                  j                               D ]1  }t	        |t
              r|j                  |j                  d          3 | j                  j                         D ]5  \  }}|| j                  v st	        |t
              r%|j                  |       7 |S )Nr  )
r   r1   r  r  r2  r  r9  r  r  r  )rg   	live_outsr  r  r  s        rS   live_output_bufferszKernelArgs.live_output_buffers  s    %/\	t33::<= 	4H(J/MM(..r23	4 !//557 	!LE5,,,
5*0MMM% 	! rU   )r   rx   r  z6Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]]rd   rJ   ru   rx   rt   rw   )rd   r	   ru   r   r-  )r  rx   r  rx   ru   rv   )r  r   r   r   ru   ztuple[str, int])r  r   ru   rx   )rd   rx   rf   r   ru   rx   )rd   r  ru   rx   )ru   zIterator[str])rd   rx   ru   r   )r  rx   r   r   ru   rx   )r   
SymbolLikeru   rx   rk   )r  z Optional[dict[torch.dtype, str]]ru   z&tuple[list[str], list[str], list[str]])ru   z?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]])ru   zIterator[tuple[str, str]]r  )ru   zOrderedSet[str])rz   r{   r|   r   r  r   r  r  r  r  r  r+  r  r  r   r  r  r  r  r  r  r  r  r  r   rU   rS   r  r    s    		E	 	 
		 	5
 , ,
@B48'!R87


2 EI..!A..	/..`/?	H/?bUN
rU   r  c                  `     e Zd ZdZ	 	 d	 	 	 	 	 	 	 d	 fdZd
dZddZddZddZd
dZ	 xZ
S )rk  aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    c                    t         |           t        |t              sJ t	        |             || _        || _        d| _        || _        || _	        y r  )
r  r   r2  r   rH  rd   bounds	use_countr   shape)rg   rd   r  r   r  r
  s        rS   r   zCSEVariable.__init__   sL     	&+.<V<.	

rU   c                    | j                   S rk   ru  rl   s    rS   r  zCSEVariable.__str__  s    yyrU   c                ,    t        | j                        S rk   )hashrd   rl   s    rS   __hash__zCSEVariable.__hash__  s    DIIrU   c                X    t        |t              xr |j                  | j                  k(  S rk   )r2  rk  rd   )rg   r  s     rS   __eq__zCSEVariable.__eq__  s!    %-I%**		2IIrU   c                     y rk   r   )rg   rd   rp   r  s       rS   update_on_argszCSEVariable.update_on_args  s    rU   c                N    | j                   j                   d| j                  dS r  )r
  rz   rd   rl   s    rS   r  zCSEVariable.__repr__  s$    ..))*!DII=::rU   r  )rd   rx   r  ValueRanges[Any]r   r   r  rF   rw   )ru   r   )r  objectru   r   )rd   rx   rp   r	   r  r	   ru   rv   )rz   r{   r|   r}   r   r  r  r  r  r  r  r  s   @rS   rk  rk    sU     (, $ ! %	
 J;rU   rk  AugmentedKeyT)default)boundr  .c                  P   e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZddZddZddZddZ	dd	Z
dd
ZddZ ej                         ddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 	 	 ddZy)CSEz Common subexpression eliminationNc                    || _         || _        i | _        || _        |xs i | _        |xs i | _        |xs t        j                         | _        t               | _
        |xs i | _        y rk   )r   r\   _cachename_prefixstore_cachereduction_cache	itertoolsr   iter_buffer_idsr   invalidated_storesvarname_map)rg   r   r\   r  iter_buffersr  r  r  s           rS   r   zCSE.__init__-  sm     FH&ALARPR!r 	 6B5VY__EV3=<7B7HbrU   c                6   g | j                   j                         D ]2  \  }}||vs| j                   |= | j                  j                  |       4 |r9| j                  j                         D ci c]  \  }}||v s|| c}}| _        y i | _        y c c}}w rk   )r  r  r  r9  r  )rg   	keep_varsrd   tmpr  r  s         rS   
invalidatezCSE.invalidateE  s    44++1134 	2ID#)#$$T*''++D1	2 ,0KK,=,=,?RDAq1	>1a4RDKDK Ss   1B>Bc           	          t        |       | j                  | j                  | j                  | j                  | j
                  | j                  | j                        S )N)r   r\   r  r  r  r  r  )rH  r   r\   r  r  r  r  r  rl   s    rS   clonez	CSE.cloneO  sP    tDz;;;;((--(((( 00
 	
rU   c                    | j                         }t        | j                        |_        t        | j                        |_        t        | j                        |_        |S )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)r  r,   r  r  r  )rg   new_cses     rS   scoped_copyzCSE.scoped_copyZ  sH    **,#DKK0",T-A-A"B()9)9:rU   c                "    t        t        |      S )z@Override this method to augment cache key with backend specifics)r   r  rg   	cache_keys     rS   augment_keyzCSE.augment_keyb  s    M9--rU   c                @    || j                   | j                  |      <   y rk   r  r"  )rg   r!  r  s      rS   putzCSE.putf  s    36D$$Y/0rU   c                <    | j                  |      | j                  v S rk   )r"  r  r   s     rS   containszCSE.containsi  s    	*dkk99rU   c                X    | j                   j                  | j                  |      d       S rk   )r  r  r"  r   s     rS   try_getzCSE.try_getl  s"    {{t//	:DAArU   c                >    | j                   | j                  |         S rk   r$  r   s     rS   r  zCSE.geto  s    {{4++I677rU   T)r  re   
assignmentr   r  c          	        t        |t              r|j                  }|s|sJ t        |t              rE|j                  j                  |      |_        |xj                  dz  c_        t        t        |      S t        |t              r|j                         }n1t        |t              r|j                  }nt        |t              sJ |}| j                  |      }	||sd}|	s| j                  |||      }	| j!                  ||	       |rt"        j$                  j&                  r+t"        j$                  j&                  j)                  |d       t        |t              rP|r |j+                  | j,                   |	 d       |j/                  |       |j+                  | j0                         |	S t        |t              rM|sJ |j+                  |j3                  | j,                   |	 d|j                   | j0                                |	S |r | j,                   |	 d| | j0                   }
n| | j0                   }
|j+                  |
       |rPt4        j6                  j8                  st4        j6                  j:                  r|t=               dk7  rt?        ||	|       |	S |	j                  j                  |      |	_        |	xj                  dz  c_        |	S )NrG   r   T)	only_oncez =z = rg  ) r2  r4   rf   rk  r  tightenr  r   r  r*   getvaluer'   rz  rx   r)  newvarr%  r7   rs  current_nodecodegen_originating_infor  r   splicer\   r~  r    r  r  r  r)   r  )rg   r   r   r  re   r+  r   r  r!  r  rz  s              rS   generatezCSE.generater  sl    dH%::D
""dK( ++--f5DKNNaN..n-I./		IdC(((Ill9%= E++feU3CHHY$88((HH))BB$ C  dN3!((DKK=R)@AMM$'$$T[[1: 
9  &67%%:$$$++se3tyyk$++'WX4 
- ""&++se3tfT[[MJ"&}5$$T* #"//KK%22JJ!-/1U:#FC7 
 ++F3CJMMQM
rU   c                    | j                    t        | j                         }t        j                  j                  ||||      }|| j                  |<   |S rk   )r  r   r  r7   rs  create_cse_varr  )rg   r  r   r  var_namer  s         rS   r0  z
CSE.newvar  sT     &&'T-A-A(B'CDhh%%huE%("
rU   c                    t        j                  | j                  vfd       t        j                  j                  |||      }|| j                  <   |S )Nc                     d  S )Nzduplicate name: r   ru  s   rS   rm  zCSE.namedvar.<locals>.<lambda>  s    4DTF2K rU   )r   _check_valuer  r7   rs  r6  )rg   rd   r  r   r  r  s    `    rS   namedvarzCSE.namedvar  sU     	(((*K	
 hh%%dFE5A!$
rU   )r  r  r  NNNN)r   rx   r\   rx   r  rx   r  zOptional[itertools.count[int]]r  z.Optional[MutableMapping[str, CSEVariableType]]r  z<Optional[MutableMapping[ReductionCacheKey, CSEVariableType]]r  z$Optional[dict[str, CSEVariableType]])r  zOrderedSet[CSEVariable]ru   rv   ru   r   )r!  rx   ru   r  )r!  rx   r  r  ru   rv   )r!  rx   ru   r   )r!  rx   ru   zOptional[CSEVariableType])r!  rx   ru   r  )r   r*   r   zCUnion[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase]r  r  re   r   r+  r   r   r   r  rF   ru   r  )r  r  r   r   r  rF   ru   r  )
rd   rx   r  r  r   r   r  rF   ru   r  )rz   r{   r|   r}   r   r  r  r  r"  r%  r'  r)  r  r   unknownr4  r0  r;  r   rU   rS   r  r  *  s   *  7;FJ <@II I 	I
 5I DI
I :I0	
.7:B8 $7;#6#6#8'+ $KK RK
 !K K K %K K 
K^ $7;#6#6#8'+ $		 	 %	 		
 
	 $7;#6#6#8'+ $ ! %	
  
rU   r  c                  0     e Zd Zd fdZddZddZ xZS )CodeGenc                T    t         |           t        j                         | _        y rk   )r  r   r  	ExitStack
exit_stackrg   r
  s    rS   r   zCodeGen.__init__  s    $..0rU   c                :    | j                   j                          | S rk   )rB  	__enter__rl   s    rS   rE  zCodeGen.__enter__  s    !!#rU   c                >    | j                   j                  |||       y rk   )rB  __exit__)rg   exc_typeexc_valexc_tbs       rS   rG  zCodeGen.__exit__  s      7F;rU   rt   r<  rH  r	   rI  r	   rJ  r	   ru   rv   )rz   r{   r|   r   rE  rG  r  r  s   @rS   r?  r?    s    1<rU   r?  c                      e Zd ZU dZded<   dZded<   dZded<   	 d 	 	 	 	 	 d! fdZej                  d"d	       Z
ej                  	 	 d#	 	 	 	 	 	 	 d$d
       Zd%dZd%dZd&dZ	 d'	 	 	 	 	 	 	 	 	 d(dZ	 	 	 	 	 	 	 	 	 	 d)dZ	 	 	 	 	 	 	 	 d*dZ	 	 	 	 	 	 	 	 	 	 d+dZd,dZ	 	 d#	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d-dZed.d       Z	 d'	 	 	 	 	 	 	 	 	 d/dZ	 	 	 	 	 	 	 	 	 	 d0dZd1dZd2 fdZd3 fdZd4dZd5dZd5dZ	 	 	 	 d6dZd7dZ d8dZ! xZ"S )9Kernelr  rx   newvar_prefixr\   Nz'Optional[Callable[[], OpsHandler[Any]]]	overridesc                   t         |           |rt        xj                  dz  c_        |xs
 t	               | _        t               | _        t               | _        t               | _	        d| _
        d| _        t        | j                  | j                        | _        t!               | _        t!               | _        d | _        d | _        d | _        d | _        t!               | _        t!               | _        i | _        d| _        d | _        y )NrG   r   )r  r   r!   generated_kernel_countr  rp   r*   loadscomputestoresnum_loadnum_reductionr  rN  r\   cser   must_keep_buffersstore_buffer_names
_load_mask_load_otherr1  node_to_boundsrr  rt  inplace_update_buffersmin_elem_per_threadkernel_name)rg   rp   increase_kernel_countr
  s      rS   r   zKernel.__init__  s     	 **a/*(JL	#%
%'$&.1$2D2Ddkk.R2<,3=<)-4859OS0:3=<
 79##$ *.rU   c              #     K   | j                   }|| _         |j                  j                         j                         | _        	 d  || _         y # || _         w xY wwrk   )r1  r  r  
get_boundsr\  )rg   r  priors      rS   set_current_nodezKernel.set_current_node  sO     !! "jj//1<<>	& %DDs   AAA A	AAc              #    K   ||}|d u x}r
t               }| j                  }| j                  }| j                  }| j                  }|| _        || _        || _        |j                         | _        	 d  || _        || _        || _        || _        |r
|rJ d       y y # || _        || _        || _        || _        |r
|rJ d       w w xY ww)Nz$unexpected store inside swap_buffers)r*   rR  rS  rT  rW  r  )	rg   lbcbsbdisallow_storesrR  rS  rT  rW  s	            rS   swap_bufferszKernel.swap_buffers  s      :B Dj(?(!B

,,hh
??$	FDJ"DL DKDHEEEv2 	 DJ"DL DKDHEEEv2 s   A/C2B 6)C*C		Cc                    t         rk   r  r  s      rS   r  zKernel.load4  r  rU   c                    | j                   }	 | j                  | _         | j                  ||      || _         S # || _         w xY w)z+A load the depends on an index we have read)rR  rS  r  )rg   rd   r  rc  s       rS   indirect_loadzKernel.indirect_load7  s8    

	DJ99T5)DJDJs	   "8 	Ac                    t         rk   r  r  s       rS   r  zKernel.store_reductionA  r  rU   c                    t         rk   r  r  s        rS   r  zKernel.storeD  
     "!rU   c                    t         rk   r  r  s        rS   r  zKernel.reductionI  
     "!rU   c                    t         rk   r  r  s       rS   r  zKernel.scanR  s
     "!rU   c                    t         rk   r  r  s        rS   r  zKernel.sort\  rr  rU   c                    t         rk   r  rl   s    rS   
var_rangeszKernel.var_rangese  r  rU   c                    t         )z3
        See [Note: Inductor bucketize op]
        r  r  s           rS   r  zKernel.bucketizeh  s
     "!rU   c                    t         rk   r  rl   s    rS   assert_functionzKernel.assert_functionw  s    !!rU   c           	     v   t        |t              rt        |      }t        |t              sJ t        |             |t        |t              sJ |t        |t              sJ |r|rd| d| d| d| d	}| d| d| }n|r
| d| }|}n|sJ | d| }|}|r	d| d| d}| j                   d| d| dS )	Nr  z <= z) & (z < r   z) | ~(z, "index out of bounds: z"))r2  rk  rx   rH  ry  )rg   r  r  r  maskrY  
cond_prints          rS   indirect_assertzKernel.indirect_assert{  s    c;'c(C#s#.T#Y.#}
5# 666}
5# 666U ugT#eC5E7!<D!7$se3ug6JWD&DJL5U#eW%DJtfF4&*D&&'q.FzlRTUUrU   c                    t         rk   r  r  s        rS   r  zKernel.check_bounds  rp  rU   c                    t         rk   r  r  s     rS   index_to_strzKernel.index_to_str  r  rU   c           	     (   t         |           | j                  sJ | j                  j	                  t        j                  t        | | j                                            | j                  j	                  t        j                  |              | S rk   )	r  rE  rO  rB  enter_contextr7   set_ops_handlerCSEProxyset_kernel_handlerrC  s    rS   rE  zKernel.__enter__  sl    ~~~%%htT^^-=>?	
 	%%a&:&:4&@ArU   c                H    | j                          t        | 	  |||       y rk   )remove_kernel_local_buffersr  rG  )rg   rH  rI  rJ  r
  s       rS   rG  zKernel.__exit__  s     ((*7F3rU   c                   t         j                  j                  syt        fd| j                  D              }t               | j                  D ]N  }|| j
                  vs|| j                  j                  vs+j                  ||      s>j                  |       P D ]  }|| j                  j                  v rw| j                  j                  |   }t        |t              rEt        fd|j                  D              }|r| j                  |       | j                   j                  |       | j#                  |        y)z
        Any buffers that are both created and have a last use in the
        same kernel can be removed.

        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        Nc              3  t   K   | ]/  }|j                   v rj                   |   j                          1 y wrk   )name_to_bufdefining_op_name)r  r  r  s     rS   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>  s;      &
i+++ !!#&779&
s   58c              3  &   K   | ]  }|v  
 y wrk   r   )r  r  names_to_removes     rS   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>  s     KaQ/1Krp  )r7   r   r  r   rY  rX  rp   r  $can_buffer_be_removed_through_fusionr9  r  r2  r  r  r  remove_inplace_bufferrt  remove_buffer)rg   fused_node_namesrd   r  rc   r  r  s        @@rS   r  z"Kernel.remove_kernel_local_buffers  s%    GG%%	% &
..&
 

 ,6<++ 	*DD222		 7 77BB*  ##D)	* $ 
	)Dtyy000ii//5c:.K3??KK..t4''++D1""4(
	)rU   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremove_buffer(%r))r1  rQ   r  rp   r  rr  r9  r  s     rS   r  zKernel.remove_buffer  s;     			%t,)0		  &  &rU   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremoving_inplace_buffer(%r))r1  rQ   r  rp   r  rr  r9  r  s     rS   r  zKernel.remove_inplace_buffer  s9    		/6*1		!!$'  &rU   c           	        t        |t        t        f      r|D cg c]  }| j                  |       c}S t        j
                  j                  j                  |      }t        |j                  d       }|D ci c]W  }t        |t        j                  t        j                  t        j                  f      r|| j                  j!                  |      Y }}t#        ||      S c c}w c c}w )Nc                    | j                   S rk   ru  )ss    rS   rm  z(Kernel.rename_indexing.<locals>.<lambda>  s
    !&& rU   )r  )r2  listtuplerename_indexingr7   r   r  r  sortedfree_symbolsr   r   UNBACKED_INTSIZEPRECOMPUTED_SIZErp   r   r/   )rg   r  r  sorted_symbolsreplacementss        rS   r  zKernel.rename_indexing  s    
 edE]+5:;D((+;;  ))%0 2 28HI $
%%II)) tyy~~a  
 
 %.. <
s   C%;AC*c                    t        |i |S rk   )rk  )rg   rp   r  s      rS   r6  zKernel.create_cse_var  s    D+F++rU   c                Z    |y| j                   j                  |j                               S )zC
        Returns arg name of a given input or output node.
        N)rp   r  r   )rg   r  s     rS   r  zKernel.arg_name  s'     <yy!!$--/22rU   )NT)rp   zOptional[KernelArgs]r`  r   ru   rv   )r  rE   ru   r  r  )rf  r*   rg  Optional[IndentedBuffer]rh  r  ru   r  rd   rx   r  r   ru   rk  rd   rx   r  r   rf   rk  ru   rv   rk   
rd   rx   r  r   rf   rk  r[   r6   ru   rv   
r   r   r  r   r  r5   rf   +Union[CSEVariable, tuple[CSEVariable, ...]]ru   r  r  r  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]ru   r  
r  r  r  r  r  r   r  r   ru   r  )ru   zdict[sympy.Symbol, sympy.Expr]r  rk  r  r  r  rk  r  r   r  r   r  r  r  zOptional[CSEVariable]ru   rk  rw   )
r  zUnion[CSEVariable, str]r  r   r  r   r{  z!Optional[Union[CSEVariable, str]]ru   rx   r  )r  r   ru   rx   r<  rK  rt   rd   rx   ru   rv   )r  z;Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]ru   r   )rp   r	   r  r	   ru   rk  )r  r@   ru   r   )#rz   r{   r|   rN  r~   r\   rO  r   r  r  rd  rj  r  rm  r  r  r  r  r  rv  r  r   ry  r}  r  r  rE  rG  r  r  r  r  r6  r  r  r  s   @rS   rM  rM    s   M3FC9=I6= PT /( /HL /	 /D & &  (,'+	FF %F %	F
 
F F8"" SW"" *"3>"FO"	"
"" " &	"
 ;" 
5""'"
" (" 
!""'" (" 	"
 " 
!"" 4804"" C" &	"
 $" " 1" ." 
" " " 37V$V V 	V
 0V 
V<""&0"9="FJ"	"
"4%)N''
/P/	/.,3rU   rM  c                  8    e Zd ZU dZded<   dZded<   dZded	<   y)
r  r  zClassVar[str]r  Nr   r   r  rx   ops_name)rz   r{   r|   r  r~   r   r  r   rU   rS   r  r  	  s!    "C"#'E 'HcrU   r  c                 b    	 dd l } | j                  | j                        S # t        $ r Y y w xY w)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  s    rS   
jinja2_envr  	  s?    !!,, " 
 	
  s   " 	..c                      e Zd ZdZe	 d	 	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 dd       ZddZe	dd       Z
ddZ	 	 	 	 	 	 ddZdd	Zy
)KernelTemplatezg
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUDATemplate
    c                    | j                  d      }t        |      dkD  r|dd  D cg c]  }d|z  |z  |z    c}|dd  dj                  |      S c c}w )NTrG   r  r  )
splitlinesr  r   )sourcenum_indentsindents_spacinglinesrz  s        rS   indent_except_firstz"KernelTemplate.indent_except_first 	  sd     !!$'u:>INqrAE&4<E!"I wwu~s   Ac                    t               }|y t        j                  |j                  d<   ddlm} 	 |j                  |       S # |$ r} G d d|      } ||      |d }~ww xY w)Nr  r   )TemplateSyntaxErrorc                  (     e Zd Zd fdZddZ xZS )IKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrorc                    t         |   |j                  |j                  |j                  |j
                         || _        y rk   )r  r   messagelinenord   filenameoriginal_error)rg   r  r
  s     rS   r   zRKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__8	  s>    G$&..&--&++&//	 +9D'rU   c                F   d| j                    d}|d| j                   dz  }t        | j                  d      r| j                  j                  j                  d      }|dz  }t        d| j                   dz
        }t        t        |      | j                   dz         }t        ||      D ]s  }|| j                   dz
  k(  rN||dz    d	||    dz  }t        | j                  d
      s=|dd| j                  j                  dz
  z  z   dz   z  }c||dz    d||    dz  }u |S )NzError in template at line 
zError message: r  z	Context:
r   r   rG   z: --> columnz     r  z^
z:     )r  r  r  r  r  splitmaxminr  r  r  )rg   
error_infor  startendre  s         rS   r  zQKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__A	  sA   #=dkk]"!MJODLL>"DDJt22H= $ 3 3 : : @ @ F"l2
 #At{{Q 7!#e*dkkAo>!&uc!2 
KA DKK!O3 *QveAhZr.J J
#*4+>+>#I$.(/*-1D1D1K1Ka1O*P)Q*/)0%&J !+QveAhZr.J J

K &%rU   )r  r  ru   rv   rw   )rz   r{   r|   r   r  r  r  s   @rS   DetailedTemplateSyntaxErrorr  7	  s    9&rU   r  )r  r  r  filtersr  r  from_string)r  envr  er  s        rS   _template_from_stringz$KernelTemplate._template_from_string+	  sj    l;-;-O-O)*.#	8??6**" !	8&.A &> .a0a7C!	8s   A A!AA!c                   t         j                  j                  t        | t        t
        f      r.| D ci c]!  }|j                         |j                         # c}n | j                         | j                         idfd}|S c c}w )Nc                >    j                  |       }||S  |       S rk   )r  )rd   r  _get_dtype_reallookups     rS   r   z1KernelTemplate._fake_get_dtype.<locals>.get_dtypeb	  s'    ZZ%F!"4((rU   )rd   rx   ru   r   )r7   r   r   r2  r  r  r   )	fake_outsr  r   r  r  s      @@rS   _fake_get_dtypezKernelTemplate._fake_get_dtypeX	  sr     ''++i$/AJK#cllncmmo5KF((*I,?,?,ABF	)  Ls   &B
c                    || _         y rk   ru  r  s     rS   r   zKernelTemplate.__init__j	  s	    	rU   c                    | j                   S )a  
        entry point to override for templates to ensure a uid e.g. through a prefix

        the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
        in the system, but reproducible e.g. restarting pytorch should yield the same id
        ru  rl   s    rS   uidzKernelTemplate.uidm	  s     yyrU   c                X    g } | j                   |fi |}|t        |      dk(  r|d   S y)z
        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.

        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        NrG   r   )maybe_append_choicer  )rg   r  temp_choicesr  s       rS   choice_or_nonezKernelTemplate.choice_or_nonex	  s>     #%))),A&A>c,/14?"rU   c                   	 |j                   | j                  di |       y# t        $ rQ}t        j	                  d|t        |       t        j                         t        j                  k         |cY d}~S d}~ww xY w)a%  
        Maybe generates a new ChoiceCaller and appends it into existing choices.
        Returns None if success, otherwise returns the error.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_infor   )	r  r4  r   r1  inforH  getEffectiveLevelrO   INFO)rg   choicesr  r  s       rS   r  z"KernelTemplate.maybe_append_choice	  sn    
	NN=4==2623" 	HHET
002W\\A	   H	s   !$ 	A>AA93A>9A>c                    t         )zM
        Generates a ChoiceCaller instance from the given arguments.
        r  )rg   r  s     rS   r4  zKernelTemplate.generate	  s
    
 "!rU   N)   )r  rx   r  r   r  r   ru   rx   )r  rx   ru   r	   )r  zUnion[list[Buffer], Buffer]ru   zCallable[[str], torch.dtype]r  rw   )r  r	   ru   zOptional[ChoiceCaller])r  ry   r  r	   ru   zOptional[NotImplementedError])r  r	   ru   r>   )rz   r{   r|   r}   r   r  r  r  r   r   r  r  r  r4  r   rU   rS   r  r  	  s     >?"%8;	  *8 *8X .	% "  
 ,/	&."rU   r  c                  "    e Zd Zd Zd fdZddZddZ	 	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZddZ	ddZ
	 d	 	 	 	 	 	 	 	 	 ddZdd	Z	 	 	 	 	 	 	 	 	 	 dd
Z	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZ	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )r  c                b    t         |           ddlm}  |       | _        || _        || _        y )Nr   ValueRangeAnalysis)r  r   r  r  vr_analysisrs  parent_handler)rg   rs  r  r  r
  s       rS   r   zCSEProxy.__init__	  s+    /-/,rU   c           	     d  
  | j                   gi  t        | j                        i }t               }t	               }t               
t        |      }d d dk(  r
dk(  r|j                  |j                  nydk(  rT
dk(  rOt        j                  j                  j                  j                  t        j                  d       j                  d n 
dv rt        |      } |i  |i 
dv rJ dd
fd}	t        j                   |	|      S )	Nmaskedri  rg  )ri  rg  rs  )ri  rg  r   c                   t        	t        t        f      r	
   n	}t        t        t        f      r,t              dkD  rt        d   t        t        f      r
   n}
dz  
t        | t              r+dk(  r| j
                  || _        | j                  || _        t        j                  j                  j                  t        j                  j                  | 	      }|j                         t        j                  j                  st        j                  j                   r)|J t#        t        j                  j                  ||       |S )Nr   rG   rg  r  r   r  )r2  r  r  r  rk  r   r  r7   rs  rW  r4  rS  r  r    r  r  r  r  )r  	var_dtype	var_shapecsevarrp   r  r  r  rd   r  
output_idxoutput_shapes       rS   do_csez!CSEProxy._default.<locals>.do_cse	  s7   
 lT5M: Z(!  lT5M:%)|Au> Z( "  !OJ ![)e#'AG77?'AGXX\\**  "" + F !!$f5 ##??&&>> ,,,AHH,,fi@MrU   )r  zUnion[str, CSEVariable]ru   rk  )_bound_variabler  r  r"   r%   r)   r   r  r7   interpreterr1  r  r  r  r  pytreetree_map)rg   rd   rp   r  rf   dtype_handlershape_handlershape_opdtype_opr  r  r  r  r  r  s    ```      @@@@@rS   _defaultzCSEProxy._default	  s@   %%%d<T<V<2++T2DCFC2424%'=$/88 3 ;;L ;;LX'U"2==55::>>#''e   L00}d3H#T4V4L#T4V4L''+++
(	 (	T vu--rU   c                z  	 ddl m} ddlm} ddlm} t        t        j                  |      rt        j                         S t        t        j                  |      rt        j                         S t        j                  j                  		j                  |k(  r| j                  j                  t        | j                  j                  t              s$J t!        | j                  j                               | j                  j                  j#                  	t        j                               S t$        j&                  rjt)        ||      r^t+        	fddD              rt        j                         S |rJ d	d}t-        t/        ||            } t1        | j2                  |      | S t        j                         S )
z
        If the variable comes from an FX node, we forward the bound we have already computed
        Else, if the variable when codegen'ing another op, we try to compute its bounds
        r   r  )TritonTemplateKernelrG   )CUDATemplateKernelc              3  :   K   | ]  }|j                   v   y wrk   )r  )r  r  fx_nodes     rS   r  z+CSEProxy._bound_variable.<locals>.<genexpr>
  s     V11&Vs   )set_indirectr  r  c                    t        | t              r| j                  S t        | t        j                        rt        |       S | S rk   )r2  rk  r  r   r  r   r  s    rS   arg_to_boundz.CSEProxy._bound_variable.<locals>.arg_to_bound
  s2    a-88O5::.&q>)HrU   )r  r	   ru   r	   )r  r  select_algorithmr  cuda.cuda_kernelr  r2  r7   rs  r   r=  r  r1  r  r\  dictrH  r  r    compute_all_boundsr  rq  r  r  r  r  )
rg   rd   rp   r  r  r  r  r  
arg_boundsr
  s
            @rS   r  zCSEProxy._bound_variable	  sP   
 	0;8ahh 45&&((ahh 23&&((--,,>>T!dkk&@&@&Ldkk88$? **B ? ;;--11';;N;N;PQQ&&73Et+L V0UVV"**,, : c,56J274++T2J??""$$rU   c                P   t        |t              rt        j                  |      }t        |t        j                        sJ t        |      |f       |j                  j                  dk  r|rt        j                  |t        j                  |t        j                              }|j                  j                  dk\  r0t        j                  |d      }t        j                  |||      }n|}t!        j"                         }|j                  t!        j"                         k7  rt        |t        j$                        r|j                  t!        t&         d      z  }t!        |j                  |z   |j                  |z         }|j                  j                  dk\  r"|j                  t!        dt&              z  }	||	z  }| j(                  j*                  j-                  | j(                  j.                  |||j0                  |j2                        }| j4                  j7                  |||      }
t9        |      ro|j                  j                  dk\   }t        |t        j$                         xs |j                  j                  |k   }| j(                  j;                  |
|||       |
S )Nr   r  r  )r2  r   r   r  r  rH  r  r  r2   r9  r  r   longr  ltrW  r   r=  Numberr   rs  rW  r4  rS  r   r  r  r  r(   r  )rg   r  r   r  r  stmr  
new_bounds
neg_boundspos	sympy_varassert_lowerassert_uppers                rS   r  zCSEProxy.indirect_indexing(
  s    dC ==&D$

+?d4j$-??+ ::aggc3>>$

#CD::##q(QB))BS1C %,,.Jzz[0022z$7U !ZZ+vgr*BB
($$t+Z-=-=-D
 ::##q(**{1f'==C!+c!1J++//**##!iiii + C ''99#tUK	5! #

 0 0A 56L)$== 

  4'BL KK$$YlLQrU   c                >    | j                   j                  ||||      S rk   )rs  r  r  s        rS   r  zCSEProxy.check_bounds_
  s     {{''dE5AArU   c                   || j                   j                  j                  v r)t        j                   j                  j                  |       t        |t        j                        r| j                   j                  ||      S | j                   j                  j                  }||v r||   S | j                   j                  ||      }|j                  dk(  r| j                   xj                  dz  c_        |S r  )rs  rW  r  r7   rX  r9  r   r   TMPrm  r  r  r  rU  )rg   rd   r  r  outs        rS   r  zCSEProxy.loadd
  s    4;;??555 HH&&**40udhh/;;,,T599kkoo11;t$$kktU+ ==AKK  A% 
rU   c                l   || j                   j                  j                  |<   | j                   j                  r{|t        j
                  j                  v r^| j                   j                  j                  |      }|j                         D ]%  }|| j                   j                  j                  |<   ' y y y rk   )	rs  rW  r  r1  r7   r   name_to_buffer
get_outputget_mutations)rg   rd   rf   r  
other_names        rS   _update_store_cachezCSEProxy._update_store_cacheu
  s    ,1##D);;##0F0F(F++**55d;C!//1 @
:?++J7@ )G#rU   c                    | j                   j                  j                  |       || j                  ||       |t        j
                  j                  vr | j                   j                  ||||       y y )N)r[   )rs  rY  r9  r'  r7   r   rr  r  r  s        rS   r  zCSEProxy.store|
  sc     	&&**40<$$T51qww...KKdE5t< /rU   c                    | j                   j                  j                  |       | j                  ||       |t        j
                  j                  vr| j                   j                  |||      S y rk   )rs  rY  r9  r'  r7   r   rr  r  r  s       rS   r  zCSEProxy.store_reduction
  sZ    &&**40  u-qww...;;..tUEBB /rU   c                |    | j                   xj                  dz  c_        | j                   j                  ||||      S r  )rs  rV  r  r  s        rS   r  zCSEProxy.reduction
  s4     	!!Q&!{{$$UI~uMMrU   c                <    | j                   j                  |||      S rk   )rs  r  r  s       rS   r  zCSEProxy.scan
  s     {{
F;;rU   c                >    | j                   j                  ||||      S rk   )rs  r  r  s        rS   r  zCSEProxy.sort
  s     {{
CCrU   c           	     D    | j                   j                  |||||||      S )a  
        [Note: Inductor bucketize op]

        Inputs:
        -------
        values: the values to be bucketized.
        boundaries: a tuple containing
          (a) the name of the boundaries tensor (which must be sorted, unless
          the sorting tensor is present),
          (b) the length of the tensor in the last dimension (i.e. the length of
          one set of boundaries),
          (c) the number of elements in the underlying storage (i.e. the length
          of the flattened tensor, ignoring striding), and
          (d) the stride of the tensor in the last dimension.
        boundary_indices: indices into a flattened version of the boundaries
        tensor, of the same size and shape as "values".  Each index points to
        the first element in the set of boundaries to be used for the
        corresponding value.
        indexing_dtype: the dtype to use when indexing into the boundaries
        tensor.  This must be int64 or int32.  This additionally specifies the
        dtype of the return value.
        right: see "Details" below.
        sorter: an optional tuple containing
          (a) the name of an optional sorting tensor, used to access unsorted
          boundaries without reordering the boundaries tensor, and
          (b) the stride of the tensor in the last dimension.
        The values in the sorting tensor are used as indices into the *last*
        dimension of the boundaries tensor, with all other indices matching.
        The size of the sorting and boundaries tensors must be equivalent.
        sorter_indices: must be present if the sorting array is present; see
        "boundary_indices" for the equivalent definition for the boundaries
        tensor.

        Output:
        -------
        The buckets each value belongs in, within a given set of boundaries.  0
        indicates a position before the first boundary, and len(boundaries_set)
        represents a position after the last boundary.

        Details:
        --------
        Given a value and a set of boundaries, calculate the bucket that each
        value belongs to.  This works differently in 1-D and N-D cases.

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

        Note that in the N-D boundaries case, the shape of "values" and
        "boundaries" must match in every dimension _except_ the last.

        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

        Boundaries must be non-decreasing, or a sorter must be provided which
        would re-index offsets in a non-decreasing order (e.g. the second output
        of torch.sort(offsets)).  Otherwise, the result is undefined.
        )rs  r  r  s           rS   r  zCSEProxy.bucketize
  s1    L {{$$
 	
rU   )rs  zKernel[Any]r  zOpsHandler[Any])rd   rx   rp   ztuple[Any, ...]r  zdict[str, Any]ru   r	   )rd   rx   rp   r	   r  r	   ru   r  r  )
r  rk  r   r  r  r   r  r   ru   r  r  r  )rd   rx   rf   rk  ru   rv   rk   r  r  r  r  r  r  r  )rz   r{   r|   rd   r   r  r  r  r  r  r'  r  r  r  r  r  r  r  r  s   @rS   r  r  	  s   D-K.Z+%b 55 %5 	5
 5 
5nBB&0B9=BFJB	B
"@ SW== *=3>=FO=	=CNN N &	N
 ;N 
5N	<'	<
	< (	< 
!	<D'D (D 	D
 D 
!D  4804N
N
 CN
 &	N

 $N
 N
 1N
 .N
 
N
rU   r  )rR   rx   ru   rv   )NNNN)r   rx   r3  r   r4  r   r5  r   r6  r   r7  Optional[CustomGraphModulePass]r8  Optional[ConfigModule]ru   rv   )r   Union[torch.device, str, None]ru   zOrderedSet[BackendFeature])r   r0  rN  r;  ru   r   )r   rx   ru   zOptional[SchedulingConstructor])FF)r   rx   rR  r   rS  r   ru   r   )r   rx   ru   r.  )r   rx   ru   r/  rt   )r  Sequence[sympy.Expr]r  r1  r  r1  ru   r   )r   rx   r  r  ru   rv   )r   rx   ru   r  )r  rx   rp   r	   r  r	   ru   r   )r   r*   r  r  r   r   ru   rv   )rd  rx   ru   r   r   r  )ru   r	   )
__future__r   r`   r  dataclassesenumr  r  rO   r2  r  rb   rer^   abcr   r   r   r   r   typingr	   r
   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   torch.fxtorch._prims_commonr   torch.utilsr   r  torch.utils._config_moduler   torch.utils._ordered_setr   torch.utils._sympy.numbersr   torch.utils._sympy.printersr   _PythonPrintertorch.utils._sympy.symbolr   r   r   torch.utils._sympy.value_rangesr   r   r  r    r!   dtype_propagationr"   ops_handlerr#   r$   shape_propagationr%   utilsr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   virtualizedr2   r3   r4   r5   r6   r7   collections.abcr8   r9   r:   r;   custom_graph_passr<   r   r=   r>   r?   r@   r  rB   r  rC   rD   rE   rF   r  rI   rJ   r   rH  r   rx   r  r  r^  _logginggetArtifactLoggerrz   rM   	getLoggerr1  rT   	dataclassrW   r   r   r   r   r   r   r   r   r   KernelArgTyper  r~   r  r.  r/  r0  r9  r;  rJ  rO  rI  rU  rW  rY  cacherG  r  r  r  bfloat16r  float16r   r&  float64int8int16r  r  r   uint16r  uint64r  r  r  r  r  compile
IGNORECASErl  rg  ri  r  r  INT_TO_FLOATr  rv  rx  r  r  r  r  r  r  rk  r  r  r  r   ReductionCacheKeyr  r?  rM  r  r  r  r  r   s   0rS   <module>r[     s   "          	 	  #  
 
 
 ,    ? ) 3 / - G O O D  : ; :    Q P BB$9>>$DD2-	B$hy&9%:N%JK23sELL()J F~~//*Eg!=
   >/		 /(C  Td= d dN* * # # #       ! ! ! < < < lIw8H,VW,.) .5" 5"p :< 6 ;DF A FDF  A F8 @D>B:>37BB,B /B !=	B
  <B 8B 1B 
B4
&T 
&3*33$3*35C3	3U
 @E"8<!V ` `FUU$U  U 	U;;&7;	;	, 
NNEKK	MM5;;> JJMMMMJJKKKKKKKKLLLLLL
 	u> : ,''' ' 	'T::!0:9D:	:2aB aBH=N =$S1 S1l "rzz";2==Q D;#%5z# D;N - - -  6: `6;HH-`6 ;HH/- 	`6 ;HH/- 	`6$ ;HH/- 	%`60 ;HH/- 	1`6< ;HH*)	=`6L ;HH(0	M`6X 	;HH66>	Y`6h ;HH1i`6r ;HH2s`6| ;HH1}`6F ;HH2 G`6P ;HH%8$Q`6^ 	;HH&%		_`6j ;HH%8	k`6v 	;HH&	w`6@ ;HH+A`6L %;HH88)	M`6X %;HH88)	Y`6d %;HH8)e`6n %;HH8)o`6z 
;HH'
{`6D ;HH(E`6N ;HHc	O`6^ ,;HH?0_`6h ,;HH?0i`6t &;HH9*u`6~ 
;HH*
`6H );HHD-I`6R );HHD-S`6\ );HHD-]`6f );HHD-g`6p (;HHC,q`6z $1;HHL5${`6D $1;HHL5$E`6N $1;HHL5$O`6X $1;HHL5$Y`6b ';HHB+c`6l (;HHC,m`6v (;HHC,w`6 2 `F	-# -"> *Z 
 N N N 
 ,J JZ
#; #;L 5+;Tk5c!1223	5l'/=0
1 l^
< 
<\3Wgo. \3~     G" G"TU
~ U
_9s   
e-