
    pi                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZmZmZ d
dlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, erd dlm-Z- d
dl.m/Z/m0Z0 d
dl1m2Z2m3Z3 ddl"m4Z4  ejj                  e6      Z7ejp                  dejr                  dejt                  dejv                  dejx                  dejz                  dej|                  dej~                  dej                  di	ZAd'dZB G d de      ZC G d  d!e'      ZDeDj                  d"       eDj                           G d# d$e+      ZG G d% d&e,      ZHy)(    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    t        | t              r:| t        j                  k(  ry| t        j                   k(  ry| | k7  ryt	        |       S t        | t
              r| rdS dS t	        |       S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfstrr%   )vals    ]/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr9   8   s_    #u%))UYYJCZ3x	C	v)')s8O    c                  |    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZeZddZddZy)MetalExprPrinterz/Converts sympy expression to Metal code snippetc                    |j                   \  }}| j                  |      }| j                  |      }|j                  r	d| d| dS d| d| dS )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r8   _print_FloorDivz MetalExprPrinter._print_FloorDivI   s[    3LLOll3??.qcC5::qcse1--r:   c                    |j                   \  }}}| j                  |      }|dk7  r0| j                  |      }|j                  r
d| d| d}n	d| d| d}| j                  |      }d| d| dS )Nr   (rB   r@   rA   z) % (rC   )rG   rH   rI   rJ   mods        r8   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingQ   s    ii3LLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(r@   zmetal::min(r?   lenrD   RuntimeErrormap_printrG   rH   ab
typecast_a
typecast_bs         r8   
_print_MinzMetalExprPrinter._print_Min]       tyy>QEFF4;;		*1,QCq3qc;
,QCq3qc;
ZL:,a88r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::max only supported for 2 argsrQ   rR   rS   r@   zmetal::max(r?   rT   rY   s         r8   
_print_MaxzMetalExprPrinter._print_Maxe   r_   r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   metal::abs(r   r@   rU   rD   rX   rG   rH   s     r8   
_print_AbszMetalExprPrinter._print_Absm   s9    499~"""T[[167q99r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   zstatic_cast<long>(metal::rint(r   ))rd   re   s     r8   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntq   s9    499~"""/DIIaL0I/J"MMr:   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  d	S )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er@   )rU   rD   rF   
ValueErrorparenthesizer   )rG   rH   numberndigits
number_strs        r8   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimalu   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r:   c                n    |j                   \  }}d| j                  |       d| j                  |       dS )Nstatic_cast<float>(z) / static_cast<float>(r@   )rD   rX   )rG   rH   lhsrhss       r8   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDiv   s;    99S$T[[%5$66MdkkZ]N^M__`aar:   c                    t        |j                        dk(  sJ t        | j                  |j                        \  }}d| d| dS )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rh   )rU   rD   rW   rE   )rG   rH   rI   ys       r8   _print_PowByNaturalz$MetalExprPrinter._print_PowByNatural   sF    499~"""4<<+1/s2H2NNr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   ru   r@   rU   rD   rE   rG   rH   rI   s      r8   _print_ToFloatzMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r}   r~   s      r8   _print_FloorToIntz"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zstatic_cast<int>(metal::trunc(rh   r}   r~   s      r8   _print_TruncToIntz"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zmetal::log2(r@   r}   r~   s      r8   _print_OpaqueUnaryFn_log2z*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&aS""r:   N)rH   
sympy.Exprreturnr6   )__name__
__module____qualname____doc__rK   rO   r^   ra   rf   ri   rs   rx   r{   r   r   _print_floorr   r    r:   r8   r<   r<   F   sR    9.
"99:N
]b
O
*
J
 %L6
#r:   r<   c                  b   e Zd ZdZe	 	 d0	 	 	 	 	 	 	 	 	 d1d       Ze	 	 	 	 	 	 	 	 d2d       Zed3d       Zed4d       Zed5d       Z	ed6d       Z
ed7d	       Zed8d
       Zed8d       Zed8d       Zed8d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed:d       Zed9d       Zed9d       Zed9d       Z ed9d       Z!ed9d        Z"ed8d!       Z#ed9d"       Z$ed9d#       Z%ed8d$       Z&ed9d%       Z'ed8d&       Z(ed9d'       Z)ed;d(       Z*ed;d)       Z+e	 	 	 	 	 	 	 	 	 	 d<d*       Z,ed9d+       Z-ed8d,       Z.d=d-Z/d>d.Z0e1d?d/       Z2y)@MetalOverrideszXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                ~    |t         j                  k(  rt        j                  d       d|  dS dt        |    d|  dS )Nz>float64 cast requested, probably from tensorify_python_scalarsru   r@   static_cast<>()r4   doublelogwarningDTYPE_TO_METAL)rI   dtype	src_dtypeuse_compute_typess       r8   to_dtypezMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r:   c                6    dt         |    dt         |    d|  dS )Nzas_type<z>(static_cast<r   rh   r   )rI   r   r   s      r8   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar:   c                    t        |       S Nr9   )r7   r   s     r8   constantzMetalOverrides.constant   s    c""r:   c                @   t         j                  j                  t         j                  j                  |             }t         j                  j                  j                  t         j                  j                  |t        |             }t        j                  ||      S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rH   r   idx_strvars       r8   
index_exprzMetalOverrides.index_expr   sl    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r:   c                    t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t        j                  ||      S # 1 sw Y   AxY wr   )r   r   
mask_loadsr   is_boolr%   r   where)maskbodyothernew_maskresults        r8   maskedzMetalOverrides.masked   sa     XX  u- 	VF	 ==  KEyy6511	 	s   A))A2c                (    |  d| dt        |       S )Nz ? z : r   )rZ   r[   cs      r8   r   zMetalOverrides.where   s    Cs#nQ/011r:   c                    d|  d| dS )Nzc10::metal::remainder(r?   r@   r   rZ   r[   s     r8   	remainderzMetalOverrides.remainder   s    's"QCq11r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::max(r?   r   rZ   r[   r\   r]   s       r8   maximumzMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::min(r?   r   r   s       r8   minimumzMetalOverrides.minimum   r   r:   c                    |  d| S )Nz || r   r   s     r8   
logical_orzMetalOverrides.logical_or       D}r:   c                    |  d| S )Nz && r   r   s     r8   logical_andzMetalOverrides.logical_and   r   r:   c                    d|  dS )Nzmetal::isnan(r@   r   rI   s    r8   isnanzMetalOverrides.isnan       qc##r:   c                    d|  dS )Nzmetal::isinf(r@   r   r   s    r8   isinfzMetalOverrides.isinf   r   r:   c                    d|  dS )Nzmetal::log(r@   r   r   s    r8   r   zMetalOverrides.log       QCq!!r:   c                    d|  dS )Nzmetal::exp(r@   r   r   s    r8   expzMetalOverrides.exp   r   r:   c                    d|  dS )Nrc   r@   r   r   s    r8   abszMetalOverrides.abs   r   r:   c                    d|  dS )Nzmetal::signbit(r@   r   r   s    r8   signbitzMetalOverrides.signbit   s     1%%r:   c                    d|  dS )Nzmetal::precise::sin(r@   r   r   s    r8   sinzMetalOverrides.sin      %aS**r:   c                    d|  dS )Nzc10::metal::sinc(r@   r   r   s    r8   sinczMetalOverrides.sinc  s    "1#Q''r:   c                    d|  dS )Nzmetal::precise::cos(r@   r   r   s    r8   coszMetalOverrides.cos  r   r:   c                    d|  dS )Nzmetal::tan(r@   r   r   s    r8   tanzMetalOverrides.tan  r   r:   c                    d|  dS )Nzmetal::asin(r@   r   r   s    r8   asinzMetalOverrides.asin      aS""r:   c                    d|  dS )Nzmetal::acos(r@   r   r   s    r8   acoszMetalOverrides.acos  r   r:   c                    d|  dS )Nzmetal::atan(r@   r   r   s    r8   atanzMetalOverrides.atan  r   r:   c                    d|  d| dS )Nz::metal::atan2(r?   r@   r   )rI   rz   s     r8   atan2zMetalOverrides.atan2   s     2aS**r:   c                    d|  dS )Nzmetal::sqrt(r@   r   r   s    r8   sqrtzMetalOverrides.sqrt$  r   r:   c                    d|  d|  dS )NrQ   z)>(-r@   r   r   s    r8   negzMetalOverrides.neg(  s     'qcaS22r:   c                    d|  dS )Nzmetal::rsqrt(r@   r   r   s    r8   rsqrtzMetalOverrides.rsqrt.  r   r:   c                    d|  dS )Nzmetal::tanh(r@   r   r   s    r8   tanhzMetalOverrides.tanh2  r   r:   c                    d|  dS )Nzmetal::atanh(r@   r   r   s    r8   atanhzMetalOverrides.atanh6  r   r:   c                    d|  d| dS )Nr>   r?   r@   r   r   s     r8   floordivzMetalOverrides.floordiv:  s     +1#Rs!44r:   c                    d|  dS )NrA   r@   r   r   s    r8   floorzMetalOverrides.floor?  r   r:   c                    d|  dS )Nzmetal::sign(r@   r   r   s    r8   signzMetalOverrides.signC  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::fmod(r?   r   r   s       r8   fmodzMetalOverrides.fmodG  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r:   c                    d|  dS )Nmetal::trunc(r@   r   r   s    r8   trunczMetalOverrides.truncM  r   r:   c                    |  d| }| j                   | j                   j                  s"|j                   |j                   j                  rd| dS |S )Nz / r   r@   )r   is_floating_point)rZ   r[   quots      r8   truncdivzMetalOverrides.truncdivQ  sQ    Cs|GGAGG$=$=GGAGG$=$="4&**r:   c                    d|  dS )Nzmetal::ceil(r@   r   r   s    r8   ceilzMetalOverrides.ceilZ  r   r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nrandomzc10::metal::rand(r?   r@   r   r   headersaddseedoffsets     r8   randzMetalOverrides.rand^  s/    	X&"4&6(!44r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nr  zc10::metal::randn(r?   r@   r  r  s     r8   randnzMetalOverrides.randnc  s/    	X&#D6F8155r:   c           	     r    t         j                  j                  j                  d       d|  d| d| d| d	S )Nr  zc10::metal::randint64(r?   r@   r  )r  r	  lowhighs       r8   	randint64zMetalOverrides.randint64h  s=     	
X&'vRxr#baHHr:   c                    d|  dS )Nzmetal::rint(r@   r   r   s    r8   roundzMetalOverrides.roundo  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::pow(r?   r   )rZ   r[   cast_acast_bs       r8   powzMetalOverrides.pows  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r:   c                f    t         j                  j                  j                  d       d| d| dS )Nspecial_mathc10::metal::rM   r@   r  )rG   rZ   names      r8   _special_unaryzMetalOverrides._special_unaryy  s/    	^,dV1QCq))r:   c                l    t         j                  j                  j                  d       d| d| d| dS )Nr  r  rM   r?   r@   r  )rG   rZ   r[   r  s       r8   _special_binaryzMetalOverrides._special_binary}  s5    	^,dV1QCr!A..r:   c           
        dD ].  }t        | |t        j                  | j                  |             0 t        j                  | j                  d      | _        dD ]1  }t        | |t        j                  | j                  |dz                3 dD ].  }t        | |t        j                  | j
                  |             0 dD ]1  }t        | |t        j                  | j
                  |dz                3 y )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)r  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammaigammaigammaczeta)
chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_heshifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_w)setattr	functoolspartialmethodr  lgammar  )clsr  s     r8   _initialize_special_opsz&MetalOverrides._initialize_special_ops  s    	
 
	WD Cy66s7I7IPTUV
	W ,,S-?-?kR

 	D ''(:(:
ARS	&
 	XD Cy66s7J7JQUVW	X
 	D ''(;(;$BST	r:   )NT)
rI   r   r   torch.dtyper   zOptional[torch.dtype]r   r%   r   r6   )rI   r   r   rG  r   rG  r   r6   )r7   zUnion[bool, float, int]r   rG  r   r6   )rH   r   r   rG  r   r6   )r   r   r   r   r   r   r   r6   )rZ   r$   r[   r$   r   r$   r   r6   )rZ   r$   r[   r$   r   r6   )rZ   r   r[   r   r   r6   )rI   r   r   r6   )rI   r   rz   r   r   r6   )r  r   r	  r   r   r6   )
r  r   r	  r   r  r   r  r   r   r6   )rZ   r   r  r6   r   r6   )rZ   r   r[   r   r  r6   r   r6   r   None)3r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  r  r  r  r  classmethodrF  r   r:   r8   r   r      s   b ,0"&	<<< )<  	<
 
< < bb*b7Bb	b b
 # # ( ( 2 2 2 2 2 2 > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + " " # # # # # # + + # # 3 3
 $ $ # # $ $ 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I # # 1 1
*/ = =r:   r   mpsc                      e Zd ZU dZeZdZdZdZdZ	 e
       j                  Z e       j                  Z e       j                  ZeZ edg      Zded<   g Zd	ed
<   	 	 	 	 	 	 d fdZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZddZddd ej8                         f	 	 	 	 	 	 	 	 	 	 	 d dZ	 	 	 	 	 	 	 	 	 	 d!dZ	 	 	 	 	 	 	 	 	 	 d!dZd"dZ d#dZ!dd$dZ"dd%dZ#	 	 	 	 	 	 	 	 	 	 d&dZ$ xZ%S )'MetalKernelz;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]r  zlist[IterationRangesEntry]multistage_reduction_entryc                X    t        |   |fi | t        j                         | _        y r   )super__init__	itertoolscountacc_var_ids)rG   tilingkwargs	__class__s      r8   rV  zMetalKernel.__init__  s&    
 	*6*$??,r:   c                    t         |   S r   r   )rG   r   s     r8   dtype_to_strzMetalKernel.dtype_to_str  s    e$$r:   c                   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| d| j                  |       d}|t        j                  t        j                  fv rd| d}t        j                  }| j                  j                  | j                  ||      S )z"Codegen a load from an InputBuffer[]ru   r@   r   )rD   inputr   r   graph	get_dtyper   r4   float16bfloat16float32r   r   loads)rG   r  indexr   r   lines         r8   loadzMetalKernel.load  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r:   Nc                h   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }d| d| d}|| d| j                  |       d| d}n[|dk(  rH| j                  j                  d       d	| d
}	d|	 d| d}
|	 d|
 d| j                  |       d| d}nt        d|       | j                  r&| j                  j                  t        ||             y | j                  j                  t        ||             y )Nr   r   r@   r`  ] = rO  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r?   );zUnimplemented store mode )rD   outputr   r^  r   rd  re  r   r  r  rV   inside_reductionr   	writeliner   stores)rG   r  rj  valuemoder   	dtype_strcast_valrk  atomic_typecast_vars              r8   storezMetalKernel.store  s<    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<d#;<KK!!,tT":;r:   c                   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }t        d | j                  D              }| d| j                  |       d| d| d}d|j                   d| }| j                  j                  t        ||             y )Nc              3  :   K   | ]  }|j                   s|  y wr   is_reduction.0ts     r8   	<genexpr>z.MetalKernel.store_reduction.<locals>.<genexpr>  s     K1ANNQK   r`  z] = static_cast<r   rr  if (z == 0) )rD   rs  r   r^  r   rd  re  nextrange_treesr   r  rv  ru  r   )rG   r  rj  rw  r   ry  reduction_dimrk  s           r8   store_reductionzMetalKernel.store_reduction  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r:   Tc                   t        |t        j                        r| j                  |      }dt	        | j
                         }t        j                  j                  |||      }|rdnd}|| d| z  }|r|d| j                  |       dz  }||rJ d       |d| z  }| j                  j                  || j                  z          |S )	Ntmp_acc_zthreadgroup   r`  ra  z+Thread group var can not have default value = )r3   r4   r   r^  r  rY  r   r   create_cse_varsexprindexing_coderu  suffix)	rG   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r8   _new_idxvarzMetalKernel._new_idxvar  s     eU[[)%%e,Ed4#3#3456hh%%h>$2.eWAhZ((4::j12!44G$%T'TT%]O,,G$$Wt{{%:;
r:   c                    |||f}|| j                   j                  v r| j                   j                  |   S | j                  ||||      }|| j                   j                  |<   |S )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rG   r   r   reduction_typerw  	cache_keyr   s          r8   	reductionzMetalKernel.reduction#  sf     6	00088++I66((	>5Q.4  +r:   c                P   | j                   sJ | j                  rJ dAd}d}d}| j                  D ]  }|j                  s|r|dz  }||j                   d| z  }t        |j                  t        j                        r||j                  z  }_|t        j                  |j                   ddd      z  } t        j                  || j                        }| j                  |      }	t        |t        j                        rt        || j                        n| j                  }
|d	k(  r| j!                  |      }| j"                  j%                  | d
       | j"                  j%                  d       | j&                  j)                  d| d| d       | j*                  j%                  d       |S | j,                  j/                  d       |dv rt0        |   }| j!                  ||
      }| j2                  s|}nD|dk(  rdnd\  }}| j!                  ||d      }| j&                  j)                  | d| d| d       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |dv r| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}nY|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j&                  j)                  | d&| d| d| d'       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |d(v r]| j!                  ||
      }| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}dt8        |    d | d}n|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j!                  |d)d      }t=        d* | j>                  jA                         D              }|d+k(  rd,nd-}|jB                  rd.| d/nd}| j&                  j)                  d| d| d| | d| d0| d1| d0|j                   d2       | j4                  j7                  | j*                  d| d| d| d| d| d| d|	 d|      S |d3k(  r+| j2                  s~| j!                  ||      }| j&                  j)                  | d4| d5| d       | j4                  j7                  | j&                  d| d| d|	 dtD        jF                        } ||      S | j!                  d6|      }| d4| d7}| j"                  j)                  | d8       | j&                  j%                  | d9| d:| d;       | j4                  j7                  | j*                  d<| d| dtD        jF                        } ||      S |d=k(  r4t        |tH              sJ d>       | j!                  d6|      }| d4| d7}d?|d)    d|d    d|d@    d}| j"                  j)                  | d8       | j2                  rC| j"                  j)                  | d8       | j&                  j%                  | d9| d| d'       n!| j&                  j%                  | d0| d       | j4                  j7                  | j2                  r| j*                  n| j&                  d| d| d|	 dtD        jF                        } ||      S tK        |      )BzeCodegen a reduction operation.
        Only sum and prod operations are somewhat reasonable optimizedc           
         t        j                  dD cg c](  }t        |  d| | j                  | j                        * c}      S c c}w )Nxyzrk   )r   _unwrapr   r   r   )res3r  s     r8   _unwrap_helperz6MetalKernel._reduction_nocache.<locals>._unwrap_helper?  sA    %%NSTvQqc]DKKDT Ts   -Ar  r    + rm   numelTintegerpositiveanyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rR   )r   *F)r  r  r  z= rO  zc10::metal::threadgroup_rM   r?   r@   rb  )maxminr   r   r  lowestz::metal::numeric_limits<z>::z()z = ::c10::metal::rr  )argminargmaxr   c              3  :   K   | ]  }|j                   s|  y wr   r  r  s     r8   r  z1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      Ar  r  rq  <z || ::metal::isnan(z) r  z;
                    z$;
                }
                welford_reducer`  rn  float3ra  z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   )r  r   r   ztuple[CSEVariable, ...])&rt  
_load_maskr  r  r  r3   r  sympyIntegerSymbolprefixMinmax_threadgroup_sizer  r   simd_group_sizer  r  ru  r   splicerv  r  r  r   rS  r   r   r   endswithr  range_tree_nodesvaluesr   r4   rh  tupleNotImplementedError)rG   r   r   r  rw  r  reduction_idxacc_buf_sizerdacc_buf_size_strshmem_buf_sizeacc	acc_dtypeacc_bufr7   default_valreduction_opsrc_metal_type
cast_valuelim_fn	limit_valdata_acc_bufidx_acc_bufidx_validx_varcmp_op
nan_suffixwf_resacc_thread_var	inp_values                                 r8   r  zMetalKernel._reduction_nocache3  s^    $$$$??""	 "" 	B??&yL>::M"((EMM2(yyk'! 	 yyt/H/HI::l3 ,6 L$"6"67%% 	 U"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&y.AG22 !/% 7HX *\ &&[ '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 ^+&&y.AG+I6N''7r%BJ22 %3%<%<U%C6~6Fc&QST	&&Yu '  ##e,^,<AcU"ZLPRS 88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 11++I~FL**5.AK+I6N''7r%BJ22 ()>(?r-PQR%3%<%<U%C6~6Fc&QST	&&Yu '  **5RW*X #44;;=  !/( : !22 *%3 
 ## )G1VHAcU:, 7EUG $IS /%  88$$*>*:!L>K=XZ%r'"]O26F5GqJ	 %   --22**9lC##wiqtE7!$LM**LL.~.>ayK[J\\]^-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU+U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9BGWFXXYZmm ' F
 "&))!.11r:   c                ,   | j                  |j                        }| j                  |      }|j                  rQt	        |j
                  j                  t        j                        r\|j
                  j                  | j                  k  r9| j                  j                  | j                   d|j                   d| d       y t	        |j
                  j                  t        j                        r|j
                  j                  n.t        j                  |j
                  j                   ddd      }| j                   j#                  |       |t%        | j                  dz
        z   t%        | j                        z  }| j                  |      }| j&                  j                  d|j                   d	|j                   d
| d|j                   d	       | j&                  j)                         5  t	        |t        j                        rS| j&                  j                  | j                   d|j                   d| j                   d|j                   d| d
       nH| j&                  j                  | j                   d|j                   d| d| d|j                   d
       t	        |t        j                        s|| j                  z  |k7  r,| j&                  j                  d|j                   d| d       d d d        y # 1 sw Y   y xY w)Nr  r  rO  r  Tr  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rm   z_cnt + r  z_cnt;r  z >= z) break;)rename_indexingrH   r  r  r3   rootr  r  r  r  r  ru  index_dtyper  r  r  rS  appendr+   r   indent)rG   entryr   	index_stracc_size	loop_sizeloop_size_strs          r8   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry  s   ))%**5
JJz*	!!uzz''7

  D$=$==((##$Aejj\YKqA  %****EMM: JJ!2!2 3594RVW 	 	''..u5 d&?&?!&C DD%%J
 
	 

9-		

|:ejj\tTYT^T^S__gh	
 YY 	O(ELL1		##''(%**S9R9R8SSVW\WaWaVbbijsittuv 		##''(%**Ss9+UXY^YcYcXddij 8U\\2t888HD		##d5::,d8*H$MN	O 	O 	Os   3DL

Lc                   | j                   r-| j                  j                         5  | j                  j                  | j                         | j                  j                  | j
                         ddd       | j                  j                  dt        | j                         z         | j                  j                  t        d | j                  j                  j                         D                     | j                   r| j                   j                         j                          | j                   r5nJ| j                  j                  | j                         | j                  j                  | j
                         | j                  j                  | j                         | j                  j!                          | j
                  j!                          | j                  j!                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}c              3  T   K   | ]   }t        |t              r|n|fD ]  }|  " y wr   )r3   r  )r  itemvs      r8   r  z+MetalKernel.codegen_body.<locals>.<genexpr>.  s:      &0u&=dD7  s   &()rS  r   r  r  ri  r   ru  rU   r   
invalidater
   r  r  popcache_clearrv  clear)rG   s    r8   codegen_bodyzMetalKernel.codegen_body  sc    **!!# /		  ,		  ./ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 11 IITZZ(IIT\\*		%

1/ /s   AG99Hc                0   | j                          t               }t        j                  j                  r|j                  d       n|j                  d       | j                         }|j                         5  t        j                  j                  s'| j                  D ]  }|j                  d| d        nr| j                  D cg c]  }d| d
 }}t        |t        t              j                  j                  j                  dz  gt                     }|j                  |       | j                  rwt        j                   d | j"                  D              }t%        |t&        j(                        rt+        || j,                        n| j,                  }|j                  d| d       |j                  d	       |j                         5  | j.                  j0                  j3                         D ]Z  \  }	}
|	| j4                  v r| j7                  t        j                  j9                  |	            }|j                  d
| d|
 d       \ | j.                  j:                  j3                         D ]  \  }	}
t        j                  j9                  |	      }|t<        j>                  k(  rBt        j                  jA                  |	      }||jC                         g k7  rtE        d      d}n| j7                  |      }|j                  d| d|
 d        | j.                  jF                  j3                         D ]  \  }	}
|j                  d|
 d        |D ]F  }t%        |jH                  t&        j(                        r(|j                  d|jJ                   d       H tM        |      dk  sJ d       tM        |      dkD  rdtM        |       nd}tM        |      dk(  r|d   jN                  nd}| j                  rdnd}|j                  | d| d|        | j                  r|j                  | d       ddd       |j                  d       |j                         5  tM        |      dkD  rAtQ        |      D ]3  \  }}|j                  d|jN                   dtS        d |z          d!       5 |jU                  | jV                         |jU                  | jX                         ddd       |j                  d"       ddd       t        j                  j                  r!|j                  d#       |j[                         S |j                  d$       |j[                         S c c}w # 1 sw Y   3xY w# 1 sw Y   xY w# 1 sw Y   xY w)%z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''z#include <c10/metal/z.h>includec              3  N   K   | ]  }|j                   s|j                    y wr   )r  r  r  s     r8   r  z-MetalKernel.codegen_kernel.<locals>.<genexpr>Z  s      1 !AGG1s   %%z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long& znumel,   z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {rP  z = thread_pos.x   rO  r  z)MTL");z''')).r  r   r   rd  cpp_wrapperru  active_range_treesr  r  r	   r   __file__parentr
   rt  mathr  r  r3   r  r  r  r  rD   output_buffersitemsremoved_buffersr^  re  input_buffersr4   float64try_get_bufferget_sizerV   sizevarsr  r  rU   r  	enumeratechrr  r  r   getvalue)rG   r  codeidx_varsheaderr  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerry  r   	outer_bufr  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                       r8   codegen_kernelzMetalKernel.codegen_kernel?  s   77NN9%NN34**,[[] Q	 77&&"ll GFNN%9&#EFG FJ\\;A*6(#6  #1(^**11889DEL#
 /$$'+yy 1%)%5%51 ($ ""6F ,d.G.GH22 !
 :;K:LCP NN:; )$(II$<$<$B$B$D DLE5 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	D
 %)II$;$;$A$A$C 
FLE5GG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE
F %)II$6$6$<$<$> ?LE5NN_UG1#=>?  ( QG!'--?8H'OP	Q 8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZO)T NN5! 'x=1$$-h$7 S#CHH:^Cc	N;K1M D../DII&' NN3cQ	 f 77NN9% }} NN6"}}g2) )V' 'SQ	  Q	 sR   8AVU.C;VI2U3)V+BV 1V.V3U=	8V V		VVc           	        t         j                  j                  }| j                  j                  j                         D ]  }|j                  |        | j                  j                         \  }}}}t        ||      D 	ci c]  \  }}	t        |      |	 }
}}	g | j                  j                  j                         | j                  j                  j                         }|D cg c]  }|| j                  vs| }}|| j                  j                  j                         D cg c]  }t        |       c}z  }|D cg c]  }|
|   	 }}| j                  D ]  }t        |j                  t         j"                  t$        f      r.t        |j                  t         j&                        r|j                  }n4t         j                  j                  j)                  ||      j*                  }|j,                  r| j.                  s|j1                  t        |             |j1                  t$                t         j                  j2                  r| j4                  n| j6                  }dd}t9        | j;                               dkD  r| j;                         D cg c]J  } ||j,                  r*t!        j<                  |j                  | j>                        n|j                        L }}|j1                   ||d             |j1                  t@               n%t         j                  j2                  rtC        d      | j.                  r| j;                         D cg c]@  }|j,                  r0 |t!        j<                  |j                  | j>                              ndB }}|j1                   ||d             |j1                  t@               n1t         j                  j2                  r|dgz  }|j1                  d       |jE                  ||tG        jH                  d      d	|
       yc c}	}w c c}w c c}w c c}w c c}w c c}w )z0
        Codegens a call to this kernel
        threadsc                    t         j                  j                  r(| D cg c]  }d| d
 } }ddj                  |        dS | ddj                  |        dS c c}w )Nzstatic_cast<uint64_t>(r@   {r?   r  z=[ra  )r   rd  r  join)r  kwargr  s      r8   format_threadsz/MetalKernel.call_kernel.<locals>.format_threads  sh    ww""BIJQ3A3a8JJDIIg./r22499W#5"6a88 Ks   Ar   zWe should always have threads?1
group_sizeNrL  F)devicetriton	arg_types)r  z	list[str]r!  r6   r   r6   )%r   rd  wrapper_coderD   r
  keysensure_size_computedpython_argdefszipr6   r  r  r  r  r3   r  r  r  r(   r  generate_numel_exprr  r  rt  r  r  cexprpexprrU   r  r  r  listrV   generate_kernel_callr4   r%  )rG   r  nodewrapperr  _	call_argsr'  call_argarg_typearg_name_to_typerD   argtreerH   expr_printerr"  r  s                     r8   call_kernelzMetalKernel.call_kernel  s    ''&&##((* 	,A((+	, &*YY%=%=%?"9a>A)Y>W
(:(CM8#
 
 S))..0R4993J3J3O3O3QR#Gs$2F2F'FGG!3!3!8!8!:;AQ;;6:;s%c*;	; $$ 	&D$**u}}c&:;DJJ5zzww++??dKQQ$$(=(=CI&  %	& &'WW%8%8tzzdjj	9 t&&()A- 002  ~~ IIaggt'@'@AG  KKw	:;T"ww"""#CDD  
 002	  >> UYYqww0I0IJKG  KKw=>T"ww""   &$$<<& 	% 	
E

 H;;8 s,   P3*P9>P9+P>Q!AQ#AQc                    |s|sy | j                  |      }|r| dnd}|r| d| j                  |       nd}|r|r
d| d| d}nd| | d}| j                  j                  | j                  |d	
       y )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rG   rH   sizelowerupperexpr_str
lower_expr
upper_exprrk  s	            r8   check_boundszMetalKernel.check_bounds  s      $$T**/z&R
BGzT%6%6t%<$=>R
U:,fZL	BD*j\:D$,,?r:   )rZ  zdict[str, sympy.Expr]r[  r   r   rI  )r   rG  r   r6   )r  r6   rj  r   r   r   r   )
r  r6   rj  r   rw  r   rx  r!   r   rI  )r  r6   rj  r   rw  r   r   rI  )r   zUnion[str | torch.dtype]r  zOptional[int]r  zOptional[Any]r  r%   r   zValueRanges[Any]r   r   )
r   rG  r   rG  r  r    rw  +Union[CSEVariable, tuple[CSEVariable, ...]]r   rF  )r  r   r   rI  rH  )r  zOptional[str]r   r6   )r  r6   r2  r   r   rI  )
rH   r   r?  r   r@  r%   rA  r%   r   rI  )&r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rE   r/  r   r.  r<   r  kexprr
   r  __annotations__rS  rV  r^  rl  r}  r  r   unknownr  r  r  r  r  r  r<  rE  __classcell__r\  s   @r8   rN  rN    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*8 %)'+##6;#6#6#8' " %	
  ! 
,  &	
 ; 
5 s2s2 s2 &	s2
 ;s2 
5s2j0Od#JcJR
h@@&0@9=@FJ@	@r:   rN  c                  <     e Zd ZeZd fdZ	 	 	 	 	 	 	 	 ddZ xZS )MetalSchedulingc                    t         |   |       t        j                  j                  }|7t        j                  j
                  s|j                  j                  d       y y y )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)rU  rV  r   rd  r(  r  r  r  )rG   	schedulerr3  r\  s      r8   rV  zMetalScheduling.__init__  sQ    #''&&77&&%%Z ' r:   c                \   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }| }||j                  |<   t         j                  j
                  rd| |z   }t        ||      \  }}| d| }	|j                  |||	d       |S )Nmps_lib_z+at::native::mps::DynamicMetalShaderLibrary 
F)gpu)r   rd  r(  src_to_kernelnext_kernel_suffixr  r   define_kernel)
rG   src_codenode_scheduler   r3  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r8   rX  zMetalScheduling.define_kernel  s     ''&&w,,,!//9K&  &g&@&@&B%CDL)NK.9G!!(+ww""A,P 
 )<M7(S%G%")"-=,>?!!,:JPU!Vr:   )rQ  zOptional[Scheduler]r   rI  )rY  r6   rZ  zlist[SchedulerNode]r   rN  r   r6   )r   r   r   rN  kernel_typerV  rX  rL  rM  s   @r8   rO  rO  
  s2    K,?IT	r:   rO  )r7   z)Union[float, int, bool, str, CSEVariable]r   r6   )I
__future__r   rB  rW  loggingr  pathlibr   typingr   r   r   r  sympy.printing.precedencer   r4   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   rR  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   rQ  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   rg  r   r9   r<   r   _initialize_pointwise_overridesrF  rN  rO  r   r:   r8   <module>rv     s,   #      / /  0  9 / O 7 G G , ,  C B 64g! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
Y#| Y#x][ ]@	  . .u 5  & & (A	@* A	@H$n $r:   