
    pi!                    j   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl#m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJ erd dlKmLZL ddl*mMZMmNZN ddlOmPZP  ej                  eR      ZSd ZT G d deU      ZV G d  d!e@      ZW eW       j                  ZY e@       j                  ZZej                  d"ej                  d#ej                  d$ej                  d%ej                  d&ej                  d'ej                  d(ej                  d)ej                  d*ej                  d+ej                  d,ej                  d-ej                  d.iZhd/ Zid0 Zj G d1 d2e?      Zkekj                  d3        G d4 d5e;      Zmej                   G d6 d7             Zod8 Zpd9 Zq G d: d;eI      Zr G d< d=eJ      Zsy)>    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreMode)BlockShapeTypec                "   t        | t              rVd| cxk  rdk  sKn t        j                  t        j                        }| |j
                  k(  ry| |j                  k(  ryd| dS t        | t              rdt        |        dS t        |       S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr1   repr)valinfos     `/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrF   >   s~    #s[C%E:%E{{5;;'$((?%$((?%q!!#us+,A..9    c                        e Zd Zd fdZ xZS )Unsupportedc                *    t         |   d|        y )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rE   rL   zUnsupported.__init__L   s    <UGDErG   returnNone)__name__
__module____qualname__rL   __classcell__rO   s   @rE   rI   rI   K   s    F FrG   rI   c                       e Zd Zed        Zed        Zd Zd Zd ZeZ	d Z
e
Zd Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z fdZd ZeZd Zd Z  xZ!S )HalidePrinterc                D    dt         j                  j                   d|  dS )Nhl.cast(, r9   )r$   kernelindex_dtypeexprs    rE   
cast_indexzHalidePrinter.cast_indexQ   s"    !((../r$q99rG   c                    d|  dS )Nhl.cast(hl.Float(32), r9    r_   s    rE   
cast_floatzHalidePrinter.cast_floatU   s    'vQ//rG   c                    d| dS )Nhl.f32(r9   rd   rM   r`   s     rE   _print_FloatzHalidePrinter._print_FloatY   s    a  rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   rg   r   r9   lenargs_printrh   s     rE   _print_ToFloatzHalidePrinter._print_ToFloat\   9    499~"""TYYq\23155rG   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.floor(r   r9   rl   rm   ra   rn   rh   s     rE   _print_floorzHalidePrinter._print_floor`   B    499~"""4;;tyy|+D*EQGHHrG   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.trunc(r   r9   rs   rh   s     rE   _print_TrunczHalidePrinter._print_Truncf   ru   rG   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   hl.ceil(r   r9   rs   rh   s     rE   _print_ceilingzHalidePrinter._print_ceilingl   sB    499~"""$++diil*C)DAFGGrG   c                J    d| j                  | j                  |             dS Nzhl.sqrt(r9   )re   rn   rh   s     rE   _helper_sqrtzHalidePrinter._helper_sqrtp   s$    $//$++d*;<=Q??rG   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   r%   r   
hl.select(r\   r9   )doprintrm   )rM   r`   cpqs        rE   _print_WherezHalidePrinter._print_Wheres   s_    LL1&LL1&LL1&A3b2aS**rG   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr%   r   r   hl.min(r\   r9   )rl   rm   rn   sympyMinrM   r`   midabs        rE   
_print_MinzHalidePrinter._print_Miny   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rG   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr%   r   r   hl.max(r\   r9   )rl   rm   rn   r   Maxr   s        rE   
_print_MaxzHalidePrinter._print_Max   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rG   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   hl.abs(r   r9   rs   rh   s     rE   
_print_AbszHalidePrinter._print_Abs   sB    499~"""TYYq\)B(C1EFFrG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.cos(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_cosz&HalidePrinter._print_OpaqueUnaryFn_cos   rp   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.cosh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_coshz'HalidePrinter._print_OpaqueUnaryFn_cosh   9    499~"""$++diil34A66rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.acos(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_acosz'HalidePrinter._print_OpaqueUnaryFn_acos   r   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.sin(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_sinz&HalidePrinter._print_OpaqueUnaryFn_sin   rp   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.sinh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_sinhz'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.asin(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_asinz'HalidePrinter._print_OpaqueUnaryFn_asin   r   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.tan(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_tanz&HalidePrinter._print_OpaqueUnaryFn_tan   rp   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.tanh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_tanhz'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rG   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr%   hl.atan(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_atanz'HalidePrinter._print_OpaqueUnaryFn_atan   r   rG   c                    t        d      Nlog2NotImplementedErrorrh   s     rE   _print_OpaqueUnaryFn_log2z'HalidePrinter._print_OpaqueUnaryFn_log2   s    !&))rG   c                   |j                   rt        | 	  |      S |j                  \  }}| j	                  | j                  |            }| j	                  | j                  |            }| j                  d| d| d      S )Nrr   z / r9   )
is_integerrK   _print_FloorDivrm   re   r   ra   )rM   r`   xdivrO   s       rE   r   zHalidePrinter._print_FloorDiv   sr    ??7*4003OODLLO,oodll3/01#SQ788rG   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr%   	hl.round(r   r9   rs   rh   s     rE   _print_RoundzHalidePrinter._print_Round   ru   rG   c                2    |j                   \  }}d| d| dS )N() / (z+hl.f32(0)))rm   )rM   r`   r   r   s       rE   _print_IntTrueDivzHalidePrinter._print_IntTrueDiv   s$    yy11#U1#[))rG   c                ~    |j                   \  }}| j                  |      }t        |      }dd| z  d| dd|z  dS )Nrg   g      $@z)*hl.round((z	)*hl.f32()))rm   rn   r;   )rM   r`   rC   ns       rE   _print_RoundDecimalz!HalidePrinter._print_RoundDecimal   sL    Qkk#F1"(SE47+RPPrG   )"rS   rT   rU   staticmethodra   re   ri   ro   rt   _print_FloorToIntrx   _print_TruncToIntr{   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rV   rW   s   @rE   rY   rY   P   s    : : 0 0!6I %I %H@+##G677677677*9I %*
QrG   rY   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    t         |    S N)_halide_typedtypes    rE   halide_typer      s    rG   c                    t        |       r/| j                  r#| t        j                  k7  rt        j                  } | t        j
                  t        j                  fv rt        j                  } t        |       S r   )	r   	is_signedr<   r>   int32float16bfloat16float32r   r   s    rE   halide_acc_typer      sM    5??u7K//urG   c                  .   e Zd Ze	 	 dC	 	 	 dDd       ZedEd       Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@edFd>       ZAed?        ZBed@        ZCedA        ZDedB        ZEy)GHalideOverridesNc                X    |t         j                  k(  rd|  dS dt        |       d|  dS )Nr   z != 0)r[   r\   r9   )r<   boolr   )r   r   	src_dtypeuse_compute_typess       rE   to_dtypezHalideOverrides.to_dtype   s9     EJJqc= +e,-Rs!44rG   c                    |t         j                  t         j                  fv rdt        |       d|  d} dt        |       d|  d}|t         j                  t         j                  fv rd| d}|S )Nr[   r\   r9   zhl.reinterpret(rc   )r<   r   r   r   )r   r   r   lines       rE   to_dtype_bitcastz HalideOverrides.to_dtype_bitcast   ss    77;y12"QCq9A U!3 4Bqc;U]]ENN33+D63DrG   c                8    | j                  t        |      |      S r   )r   rF   )clsvaluer   s      rE   constantzHalideOverrides.constant  s    ||OE2E::rG   c                    d|  dS )Nr   r9   rd   r   s    rE   abszHalideOverrides.abs      1~rG   c                R    t        | d      sd|  dS d|  d| j                   d|  dS )Nnamezhl.exp(r9   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   s    rE   expzHalideOverrides.exp  s=    q&!QCq>!3A3fQVVHDefgehhijjrG   c                    d|  dS r}   rd   r   s    rE   sqrtzHalideOverrides.sqrt      !ArG   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r\   r9   r[   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   s     rE   minimumzHalideOverrides.minimum       q&!QCr!A&&qvvhis!,QCq<s#aS1#U166(JijkillnopnqqrssrG   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r\   r9   r[   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   s     rE   maximumzHalideOverrides.maximum"  r   rG   c                X    t        |d      rd|j                   d| d}d|  d| d| dS )Nr   r[   r   r9   r   r\   r   )r   r   r   s      rE   wherezHalideOverrides.where*  s?    1f166()A3a0AA3b2aS**rG   c                    d|  dS )Nr   r9   rd   r   s    rE   coszHalideOverrides.cos0  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   sinzHalideOverrides.sin4  r   rG   c                    t        d      )NlgammarI   r   s    rE   r  zHalideOverrides.lgamma8      (##rG   c                    d|  dS )Nzhl.erf(r9   rd   r   s    rE   erfzHalideOverrides.erf<  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   coshzHalideOverrides.cosh@  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   sinhzHalideOverrides.sinhD  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   acoszHalideOverrides.acosH  r   rG   c                    d|  dS )Nz	hl.acosh(r9   rd   r   s    rE   acoshzHalideOverrides.acoshL      1#QrG   c                    d|  dS )Nr   r9   rd   r   s    rE   asinzHalideOverrides.asinP  r   rG   c                    d|  dS )Nz	hl.asinh(r9   rd   r   s    rE   asinhzHalideOverrides.asinhT  r  rG   c                    d|  d| dS )Nz	hl.atan2(r\   r9   rd   r   ys     rE   atan2zHalideOverrides.atan2X      1#Rs!$$rG   c                    d|  dS )Nr   r9   rd   r   s    rE   atanzHalideOverrides.atan\  r   rG   c                    d|  dS )Nz	hl.atanh(r9   rd   r   s    rE   atanhzHalideOverrides.atanh`  r  rG   c                    t        d      )Ncopysignr  r  s     rE   r#  zHalideOverrides.copysignd  s    *%%rG   c                    t        d      )Nerfinvr  r   s    rE   r%  zHalideOverrides.erfinvh  r	  rG   c                    d|  d| dS )Nz	hl.hypot(r\   r9   rd   r  s     rE   hypotzHalideOverrides.hypotl  r  rG   c                    t        d      )N	nextafterr  r  s     rE   r)  zHalideOverrides.nextafterp  s    +&&rG   c                    |  d| S Nz & rd   r   s     rE   logical_andzHalideOverrides.logical_andt      Cs|rG   c                    |  dS )Nz == 0rd   r   s    rE   logical_notzHalideOverrides.logical_notx  s    E{rG   c                    |  d| S Nz | rd   r   s     rE   
logical_orzHalideOverrides.logical_or|  r-  rG   c                    d|  d| dS )Nr    ^ r9   rd   r   s     rE   logical_xorzHalideOverrides.logical_xor  s    1#S1~rG   c                    |  d| S r+  rd   r   s     rE   bitwise_andzHalideOverrides.bitwise_and  r-  rG   c                    d|  S )N~rd   r/  s    rE   bitwise_notzHalideOverrides.bitwise_not  s    1#wrG   c                    |  d| S r2  rd   r   s     rE   
bitwise_orzHalideOverrides.bitwise_or  r-  rG   c                    |  d| S )Nr5  rd   r   s     rE   bitwise_xorzHalideOverrides.bitwise_xor  r-  rG   c                    |  d| S )Nz << rd   r   s     rE   bitwise_left_shiftz"HalideOverrides.bitwise_left_shift      D}rG   c                    |  d| S )Nz >> rd   r   s     rE   bitwise_right_shiftz#HalideOverrides.bitwise_right_shift  rB  rG   c                    d|  d| dS )Nzhalide_helpers.rand(r\   r9   rd   seedoffsets     rE   randzHalideOverrides.rand  s    %dV2fXQ77rG   c                    d|  d| dS )Nzhalide_helpers.randn(r\   r9   rd   rF  s     rE   randnzHalideOverrides.randn  s    &tfBvha88rG   c           	          d|  d| d| d| d	S )Nzhalide_helpers.randint64(r\   r9   rd   )rG  rH  lowhighs       rE   	randint64zHalideOverrides.randint64  s#    *4&6("SED6KKrG   c                    t        j                  | d       dt        j                  j                  j                  d|       S )Nr    + load_seed_offset)opsloadr$   r]   rm   seed_offset)r   rH  s     rE   	load_seedzHalideOverrides.load_seed  s7    ((4#$C(A(ABTV\(]'^__rG   c                    d|  dS )Nz1./hl.sqrt(r9   rd   r   s    rE   rsqrtzHalideOverrides.rsqrt  s     QCq!!rG   c                    d|  dS )Nr   r9   rd   r   s    rE   tanzHalideOverrides.tan  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   tanhzHalideOverrides.tanh  r   rG   c                    d|  dS )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rd   r   s    rE   signbitzHalideOverrides.signbit  s    DQC~VVrG   c                    |  d|  d| d| S )Nz - hl.trunc(/z)*rd   r   s     rE   fmodzHalideOverrides.fmod  s!     L1QCr!--rG   c                    d|  d| dS )Nzhl.pow(r\   r9   rd   r   s     rE   powzHalideOverrides.pow  s    2aS""rG   c                    d|  dS )Nzhl.log(r9   rd   r   s    rE   logzHalideOverrides.log  r   rG   c                    t        d      r   r   r   s    rE   r   zHalideOverrides.log2  s    !&))rG   c                    d|  dS )Nz hl.is_inf(hl.cast(hl.Float(32), r   rd   r   s    rE   isinfzHalideOverrides.isinf       2!B77rG   c                    d|  dS )Nz hl.is_nan(hl.cast(hl.Float(32), r   rd   r   s    rE   isnanzHalideOverrides.isnan  ri  rG   c                    d|  dS )Nr   r9   rd   r   s    rE   roundzHalideOverrides.round  r  rG   c                    d|  dS )Nrr   r9   rd   r   s    rE   floorzHalideOverrides.floor  r  rG   c                    d|  d| dS )Nr   r   z + hl.f32(0))rd   r   s     rE   int_truedivzHalideOverrides.int_truediv  s    1#U1#]++rG   c                .    d| j                    d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r9   r   r   s     rE   floordivzHalideOverrides.floordiv  s)     18J1#TRSQTTUV	
rG   c                4   t        j                  t        j                  d|      t        j                        }t        j                  t        j                  |d      t        j                        }t        j
                  ||      }d|j                   d| dS )N0r[   r   r9   )rS  r   ltr<   int8subr   )r   r   leftrightr{  s        rE   signzHalideOverrides.sign  sg    ||CFF3NEJJ7SVVAs^UZZ8ggdE"!&&3%q11rG   c                    d|  dS )Nrw   r9   rd   r   s    rE   trunczHalideOverrides.trunc  r  rG   c                .    d| j                    d|  d| dS )Nz"hl.trunc(hl.cast(hl.Float(max(32, rs  rt  r9   ru  r   s     rE   truncdivzHalideOverrides.truncdiv  s)    
 18J1#TRSQTTUV	
rG   c                    d|  dS )Nrz   r9   rd   r   s    rE   ceilzHalideOverrides.ceil  r   rG   c                    d|  dS )Nr   z, 0)rd   r   s    rE   reluzHalideOverrides.relu  s    4  rG   c                ~   t         j                  j                  |      }t         j                  j                  t         j                  j	                  |      t         j                  j                  |      t        |            }|t        j                  t        j                  fvrt        j                  ||      S |S )N)bounds)r$   r]   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r<   r   r>   rS  r   )r   r`   r   indexvars        rE   
index_exprzHalideOverrides.index_expr  s    ))$/hhHH!!%(HH))%0(.  

 ekk22<<U++
rG   c                    t        j                  |t        j                        }t        j                  |||      }||_        t        t        |            S r   )rS  r   r<   r   halide_clampindirect_indexing_sizer!   str)r   	index_varsizecheckwrap_negs        rE   indirect_indexingz!HalideOverrides.indirect_indexing  sC     LLEKK8	$$Ye<	+/	(!#i.11rG   c                    t         j                  j                  t         j                  j                  |      dz
        }t	        |t
        t        j                  f      sd|j                   d| d}d| d| dS )Nr%   r[   r   r9   z	hl.clamp(z, 0, )	r$   r]   kexprrename_indexingr:   r;   r   Integerr   )r   r   r  r  ends        rE   r  zHalideOverrides.halide_clamp  sj    hhnnQXX55d;a?@$emm 45UZZL	#a8C 5'se1--rG   c                   t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t         j                  j                  d|j                   dt        |       dg t        j                  |      |j                        }t        j                  ||      S # 1 sw Y   xY w)Nr[   r   r9   r  shape)r$   r]   
mask_loadsr  is_boolr   r  r   rF   r   wrapr  rS  r  )maskbodyothernew_maskresults        rE   maskedzHalideOverrides.masked#  s    XX  u- 	VF	 ==  KE   v{{m9_U-C,DAF##E*,,	 ! 
 yy6511	 	s   CCc                    t        d      )Nfrexpr   r   s    rE   r  zHalideOverrides.frexp5  s    !'**rG   c                    t        d      )Ndevice_assert_asyncr   )condmsgs     rE   r  z#HalideOverrides.device_assert_async9  s    !"788rG   )NT)r   torch.dtyper   Optional[torch.dtype])r   r  r   r  )TT)FrS   rT   rU   r   r   r   classmethodr   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r,  r0  r3  r6  r8  r;  r=  r?  rA  rD  rI  rK  rO  rV  rX  rZ  r\  r^  ra  rc  re  r   rh  rk  rm  ro  rq  rv  r~  r  r  r  r  r  r  r  r  r  r  rd   rG   rE   r   r      s:    ,0	55 )5 5   ; ;   k k
   t t t t + +
     $ $                   % %       & & $ $ % % ' '                     8 8 9 9 L L ` ` " "     W W . . # #   * * 8 8 8 8         , , 
 
 2 2     
 
   ! ! 	 	 2 2 . . 2 2" + + 9 9rG   r   halidec                  r     e Zd Z ej                  d      Z	 	 d	 	 	 	 	 	 	 d fdZd Zd Zd	dZ	d Z
 xZS )
HalideCSEVariablez\b(tmp\d+)\[\?\]c                :    t         |   ||||       d | _        y Nr  )rK   rL   	used_dims)rM   r   r  r   r  rO   s        rE   rL   zHalideCSEVariable.__init__D  s"     	vuE:7;rG   c                T   t        | j                  xs d      }t        j                  ||j	                               D ]D  }t        |t              s|j                  
J |||f       |j                  |j                         F t        j                  j                  |      | _        y )Nrd   )r   r  	itertoolschainvaluesr:   r  updater$   r]   sort_used_dims)rM   r   rm   kwargsusedargs         rE   update_on_argsz HalideCSEVariable.update_on_argsN  s    $...B/??49 	+C#01}}0C4d2CC0CMM*	+ 006rG   c                    t        |      dk(  r| j                   dS | j                   ddj                  t        t        |             dS )Nr   z[()][r\   ])rl   r   joinmapr  )rM   dimss     rE   	index_strzHalideCSEVariable.index_strV  sE    t9>ii[%%))AdiiC78::rG   c                n    | j                   | j                   dS | j                  | j                         S )Nz[?])r  r   r  )rM   s    rE   __str__zHalideCSEVariable.__str__\  s0    >>!ii[$$~~dnn--rG   c           	         | j                   t        d | j                   D              sJ | j                  | j                   D cg c]  }|j                  ||       c}      S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r:   r   Expr.0r   s     rE   	<genexpr>z-HalideCSEVariable.subs_str.<locals>.<genexpr>c  s       2
*+Jq%**%2
   $&)r  allr  get)rM   replacementsr   s      rE   subs_strzHalideCSEVariable.subs_strb  s^    ~~)c 2
/3~~2
 /
 	
 
 ~~t~~N!|//15NOONs   A$)NN)r  zValueRanges[Any]r   r  r  r7   rQ   rR   )rQ   r  )rS   rT   rU   recompileundefined_rerL   r  r  r  r  rV   rW   s   @rE   r  r  A  s^    2::12L (, $< !< %	<
 < 
<7;.PrG   r  c                  H     e Zd ZU ded<   ded<   ded<   d fdZd	dZ xZS )
DimensionInfozOptional[sympy.Expr]r`   
sympy.Exprr  stridec                    t         |           t        j                  j                  j                  |d      r| }| }|| _        || _        || _        y Nr   )	rK   rL   r$   graphsizevarsstatically_known_ltr`   r  r  )rM   r`   r  r  rO   s       rE   rL   zDimensionInfo.__init__o  sK    77//:WF5D		rG   c                   | j                   J | j                   }|r|dk(  ry|ri |}|j                  D ]  }t        |t        j                        st        |t        j                        sJ t        j                  j                  |j                        }t        |t              sJ t        |j                  |            ||<    t        ||      }t        j                  j!                  |      S )Nr   hl.Var())r`   free_symbolsr   r   TMPr:   r   Symbolr$   r]   lookup_cse_varr   r  r!   r  r"   r  )rM   r  	zero_varsr`   symr  s         rE   r  zDimensionInfo.index_strx  s    yy$$$yy+l+L(( W!#txx0%c5<<888((11#((;C%c+<===(:3<<;U(VL%W dL1Dxx$$T**rG   rP   NF)rS   rT   rU   __annotations__rL   r  rV   rW   s   @rE   r  r  i  s    

+rG   r  c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k(  r*t         j                  j                  j                  | |       ||k(  S # t
        $ r Y yw xY wNTF)r$   r  r  statically_known_equalssize_hint_or_throw	TypeErrorcheck_equals)r|  r}  r   r   s       rE   eqr    s    ww//e<GG//5GG//6 	Av	%%dE26M	  s   AB3 3	B?>B?c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k  r*t         j                  j                  j                  | |       ||k  S # t
        $ r% t        j                  | |      }|| k(  r| |k7  cY S Y yw xY wr  )	r$   r  r  r  r  r  r   gcdcheck_lt)r|  r}  r   r   r  s        rE   ry  ry    s    ww++D%8GG//5GG//6 	1u	!!$.q5L  iie$$;5= 	s   AB3 3)C! C!c                      e Zd ZU eZeZded<   	 	 	 	 d$ fdZd%dZ	d&dZ
d' fdZd Zd	 Z	 	 d( fd
Zd Zd)dZd Zd Zd(dZd Zd*dZd+dZd,dZ	 d-	 	 	 	 	 	 	 	 	 d.dZ	 	 	 	 	 	 	 	 	 	 d/dZd Z	 	 	 	 	 	 	 	 d0dZ ej:                         dd	 	 	 d1dZddd1dZd,dZ d Z!d2dZ"d-dZ#e$d         Z%d-d,d!Z&d" Z'	 	 	 	 	 	 	 	 d3d#Z( xZ)S )4HalideKernelzCallable[[sympy.Expr], str]r  c                x   t        |   |fi | | j                  | _        | j                  | _        | j                  | _        t               | _        | j                  | _	        | j                  | _
        i | _        i | _        i | _        i | _        i | _        i | _        t#        t$              | _        d| _        y r  )rK   rL   r  computeloadsstoresr)   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rM   tilingr  rO   s      rE   rL   zHalideKernel.__init__  s    
 	*6*yyYY
ii!/!1"&"7"7!22AC57;=@BCEHJ4?4E%*"rG   c                    t        |      S r   )r   )rM   r   s     rE   dtype_to_strzHalideKernel.dtype_to_str  s    5!!rG   Nc                `    | j                   j                  | d|d       t        ||||      S )Nz = hl.Func(r9   )r  	writeliner  )rM   r   r  r   r  s        rE   create_cse_varzHalideKernel.create_cse_var  s2    		tfKxq9: vue<<rG   c           
        | j                   s| j                  s| j                  rJ t        j                  t
        j                  j                  j                  t              t        j                  t        t         | 8  |            }t        t                   t"        j$                  j'                  | j(                  D cg c]  }|j*                  j-                          c}      D ci c]  }|j/                         | c}d }fd}fd}|D ]  }|j1                  t2              rV|j5                  t3        t7        j8                  d      t7        j8                  d      t7        j8                  d            |       |j1                  t:              rB|j5                  t;        t7        j8                  d      t7        j8                  d            |       j=                  t         | 9  |      j>                          tA        d D              | _!        d	}tE        | j(                        D ]%  }|j*                  j-                         D cg c]  }|j/                         v s| }	}|	jG                  fd
       |	s+|	jI                  |jK                  d|jL                               d}
t6        jN                  jP                  }g }|
tS        |	      k  rtU        |jL                  |      s|	D cg c]+  }tU        |jV                  |      s ||jX                        - }}|
tS        |      z  }
|sJ |	       |t        jZ                  t
        j                  j                  j\                  |      z  }|j_                  |	D cg c]C  }ta        ||jV                        r+ta        |jV                  |      r ||jV                  |z        E c}       |rt        jZ                  t6        jb                  |      }tU        |d      r2 ||jL                  |z        }tU        |d      rJ g }tS        |	      }
d}te        dtS        | j                               }|jf                  r.te        dtS        | j                               | j                  |<   || j                  |<   |jI                  ||f       ||z  }|	D cg c]%  }tU        |jV                  |      s|jX                  ' }}|
tS        |      z  }
tS        |      }|D cg c]&  }tU        ||      st7        jh                  ||z        ( }}tS        |      |k  s|dk(  sJ |j_                  |       |r|
tS        |	      k  rtU        |jL                  |      s|	D ]  }	 d}d}tU        |jV                  |      s)||   \  }}|dz  }||z  }tU        |jV                  |      s)d}t6        jN                  jj                  }tU        |jX                  |      s1||   \  }}|dz  }|||z  z  }||z  }tU        |jX                  |      s1|| j                   |j/                         <    ( | j                  D ]-  }| jp                  js                  | d|jt                  d       / | j                  rL| jw                  d| j                  jy                         D ci c]  \  }}|| j                  |    c}}       yyc c}w c c}w c c}w c c}w c c}w c c}w c c}w # tl        $ r |sJ t6        jN                  jj                  }t6        jN                  jP                  }|D ]  \  }}|||z  z  }||z  } t
        j                  j                  jo                  t3        ||jV                  |jX                        | j                        | j                   |j/                         <   Y Mw xY wc c}}w )a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                z    t        j                  t        j                  j                  j                  |             S r   )r   simplifyr$   r  r  remove_precomputed_replacementsr_   s    rE   r  z0HalideKernel.finalize_indexing.<locals>.simplify  s+    >>  @@F rG   c                   | v r|    }j                  |j                  j                  |j                  |z  t        j
                  j                  j                  |t        |j                  |                  j                                y y r   )addrootlookupdivisorr$   r  r  evaluate_minr   lengthsymbol)baser  modulusnodeall_used_symbolssym_to_nodes       rE   visit_modular_indexingz>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  sw    {""4( $$II$$w.((55#Xdkk7%C
 fh #rG   c           	         | v r`|    }j                  |j                  j                  |j                  |z  t	        |j
                  |            j                                y y r   )r  r  r  r  r   r  r  )r   r  r"  r#  r$  s      rE   visit_floor_divz7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s]    {""4( $$II$$w. g6 fh	 #rG   r   r  r!  c              3  P   K   | ]  }t        |t        j                           y wr   )r   r   INDIRECT)r  r  s     rE   r  z1HalideKernel.finalize_indexing.<locals>.<genexpr>  s       )
36N3.)
r  Fc                (     | j                         S r   )r  )r   	size_hints    rE   <lambda>z0HalideKernel.finalize_indexing.<locals>.<lambda>  s    Yqyy%9 rG   keyr%   r   Thhrz
 = hl.Var(r9   rdomN)=r  r  r  	functoolspartialr$   r  r  r+  r   dictfromkeysr  rK   r  r   r   r  r  from_iterablerange_treesnodesr  r  hasr   replacer   Wildr   r  r  anyr  reversedsortappendr  numelSOnerl   r  r  r  reduceevaluate_maxextendry  r  r!   is_reductionr  Zero
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)!rM   indicestreer   r  r%  r'  r  had_fallbackr8  handled_countr  added_sym_sizesizes_to_addr  	next_sizer  	new_sizes	prior_lensr"  idxr  r  r`   
full_indexr  vrvr#  r+  r$  rO   s!                                @@@rE   finalize_indexingzHalideKernel.finalize_indexing  s    ##t'7'74;Q;Q	
 
 %%agg&6&6&@&@3O	--EG$<g FG%c?, __22151A1AB""$B
 HHJM
	

		  	REyy)#

6*

9-

9-
 + yy"

6*

9- $ ##EG$<U$C$P$PQ%	R( &) )
:J)
 &
" T--. S	D $

 1 1 3V1qxxzEU7UQVEVJJ9J:T[[DJJ78MggkkGN#e*,R

G5L05 +,AIIw9OHQXX&    \!22#*U*|	 0 0GG$$11<!  ## "'gqyy1bC6H !W!45 # ) 0 0L II)Q' %-TZZ'-A$B	#%i#333')(+E
'+,qT5E5E1F0G-HIC((6H T%5%5!6 787..s3 -6D$$S)"))3	*:;y(G38 SaBqyy'<R SI S!S^3M #L 1I ".$!!Y/ q9}5$L $
 |,y8INJJ ''	27 #!  #e*,R

G5L\  CG w7$23$7	Tq4 !w7 F 77<<D f5$23$7	Tq,$	 !f5
 >BD++DKKM:oS	l ## 	JC((C5
388,a)HI	J!!6:6L6L6R6R6TUUQT%%a((U "c C
z W 0 !T$2 " ''<!&J"WW[[F%3 '	T"fsl2
$' ((==+JdkkR ,, ++DKKM:( Vsd   ![81[=\2\\\A\
\+\+\A\A"\\_(C_%$_%c           
        | j                   rdnd}|| j                  v r| j                  |   S i }| j                  j                         D ]c  }| j                   s|| j                  v rt        j                  d|j                        }|sJ t        d| |j                  d             ||<   e | j                  | d|j                         D ci c]  \  }}|| j                  |    c}}       || j                  |<   |S c c}}w )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r/  r%   dom)r   r  r  keysr  r  matchr   r!   grouprK  rL  )rM   prefixrenamesr  mrY  rZ  s          rE   setup_dom_indexingzHalideKernel.setup_dom_indexingw  s   --3T%%%##F++##((* 	HC((SD4J4J-Jchh/AH1-&!''!*.FGGCL	H 	hcN'--/RBR!1!1!!44R	
 $+ 	 Ss   Dc           	     v   |j                         D cg c]&  }d| j                  | j                  |             d( }}| j                  j	                  | ddj                  |       d       t        |j                               D ])  \  }}| j                  j	                  | d| d| d       + y c c}w )	Nhl.Range(0, r9   z = hl.RDom([r\   ]) = r  r  )r  r  r  rJ  r  r  	enumerater`  )rM   r   varsr  rsizesr]  rsyms          rE   rK  zHalideKernel.codegen_rdom  s     
 4::d&:&:4&@AB!D
 
 	$$v\$))F:K9LB%OP - 	BGAt((D6TF!A3a)@A	B
s   +B6c                    t         |   |      }t        || j                        }t        j
                  j                  j                  || j                        S r   )	rK   r  r"   r  r$   r  r  rI  r  )rM   r  rO   s     rE   r  zHalideKernel.prepare_indexing  sI     (/5$"9"9:ww44UD<L<LMMrG   c                    t        |t        j                        r%| j                  |j                        j
                  S | j                  |   S )zThe size of an index symbol)r   r   r  r  r   r  r  )rM   r  s     rE   sym_sizezHalideKernel.sym_size  s<    #txx(&&sxx0GGG$$rG   c           	      	    g t        |j                  d       D ]~  }t        |t        j                  t        j
                  f      rj                  |       ?t        |t        j                  t        j                  t        j                  f      ryJ |        t        j                  j                  }t        j                  t        j                  j                        }g }t        j                   j!                  |            }t#        |t        j$                        r|j&                  n|gD ]  }|j                  D 	cg c]	  }	|	|v s|	 }
}	t)        |
      dk(  r||z  }5t)        |
      dk(  r||
d   xx   |z  cc<   Tg }t+        t)        |            D ]e  }||   J ||   \  }}t-        |      t-        |
      z  r*|
j/                  |D 	cg c]	  }	|	|
vs|	 c}	       ||z  }S|j                  ||f       g g ||
|f}  fd}g }|D ]8  \  }}|D ]  }	||j1                  |	      z  } |j                   |||             : |j3                         D ]  \  }}|j                   |||g               |j5                  d        |sA j6                  r|j                  t9        t        j                  j                  dd             n}t:        j<                  j>                  jA                  |d   jB                  d      sF|jE                  dt9        t        j                  j                  rdn|d   jB                  d             |rs| jF                  v rit:        j<                  j>                  jI                  | jF                  |         r2 jK                  || jF                  |   z
          jF                  |   }n>t:        j<                  j>                  jM                  |d      r jK                  ||       d}|}tO        jP                         D ]W  } jS                  |||      r||fc S rJ | d| }| jT                  |   vs: jT                  |   j                  |       Y yc c}	w c c}	w )	zEConvert address-based indexing into dimensions using self.halide_varsc                    | j                   S r   ru  r   s    rE   r,  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s
    AFF rG   r-  r   r%   Nc                    t        j                  |       } t        |      dk(  rUt        j                  d
      }| j	                  ||d   z        }|r%t        |d   	j                  |d         ||         S rJ |        t        j                  t        | |D ci c]  }|	j                  |      dz
   c}      dz         }t         j                  j                  }t        | t         j                        rt| j                  D ]e  }t        |t         j                        s||z  }t        j                  | |z        } t        j                  t        j                  ||z              }g t        | ||      S c c}w )Nr%   wild)excluder   )r   factorrl   r;  ra  r  rq  r  r"   rA  rB  r:   Mulrm   r  ceiling)r`   symsstride_wildre  r  r  r  termis_storerM   symbolss           rE   expr_to_dimensionz>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  sG   <<%D4yA~#jjAJJ{T!W45(QtAw!7;   %%<^^4!N##t}}S'9A'="=!NORSSF WW[[F$		* II ND!$6$$~~dTk:!&ftm0L!M	N
 !vv66 "Os   E;c                t    t         j                  j                  j                  | j                  t
              S )Nr  )r$   r  r  r+  r  r   )ds    rE   r,  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s$     0 0 : :188c : R rG   _view)+sortedr  r   r   HALIDEr  r?  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   rA  rG  r4  r5  expandr  r:   Addrm   rl   ranger   rE  poprL  r>  r  r  r$   r  r  r  r  insertr  statically_known_geqapply_offset_to_dimensionstatically_known_gtr  countinstall_dimsr
  )rM   r  r  r}  r  rH  
split_exprsplit_failedpartrY  	part_varsnew_split_failedr]  
other_vars
other_partr  r  rz  r`   orig_varr~  s   `  `                @rE   indexing_to_dimensionsz#HalideKernel.indexing_to_dimensions  s)   %,,2BC 	CcDKK#:;s#%))		--   		 ]]7EGGLL9
DFT11%89",UEII">EJJUG 	FD$($5$5IqjIII9~"$Y1$9Q<(D0(#% s<01 JA'?666-9!_*J
!*-
90EE!((Z)V1ICU!)VW
*(//Z0HIJ  F!1EIt3DE!	F$	7. & 	7JD$ *
q))*KK)$56	7 $))+ 	8ICKK)$67	8		R	S))M%'',,1=>!!99$q'..!LKK=Hq$q'..RST d)))agg.>.>.S.S++C0/ ..tVd>Q>QRU>V5VW,,S1!!55 ..tV<" 	:A  dFH=Dy <JeA3'C$--h77##H-44S9	:U J *Ws   	RR	R"Rc                f   || j                   vr|| j                   |<   || j                  |<   y| j                  |   |k7  s$t        | j                   |         t        |      k7  ry|r| j                   |   |k(  S t        | j                   |   |      D ]  \  }}|j                  |j                  k7  r y|j
                  |j
                  k7  s|j                  |j                  k7  sTt        j                  j                  j                  |j
                  |j
                        |_        d|_         y)z>Try to set self.buffer_dimensions[var], return True on successTFN)r  r  rl   zipr  r  r`   r$   r  r  rD  )rM   r  r  rH  r}  oldnews          rE   r  zHalideKernel.install_dims  s   d,,,*.D""3''-D$s#v-""3'2
Y2 ))#.$66D2237> 	 HCzzSZZ'xx388#sxx388';77++88388L	  rG   c                   |dk(  ry t        t        t        |                  D ]  }||   j                  dk(  s8t        j
                  j                  j                  |||   j                        sMt        |||   j                        }||||   j                  z  z  }||   xj                  |z  c_	         |dk(  sJ y )Nr   r%   )
r=  r  rl   r  r$   r  r  r  r   r`   )rM   r  rH  r]  r  s        rE   r  z&HalideKernel.apply_offset_to_dimension  s    Q;%D	*+ 	%AAw~~"agg&6&6&K&KQ'  Q7$a//Q$	% {{rG   c                   t        t        j                            }|j                  D ]  }t	        |t        j                        sJ t        |t        j                        rU| j                  |j                        }t	        |t              r|j                  J |j                  |j                         t        |t        j                        r|j                  |       t        |t        j                  t        j                   t        j"                  t        j$                  f      rt'        d|        | j)                  |      S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimszunhandled symbol )r   r   r  r  r:   r   r   r  r  r   r  r  r  r  r  r  r  r  INDEXr   r  )rM   r  r  r  cse_vars        rE   r  z!HalideKernel.used_dims_from_index+  s   u||,.	%% 	ECc5<<000c488,--chh7w(9:))56   !2!23T[[1c"d''D4I4I4::V ),=cU*CDD#	E$ ""9--rG   c                    t        d |D              sJ t        j                  | j                  | j                  j                               D cg c]  }||v r|
 }}t        |      t        |      k(  sJ |S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r  r  s     rE   r  z.HalideKernel.sort_used_dims.<locals>.<genexpr>C  s     @:a,@r  )r  r  r  r  r  r  rl   )rM   r  r  ordereds       rE   r  zHalideKernel.sort_used_dimsB  s    @i@@@@ !  $"8"8"?"?"A
 i	 
 
 7|s9~---
s   A9c                    dj                  fd|D              }t        |      dk(  rd}|S t        |      dk(  r| d}|S )Nr\   c              3  B   K   | ]  }|j                          y wr   )r  )r  r  r  r  s     rE   r  z.HalideKernel.make_index_str.<locals>.<genexpr>O  s     Qqakk,	BQs   r   ()r%   ,)r  rl   )rM   r  r  r  r  s     `` rE   make_index_strzHalideKernel.make_index_strN  sM    IIQDQQ	t9>I  Y!^$+QIrG   c                "   | j                   j                  |      }| j                  |      }| j                  ||d      \  }}| d| j	                  |       d}t
        j                  j                  |      }|t        j                  t        j                  fv rt        j                  }d| d}| j                  rt        | j                  t              r| j                  j                  J t!        g | j#                  |      | j                  j                        }| j%                  | j'                  |            }|j                  r| j(                  j+                  |j,                   d       | j(                  j+                  |j,                   d| j                   d       | j/                  | j0                  xs d      }	| j(                  j+                  | d	t3        |       d
|	 d       | j(                  j+                  | d| dt3        |       d
|j,                   d       |S | j(                  j+                  | d| j                   d
| dt3        |       d       |S | j5                  || j#                  |            S )z"Codegen a load from an InputBufferFr  r  rc   r9   z!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(r\   rj  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rm   inputr  r  r  r$   r  	get_dtyper<   r   r   r   
_load_maskr:   r  r  r   r  newfuncr  r  r  r   r  _load_otherr   r  )
rM   r   r  r  r  r   r   r  r  r  s
             rE   rT  zHalideKernel.loadW  sL   iiood#%%e,//UEB	Ta++D12!4!!$'U]]ENN33MME+D63D??4??,=>OO--9: #O$++E2OT__5N5NOI \\$"5"5i"@AF		##v{{m3T$UV		##v{{m<?PPQ$RS

4#3#3#8q9		##hk+e*<)=RwaH 		##hc${;u3E2FbU[\ M 		##hmDOO+<BtfJ{[`OaNbbgh M<<d&?&?&FGGrG   c                ^    | j                   j                  t        j                  dd|         S )Nz\[.* )csevarname_mapr  r{  rM   r   s     rE   r  zHalideKernel.lookup_cse_var~  s$    xx##BFF7B$=>>rG   c                (   t        |t              sJ | j                  j                  |      }| j	                  |      }| j                  ||d      \  }}| j                  |      s|| j                         }| j                  ||      }|j                  |      }	dj                  dgt        |      z        xs d}
| j                  j                  t        || d|
 d| d             n| j                  |d	      }t        |      }	t         j"                  j%                  |      }|| d| d
t'        |       d|	 d}n+|dk(  r| d| dt'        |       d|	 d}nt)        d|       | j                  j                  t        ||             y)z"Codegen a store to an OutputBufferTNr\   r  r  r  z] = hl.undef(z.type()))r  z] = hl.cast(r9   
atomic_addz] += hl.cast(zstore mode=)r:   r  rm   outputr  r  is_indirect_indexingrf  r  r  r  rl   r  r  r(   r  r$   r  r  r   r   )rM   r   r  r   moder  r  r  r  	value_str
undef_dimsr   r   s                rE   storezHalideKernel.store  s    %!2333iit$%%e,//UDA	T$$U+t/?224L++D,?I|4I))ZL3t9$<=F$JIITcU!J<}SE#RS ++DD+AIE
I!!$'<U!I;l;u3E2FbSTUD\!U!I;mK4F3Gr)TUVD%D6&:;;		Lt45rG   c           	     v   | j                   sJ | j                  rJ |||f}|| j                  j                  v r| j                  j                  |   S t	        |t
              r1|dk(  sJ  | j                  | x| j                  j                  |<   }|S t	        |t              r|j                  J t        | j                        }| j                  |j                  D cg c]	  }||vs| c}      }	|t        |j                        z
  rF| j                  | | j                  t        g |j                  |            |j                        }|j                  | j                        }
t         j"                  j%                  ||      }t'        |      }|dv r|	j(                   d| }| j*                  j-                  | d| d|
 d       g }d	}t/        | j                        D ]C  \  }}|j1                  | d
| d       |d	k7  r|dxx   d| z  cc<   || j2                  |   z  }E | j*                  j-                  |	 ddj5                  |              n|dk(  r| j7                  ||      }	nt9        ||      }t;        j<                  t?        tA                           5   ||	|
      }ddd       d| dtC        |       d}| j*                  j-                  |	 d|        | j*                  j-                  |	 d        |	| j                  j                  |<   |	S c c}w # 1 sw Y   {xY w)zCodegen a reduction operationwelford_combineNr  )argmaxargmin_z = hl.z(rdom, r9   r%   r  r  *rj  rQ  welford_reducer[   r\   )"r   r  r  reduction_cacher:   tuplewelford_combine_implr  r  r   r  r  r  r  r  r  r   	Reductiondefault_accumulatorr   r   r  r  rk  r?  r  r  welford_reduce_fallbackr   r$   set_ops_handlerr   r   rF   )rM   r   r   reduction_typer   	cache_keyresult_tuplereduction_varsrY  
result_varr  defaultacc_typer  partsr  r]  r  
combine_fncombine_strdefault_strs                        rE   	reductionzHalideKernel.reduction  s-    $$$$??""6	00088++I66eU#!%6666)))51DHH$$Y/,  %!238SSS#D$:$:;\\C11N+BQC

 Ju77LL'##J/R/R>/R$STkk ! E
 NN4#9#9:	,,22>9M"5)11!'q(89EII5'/?wykQR STEF#D$:$:; 03was!_-Q;"I1VH-I$**3//	0
 II:,c%**U2C1D EF//55eUCJ1.(KJ""??3D#EF @(Y?@$XJb1I0J!LKII:,c+ ?@II:,c+ ?@.8  +I D:@ @s    	L**L*+
L//L8c                   t        |t              r|j                  J t        |t              r|j                  J t        |t              r|j                  J t        g |j                  |j                  |j                  xs | j                        }|t        | j
                        z  }| j                  | j                  |            }|||fD cg c]  }d|j                   d }}|j                  }| j                  j                  | ddj                  |       d       | j                  j                  | d| d       | j                  j                  | d| d	       | j                  j                  | d
| d       | j                  j                  | d|j                  | j
                                | j                  j                  | d|j                  | j
                                | j                  j                  | d|j                  | j
                                | j                  j                  | d| d| d       | j                  j                  | d| d| d       | j                  j                  | d| d| d| d       | d| d| d| d| d| d| d| d| d| dg}	| j                  j                  | ddj                  |	       d       g }
t        d       D ]S  }|
j                  | j                  |j                               | j                  j                  |
d!    d"| d#| d$       U t        |
      S c c}w )%Nr[   z.type(), 0)z = hl.Tuple([r\   ri  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   r  rj  r  r  )r:   r  r  r   r  r  r  r  r   r  r  r  r  r  r?  r  )rM   meanm2weightr  r  r   r  pfxr  unpackedr]  s               rE   r  z!HalideKernel.welford_combine_impl  se   $ 12t~~7QQQ"/0R\\5MMM&"349I9I9UUU?dnn?r||?f.>.>?S4CSCS
	 	Z 6 677	\\$"5"5i"@A
<@"f;MNaXaffX[1NNoo		zl-		'8J7K2NO		se:j\=>		se8J<s;<		se<
|3?@		se:dmmD<R<R.S-TUV		se8BKK8N8N,O+PQR		e<0F0F GHI	
 			se9SEC5HI		se>#l3%yQR		e*3%/H\Z]Y^^jk	
 e:cU)C5
;e8C5Yse9SEVYUZZdee;

 			zl-		&8I7J"MNq 	GAOODLL)=)=>?II8B<.J<q1 EF	G X7 Os   M+c           
     N   | j                   sJ t        |      t        |      k(  sJ g }t        t        j                            }|D ]  }t        |t              r|j                  J t        |j                        t        | j                        z  r|j                  |       nK|j                  | j                  | g |j                  g | j                  d d |j                               |j                  |j                          | j                  | j                  |            }|j                  r+t        |j                        t        | j                        z  sJ t        ||      D cg c]  \  }}dt!        |       d| d }	}}| j#                  | j%                  | j&                  d   j(                              }
|j*                   d}| d}| j,                  j/                  | d	|
 d
       t        | j                        dk(  sJ d       g | j                  \  }|t1        |      i}|t1        |      dz
  i}t        |      dk(  r(d }|j3                  |      g}|j3                  |      g}nqd }t5        t        |            D cg c]  }|j3                  |      d| dz    }}t5        t        |            D cg c]  }|j3                  |      d| dz    }}| j,                  j/                  | d ||	              t7        j8                  t;        t=                           5   |||      }d d d        | j,                  j/                  |j3                  |       d |              t        |      dk(  r|fS |D cg c]"  }| j                  | j                  |            $ }}t?        |      D ])  \  }}| j,                  j/                  | d| d| d       + tA        |      S c c}}w c c}w c c}w # 1 sw Y   xY wc c}w )Nr%   r  r[   r\   r9   r  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    | d   S r  rd   r   s    rE   maybe_tuplez&HalideKernel.scan.<locals>.maybe_tuple3  s    trG   c                ,    ddj                  |        dS )Nz
hl.Tuple([r\   ri  )r  r   s    rE   r  z&HalideKernel.scan.<locals>.maybe_tuple:  s    #DIIaL>44rG   r  r  rj  )!r   rl   r   r   r  r:   r  r  r  r?  r  r  r  r  r  r  r   r  r  r7  r@  r   r  r  r!   r  r  r$   r  r   r   rk  r  )rM   dtypesr  values_origr  all_used_dimsr   r  r   initialr  scan_domscanscan_varscan_renames_curscan_renames_prir  	read_left
read_rightr]  r  r  unpack_varsrY  s                           rE   r  zHalideKernel.scan  s5    $$$$6{c+....*,"5<<02  	2Ee%67EOO<WWW%//*Z8N8N-OOe$LL 'I%//I+DT-C-C+DRa+HI#kk !    1	2 \\$"5"5m"DE
##
:3G3G(H:""L
 )
 	
 

 !$FF 3
u u-.bq9
 

 D001A1A"1E1K1KLM oo&e,2		xj(@LM4))*a/ 	
4	
/ 0../$&8&>?$&8&>&BCv;! $,,-=>?I$--.>?@J5
 s6{+ ##$45!A3a@I  s6{+ ##$45!A3a@J 
 			zl#k'.B-CDE /@AB 	<$Y
;K	<		""#345S[9Q8RS	
 v;!= QWXAt||D$7$7$FGXXk* 	<DAqII1#SAaS :;	<[!!k
:	< 	< Ys$   5PPP?
P'P"Pr  c                   | j                   j                  | j                  |||      }t        |t              sJ ||_        |S )Nr  )r  generater  r:   r  r  )rM   r   r  r  r  r  s         rE   r  zHalideKernel.genfuncW  sB     hh		4eL#0111!
rG   r  c               p    | j                   j                  |      }t        |t              sJ ||_        |S r  )r  newvarr:   r  r  )rM   r  r  r  s       rE   r  zHalideKernel.newfuncd  s3    hhooEo*#0111!
rG   c                x    t         j                  j                  |      j                         j	                         S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r$   r  
get_buffer
get_layoutstorage_sizer  s     rE   halide_buffer_numelz HalideKernel.halide_buffer_numelj  s+     ww!!$'224AACCrG   c                   d }g }| j                   j                         \  }}}}t        t        ||      |      D ]  \  }|j	                  |f       t        t              s*j                  dk(  rj                  J |j                  fd| j                  j                  j                  d      D                |S )zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                n    | \  }}t        |t              ryd|j                  v ryd|j                  v sJ y)Nr%   out_ptrr   in_ptrr   )r:   r-   r   )	arg_tuple	_call_strr  s      rE   	arg_orderz.HalideKernel.halide_argdefs.<locals>.arg_orderw  s<    &NIs#w'chh&388+++rG   r-  r   c           	   3     K   | ]>  }d t        |j                  j                  j                  j                        f @ y w)N)alias_of)r.   bufferr   rH  r   )r  aliasr  s     rE   r  z.HalideKernel.halide_argdefs.<locals>.<genexpr>  sG        !!JJIIJJ%(XX	s   AArd   )rm   python_argdefsr  r  r?  r:   r.   rH  r  rE  r
  r  r   )rM   r  r  r  r   r   call_strr  s          @rE   halide_argdefszHalideKernel.halide_argdefsr  s    
	 =?YY--/
1a#C1I9= 	MHcMM8S/*#y)zzQ3<<+???  "&!4!4!8!82!F 		" rG   c                   g }| j                         D ]4  \  }}t        |t              r	d}d}d}d}n| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}t        |      t        |      k(  sJ t        | j                  |j                           }t        |j                      d}|j                  t        ||j                  ||||j                               7 t         j"                  j%                         }	|	j&                  dk(  rDt(        j*                  j,                  g}
t(        j*                  j.                  }dt1               i}d}n|	j&                  dk(  sJ d       |	j2                  d	k  sJ d
       t(        j*                  j4                  g}
t(        j*                  j6                  }t8        j:                  j=                  |	      }d|
d	   vrAdD ]<  \  }}|j>                  |k\  s|j@                  |k\  s&|
j                  d| |         n |
j                  d       d|jB                  i}tE        d	|	j2                        }|
j                  d       |
j                  d       t(        j*                  jF                  s|
j                  d       t(        j*                  jH                  r|
j                  d       d| jJ                  v r|
j                  d       tM        |djO                  |
      |||      S c c}w c c}w )z)Compute metadata required by codecache.pyNlongr  )r  r  rH  r  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r  r   )      )r  r   )r  r%   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)(r
  r:   r-   r  r   r0   r  r  r  rl   r  r/   r   r?  r   r  r$   r  get_current_device_or_throwtyper   r  
cpu_targetscheduler_cpur    r  
gpu_targetscheduler_cudar<   r  get_device_propertiesmajorminormulti_processor_countr@   assertsr  r^   r   r  )rM   argtypesr  r  r  r  rH  r   r   current_devicer  r  r   r!  
capabilityr)  r*  s                    rE   halide_kernel_metazHalideKernel.halide_kernel_meta  s   ))+ 	FAs#w' "33CHH= $..qvv67  "33CHH= $..qxx89  5zS[000t22388<='		2315OOHH!! \\	%	: <<>%'mm../F33I35O K!&&&0K2KK0!''1,M.MM,mm../F44I99.IJ q	1$L LE5!''50Z5E5E5N(8w&GH MM.)z??O
 a!5!56K 	n% 	l#}}$$MM,'==MM'"4###MM/*88F#+#
 	
Cs   +M/+M4c                
     j                   j                  rt        d       j                         }t	               }|j                  dd       |j                           j                         D ]  \  }}t        |t              r,|j                  |j                   d j                   d       B|j                  sJ |       d|j                  v rdnd	}t        |j                        }t!         j"                  |j                           }|j                  |j                   d
| d| d| d        |j                  d       |j                           j                         D ]/  \  }}|j                  |j                   d|j                          1  j                   j%                         D ]  \  }	}
|j                  |	 d
|
         |j                   j&                          fd} j(                  j*                  D ]C  }t        |t,              r t.        j0                  j3                  ||      }|j                  |       E |j                  d       |j                  d        j                         D ]  \  }}t        |t              rWt4        j6                  j8                  j;                  |j<                  d      }|j                  |j                   d| d       n j"                  |j                     }g }t?        |      D ]   \  }} jA                  t4        j6                  j8                  j;                  |jB                  d      |      }|jE                  d| d       d|j                  vsp|j                  |j                   d| d       	 |j                  |j                   d| dtG        |jH                         d       	 |j                  |j                   d| dtG        |jB                         d        |j                  |j                   ddjM                  |       d        |jO                  d       |j                  djQ                                |jR                  rk|j                  dtU        jV                  |jR                        d|jX                  d |jR                  d|jZ                  d!	d       |j]                         S |j                  d"|jX                  d#d       |j]                         S # tJ        $ r Y Rw xY w# tJ        $ r Y +w xY w)$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r9   outzhl.OutputBufferzhl.InputBufferrj  r   r\   z&
            def generate(g):
        z = g.c                    t        t        j                  j                  | j	                  d               }|j
                  J |       t        |      S )Nr%   )r   r  r  r  rb  r  r  )re  r  rM   s     rE   update_indexz1HalideKernel.codegen_kernel.<locals>.update_index  sE    ($((*>*>qwwqz*JKC==,1c1,s8OrG   r  zassert g.using_autoscheduler()r%   r  z.set_estimate(rh  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([ri  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rm   r2  rI   r0  r)   splice	do_indentr
  r:   r-   r  r   r^   r  r   r   rl   r  aliasesrJ  r  _linesr  r  r  r{  r$   r  r  r+  r`   rk  _autoscheduler_workaroundsr  r?  r;   r  r  r  do_unindentrstripr  r   find_libautoscheduler  r   getvalue)rM   r   metacoder  r  argclsargtypendimr  r  r7  r   hintr  range_hintsr]  dims   `                 rE   codegen_kernelzHalideKernel.codegen_kernel  s   99$$/00&&(  	 
	
 	))+ 	LFAs#w'#((+=d>N>N=OqQRzz&3&z.3sxx.?*EU%cii0411#((;<#((3vhay4&JK	L 		

 	))+ 	9FAsNNchhZuSXXJ78	9		))+ 	-HCNNcU#cU+,	-D&&'	
 II$$ 	!D$$(5599,MNN4 		!
 	r78))+ 	XFAs #w'ww''11#((Q1G#((>$qAB--chh7 'o !FAs::((22388a2H$D  &&dV1'=>CHH,#((5<'HI! NN#&88*E!M#cjj/ARRS T
! NN#&88*E!M#chh-PQ R!& #((+;DIIk<R;SSUVW;	X> 	 		
 >>KK$$3$H$H$X#[ \((, 7<<@NN;MRPTPdPdOg h	  #  8 }} KK::>++ I
    }}]  ) ! !  ) ! !s$   &7T#7T3#	T0/T03	U ?U c                    t        |      dk(  rTt        j                  j                  dk(  r7t        j
                  j                         j                  dk(  rt        d|       } | S )Nr%   Anderson2021r  r   )	rl   r   r  r'  r$   r  r"  r#  r@   )r   r  s     rE   r<  z'HalideKernel._autoscheduler_workaroundsb  sN     IN,,>335::fD Aq	ArG   c                   t         j                  j                  }| j                         D cg c]  \  }}|j                  |  }}}t         j                  j                         }|j                  dk(  rE|j                  |j                  t         j                  j                        }|j                  |       |j                  |||d       yc c}}w )zCodegen a call to this kernelNr  F)devicetriton)r$   r  wrapper_coder
  r  r"  r#  write_get_raw_streamr  r   r?  generate_kernel_call)	rM   r   r"  wrapperr   r  	call_argsr.  stream_names	            rE   call_kernelzHalideKernel.call_kernelm  s    ''&&*.*=*=*?X33<<CWsVX	X<<>&(!66$$aggllK [)$$!	 	% 	
 Ys   CCc                     yr  rd   )rM   r  s     rE   generate_assertzHalideKernel.generate_assert~  s    rG   c                     y r   rd   )rM   r`   r  loweruppers        rE   check_boundszHalideKernel.check_bounds  s     	rG   )r  zdict[str, sympy.Expr]rQ   rR   )r   r  rQ   r  )NNN)rM  zSequence[sympy.Expr])r  r  )r  r  r  r  r}  r   r  )r   r  r  r  )r   r  r   )
r   r  r  r  r   r'   r  r6   rQ   rR   )
r   r  r   r  r  r5   r   +Union[CSEVariable, tuple[CSEVariable, ...]]rQ   r\  )r  ztuple[torch.dtype, ...]r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]rQ   r]  )r  r7   rQ   r  )rQ   r   )r`   r  r  r  rY  r   rZ  r   )*rS   rT   rU   r   	overridestexprr  r  rL   r  r  r[  rf  rK  r  rq  r  r  r  r  r  r  rT  r  r  r  r  r  r   unknownr  r  r  r
  r0  rI  r   r<  rU  rW  r[  rV   rW   s   @rE   r  r    s   I).E&.+%+ 
	+6"=iV*BNN%f:P(
..
%HN? SW66 *63>6FO6	6:<< < &	<
 ;< 
5<|$LS"'S"
S" -S" 
!S"t #{""$ $  
 =A D"HQ
fwr  
"&09=FJrG   r  c                  (    e Zd ZeZedd       Zd Zy)HalideSchedulingc                    t        t        j                  t        j                  t        j                  g      }t
        j                  j                  r|j                  t        j                         |S r   )
r   r&   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r  scan_kernelsr  SCAN)r   rM  r  s      rE   get_backend_featuresz%HalideScheduling.get_backend_features  sR    ..6677
 ==%%JJ~**+rG   c                   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }||j                  |<   |j                  d       t               }|j                  d|j                         d       |j                  |d       |j                  d       t        ||      \  }}| d| }	|j                  ||j                         |	       t        d	      rt        |d
|       |S )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr3  z''')
kernel_metadatar  )r$   r  rO  src_to_kernelnext_kernel_suffixadd_import_oncer)   r  r0  r8  r   define_kernelr@  r   r   )
rM   src_codenode_scheduler]   rR  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rE   rq  zHalideScheduling.define_kernel  s"   ''&&w,,,!//9K. + +7+E+E+G*HIK.9G!!(+##W -.O%%'(A(A(C'FeL ""84"8%%f-(;M7(S%G%")"-=,>?!!_5579I ''89#KX>rG   N)rM  ztorch.devicerQ   zOrderedSet[BackendFeature])rS   rT   rU   r  kernel_typer  ri  rq  rd   rG   rE   rb  rb    s    K
 
rG   rb  )t
__future__r   dataclassesr2  r  loggingr  collectionsr   mathr   typingr   r   r   r	   r
   r   r   r<   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r    r!   r"   virtualizedr#   rS  r$   commonr&   r'   r(   r)   r*   r+   r,   r-   r.   cppr/   	cpp_utilsr0   simdr1   r2   r3   collections.abcr4   r5   r6   shape_propagationr7   	getLoggerrS   re  rF   RuntimeErrorrI   rY   r   r_  pexprr   r   r   r   float64rz  int16r   r>   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesr  	dataclassr  r  ry  r  rb  rd   rG   rE   <module>r     s   "     	 #  F F    - , ? 7 4  ' ) B ) 7  )
 
 
   ; ; (62g!
F, F
QM QD 	 
JJ	NNO	MM>	MM>	MM>	JJ	KK	KK	KK	KK	LL-	LL-	LL-"H9k H9V
  / / 9%P %PP + + +>
 ^: ^B+~ +rG   