
    pi5                   b   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl;m@Z@mAZA ddlBmCZC ddlDmEZE ddlmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ ddlRmSZS ddlTmUZUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z` erd dlambZbmcZc d dldZdddlemfZf ddlgmhZh  ej                  ej      Zk eX       j                  Zmenej                  ej<                  epeqf   Zree;j                  eYf   Zted gdf   Zud[d!Zvd\d"Zwexepeyf   Zzeeneeyej.                  f   d#f   eezgeneyd#f   f   f   Z{	 	 d]	 	 	 	 	 	 	 	 	 	 	 d^d$Z|d_d%Z}ej                   G d& d'             Z G d( d)      Z G d* d       Zej                   G d+ d,e             Zej                   G d- d.e             Zej                   G d/ d0e             Zej                   G d1 d2e             Z G d3 d4e      Zej                   G d5 d6e             Zej                   G d7 d8e             Zej                   G d9 d:e             Zej                   G d; d<e             Zej                   G d= d>e             Zej                   G d? d@e             Z G dA dB      Zej                   G dC dDe             Zej                   G dE dFe             Zej                   G dG dHe             Zej                   G dI dJe             Z G dK dLe      Zej                   G dM dNe             Zej                   G dO dPe             Zej                   G dQ dRe             Zej                   G dS dTe             Zej                   G dU dVe             ZepZeeeMf   Z G dW dXeV      Z G dY dZe      Zy)`    )annotationsN)chaincount)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)'set_kernel_post_grad_provenance_tracing)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)FxConverterWrapperLinec                8   t         j                  j                  |       }| j                         t         j                  j                  v}| j                         | j                         t        t         j                  j                  j                  |            |fS N)
r2   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper/   sizevarssimplify)nodestorage_size	alignments      a/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrQ   ]   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                   | j                         |j                         k7  ry| j                         |j                         k7  ryt        j                  j                  j                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |            }t        |      t        |      k(  sWt        j                  j                  j                  |d|z        r+t        j                  j                  j                  ||      ryy)NFgffffff?T)
rI   rJ   r2   rE   rK   rL   rF   r/   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rP   can_match_buffer_sizerZ   k   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMGG11+zJrR   .c                    t               dd	 d	 	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        j                  j                  rV|rTt        j                  j                  r:|t        j                  j                  v rt        j                  j                  |   }	nd gt        |      z  }	t        |      dk(  r" ||d   |	d         \  }
} |d	|
 d	|        nt        |      dkD  sJ t        |      t        |      k(  sJ t               }t        t        |||	      d
 d      D ]  \  }
}}g }|j                  r:|j                  D ]+  }|dvs|j!                  d| d|j                  |           - |rdj#                  |      }nd} ||
|      \  }
}d| d|
 }||v r~|j%                  |        ||d| d|         d d d        d d d        |j'                         fS # 1 sw Y   #xY w# 1 sw Y   'xY w)Nc                d    t        | t        j                        r| S t        j                  |       S rD   )
isinstancesympyr   Integer)items    rP   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLrR   c                    t        |       r| | fS t        fd| D              }|s|}j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wrD    ).0gra   s     rP   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wrD   generate_example_arg_valuetype)re   rf   wrappers     rP   rg   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s*        ::1d1gF   %()callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)gridexample_grid
sympy_gridra   rl   s      rP   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rR   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y rD   )	writeliner   rq   rr   kernel_autotune_nameskernel_autotune_calls)linert   nameoutputrl   s     rP   rx   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rR   grid_wrapper_for_def z(meta):r3   r   zreturn c                2    t        | d   j                        S Nr3   lenkwargsxs    rP   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>   s    c!A$++. rR   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )r`   Union[int, sympy.Expr]return
sympy.ExprrD   )rs   
TritonGridrt   zOptional[TritonGrid])r{   strrt   Optional[str])r*   r   rq   rr   rz   indent
contextlibnullcontextr2   rE   autotuning_gridsr   r   sortedzipr   appendjoinaddgetvalue)r|   configsgridsrl   original_fxnode_namerv   rx   fn_namekernel_autotune_calls_indentexample_gridsrs   rt   seenc
guardslistkwargguards	statementra   r}   s   `  `              @@rP    user_defined_kernel_grid_fn_coder      s    FM
 .2

*
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
 .L6 .LMM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KD,v&',(@Au:>!>u:W---$.LD *0E7M2.* L%a
  
88!" W  ) 
 '--ugVAHHUOCT.UVW $\\*5F#F%3D,%G"l!&4&9	$#)s6()L>%JK1L-.L .L` FOO%%%a.L .L .L .Ls,   I!D%I<A7I3I!I	I!!I*c                    t               j                  | j                  d       ddlddlm ddlm t        | j                  g      fd |        j                         S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   N)JITFunction)	constexprc           	        t        d t        j                  | j                        D              }| j                  j                  j                  di       }| j                  j                  j                  D ]w  }|v r	|| j                  j                  v s"| j                  j                  |   }t        |      rX	j                          	j                  d       	j                  |j                  d       j                  |        |       t        d      rt        |j                  j                   j"                        rY	j                          	j                  d       	j                  |j                  d       j                  |        |       .t        |t$        t&        t(        
f      r	j                          t        |
      rd|j*                  d	}n|}|j                  |      x}rKt        |t,              rd
|j.                   d|j0                   }nd
|}	j                  | | d|        n	j                  | d|        j                  |       ||v s|dk7  st        |d      s|j.                  j3                  d      s9	j                  d|j.                   d|j0                   d|        j                  |       z y )Nc              3  R   K   | ]  }|j                   d k(  r|j                   ! yw)LOAD_GLOBALN)opnameargval)re   insts     rP   rg   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>  s(      '
{{m+ KK'
s   %'__annotations__z@triton.jitTr   constexpr_functionz@triton.constexpr_functionztl.constexpr(): . = tl
__module__rq   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesr]   newlinerx   splicesrcr   hasattrruntimejitConstexprFunctionintr   boolvaluerk   r   __name__
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverserq   s           rP   r   zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse  s   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11:: 5	6K..jmm777#22;?fk2#++-#--m<#**6::T*B$((5V$V%9:zFNN..@@@ $++-#--.JK#**6::T*B$((5V$c4(CD#++-!&)4'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$7"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((5#44#t+5 ))44X>
 $-- 1 12(6??:K4P[}] %((5k5	6rR   )
r*   r   r   rq   r   triton.languager   r   r   r   )kernelr   r   r   r   r   rq   s    @@@@@@rP   9user_defined_triton_kernel_transitive_closure_source_coder      si    
 %&O6::T2 ") "6??"34@6 @6D V##%%rR   c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgsympy.Symbolinnerr   
inner_exprc                ,    t        | j                        S rD   )r   r   selfs    rP   __str__zSymbolicCallArg.__str__[  s    4::rR   N)r   r   __qualname__r   r   rd   rR   rP   r   r   U  s    rR   r   c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                l    t         |           t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rP   r   zMemoryPlanningState.__init__`  s-    ##D) 	 12(rR   c                L    t        | j                  j                  |d             S rD   )r   r   r   )r   r   s     rP   __contains__z MemoryPlanningState.__contains__g  s    DOO''T233rR   c                \    | j                   |   j                         }|j                  rJ |S rD   )r   pop	is_reusedr   r   r`   s      rP   r   zMemoryPlanningState.popj  s+    s#'')>>!!rR   c                \    |j                   rJ | j                  |   j                  |       y rD   )r   r   r   r   s      rP   pushzMemoryPlanningState.pusho  s&    >>!!##D)rR   )r   ReuseKeyr   r   )r   r   r   FreeIfNotReusedLine)r   r   r`   r   r   None)r   r   r   r   r   r   r   __classcell__r   s   @rP   r   r   _  s    24
*rR   r   c                      e Zd ZddZy)rB   c                    t        d      )Nz2FX codegen not yet supported for type {type(self)})NotImplementedErrorr   	converters     rP   
codegen_fxzWrapperLine.codegen_fxu  s    !"VWWrR   Nr  rA   r   FxConversionFuncr   r   r   r	  rd   rR   rP   rB   rB   t  s    XrR   c                  :    e Zd ZU ded<   ded<   d	dZd
dZddZy)EnterSubgraphLinePythonWrapperCodegenrl   r@   rE   c                b    | j                   j                  | j                   j                         y rD   )rl   push_computed_sizescomputed_sizesr   s    rP   __post_init__zEnterSubgraphLine.__post_init__~  s    (()D)DErR   c                n    | j                   j                  | j                         |j                          y rD   )rl   push_codegened_graphrE   	do_indentr   codes     rP   codegenzEnterSubgraphLine.codegen  s"    ))$**5rR   c                    |j                   S rD   )_generate_enter_subgraphr  s     rP   r	  zEnterSubgraphLine.codegen_fx  s    111rR   Nr   r  r  r*   r   r  r
  r   r   r   r   r  r  r	  rd   rR   rP   r  r  y  s    !!F2rR   r  c                  2    e Zd ZU ded<   ddZedd       Zy)CommentLiner-   r{   c                :    |j                  | j                         y rD   )rx   r{   r  s     rP   r  zCommentLine.codegen  s    tyy!rR   c                    | j                   S rD   )_generate_comment)r  s    rP   r	  zCommentLine.codegen_fx  s    ***rR   Nr  r
  )r   r   r   r   r  staticmethodr	  rd   rR   rP   r   r     s!    
" + +rR   r   c                  0    e Zd ZU ded<   ddZddZd	dZy)
ExitSubgraphLiner  rl   c                V    | j                   j                         | j                   _        y rD   )rl   pop_computed_sizesr  r   s    rP   r  zExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rR   c                X    | j                   j                          |j                          y rD   )rl   pop_codegened_graphdo_unindentr  s     rP   r  zExitSubgraphLine.codegen  s    ((*rR   c                    |j                   S rD   )_generate_exit_subgraphr  s     rP   r	  zExitSubgraphLine.codegen_fx  s    000rR   Nr  r  r
  r  rd   rR   rP   r&  r&    s    !!H1rR   r&  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterDeviceContextManagerLiner   
device_idxOptional[int]last_seen_device_guard_indexc                x   t         j                  j                  r|j                  d       t         j                  j                  rg| j
                  ;|j                  t         j                  j                  j                          d       y | j
                  | j                  k(  s{J d       | j
                  H|j                  t         j                  j                  j                          d| j                   d       y |j                  d| j                   d       y y |j                  dt         j                  j                  j                  | j                         d       |j                          |j                  t         j                  j                  j                  | j                               y )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r2   rE   cpp_wrapperrx   aot_moder2  
device_opscpp_aoti_stream_guardr0  cpp_aoti_device_guarddevice_guardr  
set_devicer  s     rP   r  z%EnterDeviceContextManagerLine.codegen  sP   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ P NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrR   c                    |j                   S rD   )&_generate_enter_device_context_managerr  s     rP   r	  z(EnterDeviceContextManagerLine.codegen_fx  s    ???rR   Nr  r
  r   r   r   r   r  r	  rd   rR   rP   r/  r/    s    O"//K:@rR   r/  c                      e Zd ZddZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y rD   )r2   rE   r7  r+  r  s     rP   r  z$ExitDeviceContextManagerLine.codegen  s     ww"" #rR   c                    |j                   S rD   )%_generate_exit_device_context_managerr  s     rP   r	  z'ExitDeviceContextManagerLine.codegen_fx  s    >>>rR   Nr  r
  r   r   r   r  r	  rd   rR   rP   rB  rB    s    ?rR   rB  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelAllocLiner  rl   ir.ExternKernelAllocrM   c                    | j                   }g |j                         |j                         }| j                  j	                  | j                   |       y rD   )rM   codegen_argscodegen_kwargsrl   $_generate_extern_kernel_alloc_helper)r   r  rM   argss       rP   r  zExternKernelAllocLine.codegen  sD    yy=""$=t':':'<=99$))TJrR   c                    |j                   S rD   )_generate_extern_kernel_allocr  s     rP   r	  z ExternKernelAllocLine.codegen_fx  s    666rR   Nr  r
  r@  rd   rR   rP   rH  rH    s    !!
K
7rR   rH  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelOutLiner  rl   ir.ExternKernelOutrM   c                n   | j                   }g |j                         |j                  d      }|j                         }t        j
                  j                  r|j                  dk(  rd}n|j                         }|j                         x}r|j                  nt        j
                  j                  }d }t        j                  j                  dk7  rt        ||d      }| j                  j!                  ||j#                         |j$                  r|j$                  j#                         nd |||       y )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_outr   )	is_extern)rM   rK  rL  get_kernel_namer2   rE   r7  cpp_kernel_name
get_devicerk   device_typer   traceprovenance_tracking_levelr!   rl   "_generate_extern_kernel_out_helpercodegen_referenceoutput_view)r   r  rM   rN  kernel_nameddeviceprovenance_debug_handles           rP   r  zExternKernelOutLine.codegen  s   yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L15<<11Q6&MkT'# 	77""$484D4DD..0$#	
rR   c                    |j                   S rD   )_generate_extern_kernel_outr  s     rP   r	  zExternKernelOutLine.codegen_fx      444rR   Nr  r
  r@  rd   rR   rP   rR  rR    s    !!

85rR   rR  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
FreeLiner  rl   %Union[BufferLike, ir.TorchBindObject]rM   c                    | j                   j                         t        j                  j                  vsJ |j                  | j                  j                  | j                                y rD   )rM   rG   r2   rE   removed_buffersrx   rl   make_buffer_freer  s     rP   r  zFreeLine.codegen  sF    yy!!#177+B+BBBBt||44TYY?@rR   c                    |j                   S rD   )_generate_freer  s     rP   r	  zFreeLine.codegen_fx      '''rR   Nr  r
  r@  rd   rR   rP   ri  ri    s    !!
//A(rR   ri  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ddZddZy)KernelCallLiner  rl   r   ra  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   rq   zdict[str, Any]triton_metaztorch.devicerc  
graph_namer   c                   | j                   j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  
       y )N)rq   rw  rt  ru  rx  rc  ry  r   )rl   _generate_kernel_call_helperra  rs  rq   rw  rt  ru  rx  rc  ry  r   r  s     rP   r  zKernelCallLine.codegen!  se    11NN;;nn]]]]((;;!%!:!: 	2 	
rR   c                    |j                   S rD   )_generate_kernel_callr  s     rP   r	  zKernelCallLine.codegen_fx/      ...rR   Nr  r
  r@  rd   rR   rP   rr  rr    sL    !!LO
/rR   rr  c                  f    e Zd ZU ded<   ded<   ded<   dZded<   d	Zd
ed<   dZded<   ddZddZy)KernelDefinitionLiner  rl   r   ra  kernel_bodyNr   metadataTr   gpucpp_definitionc                    | j                   j                  | j                  | j                  | j                  | j
                  | j                         y N)r  r  r  )rl   _define_kernel_helperra  r  r  r  r  r  s     rP   r  zKernelDefinitionLine.codegen<  sB    **]].. 	+ 	
rR   c                    |j                   S rD   )_generate_kernel_definitionr  s     rP   r	  zKernelDefinitionLine.codegen_fxE  rg  rR   r  r
  )	r   r   r   r   r  r  r  r  r	  rd   rR   rP   r  r  3  s<    !!"Hm"C$(NM(
5rR   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner  rl   c                    | S )zFirst pass to find reuserd   r   states     rP   planzMemoryPlanningLine.planM  s    rR   c                     y)zSecond pass to output codeNrd   r  s     rP   r  zMemoryPlanningLine.codegenQ  s    rR   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        rl   =(, r   )dataclassesfieldsr|   getattrr   rk   r   BufferrG   r   r   )r   rN  fieldvals       rP   r   zMemoryPlanningLine.__str__T  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::rR   Nr  r   r   r  r  r   r   )r   r   r   r   r  r  r   rd   rR   rP   r  r  I  s    !!);rR   r  c                  *    e Zd Zd ZddZddZddZy)EfficientPeakEstimatec                   ddl m}m} t        j                  j
                  j                  }t        t        j                  j                  j                               }t        t        j                  j                               } |||      } ||||      \  | _        }ddlm}  ||t        j                  t         d      | _        y )Nr   )estimate_peak_memoryget_freeable_input_bufr3   )SegmentedTreer   )memoryr  r  r2   rE   	schedulernodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer  operatorr   max)	r   r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder  s	            rP   r   zEfficientPeakEstimate.__init__d  s    I''++11!!''"6"6";";"=>"177#;#;#=>!7!V;O"<
8 "8 	2+"HLL#q
rR   c                    t         j                  j                  j                  t         j                  j	                  |      d      t        |j                               z  S )Nr   fallback)r2   rE   rK   	size_hintrF   r)   rJ   r   rM   s     rP   	_get_sizezEfficientPeakEstimate._get_sizew  sL    ww))GG//5 * 
4>>+,- 	-rR   c                n    | j                   j                  |j                  dz   |j                  dz
        S r   )r  summarize_rangescheduler_node_indexr   line_aline_bs      rP   peak_betweenz"EfficientPeakEstimate.peak_between|  s6    ""22''!+V-H-H1-L
 	
rR   c                    |j                   dz   |j                   k(  ry | j                  j                  |j                   dz   |j                   dz
  | j                  |j                               y r   )r  r  update_ranger  rM   r  s      rP   update_peak_betweenz)EfficientPeakEstimate.update_peak_between  s^    &&*f.I.II((''!+''!+NN6;;'	
rR   N)rM   
BufferLiker   r   )r  r   r  AllocateLine)r   r   r   r   r  r  r  rd   rR   rP   r  r  c  s    
&-



rR   r  c                  >    e Zd ZU ded<   d Zd	dZd
dZddZddZy)r  r  rM   c                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rD   r2   rE   r  current_noder  indexr  r   s    rP   r  zAllocateLine.__post_init__  T    ww  --999$%GG$5$5$;$;$A$AGG**%
!rR   c                    |j                   dz   | j                   k(  ry| j                  j                  j                  }| j                  j                  j	                  ||       }||z   }||k  S )Nr3   T)r  rl   estimate_peakr  r  )r   	free_linesizer  peak_memory_in_rangenew_peak_memorys         rP   should_reuse_bufferz AllocateLine.should_reuse_buffer  si    ))A-1J1JJ"ll88LL#||99FFyRVW!55"555rR   c           	        | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  r
||v r|j                  |      }t        j                  j                  j                  t        j                  j                  | j                         d      t        | j                   j                               z  }| j!                  ||      rXd|_        | j                  j$                  j'                  ||        t)        | j                  |j                   | j                         S |j+                  ||       | S | j                   j-                         j.                  dk(  rh| j                  j1                  | j                         }|A|xj2                  t5        t7        j8                  t:        j<                  |d            z  c_        | S )Nr   r  Tcpur3   )rM   rG   r2   rE   rl  NullLinerl   rQ   r   allow_buffer_reuser   rK   r  rF   r)   rJ   r  r   r  r  	ReuseLiner   rI   rk   static_shape_for_buffer_or_noner   r   	functoolsreducer  mul)r   r  r   r  r  static_shapes         rP   r  zAllocateLine.plan  s}   99177#:#::DLL)) tyy)$$		#I77##--33DII> . tyy22456D ''	48&*	#**>>y$O y~~tyyII

3	*99((*//58<<GG		RL'11S$$X\\<C6 1 rR   c                    | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y rD   )rM   rG   r2   rE   rl  rl   make_buffer_allocationrx   r   r  r{   s      rP   r  zAllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trR   c                    |j                   S rD   )_generate_allocater  s     rP   r	  zAllocateLine.codegen_fx  s    +++rR   N)r  r   r  r   r   r   r  r  r
  )	r   r   r   r   r  r  r  r  r	  rd   rR   rP   r  r    s!    

68
,rR   r  c                  D    e Zd ZU ded<   dZded<   d ZddZddZdd	Zy
)r   r  rM   Fr   r   c                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rD   r  r   s    rP   r  z!FreeIfNotReusedLine.__post_init__  r  rR   c                   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j!                  t#        | j                        |        | S r   )r   rM   get_inputs_that_alias_outputr]   layoutr   MultiOutputLayoutr   rG   r2   rE   rl  r  rl   r   r  r   rQ   r  s     rP   r  zFreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$JJ'		2D9rR   c                    | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y rD   )	rM   rG   r2   rE   rl  r   rx   rl   rm  r  s     rP   r  zFreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rR   c                    |j                   S rD   )_generate_free_if_not_reusedr  s     rP   r	  zFreeIfNotReusedLine.codegen_fx  s    555rR   Nr  r  r
  )	r   r   r   r   r   r  r  r  r	  rd   rR   rP   r   r     s'    
It

E
6rR   r   c                  D    e Zd ZU ded<   ded<   ded<   d
dZddZddZy	)ReinterpretLiner  rM   	reused_asz	ir.Layoutr  c                    | S rD   rd   r  s     rP   r  zReinterpretLine.plan  s    rR   c                @   t        | j                  t        j                        sJ t        | j                  j                  t        j
                        sJ | j                  j                  | j                  j                         | j                  j                         y rD   )
r]   r  r   NonOwningLayoutviewr#   rl   codegen_deferred_allocationr  rG   r  s     rP   r  zReinterpretLine.codegen  sj    $++r'9'9:::$++**B,>,>???00NN##%t{{'7'7	
rR   c                    |j                   S rD   )_generate_reinterpretr  s     rP   r	  zReinterpretLine.codegen_fx  r~  rR   Nr  r  r
  )r   r   r   r   r  r  r	  rd   rR   rP   r  r    s#    

/rR   r  c                  H    e Zd ZU ded<   ded<   dZded<   ddZddZdd	Zy
)r  r  rM   r  Tr   
delete_oldc                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S rD   )rM   rG   r2   rE   rl  r  r  rl   r  s     rP   r  zReuseLine.plan  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrR   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y rD   )
rM   rG   r2   rE   rl  r  rx   rl   make_buffer_reuser  r  s     rP   r  zReuseLine.codegen  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rR   c                    |j                   S rD   )_generate_reuser  s     rP   r	  zReuseLine.codegen_fx
  s    (((rR   Nr  r  r
  )r   r   r   r   r  r  r  r	  rd   rR   rP   r  r    s'    
J
)rR   r  c                      e Zd ZddZy)r  c                    |j                   S rD   )_generate_nullr  s     rP   r	  zNullLine.codegen_fx  rp  rR   Nr
  r  rd   rR   rP   r  r    s    (rR   r  c                  X    e Zd ZU ded<   ded<   ed	d       Zed
d       Zedd       Zy)CommBufferLiner  rl   	ir.BufferrM   c                    ddl m} | j                  j                         }| j                  j	                         } ||      rt        d| j                         t        |      |j                  z  S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr   rM   	get_numelrJ   AssertionErrorr   itemsize)r   r   numelr   s       rP   r  zCommBufferLine.size  sa    5		##%		##%u ?		{K  5zENN**rR   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rD   )rM   get_output_specr]   r   CommBufferLayoutcomm_buffer_typer   r  s     rP   r	  zCommBufferLine.comm_buffer_type$  s6    **,&""5"5666&&&rR   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rD   )rM   r  r]   r   r  
group_namer
  s     rP   r  zCommBufferLine.group_name*  s6    **,&""5"5666   rR   Nr   r   )r   zir.CommBufferTyper  )r   r   r   r   propertyr  r	  r  rd   rR   rP   r  r    sG    !!
O	+ 	+ ' '
 ! !rR   r  c                  ,    e Zd ZddZed        ZddZy)CommBufferAllocateLinec                "   | j                   j                         t        j                  j                  vsJ | j                   j                         }| j                   j                         }| j                   j                         }t        | j                   j                               }t        | j                   j                               }|j                  | j                  | j                  | j                  | j                  |||||             y rD   )rM   rG   r2   rE   rl  rZ  rJ   ro   get_size
get_striderx   make_allocation_liner	  r  rl   )r   r  r|   rc  r   shapestrides          rP   r  zCommBufferAllocateLine.codegen3  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rR   c                    | t         j                  j                  k(  rS| d|j                  |       d|j                  |       d| d|j                   d| dt        j                  dd       dS t        d	|        )
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupler  randomrandintr  )r	  r  rl   r|   rc  r   r  r  s           rP   r  z+CommBufferAllocateLine.make_allocation_lineG  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rR   c                    |j                   S rD   )_generate_comm_buffer_allocater  s     rP   r	  z!CommBufferAllocateLine.codegen_fxZ  s    777rR   Nr  r
  )r   r   r   r  r$  r  r	  rd   rR   rP   r  r  1  s     
(  $8rR   r  c                      e Zd ZddZddZy)CommBufferFreeLinec                    | j                   j                  | j                        }|j                  | d| j                  j
                   d       y )Nz # z buffer free)rl   rm  rM   rx   r	  r   r  s      rP   r  zCommBufferFreeLine.codegen`  s@    ||,,TYY7$s4#8#8#>#>"?|LMrR   c                    |j                   S rD   )_generate_comm_buffer_freer  s     rP   r	  zCommBufferFreeLine.codegen_fxd  s    333rR   Nr  r
  rF  rd   rR   rP   r   r   ^  s    N4rR   r   c                  J    e Zd ZU dZded<   ded<   ded<   ded<   dd	Zdd
Zy)MultiOutputLinezU
    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
    r  rl   r   result_namearg_nameSequence[Any]indicesc                      fd  j                    j                        }|j                   j                  j                    j
                   d|  j                  j                          y )Nc                l   t        |      dkD  r|d   \  }}t        |t              r |  d| d|dd        S t        |t              r<j                  j                  | j                  t        |            } ||dd        S t        |t              r |  d| d|dd        S t        d|      | S )Nr   []r3   z['z']znon supported index type: )
r   
issubclassr   ro   rl   codegen_tuple_accessr&  r   dictr  )basenamer)  itypeituple_accesscodegen_list_tuple_accessr   s        rP   r5  z:MultiOutputLine.codegen.<locals>.codegen_list_tuple_accesst  s    7|a"1:qeT*4z1#Q5GQRQSUUu-#'<<#D#D $"2"2CF$L 5\712;OOt,4zA3b5I7STSU;WW()EuMMrR   r   )r'  r)  rx   rl   declarer&  ending)r   r  r   r5  s   `  @rP   r  zMultiOutputLine.codegens  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rR   c                    |j                   S rD   )_generate_multi_outputr  s     rP   r	  zMultiOutputLine.codegen_fx  s    ///rR   Nr  r
  )r   r   r   __doc__r   r  r	  rd   rR   rP   r%  r%  h  s*     "!M
00rR   r%  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)SymbolicCallArgLiner  rl   r   argr@   rE   c                d    | j                   j                  | j                  | j                         y rD   )rl   "_generate_symbolic_call_arg_helperr=  rE   r  s     rP   r  zSymbolicCallArgLine.codegen  s    77$**MrR   c                    |j                   S rD   )_generate_symbolic_call_argr  s     rP   r	  zSymbolicCallArgLine.codegen_fx  rg  rR   Nr  r
  r@  rd   rR   rP   r<  r<    s    !!	N5rR   r<  c            	          e Zd ZdZdZ fdZe	 d	 	 	 	 	 	 	 dd       ZddZddZ	ddZ
dd	Zdd
Zedd       ZddZedd       ZddZedd       ZddZ	 	 ddZddZddZddZddZddZddZddZddZddZddZd Zd Z d Z!d Z"d  Z#dd!Z$dd"Z%dd#Z&dd$Z'dd%Z(dd&Z)dd'Z*dd(Z+dd)Z,d* Z-	 	 	 	 dd+Z.	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd,Z/dd-Z0dd.Z1dd/Z2d0 Z3d1 Z4d2 Z5	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd3Z6d4 Z7dd5Z8e9jt                  dd6       Z;dd7Z<d8 Z=d9 Z>d: Z?d; Z@dd<ZA	 	 	 	 	 	 dd=ZBd> ZCdd?ZDd@ ZEddAddBZFddAddCZGddDZHddEZIddFZJddGZK	 	 ddHZL	 d	 	 	 ddIZMddJZNddKZOdL ZPdM ZQdN ZRdO ZS	 	 	 d	 	 	 	 	 	 	 	 	 ddPZTe	 d	 	 	 	 	 ddQ       ZU	 	 	 d	 	 	 	 	 	 	 	 	 ddRZVddSZW	 	 ddTZXdddUZY	 	 	 	 	 	 ddVZZddWZ[ddXZ\dY Z]dZ Z^d[ Z_d\ Z`d] Zad^ Zbd_ Zcd` ZdddaZedb Zfdddddddddc	 	 	 dddZgdddddddeddf	 ddgZhdh Zidi Zjdj ZkddkZlddlZmedm        Zn	 ddnZodo ZpddpZqddqZrddrZsddsZt	 d	 ddtZudduZvddvZwddwZxdx ZyddyZzdz Z{dd{Z|d| Z}	 	 	 	 	 	 	 	 dd}Z~d~ Z	 	 	 	 ddZddZd Zd Zd Zd Zd Zd Zd Zd Zed        Zed        Zed        Zed        Zed        Z xZS )r  zB
    Generate outer wrapper in Python that calls the kernels.
    Tc                    t                    t                _        i  _        t                _        t                _        t                _        t                _	        t                _
        t                _        t                _        t                _        t                _        t                _        i  _        d _        i  _        t                _        g  _        d _        d _        d _        d _        d _        t6        j8                  j:                  rdnd _        t6        j8                  j:                  rdnd _        d  _         d _!        i  _"        t                _#        t                _$        d  _%         jM                          g  _'        g  _(         jS                          tU               s jW                           jY                          t6        j8                  jZ                  sBt6        j8                  j\                  j_                         D ]  \  }} ja                  ||        t        tb                   _2        t        tb                   _3        i  _4         tk        jl                  d        jn                         _7        tj        jp                  d
 fd       }| _9        i  _:        t                _;        ty                _=        t                _>        i  _?        t        t        j                  j                  t        j                  j                  	       _E        g  _F        y )Nr    #r  z
std::move(r   Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y rD   )importsrx   r   rq   rr   rz   )r{   r   s    rP   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rR   )debug_printer_leveluse_array_ref)r{   r   r   r  )Gr   r   r   _names_iterargs_to_buffersr*   rG  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsrz   subgraph_definitionsr   ry   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr6  declare_maybe_referencer7  commentnone_strr2   rE   r7  
move_beginmove_endr2  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr+   write_prefix!write_kernel_autotune_defs_headerr8  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamcacherH  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   r|   hashedrH  r   s   `   rP   r   zPythonWrapperCodegen.__init__  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(QS&L 	" 9C $!!# &("$&!248..0ww ! 6 6 < < > 2f##D&12 $J/1
+-
 57$=I$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rR   Nc                D    | r|J |J t        |||      S t               S rD   )SubgraphPythonWrapperCodegenr  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rP   createzPythonWrapperCodegen.create  s?      ,,,!---/~/C  $%%rR   c                    d| _         y )Ncall)ra  r   s    rP   rb  z)PythonWrapperCodegen.set_launcher_fn_name  s
     &rR   c                D    | j                   j                  | d|        y )Nz = None  # )rM  rx   )r   r|   r|  s      rP   rj  z#PythonWrapperCodegen.write_constant  s    k&:;rR   c           	        t         j                  j                  j                         }d}||j                  d|j                   }d}t        t        j                  j                        dkD  rd}n0t         j                  j                  j                  j                  rd}| j                  j                  d| dt        j                   d| d	d
       | j                   j                  dd
       	 ddlm} | j                   j                  dd
       t        j*                  r| j                   j-                  d       y y # t&        t(        f$ r Y >w xY w)NrD  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   rw  rx  	_inductortest_configstrack_memory_lifecyclerG  r   r   r   rM  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingrx   )r   contextaot_config_commentinductor_debug_utilsr  s        rP   re  z!PythonWrapperCodegen.write_header  sc   --..6687#9#9#E#-g.D.D-E!F!v""CCDqH#w __##00GG#l #$ % $,,- .%& '!$ ' 	 	
* 	 ! 	 	
$	 DKK 	   ##KK!!"?@ $ , 		s   6#E EEc                     y rD   rd   )r   rM  s     rP   include_extra_headerz)PythonWrapperCodegen.include_extra_header]      rR   c                ^    | j                   j                  dt        j                   d       y )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )rR  r   r   r   r   s    rP   rg  z6PythonWrapperCodegen.write_kernel_autotune_defs_header`  s3    !!((
 $,,- .	
rR   c                   dt         j                   d}t        j                  j                  r]| j
                  j                  |       | j
                  j                  t        j                  j                  j                  d             t        j                  j                  s`| j                  j                  |d       | j                  j                  t        j                  j                  j                  d             y y )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r$   r   r   rq   rr   rz   r   rx   r2   rE   r9  import_get_raw_stream_asr7  rG  r   
import_strs     rP   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_oncep  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rR   c                   t         j                  j                  j                  d      }t        j
                  j                  r6| j                  j                  |      s| j                  j                  |       t         j                  j                  s8| j                  j                  |      s| j                  j                  |       y y y )Nr  )r2   rE   r9  r  r   rq   rr   rz   containsrx   r7  rG  )r   import_get_raw_stream_strs     rP   write_get_raw_stream_headerz0PythonWrapperCodegen.write_get_raw_stream_header  s    $%GG$6$6$O$O%
! ==11--667PQ**445NOww""<<(()BC&&'@A D #rR   c                $    | j                          y rD   )r  r   s    rP    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_once  s    ((*rR   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmetar   )reprrr  r   rM  rx   r   rq   rr   rz   rs  r   )r   r  vars      rP   add_meta_oncez"PythonWrapperCodegen.add_meta_once  s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  rR   c                z    | j                         D cg c]  }|j                  | j                         c}S c c}w rD   )get_graph_outputsr_  rQ  r   r   s     rP   get_output_refsz$PythonWrapperCodegen.get_output_refs  s<     =A<R<R<T
78A 1 12
 	
 
s   "8c                     y rD   rd   r   s    rP   mark_output_typez%PythonWrapperCodegen.mark_output_type      rR   c                6    t         j                  j                  S rD   )r2   rE   r  r   s    rP   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputs  s     ww###rR   c                6    t         j                  j                  S rD   )r2   rE   r  r   s    rP   r  z&PythonWrapperCodegen.get_graph_outputs  s    ww$$$rR   c           
        | j                         j                         D ]  \  }}t        |t        j                  t
        j                  f      r1|t        j                  j                  vst        |t
        j                        rht        |j                               dk(  r| j                  |j                               }| j                  |j                               }| j                  j!                  d| d| d| d        y )Nr   zassert_size_stride(r  r   )r  ri  r]   r^   r   r   TorchBindObjectr2   rE   graph_input_namesGeneratorStater.   r  rp   r  rN  rx   )r   r|   bufr  r  s        rP   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_asserts  s    ..0668 	SID##

B,>,>?@ 177444
R&&9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR	SrR   c                `   | j                   j                  d       | j                         j                         D ]r  \  }}t	        |t
        j                  t        j                  f      r1d| d}| j                   j                  |       d| d}| j                   j                  |       t y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	rN  rx   r  ri  r]   r^   r   r   r  )r   r|   r  r{   s       rP   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_asserts  s    HI..0668 	(ID##

B,>,>?@ &;<DKK!!$' &;<DKK!!$'	(rR   c                :    | j                   j                  d       y )NzV

            async_compile.wait(globals())
            del async_compile
            )rN  r   r   s    rP   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_wait  s    	
rR   c                    dj                  |      }t        |      dk(  r|dz  }| j                  j                  | d       | j                  j                  d       y )Nr  r3   ,z = argszargs.clear())r   r   rN  rx   )r   input_nameslhss      rP   
write_argszPythonWrapperCodegen.write_args  sP    ii${q 3JCWo.n-rR   c                    t         j                  r| j                  j                  d       d}|S | j                  j                  d| j                   d       d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r3   )r   graph_partitionrN  r   ra  r   prefix_indents     rP   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sm    !!KK M  KK**+ ,
 MrR   c                6    t         j                  j                  S rD   )r2   rE   r  r   s    rP   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_names  s    ww(((rR   c                   | j                   J | j                          | j                         }| j                  j	                  |      5  t
        j                  j                  rA| j                  j                  t        j                  j                  j                                t        j                  j                         }t
        j                  r| j                  j                  d| d       | j                         x}r| j!                  |       | j#                          t%               rt'        |       r| j)                          d d d        y # 1 sw Y   y xY w)Nz0training_annotation = nvtx._device_range_start(''))ra  r  r  rN  r   r   rq   debug_sync_graphrx   r2   rE   r9  synchronizeget_training_phaser  r  r  codegen_inputsr,   r+   "codegen_input_size_and_nan_asserts)r   r  phaser  s       rP   rf  z!PythonWrapperCodegen.write_prefix  s   $$000%%'>>@[[. 	:}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!
 -.<TB779)	: 	: 	:s   
DEEc                    t         j                  r| j                          t         j                  r| j	                          y y rD   )r   size_assertsr  nan_assertsr  r   s    rP   r  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rR   c                   | j                          d| }t        j                  j                  r=| j                  j                  | d| d       t        j                  j                  r|S | j                  | d| d       |S )Nstream = get_raw_stream(r   )	r  r   rq   rr   rz   rx   r2   rE   r7  )r   r0  ry  r|   s       rP   rp  z)PythonWrapperCodegen.write_get_raw_stream  s    ((*
|$==11&&00&*:,a8 ww""$1*Q?@rR   c                     | j                   d   S )N)rc  r   s    rP   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph*  s    ))"--rR   c                :    | j                   j                  |       y rD   )rc  r   )r   rE   s     rP   r  z)PythonWrapperCodegen.push_codegened_graph-  s    ""))%0rR   c                6    | j                   j                         S rD   )rc  r   r   s    rP   r*  z(PythonWrapperCodegen.pop_codegened_graph0  s    ))--//rR   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyr  rd  r   )r   r  r  s      rP   r  z(PythonWrapperCodegen.push_computed_sizes3  s!    !((//0HIIrR   c                6    | j                   j                         S rD   )rd  r   r   s    rP   r(  z'PythonWrapperCodegen.pop_computed_sizes8  s    ((,,..rR   c                .    t        | j                         S rD   )nextrK  r   s    rP   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix;  s    t''()*rR   c                   | j                  t        || j                               t        j                  j
                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          t        |       r| j                          | j                  j                  d| d| d       || _        y )Nr5  r6  r  r  r   )rx   r/  r2  r   rq   rr   r  rz   r2   rE   r9  r<  r  r+   r  )r   r0  s     rP   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter>  s    )*d6W6WX	
 ==11))+&&00**77
CDAF &&002248002&&00$6zl!D -7)rR   c                    | j                  t                      t        j                  j                  r| j
                  j                          y y rD   )rx   rB  r   rq   rr   rz   r+  r   s    rP   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exitQ  s6    356==11&&224 2rR   c                   |r,t         j                  r| j                  j                  ddj	                  |      z   dz          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d	dj	                  |      z   dz          y | j                  j                  d
       y )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r   r  rQ  rx   r   r  r+  )r   output_refss     rP   generate_returnz$PythonWrapperCodegen.generate_returnV  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rR   c                     y rD   rd   r   results     rP   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffixh  r  rR   c                    t         j                  rNdj                  | j                        t	        | j                        dk(  rdndz   }|j                  d| d       y y )Nr  r3   r  rD  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  r   all_partition_namesr   r   )r   r  all_partition_name_lists      rP   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffixk  se    !!&*ii0H0H&I43349r'# MM--D,E F "rR   c                     y rD   rd   r  s     rP   generate_endz!PythonWrapperCodegen.generate_endy  r  rR   c                :    | j                  t        | |             y rD   )rx   rH  r  s     rP   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernel|  s    ,T489rR   c                    |j                  |        | j                  t        | |             t        |j                  t
        j                        r|j                  |        y y rD   )codegen_commentrx   rH  r]   r  r   Layoutcodegen_size_assertsr  s     rP   generate_extern_kernel_allocz1PythonWrapperCodegen.generate_extern_kernel_alloc  sI    T",T489dkk299-%%d+ .rR   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr3   zrun_intermediate_hooks()r]   r  r   
NoneLayoutrG   get_origin_noderX  r7  r   memory_planningrx   r6  r   r^  generate_intermediate_hooksr   r|   )r   extern_kernelrN  	no_returnoutput_nameorigin_nodera  r7  s           rP   rM  z9PythonWrapperCodegen._generate_extern_kernel_alloc_helper  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rR   c                \    |j                  |        | j                  t        | |             y rD   )r  rx   rR  r  s     rP   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out  s&     	T"*467rR   c                D   t         j                  j                  j                  }|j	                  ||d d d       |j                  d|r|n|        | j                  ||       |5  | j                  | ddj                  |       d       d d d        y # 1 sw Y   y xY w)Nexternzout=r  r  r   )	r2   rE   wrapper_coderz  set_printer_argsr   write_provenance_debug_handlerx   r   )r   r   outout_viewrN  rc  debug_handledebug_printer_managers           rP   r^  z7PythonWrapperCodegen._generate_extern_kernel_out_helper  s     !" 4 4 B B..tVT4Rdx8S9:;**6<@" 	;NNfXQtyy&7q9:	; 	; 	;s   &'BBc                    |j                   }|j                  }|r$t        d |D              }t        d |D              }|j                  j	                          d}dj                   fd|D              }dj                   fd|D              }t        j                   |j                        }d}| d|j                   d	}| d| d| d| }	| d
|	 d}
|
S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrD   r2   rE   rK   atomically_apply_size_hintre   rb  s     rP   rg   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s%     VA))DDQGV   35c              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrD   r  r  s     rP   rg   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s*      CD  ;;A>r  z.data_ptr()r  c              3  J   K   | ]  }t         j                  |        y wrD   r  val_to_arg_strre   dimr   s     rP   rg   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s     XC-<<T3GX    #c              3  J   K   | ]  }t         j                  |        y wrD   r   r"  s     rP   rg   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s$      
?B //c:
r$  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsro   tensorr_  r   r  r!  element_sizerank)r   descapply_size_hintsr'  r(  ptrr*  rN  r   rN  r  s   `          rP   *_generate_tma_descriptor_call_experimentalz?PythonWrapperCodegen._generate_tma_descriptor_call_experimental  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArR   c                    |j                   }|rt        d |D              }d}| d}|j                  j                          d| }| d| d}|S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrD   r  r  s     rP   rg   zLPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>  s*       CD  ;;A> r  z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shapero   r)  r_  )r   r,  r-  r2  rN  r   rN  r  s           rP   $_generate_tma_descriptor_call_stablez9PythonWrapperCodegen._generate_tma_descriptor_call_stable  so    &&  HS  K Cx|$++//12"[MBQtfArR   c                    t        |t        j                        r| j                  ||      S t        |t        j                        sJ | j                  ||      S rD   )r]   r   TMADescriptorExperimentalr/  TMADescriptorStabler3  )r   r,  r-  s      rP   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_call  sW    dB889BB&  dB$:$:;;;<<TCSTTrR   c                    | j                  |      }|j                   d| | j                   }| j                  |       y Nr   )r7  r|   r7  rx   )r   r,  r  r{   s       rP   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trR   c                    | ddj                  t        t        |             }|j                  d      r|dj                  dg|z         z  }n|r|dt	        |       z  }|dz  }| j                  |       y )Nr  r  zaten.scatter_reducer  rD  z	, reduce=r   )r   mapr   r   r  rx   )	r   r}   inputsrY  python_kernel_namesrc_is_tensorr  r   r{   s	            rP   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22trR   c                |    ddj                  |       d}||||g}| j                  | j                  ||             y )Nr,  r  r-  )r   rx   wrap_kernel_call)r   r   r   r)  values
accumulateindices_strrN  s           rP   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  sA    $))G,-Q/;
3t,,VT:;rR   c           
     `    | j                  | d| ddj                   |              d       y )Nr   r  r  r   )rx   r   )r   buf_namer>  get_argsop_overloadru  outputss          rP   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrR   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzPythonWrapperCodegen.generate)r   	_generater   is_inferences     rP   generatezPythonWrapperCodegen.generate  s,    9: 	0>>,/	0 	0 	0s   '0c                &    t         j                  ryy)Nr   r3   )r   r  r   s    rP   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rR   c              #  b   K   | j                   }	 || _         | || _         y # || _         w xY wwrD   rx   )r   newolds      rP   set_writelinez"PythonWrapperCodegen.set_writeline  s.     nn	! DNI DNSDNs   /# /	,/c                    | j                   j                  }t        j                  j                  r| j
                  j                  |       y | j                  j                  |       y rD   )rt  kernel_defsr   rq   rr   rR  r   rM  )r   rZ  s     rP   _write_multi_kernel_defsz-PythonWrapperCodegen._write_multi_kernel_defs!  sF    --99==11%%,,[9KK{+rR   c                	   t         j                  r| j                          t        j                         5 }|j                  | j                  j                                t         j                  r| j                  |       t         j                  r| j                          | j                  |       t         j                  j                  r*t         j                  j                  s| j                          | j!                  | j                  j"                        5  | j$                  D ]I  }t'        |t(              r|j+                  | j                         /| j                  j#                  |       K 	 d d d        | j-                          | j/                         }| j1                          t         j                  j2                  rA| j                  j#                  t4        j6                  j8                  j;                                t         j                  r| j=                          t         j                  j                  r*t         j                  j                  s| j?                          t         j                  j                  r| jA                          t         jB                  r+t         jD                  s| j                  j#                  d       | jG                  |       d d d        tI               }|jK                  | jL                         |j#                  d       |jK                  | jN                         t4        j6                  jP                  r>t4        j6                  jD                  r$t4        j6                  jR                  r
tI               }|jK                  | jT                         | jW                          |jK                  | jX                         | j[                         }|j                  |      5  |jK                  | j                         d d d        | j]                  |       |jK                  | j^                         | ja                  |       | jc                  |       | je                  |       |jg                         | jh                  jg                         fS # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nz+nvtx._device_range_end(training_annotation)rD  )5r   profile_bandwidthr  r   	ExitStackenter_contextrQ  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesrq   store_cubinrr   !generate_reset_kernel_saved_flagsrX  rx   rX  r]   rB   r  r[  r  r  r  r2   rE   r9  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  r7  r  r*   r   rG  rM  r8  is_const_graphrS  finalize_prefixrN  rS  r  rO  r  r  add_benchmark_harnessgetvaluewithlinemaprP  )r   rP  stackr{   r  r  wrapper_call_indents          rP   rN  zPythonWrapperCodegen._generate(  se   ##))+!!# *	.u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ : JJ :D!$4T%6%67))33D9	:: ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -U*	.Z  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./ 	-MM$++,	- 	##F+dkk"""6*&!""6* &&($$88:
 	
u: :*	. *	.z	- 	-s2   C%R3AR&4E-R39S &R0	+R33R= S	c                6   | j                   j                  d       i }t        j                  j                  r_t
        j                  j                  rEt        t
        j                  j                        D ci c]  \  }}| j                  |      | }}}| j                   j                         dz   | j                  j                         z   }t        j                  t        j                  k(  rkt!        j"                  t%               dd      5 }|j'                  |j)                  d             |j*                  }ddd       t        j,                  d       	 t/        ||       yc c}}w # 1 sw Y   3xY w# t0        $ r}t3        d	|       |d}~ww xY w)
z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        r4  z.pyF)dirrO  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )rR  r   r   rq   rr   r2   rE   autotuning_inputs	enumerateget_autotuning_input_namer   rz   r    levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder|   debugexec	ExceptionRuntimeError)r   scopeidxvtuning_codef	file_pathes           rP   rh  z4PythonWrapperCodegen.generate_and_run_autotune_blocky  sp   
 	!!((	
 ==11agg6O6O ((A(ABC ..s3Q6E 
 %%..0((1134 	
   GMM1 ,,Ke #**734FF		#
 !!0
	Se$/# #  	S!FqcJKQRR	Ss*   8E)-E/E; /E8;	FFFc                \    ddl m}  ||       j                  | j                        | _        y )Nr3   )MemoryPlanner)r  r  r  rX  )r   r  s     rP   memory_planz PythonWrapperCodegen.memory_plan  s     2"4(--djj9
rR   c                   | j                         }t        j                  j                  |      }| j                  rt        | j                  d   t              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt        | j                  d   t              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t        |t              r"|j                  |d         | j                  |<   Dt        |t              r|j                  t                      nt        |t               s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t#        d |D              }y )Nr  r   c              3  4   K   | ]  }|j                     y wrD   )r   )re   ss     rP   rg   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>  s      +
./A))+
s   )r  r2   rE   _get_output_namesrX  r]   r  rM   r|   r   r   ranger   r  r  r   r&  sum)r   rK  	out_namesplanning_statespast_planning_statesr3  r{   _total_allocated_buffer_sizes           rP   memory_plan_reusez&PythonWrapperCodegen.memory_plan_reuse  s   ((*GG--g6	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rR   c                    |r!t         j                  r| j                          y t         j                  rt	               | _        | j                          y rD   )r   r  r  r  r  r  r  rO  s     rP   rc  z*PythonWrapperCodegen.run_wrapper_ir_passes  s9    F22((%:%<"""$rR   c           	        	 | j                   	t        j                  	fd       }t        j                  	fd       }t        |t        j
                        rGt        |t        j                        r||v ry 	j                  | d|        |j                  |       y t        |t        j                        rt        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X t        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X y t        |t        j                        ry t        |t        j                        ry t         j"                  j$                  j&                  ry t)        dt+        |             )Nc                <    j                  |  d|  d       |  dS )Nz_size = z.size()_sizerU  r|   r  s    rP   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s(    NNdV8D69:V5>!rR   c                <    j                  |  d|  d       |  dS )Nz
_stride = z	.stride()_striderU  r  s    rP   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s)    NNdV:dV9=>V7##rR   r   r,  r-  zUnknown value type: )rN  r  rq  r]   r^   r   Symbolrx   r   r   	TensorBoxrs  r  r  r  r  r  r  r   r  r  rk   )
r   r|   r   
bound_varsr  r  r#  r  r  r  s
            @rP   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignment  s    {{		" 
	" 
	$ 
	$ eUZZ(eU\\2ez6INNeWCv./NN5!r||,&u~~'78 )	TdELL1d*6LNNdV3vd|nAcU!#DENN4()  ))9)9);< +Vfell3j8PNNfXS$0@#a#HINN6*+ r112r001%%55$';DK=%IJJrR   c           	        t        t        j                            }| j                         }|j	                         D cg c]$  \  }}t        |t        j                        s!||f& c}}|j	                         D cg c]$  \  }}t        |t        j                        r!||f& c}}z   }|D ]  \  }}| j                  |||        	 	 	 	 dd}|D ])  \  }	}t        |t        j                        s! |||       + yc c}}w c c}}w )z$Assign all symbolic shapes to localsc                P   t        j                  | j                         | j                         g      D ]k  }t	        |t
              rt	        |t        j                        r.|j                  D cg c]	  }||vs| }}t        |      dkD  s[t        d| d| d       y c c}w )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterabler  r  r]   r   r^   r  free_symbolsr   r  )r   r  exprsymundefined_symbolss        rP   _verify_input_symbol_assignmentzLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS 
!$-D%,,1O $(#4#4%:8MC%! % ()A-(tfK0A/BB[\ 
%s   0	B#:B#N)r   ir.TensorBoxr  OrderedSet[sympy.Symbol])	r   r^   r  r  ri  r]   r  r   r  )
r   r  r  kr  r=  r|   r   r  _s
             rP   r  z#PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
q!z!U\\7RQF
 , 2 2 4X1Jq%,,<WaVXY " 	JKD%00ujI	J		0	&  	?HAueR\\2+E:>	?3
Xs   "D"D>"D!Dc                ~   t        |t        j                        rt        |t        j
                        r|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }t        ||      }| j                  t        | |t        j                               y y y rD   )r]   r^   r  r   r   PRECOMPUTED_SIZEr  r   r2   rE   rK   inv_precomputed_replacementsr   rx   r<  )r   r  r  r=  s       rP   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed$  s    c5<<(^CAVAV-Wd)))##C(77##@@ED!#t,CNN.tS!''BC .X(rR   c                     y rD   rd   r   s    rP   rj  z$PythonWrapperCodegen.finalize_prefix-  r  rR   rL   c                   t        d      )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r   r   rL   s      rP   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar0  s    UVVrR   c                   t        ||      S )Nr  )pexprr  s      rP   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar3  s    Q**rR   c                $    | j                  |      S rD   )r  r  s     rP   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar6  s    **1--rR   c                    | d| dS )Nr,  r-  rd   )r   r1  r|   r  s       rP   r/  z)PythonWrapperCodegen.codegen_tuple_access9  s    1UG1%%rR   c                    g t        | j                  |      }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj                  |       dS )Nr   ()r3   r  r  r  r   )r<  r  r   r   )r   r  partss      rP   rp   z/PythonWrapperCodegen.codegen_python_shape_tuple<  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rR   c                $    | j                  |      S rD   )rp   )r   r  s     rP   r  z(PythonWrapperCodegen.codegen_shape_tupleD  s    ..u55rR   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            g fS )Nzalloc_from_pool({})r  )formatr   r  r   rp   )r   r|   offsetr   r  r  s         rP   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_poolG  s_     %++II&MJ33E:33F;

 
 
	rR   c                   ||j                   j                  k(  rk||j                   j                  k(  rR||j                   j                  k(  r9|&||j                  k7  rd|j                          d| dS |j                          S | j                  |      }| j                  |      }| j                  |      }|/||j                  k7  r d|j                          d| d| d| d| dS d|j                          d| d| d| d	S )Nzaten.view.dtype(r  r   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r  r  r  r  r   rG   rp   r  )r   datar  r  r  rx   r   s          rP   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_viewV  s    DKK$$$$++,,,$++,,, Udjj%8)$--/):"UG1EE--/*+2248D44V<F))&1F Udjj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrR   c                8    | j                  | d| d| d       y )Nz.copy_(r  r   rU  )r   r   dstnon_blockings       rP   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copys  s!    #gcU"\N!<=rR   c                    |j                         }|j                  d      }| j                  t        | |||j                               y r   )rG   
input_namerx   r%  r)  )r   rM   r&  r'  s       rP   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_outputv  s6    mmo??1%t[(DLLQRrR   c           
     @   |j                    d|j                   d|j                    d|j                    }| j                  |j                   d|j                   d|j
                   d| d       | j                  j                  t        |j                               y )Nz + z if z
 < 0 else r   z * (r   )	r  r  rx   unbacked_offset_symbolbase_offsetbase_dim_strider`  r   r   )r   rM   	index_strs      rP   codegen_dynamic_select_indexz1PythonWrapperCodegen.codegen_dynamic_select_index{  s    zzl#dii[TZZL
4::,W	**+3t/?/?.@DDXDXCYY]^g]hhij	
 	""&&s4+F+F'GHrR   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wrD   )r_  )re   ts     rP   rg   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s     >Q1&&(>s   r   r   .item()r3   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)r=  r   keypathrx   r  r]   r   r   divisorr  rG   )r   rM   r  s      rP   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rR   c           
     0     fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d       t*              rdt        |t,        j.                        rct        t        j                  j                        d	k(  rj                  d
       j                  d|         |||j1                                t        |t         j2                        r4 ||t        j                  j$                  j5                  |d             /t        |t,        j6                        r# ||d|j                  j8                   d       l|j;                         D cg c]-  }t        j                  j$                  j5                  |d      / }	}|j=                         D cg c]-  }t        j                  j$                  j5                  |d      / }
} |||	|
|j?                         |jA                                " ddjC                  t        j                  j                  jE                                d}j                  d|        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(r  
, device='	', dtype=r   )rx   rp   )r|   r  r  rc  r   r}   r   s        rP   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rR   c                2    j                  |  d|        y r9  rU  )r|   r  r}   s     rP   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rR   c                    dd l }t        |t        j                        sJ j	                  |  d|j                  |      d       y )Nr   z = pickle.loads(r   )pickler]   r  ScriptObjectrx   dumps)r|   r   r  r}   s      rP   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  sB    eU%7%7888v%5fll56I5LANOrR   )rD  rD  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   r  ztorch.cuda.default_generators[z].graphsafe_get_state()zcall([r  ])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r2   rE   	constantsri  rx   r  r  rc  r   r   torchbind_constantsr  r]   r^   r  rK   
var_to_valr   r   r   r  get_real_objr   r  r  r  r  r  rZ  rJ   r   r  )r   r}   r  r  r  r|   r   torchbind_objr   r  r  call_strs   ``          rP   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module  sy   		0	P 	K	
 ]]_ E	YMM     !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; (eeU\\2zGG$$//33E4@,8 eR%7%781776671<((9$$wtf%56'e.@.@.BCuzz2
 #4)9)9)C)CETV)C)WXr'8'89"89K9K8LLcd "'!1 ((221r2BE  "'!1!1!3 ((221r2BF  #((*)E(T  		!''*>*>*C*C*E FGrJH}XJ78WXKE	Y E	YfoE	Y E	Ys+   J>P42P&P:2P,BP
PPc                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)rD  rD  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r(   r   r}   s     rP   rk  z*PythonWrapperCodegen.add_benchmark_harness  ss     ''&&v.@A]]_ 	X,-?-A,BB_`	 	 	s    A//A8c           
     D    | j                  t        | |||||             y r  )rx   r  )r   ra  r  r  r  r  s         rP   define_kernelz"PythonWrapperCodegen.define_kernel  s*     	 !-		
rR   c                ,    |r| dnd}d| |  d| }|S )Nr4  rD  z

r   rd   )ra  r  r  metadata_commentbodys        rP   _format_kernel_definitionz.PythonWrapperCodegen._format_kernel_definition  s1     /7hZr?B&'}C}ErR   c                *   t         j                  j                  rJ| j                  ||d       }| j                  j                  |       t        j                  j                  ry | j                  |||      }| j                  j                  |       y )N)r  )
r   rq   rr   r  rR  r   r2   rE   r7  rM  )r   ra  r  r  r  r  r  s          rP   r  z*PythonWrapperCodegen._define_kernel_helper"  s     ==1111[4 2 D %%,,T2ww""--x . 
 	4 rR   c                :    | j                   j                  |       y rD   )rS  r   )r   fn_codes     rP   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fn9  s    !!((1rR   c                  ,-./01 ddl m}m}m}	 ddlm,m}
m}m}m	} ddl
m}m} |j                  }g 1i /g .g }.1fd-d-,-/fd	}t        |j                        D ]  \  }}||j                   v r || ,|      d	
       (|vr-|   }|    || ,|      d	       Jt#        |t$        j&                        r[t#        |t$        j(                        r'd|j*                  |j,                  j/                         fnd\  }}} || |||||             t#        |t$        j0                        r0 || |||j3                         |j/                                      	t#        |t$        j4                        rO || |||j6                  j3                         |j/                         |j8                  j:                               rt#        |t<        t>        j@                  f      xr* tB        jD                  jF                  jI                  |d      } || |||      |        tK        1d .|j                  D cg c]  }tM        |       c}      }|tO        jP                  tB        jD                  jS                               i /tT        jW                  |d      tY        1.      gd}|rt[        |      |d<   |rt[        |      |d<   t]        |      dk(  r0|j_                         }g ta        t>        jb                  |d         }nd.0fd}i 0|D  cg c]  } g ta        ||        }} |rt]        |      t]        |      k(  sJ g }!te        tg        ||      d d	      D ]Q  \  } }"|!ji                   ||"      g ta        tj        |       g ta        tl        |       g ta        tj        |       d       S |	j                  |!g ta        tn        0jq                               d}g 0js                         }tu        |jv                        g}#t]        |      dkD  rQjq                         D ]>  }t#        |t$        j0                  t$        j4                  f      r.|#ji                  |       @ |#ji                  to        |             |#jy                  to        |             t[        |#      }#|#| jz                  v rg | jz                  |#   |S | dt]        | jz                         }$t}               }%t~        j                  j                  r|%j                  d|$d       n|%j                  d|d       |$|d <   |j                  |j                                |%j                   |              |%j                  d!g ta        ||      d"|d#|d$       t        |      }&t~        j                  j                  r|&j                  d%| d&d%|$ d&      }&|&j                  d'd(      }&|%j                  |&       tB        jD                  jS                         }'|%j                  d)|'j                   d*       t        j                  |jv                        \  }(})t        j                  |jv                        }*d+|* d,|) }+| j                  |$|%j                         |+       |$|f| jz                  |#<   |$||fS c c}w c c} w )/Nr   )config_to_dict	FixedGridPrecomputedGridr3   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J    j                  |       j                  |        y rD   )r   )r  r=  arg_indices	signatures     rP   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signatureY  s    S!s#rR   c                   |r?t               r	 | |       |j                  v r|j                     |j                  <   y y |j                  v sJ |r>t               r |  |j                               n	 | |       d|j                  <   y |r4t               r |  |j                               d |j                  <   y  | |       y )Nr|   r3   )r1   r|   )	r  r=  is_constexprequals_1equals_noner  r  r  r   s	        rP   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg]  s    13 %S#.88v% +1*:Ichh' & xx6)))57
 )l.IJ(c2*+Ichh' 57 )l.IJ*.Ichh'$S#.rR   r  T)r  )r  stable)experimentalNN)r|   api_typer2  r   )r|   bufferr   )r|   r  r   r  )r  )
size_dtyper)  argdefs)r)  )r  rc  r  r   restore_valuereset_to_zeror   c                N   t        | t        j                        rdg | j                  }|s| S |j	                  t
               |D ]+  }|v rt        j                  dt                     |<   - t        |       S t        | t              sJ t        j                  |       S )N)r   _launcher_s)r]   r^   r   r  sortr   r  r   r0   r   r_   )r  symbolsr  extra_launcher_argss      rP   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher  s    dEJJ/2 1 12G"#LLSL)& "55$38<<)#.A*B)CD4+C0 &d,?@@!$,,,}}T**rR   c                2    t        | d   j                        S r   r   r   s    rP   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>  s    3qt{{3C rR   r   )r   pythoncpppython_slow)	grid_typeprecomputed_gridsr"  r  zasync_compile.triton(z, '''ra  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r  z# Original path: r6  )FFF)r  r   r   r   )Mruntime.triton_heuristicsr  r  r  commonr  r  r  r  r	  rq   r
  r  r   rs  	arg_names
constexprsr]   r   TMADescriptorr6  r2  r)  rJ   r  rG   r#   r  r  r  r   r^   r_   r2   rE   rK   statically_known_equalsr=   r4   r%   r  get_current_device_or_throwr0  fromkeysr;   ro   r   setup_grid_as_argsr<  sympifyr   r   r   r  r:   r   rC  r  idr   extendr_  r*   r   unique_user_kernel_namesrx   updateinductor_meta_commonr   r   replacerk   inspectgetsourcelinesgetsourcefiler  r   )2r   r   r   r   restore_value_argsreset_to_zero_argsr   r  r  r  r  r  r  r	  r
  r  original_nameequal_to_1_argsr  r  r   r=  r  r2  r   r  r   triton_signaturerx  inductor_metaextra_launcher_call_argsr#  rs   r)  cfg	cache_keyr|   r   
kernel_srccurrent_devicer  linenosrcfiler  r  r  r  r  r"  r  s2      `                                        @@@@@@rP   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernel<  s   	
 	

	
 	
 	D)+	$&	!#%'	$"	/ "	/H "&"2"23 9	GHCf'''\s3$G& +Cc{"\s3Fc2#3#34 &c2+A+AB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY/!!$#&<<>"%--/  R%7%78 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs9	Gv -)/)9)9:AWQZ:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<OEOSZ3w<777 "#E7#)CT 
	c "(("0"5"5Ct$4"52UD!12':UD)9':	
 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}} *!#		23E3E'FG$$S)* 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~w78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::M ;j Ps   	['[c                    | d|j                    d}||d| z  }t        j                  |dd      }t        ||j                        }| j                  t        | |t        j                               |S )Nr  r  T)
is_integeris_positive)	rN  r^   r  r   r  rx   r<  r2   rE   )r   ra  treerO  sym_namer  r=  s          rP   generate_numel_exprz(PythonWrapperCodegen.generate_numel_exprH	  sq    !]!DKK=6!F8$Hll8$G c4::.*4agg>?
rR   c                j    | j                  |j                   dt        |j                                y r9  )rx   r   r  r   )r   r=  rE   s      rP   r?  z7PythonWrapperCodegen._generate_symbolic_call_arg_helperZ	  s)     	#))Ccnn(='>?@rR   c                   |j                         }t        | |      }|j                  t        j                  k(  r| j                  |       n1|j                  t        j                  k(  r2| j                  |       | j                  | j                  |             n|j                  t        j                  k(  r| j                  j                  |      }|rRt        |t              rt        |j                  t              sJ t        j                  |j                  |      |_        nV| j                  |       | j                  | j                  |             || j                  |<   nt        |j                        t         j"                  j$                  r| j&                  j                  t(        j+                  | ||j,                  |j.                  t0        j2                  j4                  j7                  |j8                        fd             |j                  t        j                  k7  r0| j&                  j                  t(        j                  | |             y y y )N)r3   )r  r  )rG   r  	zero_moder9   UNINITIALIZEDrx   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHrv  r   r]   rM   r8   maximumr  r   rq   rr   rz   r  make_allocationrc  r   r2   rE   rK   r  r   )r   wsr|   r{   priors        rP   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocation_	  s   {{}D"%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%6:JJ<   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rR   c                v    |j                   t        j                  k7  r| j                  t	        | |             y y rD   )rS  r9   rW  rx   r   )r   rZ  s     rP   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation	  s.    <<,;;;NN.tR89 <rR   c                $    | d| j                    S )Nz.zero_())r7  )r   r|   s     rP   rV  z%PythonWrapperCodegen.make_zero_buffer	  s    x}--rR   c                H    | ddj                  |       d| j                   S )Nr  r  r   )r   r7  )r   r|   rs  s      rP   rB  z%PythonWrapperCodegen.wrap_kernel_call	  s'    q9-.a}==rR   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)rQ  rx   r2   rE   graph_idr_  r   )r   rm  s     rP   ra  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call	  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rR   c                :    | j                   j                  d       y )Nzstart_graph())rQ  rx   r   s    rP   rb  z)PythonWrapperCodegen.generate_start_graph	  s    ##O4rR   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(r   )rQ  rx   r   profile_bandwidth_outputr   s    rP   rf  z'PythonWrapperCodegen.generate_end_graph	  s'    ##j1P1P0SST$UVrR   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            rQ  r   r$   r   r   s    rP   re  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags	  s2      ''8'A'A&B C	
rR   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        rg  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nrh  r   s    rP   rg  z5PythonWrapperCodegen.generate_save_uncompiled_kernels	  s4     	  ''8'A'A&B 	C	
rR   c                >    d }|D cg c]
  } ||       c}S c c}w )Nc                    t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S t        t        j                  j                  j                  |             S )Nr  )r]   r   r<   r   floatr   r   r  r2   rE   rK   rL   )r=  s    rP   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg	  s^    #s#*B3*GsYPSPC#udO!DE3xQWW--66s;<<rR   rd   )r   rs  rm  r=  s       rP   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_call	  s!    	= *33#333s   c                &    t        |t              rt        |t        j                        r.|j	                         j                         } j                  |   }n\ j                  j                  |      r|} j                  |   }n/|J d       d j                   }|} xj                  dz  c_        |
J d|        t        d |j                         D              }t        d t        j                  j                  |      D              }t        d |j                         D              }|j                         }	|j!                         }
t        j                  j"                  j%                  |j'                         j(                  t*        j,                        }d	| d
| d|	 d|
 d
| d
| d} j.                  j1                  | d|        t        |t        j                        r5 j3                  |d      }|} j.                  j1                  | d|        |S t5        |t6        j8                        st        |t:              rt        |t<              r| j>                  v r|S |y|}t        |t:              r|j@                  }|t        j                  j"                  jB                  v r't        j                  j"                  jB                  |   }t=        t        j                  j"                  jE                  |t*        j,                              S t        |t<        tF        tH        tJ        f      rt=        |      S t        |tL              rdd
jO                   fd|D               dS tQ        dtS        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r3   z Failed to find a buffer for arg c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  Nr2   rE   rK   r  r   unbacked_symint_fallbackre   r  s     rP   rg   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s@      
 	   ;;#<< <    AAc              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywrr  rs  ru  s     rP   rg   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s@      $
 	   ;;#<< < $rv  c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywrr  rs  ru  s     rP   rg   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s@      
 	   ;;#<< < rv  r  zgenerate_example_value(r  z, 'z', r   r   T)r,  r-  r  r,  c              3  T   K   | ]  }j                  |t        |             ! y wrD   ri   )re   ar   s     rP   rg   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>
  s#      ZQR!@!@DG!L Zrm   r-  zUnsupported type )*r]   torch_dtyper   r.  
get_tensorrG   rL  r   rU  ro   r  r2   rE   get_allocation_sizer  rZ  rJ   rK   r  
get_layoutr  r   rt  rz   rx   r7  r.  r^   Basicr   r   rs  r   r  r  r   rl  r   r   r   r  rk   )r   r=  arg_typeraw_argrH  r  r  allocation_sizer  rc  r   r  r   s   `            rP   rj   z/PythonWrapperCodegen.generate_example_arg_value	  sF   h,'2#3#34"--/88:**3/%%))#.**3/* X* &d&F&F%GH00A50?L&Fse$LL? 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#34 :: %) ;  **44zUG5LMO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFrR   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )Nr,  r  c              3  @   K   | ]  }j                  |        y wrD   )_grid_dim_str)re   r`   r   s     rP   rg   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>$
  s     RT 2 24 8Rs   r-  )r]   r   r   r  )r   grid_per_dims   ` rP   r  z"PythonWrapperCodegen._grid_dim_str!
  s<    lD)diiR\RRRUXX &&rR   )rc  rq   rw  rt  ru  rx  r   r  c                  | j                   j                  |D ci c]2  }t        |t              r |t        j
                  j                  |      4 c}       |xs t        j
                  j                         }| j                  ||
       | j                  t        | ||||||||t        j
                  j                  |	             yc c}w )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        )
ra  rs  rt  ru  rw  rq   rx  rc  ry  r   N)rL  r7  r]   r   r2   rE   try_get_bufferr0  r  rx   rr  r|   )r   ra  rs  rc  rq   rw  rt  ru  rx  r   r  r=  s               rP   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_call)
  s    . 	## %c3' QWW++C00	
 @177>>@**;E'#!!#'77<<%9	
s   7C
rD  )rc  rq   rw  rt  ru  rx  ry  r   c          
     "
    |xs t         j                  j                         }|s|j                  dk7  r~|j                  dk(  r" j	                   j                  |             y |j                  dk(  r% j	                   j                   d|             y t        d|j                   d       j                  |      }dj                  |      }t        j                   |j                  |	      }|s$d| d	} j	                   d
 d| d| d	       y  j                          t        j                  j                  rN j                   vr?|t#        |      t#        |      k(  sJ d       d |
rDt         j                  j$                  r*t         j                  j$                  j'                  |
d       d fd} fd}g }|(|J d       d gt#        |      z  }d gt#        |      z  }nt#        |      t#        |      k(  sJ d       i }t)        t+        ||||            D ]P  \  }\  }}}}d }t-        |t.              r!dt/        |      v r|j1                  d      \  }}d }r|v r j3                  |         }|rB|}t-        |t4              st7        |t8        j:                        st-        |t<              r|||<   n|dk(  r |||||      r||   }nt-        |t4              r_t?        j@                  d|      r|}n4| jB                  vr jE                  |||      }n jB                  |   d   }|f jB                  |<   n jE                  |||      }|jG                  ||n| d|        S  jH                  j	                  dt         j                  jJ                  jM                  |j                         d        jH                  jO                           jH                  j	                   ddj                  |       d| d	        jH                  jQ                           jH                  j	                  tS        d|d              j                   jU                         t         j                  jV                  ry t         j                  jX                  jZ                  }|j]                  ||d        |5   j	                   d| d| d	       d d d         j                          y # 1 sw Y   xY w)Ncudar  mpsz.generated_kernelzdevice z nyir  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                     j                   j                         D  cg c]  \  } }|k(  r|  }} }|rddj                  |       dS yc c}} w )a  After all the autotune kernel calls have been written (i.e.
                self.kernel_autotune_example_args is complete), returns a deletion call
                for all autotune example tensors that are unnecessary after kernel_name
                is called.del r  r4  rD  )rT  rC  r   )r)  kntensors_to_deletera  r   s      rP   get_autotune_deletion_callzUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call
  se     '+&G&G&N&N&P%"[( %! %
 %!$)),=">!?rBB%s   Ac                j   ||   }||v ryt        t        | |            D ]  \  }\  }}||k(  st        |t              sd}r|v rj	                  |         }|dk(  rA	 |j                         }	t        |	j                        D ]  \  }
}||k(  s| d|
 d||<     y  y# t        $ r Y w xY w)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
                This is particularly useful for jagged cases, where the dimension is often
                being passed in as an input.TrD  z.shape[r-  F)rs  r   r]   r"   rt  r~  r  r  )rt  ru  r  reused_args
target_argr3  raw_keyr  triton_inputr  r#  r  autotune_argsr   s               rP   infer_arg_by_inputszNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs
  s    
 &c]
,-6s8X7N-O !)A)Cxz'6'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&< ,FC J=IN'RUQVVW:XJ 7'+,!, 	 / ! !!s   #0B&B&"B&&	B21B2zkeys are not None but args arez#call_args and raw_args do not matchr  rD  z^(workspace|semaphore)r   r5  r6  z.run(z	, stream=z
<del_call>r  )/r2   rE   r0  rk   rx   rB  r  rn  r   r  rp  r  r  r   rq   rr   ry   r   autotuning_mappingr   rs  r   r]   r   splitrt  r{  r.  r^   r  r   rematchrT  rj   r   rz   r9  r<  r  r+  r'   r   r7  r  rz  r  )r   ra  rs  rc  rq   rw  rt  ru  rx  ry  r   call_args_strstream_name
stream_ptrr  r  all_argsr  r3  r=  r  r  r  r   r  arg_strr  r  s   ``                         @rP   r{  z1PythonWrapperCodegen._generate_kernel_call_helperZ
  s    @177>>@&++/{{e#t44[)LM  %))[M9J*KYW
  #WV[[M#>?? 77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($!B H'I)II' 6C	N2 6C	N28}I6 96 K8AIy(H=9 )P44C7G c3'C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<"8U[[9%c?;/6G,]':h;(
 *'2G+6 xx 93?"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUG3;se1WI<NOS)PX &&00**77EFaH &&002&&00-uTYYx%8$9;-qQ &&224&&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW" 	XNNk]%i}TUVW	X%%'	X 	Xs   TTc                :    | j                   j                  |       y rD   )rX  r   r   r{   s     rP   rx   zPythonWrapperCodegen.writeline  s    

$rR   c                4    |D ]  }| j                  |        y rD   rU  )r   rX  r{   s      rP   r  zPythonWrapperCodegen.writelines  s     	!DNN4 	!rR   c                L    | j                   j                  t        |             y rD   )rX  r   r-   )r   ctxs     rP   r_  z"PythonWrapperCodegen.enter_context  s    

+c*+rR   c                (    ddl m}  |       rdd l}t        |t              rt        |j                  j                        S t        |t        j                        rt        |      S t        |t        t        f      rAt        j                   G d d             t         t        |       fd|D                    S t        |t         j"                  j$                        rt'        |      S t        |t(        j*                  t(        j,                  t.        f      r|j1                         S  |       r+t        |j2                  j4                        rt        |      S t        |t(        j6                        r|j1                         S t        |      S )Nr   )has_triton_packagec                      e Zd ZU ded<   d Zy)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S rD   )r  r   s    rP   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__+  s    88OrR   N)r   r   r   r   r  rd   rR   rP   Shimr  '  s    $rR   r  c              3  V   K   | ]   } t         j                  |             " y wrD   r   )re   rz  r  r   s     rP   rg   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>0  s$     Vq1@@qIJVs   &))torch.utils._tritonr  rq   r]   r   r  rM   r  r^   r   ro   r   r  	dataclassr  rk   r  _ops
OpOverloadr   r   r  
MutableBoxr#   r_  languager   r  )r   r  type_r  rq   r  s   `    @rP   r!  z#PythonWrapperCodegen.val_to_arg_str  s%   :a"%%5::&8OE4=)""$ $ #$ QVTUVV  5::001&q))BIIr}}oFG&&((!jFOO4I4I&J7N2,,-&&((7NrR   c           	     `   |j                         }|j                         }t        |j                               }t        t        j
                  j                  |            }t        |j                               }|j                         }| j                  |j                         ||||||      S rD   )rZ  rJ   ro   r  r2   rE   r}  r  get_is_pinnedrY  rG   )r   r  rc  r   r  allocation_shaper  	is_pinneds           rP   r  z+PythonWrapperCodegen.make_buffer_allocation>  s    ""$  "foo'( !<!<V!DEv((*+((*	##OOvueV=My
 	
rR   c                x    d}t         j                  j                  s| j                  j	                  |d       y y )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r2   rE   r7  rG  r   r  s     rP   "write_memory_track_allocation_oncez7PythonWrapperCodegen.write_memory_track_allocation_onceI  s4    
 ww""LL
$7 #rR   c                   ||}| j                  |      }| j                  |      }	| j                  |      }
t        j                  j                  j                  j
                  r| d|	 d|
 d| d|j                   d| d}ne|j                  dk(  r|r| d|	 d|
 d| d	}nE|j                  d
v r| d|j                   d|	 d|
 d| d	
}n| d|	 d|
 d|j                   d| d	
}||	k7  r|d| d|
 d	z   }|S )Nz = tracked_empty_strided(r  z, dtype=r  z	', name='r  r  z = empty_strided_cpu_pinned(r   )r  r  xpumtiaz = empty_strided_r  z = empty_strided(r  z.as_strided()rp   r  r  r   r  r  rk   )r   r|   rc  r   r  r  r  r  r  codegen_allocation_shape_tuplecodegen_stride_tupler  s               rP   rY  z$PythonWrapperCodegen.make_allocationQ  s    #$"==eD)-)H)H*
&  $>>vF??!!..EE&112"'( )  !;;- (b"  [[E!i&412"'('  [[:: &)&++a12"'('  &)12"'( )!;;-yq:  "@@,':&;2>R=SSTUUC
rR   c                8    | j                  t        |             y rD   )rx   r   r  s     rP   make_commentz!PythonWrapperCodegen.make_comment  s    {4()rR   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr      )r6  r7  rZ  )r   new_nameold_namerZ  s       rP   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^rR   c                (    d|j                          S )Nr  )rG   )r   r  s     rP   rm  z%PythonWrapperCodegen.make_buffer_free  s    foo'())rR   c                8    ddj                  d |D               S )Nr  r  c              3      K   | ]  }|  y wrD   rd   )re   r|   s     rP   rg   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s     >>s   )r   )r   names_to_dels     rP   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names  s    dii>>>?@@rR   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr   r   reuse)rY  r7  rZ  )r   r  r  del_lines       rP   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrR   c                P    |$| j                  | j                   d| d|        y y )Nz [Provenance debug handles] r6  )rx   rZ  )r   ra  r  s      rP   r  z2PythonWrapperCodegen.write_provenance_debug_handle  s4    
 #NN<<. <[M<.Y $rR   c                r   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r4|j                         |j                         k(  r| j                  |||      S | j                  ||j                         |j                         d| j                  j                        }| j                   | d| | d| j                   dS )N;z; r   r   r  r  )rJ   rG   r2   rE   r  rm  r  r  r  r  rQ  rx   r6  rZ  )r   rW  rV  r  r  r  r  reinterpret_views           rP   r  z&PythonWrapperCodegen.make_buffer_reuse  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228XxPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rR   c                    | j                  t        || j                   | d|j                          | j                   d| j
                   d             y )Nr   r  z alias)rx   r6   r6  r_  r7  rZ  )r   r|   r  s      rP   r  z0PythonWrapperCodegen.codegen_deferred_allocation  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rR   c                   |j                         }|t        j                  j                  v s8|| j                  v s*t        |t        j                  t        j                  f      ry | j                  j                  |       t        |j                         t        j                  t        j                  f      r|j                         sy |j                         }t        |t        j                        ry t        |t        j                         ry t        |t        j"                        rt        |j$                  t        j&                        s*J dt)        |j$                         d|j$                          |j$                  j*                  }t        |t        j,                        sJ t)        |             |j*                  }t        |t        j.                        sJ t)        |             | j1                  |       | j3                  t5        | |||             y t        |t        j6                        r| j3                  t9        | |             y | j3                  t;        | |             y )Nzunexpected r   )rG   r2   rE   rl  rl  r]   r   DonatedBufferSubgraphBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVEr  r  r  r#   rk   r  
StorageBoxr  codegen_allocationrx   r  r  r  r  )r   r  r|   r  boxinput_buffers         rP   r  z'PythonWrapperCodegen.codegen_allocation  s     AGG+++t~~%&2#3#3R5F5F"GH4 &&(%%r~~6 **,'')fb;;<fbmm,fb001fkk2+=+=> d6;;/06;;-@> ++""Cc2==1<49<188LlBII6AS	A6##L1NN?4vvNOfb112NN1$?@|D&12rR   c                   |j                         }t        |t        j                  t        j                  f      r| j                  t        | |             y t        |j                         t        j                        r| j                  t        | |             y | j                  |      sy | j                  j                  |       | j                  t        | |             y rD   )rG   r]   r   InputBufferr  rx   ri  r  r  r   	can_reuserm  r   r   )r   r  r|   s      rP   codegen_freez!PythonWrapperCodegen.codegen_free  s      fr~~r/A/ABCNN8D&12f,,.0C0CD NN-dF;<~~f%

t*489rR   c                   |j                         }|t        j                  j                  v xs |t        j                  j                  v xr6 t        t        j                  j                  |   t        j                         xsh |t        j                  j                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S rD   )rG   r2   rE   rl  r  r]   graph_inputs_originalr   r  r  r  never_reuse_buffersrm  )r   r  output_bufferr|   s       rP   r  zPythonWrapperCodegen.can_reuse  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" tzz!
 	
rR   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S rD   )rG   rn  )r   r  reused_buffers      rP   	did_reusezPythonWrapperCodegen.did_reuse   sC     OO, KFOO-.-2H2H2JJ	
rR   c                t   t        ||      sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y rD   )	rZ   r  rm  r   rG   rl  rn  rx   r  )r   r  r  s      rP   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse  s    $\=AAA-

|,,./=11340<0E0E0GM**,-y|]CDrR   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S rD   )r   r`  r   r6  )r   r   r|   s      rP   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_decl  sC    6{4---K &&**40<<$&&rR   c                &   t        t        j                  j                  j                  |      }|sy |j                         D ]I  \  }dfdfd}| j                  | j                  |       d |        | j                          K y )Nc                   |dk(  r| S t        |      dk\  r_t        |d   t              rLt        |d   t        j                        r/ |  d|d   j
                   d|d   j                   d|dd        S t        |d   t              r |  d|d   j
                   d|dd        S t        |d   t        j                        rYt        j                  j                  r  d	|d   j                   d
|  d|dd        S  |  d|d   j                   d|dd        S t        |d   t              r |  d|d   j                   d|dd        S t        d|       )Nrd   r   r   r3   r   r  r   r  z	std::get<z>(r,  r-  z.__floordiv__(r  )r   r]   r   pytreeSequenceKeyr|   r  r2   rE   r7  r   r  r  )r  r  gos     rP   r  zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go0  s   b=K LA%"71:}="71:v/A/AB&'!*//!2!GAJNN3C1Ewqr{   
M:a
'8;WQR[II
F,>,>? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K8 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrR   c                    t         j                  j                  rt              dk(  rZd   }  d   j	                         t        | t        j                        r!t        | j                        dk7  r	dd        S       S t        d   t        j                        sJ  d   j                     j	                         dd        S        S )Nr3   r   )r2   rE   r7  r   rG   r]   r   r  r)  r  r  r  )r  r  r  r  rK  s    rP   go_outerzOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outerN  s    77&&
 7|q(%aj  "#AJ//1)#r~~>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIII!''!*.."9"B"B"DgabkRRk733rR   r   )r  r   r  zpytree.KeyPath)	r   r2   rE   rK   	shape_envri  rx   r  r7  )r   r  rK  unbacked_bindingsr  r  r  r  s    ``   @@rP   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs  s     6GG&&(9
 ! ,113 <	JAw
L<4. NN44Q78HJ<}Uu<	rR   c                     fd} fd}	  j                  j                          j                   j                   dj                           |        t
        j                  }t        j                  j                        5  j                  j                  |       d d d         |         j                          y # 1 sw Y   !xY w#  j                          w xY w)Nc                    t        j                  j                        t              k(  sJ t        j                  j                        D ]3  \  } }j	                  j
                   |  d| j                          5 y r9  )r   rE   r  r   rx   r6  r7  )inner_inputouter_inputouter_inputsr   subgraphs     rP   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefixw  sy    x~~223s<7HHHH,/++\- ([ ||n[M[M$++OrR   c                    t        j                  j                        t              k(  sJ t        j                  j                        D ]5  \  } }j	                  | d| j                          j                          7 y r9  )r   rE   r  r   rx   r_  r7  )inner_outputouter_outputouter_outputsr   r  s     rP   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  s{    x~~334M8JJJJ.1,,m/ *l #nC(F(F(H'I$++WrR    subgraph: )parent_graph)	r  rE   rx   rZ  r|   r2   set_graph_handlercodegen_subgraphr*  )r   r  r  r  r  r  r
  s   ````   rP   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inliningi  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4 //!- 0  %&$$&  $$&s$   A;C C,C CC C*c           	        |j                   }|j                  }t        |j                               |j                  D cg c]  }|j
                   c}z   }dj                  |      t        |      dk(  rdndz   }|D cg c]  }|j                          }	}dj                  |	      t        |      dk(  rdndz   }
| j                  d| d| d       |j                         D cg c]
  \  }}|s	| }}}|r#| j                  ddj                  |              | j                  d	|
 d
| d| d       | j                  d| d       yc c}w c c}w c c}}w )z'Generate code to call a graph partitionr  r3   r  rD  	partition	_args = [r-  r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr   r  symbol_inputsr|   r   r   rG   rx   ri  )r   partition_idr  r  r  symbol_inputr  r=  rM   output_namesrK  r|   
deallocater  s                 rP   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call  sj    2DD+88-22452F2T2T9
".L9
 
 ;'#k2Ba2G3RP4@ADAA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
%T:zD
 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E?E'
E2Ec                P    t        |      D cg c]  }d| 	 c}| _        y c c}w )N
partition_)r  r  )r   num_partitionsr  s      rP   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_names  s$    BGBW#X3j$6#X #Xs   #c           	     p   dj                  |      t        |      dk(  rdndz   }dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       | j                  d| d|j                  j                   d|j                  j                   d	       y )
Nr  r3   r  rD  r  r-  r  z) = r  )r   r   rx   rE   r|   )r   r  r  outer_flattened_outputsouter_output_namesouter_input_namess         rP   ,codegen_subgraph_call_with_flattened_outputszAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rR   c                v   dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       t
        j                  j                  j                          | j                  | d|j                  j                   d|j                  j                   d	       y )
Nr  r3   r  rD  r  r-  r   r  r  )r   r   rx   rE   r|   r2   r  free_buffers)r   r  r  outer_buffer_namer"  s        rP   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rR   c                   | j                  |j                         | j                  d       | j                  | j                   d|j                          t
        j                  }|j                  |j                  _        |j                  j                  | j                  vrt        j                  |j                        5  t        j                  dd      5  |j                  j                         \  }}d d d        d d d        | j                  j                  |j                  j                         | j                  j                         y y # 1 sw Y   ]xY w# 1 sw Y   axY w)NrD  r	  r  F)r  rE   rx   rZ  r|   r2   r7  ru  r  r   patchr  r   r   r   )r   r  r
  subgraph_coder  s        rP   codegen_subgraph_commonz,PythonWrapperCodegen.codegen_subgraph_common  s   !!(..1r$,,{8==/BCww%1%=%=">>d&F&FF $$X^^4 @\\"3U; @'/~~'='='?$M1@@
 ,,001D1DE,,]-@-@A G
@ @@ @s$   E E8E E	E  E)c                L    | j                  |       | j                  |||       y rD   )r+  r#  )r   r  r  r   s       rP   'codegen_subgraph_with_flattened_outputsz<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputs  s(     	$$X.99l$;	
rR   c                L    | j                  |       | j                  |||       y rD   )r+  r'  )r   r  r  r&  s       rP   r  z%PythonWrapperCodegen.codegen_subgraph  s%     	$$X.""8\;LMrR   c                   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j                  |j                  ||       y | j                  |j                  ||       y c c}w c c}w )N = [None] * r,  r-  )rG   rx   r   rK  r=  r_  r2   rE   r8  r  r  r  r  )r   invoke_subgraphr|   r  r  r3  r  s          rP   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraph  s    '')$|C0G0G,H+IJK;J;Q;QRC--/RR77(-c/2I2I.J(K#$4&!AM  --((, !!/":":L$O Ss   C(C-c                   |j                         }|j                  D cg c]  }|j                          }}|j                  j                         }t	        |j                  t
        j                        s| d}| j                  | dt        |j                                | j                  d| d       | j                  t        | |j                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j                  ||       n| j#                  |j                  ||       | j                  t%        |              | j                  d       | j                  t        | |j&                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j&                  ||       n| j#                  |j&                  ||       | j                  t%        |              y c c}w c c}w c c}w )Nr  r0  r   r6  r,  r-  zelse:)rG   operandsr_  	predicater]   r   ShapeAsConstantBufferrx   r   rK  r  true_subgraphrE   r2   r8  r  r  r  r&  false_subgraph)r   conditionalr|   r  r  r5  r3  r  s           rP   codegen_conditionalz(PythonWrapperCodegen.codegen_conditional  s   ##%;F;O;OPC--/PP));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOP775:3{?R?R;S5TUvQqc^UMU--))< !!+";";\4P'-.w({/I/I/O/OPQ775:3{?R?R;S5TUvQqc^UMU--**L- !!+"<"<lDQ'-.9 Q V Vs   I)"I.I3c                P	     fd}|j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}t	        |      } j                  | dt	        |              |r  j                  | dt	        |       d       t        |      D ]  \  }	}
 j                  | d|	 d|
          g t        t	        |            D 	cg c]
  }	| d|	 d c}	|}| dg}t        |      }|d	t	        |       } ||j                  ||        j                  d
|d            j                  d       |rt        t        ||j                              D ]k  \  }	\  }} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      m nt        t        ||j                              D ]k  \  }	\  }} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      m  j                  d        j                  t         |j                  j                                ||j                  ||        j                  t                      |r j                  t         |j                  j                               t        t	        |            D ]"  }	 j                  | d|	|z    d| d|	 d       $  j                  t                       j                  t         |j                  j                                ||j                  ||        j                  t                       j                  d|d           |r j                  d       t        t	        |            D ]  }	 j                  d| d|	|z    d        j                  t         |j                  j                                j                  | d|	 d| d|	|z    d        j                  t                       y	y	c c}w c c}w c c}	w )z1while_loop is codegened as a host side while_loopc                    t         j                  j                  rj                  | ||       yj	                  | ||       y)z3Helper method to deduplicate subgraph codegen logicN)r2   rE   r8  r  r-  )r  r  r  r   s      rP   r  zAPythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraph2  s7    ww11(L-X<<lMrR   r0  z.extend([[] for _ in range(z)])r,  z] = r-  _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()r  zwhile should_loop:z	].append(r  z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rG   carried_inputsr_  additional_inputsr   rx   rs  r  r   cond_subgraphr   r  body_subgraphrE   r&  )r   
while_loopstack_outputr  r|   r  outer_carried_inputsouter_additional_inputs
ckp_offsetr3  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputcarried_bufs   `                rP   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loop/  s   	 ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 -.
$|C0D,E+FGHNN&3C8L4M3NcR   45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK$$&79K	
 	(:1(='>?@,-3<(**C*CD4 7//M; 0z7O7O7U7UVW$q4>STU/567 4=(**C*CD4 7//M; 0z7O7O7U7UVW$q4hGH/567 	+,(z/G/G/M/MNO$$&79K	
 	'-. NN,T:3K3K3Q3QRS3345 P$qZ(8	$q2NOPNN+D12 	(z/G/G/M/MNO$$&79K	
 	'-.+,>q,A+BCD NNBC3345 7aJ/?wGH0z7O7O7U7UVWfAaS 0aJ7GyQ /567 Y 
#
  Hs   RR4R#c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }||S t        |      S # t        $ r Y y w xY w)Nr  )r  r]   r   r2   rE   
_shape_env_maybe_evaluate_staticr~  )r   r  s     rP   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none  sf    	q.$/ !S!''$$;;A>C{
s8O 		s!   A A ,A 
A 	A&%A&c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S rD   )r  rR  r   )lstr  r   nums       rP   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none  sA     	A&CCAFC{MM#		
 rR   c                0    t         j                  |       d uS rD   )r  rV  )rT  s    rP    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints  s     !FFsKSWW	
rR   c                H    t         j                  | j                               S rD   )r  rV  r  r  s    rP   r  z4PythonWrapperCodegen.static_shape_for_buffer_or_none  s    #IIOO
 	
rR   c                0    t         j                  |       d uS rD   )r  r  rZ  s    rP   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKSWWWrR   rD   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]r  )r|   r   r|  r   r   r  )rM  r   )r  TritonMetaParamsr   r   r   rv  r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  rv  r  )r0  r   ry  r   r   r   r  )r0  r   r   r  )r  rv  r   r  r  r*   r   r  )rM   zir.FallbackKernelr   r  )rM   rI  )rM   rS  r   r  )r   r   r  r   r  r   rN  rv  rc  r   r  r1  r   r  )F)rH  r   r>  r   rI  zCallable[[], Sequence[str]]rJ  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]ru  r(  rK  zSequence[ir.Buffer]r   r  )rV  Callable[..., None]r   zIterator[Callable[..., None]])rP  r   )r|   r   r   r  r  r  )r  r   )r   r   rL   r   r   r   )r   r   r   r   )r1  r   r|   r   r  r   r   r   )r  zSequence[Expr]r   r   )r   ztuple[str, list[str]])rx   rd  r   r   )r  zUnion[bool, str])rM   zir.MultiOutput)NTN)
ra  r   r  r   r  r   r  r   r  r   )ra  r   r  r   r  r   )r  r   )r   z"list[list[Union[int, sympy.Expr]]])ra  r   rO  r   )r=  r   rE   r@   r   r  )rZ  r8   )ra  r   r  r1  )ra  r   )r  r  )NF)rD  )r  rj  )r  rv  )r  r   r  r   r  r   )r  r1  )rW  r  rV  r  r  r   )r|   r   r  zir.ReinterpretViewr   r  r  r  )r  r  r  r  )r  r   rK  r   r  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   r  )r  r   r  zir.GraphPartitionSignature)r  r   )r   r   r   r:  supports_cachingr   r$  r  rb  rj  re  r  rg  r&   r  r  r  r  r  r  r  r  r  r  r  r  r  r  rf  r  rp  r  r  r*  r  r(  r  r  r  r  r  r  r  r  r  rM  r  r^  r/  r3  r7  r:  r@  rF  rL  rQ  rS  r   contextmanagerrX  r[  rN  rh  r  r  rc  r  r  r  rj  r  r  r  r/  rp   r  r  r  r  r  r  r  r  rk  r  r  r  r   rJ  rP  r?  r\  r^  rV  rB  ra  rb  rf  re  rg  rn  rj   r  r  r{  rx   r  r_  r!  r  r  rY  r  r  rm  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r'  r+  r-  r  r2  r:  rN  rR  rV  rX  r  r\  r  r  s   @rP   r  r    s#    ]#~ 
 FJ	&&$& 7& C	& &'<@AD
   "	B + +	! 
 

$	G$
%S$	(
.8):6-.10J
/+7&5
5$:,:8 8 
8 '+;; ;  	;
 ; ; $; 
;",U
&<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,O
b&SP:
 
D%(K(K (K -	(KT'?RD @D W CG +.&'6	,  ' 
:>S
I4*[Yz, #'(,

 
  	

 
 &
& FJ'*6C  #'(,!! !  	!
 ! &!.2J; 2J;X$A"A+8A	A
%N:.>85W

4
4SGj' !&*/
/
 $/
l !u(u(n !, F	
 8 8 TY.`*_*Au '+ $` 
(3T:(
 
E'NN N H	N
 
N`+'Z<< 9<BY
$
"B&
NP /Ba7F     
 

 
 

 X XrR   r  c                       e Zd ZdZ	 d	 	 	 	 	 d fdZddZddZd Zd Zd Z	ddZ
dd	Zdd
ZddZ	 	 ddZddZddZd fdZedd       Zedd       Z xZS )r~  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    c                L    || _         || _        || _        t        |           y rD   )r  r  r  r   r   )r   r  r  r  r   s       rP   r   z%SubgraphPythonWrapperCodegen.__init__  s(     +,$8!rR   c                &    | j                   | _        y rD   )r  ra  r   s    rP   rb  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2 2rR   c                     y rD   rd   r   s    rP   re  z)SubgraphPythonWrapperCodegen.write_header  r  rR   c                     y rD   rd   r  s     rP   rk  z2SubgraphPythonWrapperCodegen.add_benchmark_harness  r  rR   c                     y rD   rd   r  s     rP   r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r  rR   c                     y rD   rd   r   s    rP   r  z5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  rR   c                6    | j                   j                         S rD   )r  r  r   s    rP   r  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rR   c                     y rD   rd   r  s     rP   r  z2SubgraphPythonWrapperCodegen.generate_after_suffix  r  rR   c                \    | j                   j                  d| j                   d       d}|S )Nz
            def z(args):
            r3   )rN  r   ra  r  s     rP   r  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rR   c                     yr   rd   r   s    rP   rS  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rR   c                    | j                   x}r3|j                  |j                  D ci c]  }t        |      | c}z  }|S t        j
                  j                  }|S c c}w rD   )r  input_nodesr  r   r2   rE   r  )r   r  r  r=  s       rP   r  z-SubgraphPythonWrapperCodegen.get_graph_inputs  sh     11191**#,#:#:.A	. F
  WW))F.s   Ac                    | j                   x}rJt        |j                  j                               |j                  D cg c]  }|j
                   c}z   }|S t        j                  j                  }|S c c}w rD   )	r  r   rt  r  r  r|   r2   rE   r  )r   r  r  namess       rP   r  z2SubgraphPythonWrapperCodegen.get_graph_input_names  sr    11191..33566?6M6M:&2!!: E
  GG--E:s   A5c                r    | j                   x}r|j                  }|S t        j                  j                  }|S rD   )r  r  r2   rE   r  )r   r  rK  s      rP   r  z.SubgraphPythonWrapperCodegen.get_graph_outputs  s;    11191,,G  gg++GrR   c                ~    |j                         }| j                  x}r||j                  v ry t        |   |       y rD   )rG   r  rt  r   r  )r   r  r|   r  r   s       rP   r  z/SubgraphPythonWrapperCodegen.codegen_allocation  s?     222I2	@U@U8U "6*rR   c                8    | j                   j                          y rD   )r  r  r   s    rP   r  z5SubgraphPythonWrapperCodegen.write_triton_header_once  s     	446rR   c                8    | j                   j                          y rD   )r  r  r   s    rP   r  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_once%  s     	<<>rR   rD   )r  r   r  r  r  r^  r  r  rc  r  ra  r`  rb  re  )r   r   r   r:  r   rb  re  rk  r  r  r  r  r  rS  r  r  r  r  r&   r  r  r  r  s   @rP   r~  r~    s     FJ	 - C	3
8		G	+ 7 7 ? ?rR   r~  )rM   r  r   r   )rV   r  rW   r  )NN)r|   r   r   zlist[triton.Config]r   zlist[TritonGrid]rl   r]  r   r   r   ztuple[str, str]r  )
__future__r   r   r   r  r   r  r:  rv  r  r  r  rx  	itertoolsr   r   typingr   r   r   r	   r
   r^   r   r  
torch._opstorch.utils._pytreeutils_pytreer  r   r{  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   rD  r   r   r   	codecacher    r|  r!   r"   r#   r   r$   runtime.hintsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   virtualizedr2   r+  r4   r5   r6   r7   r8   r9   	cpp_utilsr:   triton_utilsr;   r<   r=   collections.abcr>   r?   rq   rE   r@   wrapper_fxirrA   	getLoggerr   logdoprintr  ro   rc  r   r   r   r  r  r  rQ   rZ   r0  r   r_  r   r   r   r  r   r   rB   r  r   r&  r/  rB  rH  rR  ri  rr  r  r  r  r  r   r  r  r  r  r  r   r%  r<  rk  Liner  r~  rd   rR   rP   <module>r     s   "    
      	  " @ @     $ $ & 6 C A ;  . / 9 : ( ( ' ; ( ' ,       P P 2%) g! u{{C56299l*+
]OT12 @ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.k&
k& k& k& ,	k&
 (k& k&\S&l   * **X X
 2 2 2 ++ + + 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 !5+ !5 !5H 	({ 	( 	( /[ / /> 5; 5 5* ; ; ;2%
 %
P 3,% 3, 3,l 6, 6 6> /( / /& )" ) ).(! (
 ![ ! !: )8^ )8 )8X 4 4 4 #0k #0 #0L 	5+ 	5 	5 
,-Z(X7 Z(XzPp?#7 p?rR   