
    pi^                   :   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZ d dl m Z  d dl	m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dl8mDZDmEZE ddgZFd dlGmHZHmIZImJZJmKZK e+rVd dlmLZLmMZMmNZN d dl7mOZOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZf ddlgmhZhmiZi g dZj e,d      Zkej                  d d       Zmd d lnmoZo d d!lpmqZq d d"lrmsZs d d#ltmuZu d d$lvmwZw d d%lxmyZy d d&lzm{Z{m|Z|m}Z}m~Z~mZ d d'lmZmZ d d(lmZmZ dd)lmZ dd*lmZ ej                  d+k(  Z ej                  e      Ze7j                  j!                  ed,      Z e,d-      Zee6j(                  e6j(                  f   Ze)e-e7j,                  ee7j                  f      Zd.d/d0Zd1Zd1Zd1Zd2Zd3Zeedz
  z  d k(  red4k\  sJ d5       d!d6Zd"d7Z G d8 d9e6jB                        Z ejF                  d:;       G d< d=             Zd#d$d>Z	 d#	 	 	 	 	 	 	 d$d?Zej                  d%d@       Zd&dAZd'dBZd(dCZd)dDZ	 	 	 	 	 	 d*dEZd+dFZ	 	 	 	 d,dGZd-dHZ	 	 	 	 d.dIZd/dJZdK f	 	 	 	 	 d0dLZ	 	 	 	 	 	 	 	 d1dNZd2d3dOZ	 	 d4	 	 	 	 	 	 	 	 	 d5dPZ	 	 	 	 	 d6	 	 	 	 	 	 	 	 	 	 	 	 	 d7dQZd8dRZd9dSZd:dTZd;dUZd<dVZ e1dW      Z e,dXd:Y      Ze$e/e#ef   ef   Z G dZ d[e*e&eef         Zd=d\Zd>d]Z	 	 	 	 d?d^Z	 	 	 	 d@d_Z	 	 	 	 	 	 dAd`Z	 	 	 	 	 	 dBdaZ	 dC	 	 	 	 	 dDdbZ	 	 	 	 	 	 dEdcZƐdFddZǐdGdeZȐdHdfZɐdIdgZʐdJdhZːdKdiZ̐dLdjZ͐dMdkZΐdNdlZ	 	 	 	 dOdmZАdPdnZѐdQdoZd dlZӐdRdpZg ZdMedq<   dSdrZאdRdsZej                  	 	 	 dT	 	 	 	 	 	 	 dUdt       ZeZeZeZݐdVduZ	 	 	 	 	 	 dWdvZ ej                  d4      dXdw       Z G dx dye(      ZejF                   G dz d{             Z G d| d}      Z G d~ de      Zej                  dYd       Z G d d      Z G d de      Zej                  dZd[d       Zej                  d\d       Zej                  d%d       Zd\dZ	 dC	 	 	 	 	 	 	 d]dZ	 	 	 	 	 	 d^dZd_dZd_dZddd:d	 	 	 	 	 	 	 	 	 d`dZdddadZdddadZdbdZdcdZe-ee6j(                  f   Zded<   ej                  ddd       Zej                  ddd       Zej                  ded       Zej                  dfd       Zej                  dgd       ZdhdZdbdZdbdZdhdZdhdZ 	 	 	 	 	 	 	 	 didZ	 	 	 	 dj	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dkdZd%dZ G d d      Z	 	 	 	 	 	 	 	 dldZ	 	 	 	 	 	 	 	 dldZdmdZdndZdodZ		 	 	 	 	 	 	 	 dodZ
	 	 	 	 	 	 	 	 dpdZej                  	 	 	 	 	 	 dqd       Z	 dC	 	 	 	 	 drdZdsdZdtdZdudZdudZdvdZdwdZej                  dxd       Zd\dZej                  d\d       Zej                  dyd       Zej                  d\d       Zd\dZdzdZd{dZd%dZd%dZd|dZdNdZ G d dej@                        Z!	 	 	 	 	 	 	 	 	 	 d}dÄZ"d~dĄZ#	 	 	 	 d~dńZ$	 dC	 	 	 	 	 ddƄZ%ddǄZ&ddȄZ'ddɄZ(	 	 	 	 	 	 ddʄZ)	 	 	 	 	 	 	 	 dd˄Z*d̄ f	 	 	 	 	 	 	 	 	 	 	 dd̈́Z+d΄ f	 	 	 	 	 	 	 	 	 	 	 ddτZ,ddЄZ-ddфZ.ejF                   G d҄ dӫ             Z/ej                  ddԄ       Z0ddՄZ1ddքZ2ddׄZ3dd؄Z4	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddلZ5ddڄZ6ddۄZ7dd܄Z8dd݄Z9	 	 	 	 	 	 	 	 ddބZ:dd߄Z;	 	 	 	 	 	 	 	 ddZ<ddZ=	 dC	 	 	 	 	 	 	 ddZ>	 	 	 	 	 	 ddZ?ddZ@	 	 	 	 	 	 ddZAd%dZBddZCddddddddZDeDj                         D  ci c]  \  } }|| 
 c}} ZF ej                  d      ZHddZIddZJddZKddZLej                  dd       ZMejF                   G d d             ZNi ZOded<   	 	 	 	 	 	 	 	 ddZP eC       ZQded<   ddZRd dZSddZT e,d      ZU e,d       ZV G d deeUeVf         ZW e0d:      dCd:d;dd       ZXddZY G d dej@                        ZZej                  dd       Z[d%d	Z\dd
Z]d dZ^ddZ_d%dZ`ddZadZbddZcddZdddZe	 	 d	 	 	 	 	 	 	 	 	 ddZfddZgd%dZhddZi	 	 d	 	 	 	 	 	 	 ddZjddZk ejF                  d:;       G d d             Zle$de#f   Zme$emelgemf   Zn G d d      Zo eo       ZpddZqddZryc c}} w (      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr3   r   rC   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      W/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerS   i   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rC   rE         @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr3   )ALIGN_BYTES)nbytess    rR   _alignrp      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrn   )vs    rR   ry   ry      sQ    !eii+,3{AFF+,,aK599Q#<#KKrq   c                  *    e Zd ZdZdZdZedd       Zy)r{   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr3   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)rs   intrt   Integerrp   ry   )clsvalues     rR   evalz
align.eval   s6    ec5==12#e*%%uL rq   N)r   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodr    rq   rR   r{   r{      s!    FEJ rq   r{   Tfrozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__r   rq   rR   r   r      s$    
 	G -,-- rq   r   c           
         |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]A  }|j6                  t8        j                  k(  r"t;        j<                  d|j>                        |C c}      }|r"|tA        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   3xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArC   dtypedeviceTenable_timing   r3   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  4   K   | ]  }|j                     y wr   device_time_total.0events     rR   	<genexpr>zfp8_bench.<locals>.<genexpr>	  s     QE33Q        @@profiling results: %s ms)!rK   rC   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestablerW   eventsdevice_typerV   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rR   	fp8_benchr      sT    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
!!Z__4HH0%**=I	 	
	O OOQQQ	

 II(#.JO RO 
 H
 
*	
sE   "'P'PA9P2=PAP2P,9P2AP?P)$P22P<c                L    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )r   r   rC   r   Tr   r   r3   r   Nr   r   r   r   zContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wr   r   r   s     rR   r   z+do_bench_using_profiling.<locals>.<genexpr>b  s     A%e%%Ar   r   r   )rK   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   r   rV   r   rM   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   num_event_per_groupactual_eventsr   s                    rR   do_bench_using_profilingr     s    D	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
JJ			NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   6AN$5N(N 
Nc                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rK   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrJ   opsImportErrorr   str)r   r   s     rR   has_torchvision_roi_alignr   g  s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rK   r   r   rs   r   typer   rT   Workercurrent_devicer   device_interfaces     rR   decode_devicer  w  s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrq   c                |    t        j                  t        j                  | t        j
                  j                        S r   )	functoolsreduceoperatormulrt   SOne)its    rR   sympy_productr    s#    HLL"eggkk::rq   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr   r   )r   abs      rR   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rM   rt   expandr   r   )seq1seq2s     rR   	sympy_dotr    s8    t9D	!!!<<>c$o>>??rq   c                \    | D ci c]  }t        |      | c}j                         S c c}w r   )r   values)r  rO   s     rR   uniquer    s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )rs   rt   ExprrZ   sympifyr   r   runtime_ceildiv)numberdenoms     rR   re   re     s     &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))rq   c                f   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d'||    S c c}w )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r   splitupdatelistr  rs   )key	dtype_strtysr}   s       rR   _type_ofrN    sR   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<01112S#&3@aI/?,@@ 2s   
B.c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rt   r  lstr   s     rR   convert_shape_to_inductorrR    s!     '**EMM!***s   $c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r3   VN)hint)
virtualizedrU  rs   r   rt   r   graphsizevars	shape_envcreate_symintnode)r   rU  s     rR   convert_to_symintr\    se      a 	

 !U]]+ F	 !!++==ad=Krq   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )r\  rP  s     rR   convert_shape_to_symintr^    s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr   )
alias_infor   r  s     rR   r   zis_view.<locals>.<genexpr>  s     FAq||4'Fs   )any_schema	argumentsops    rR   is_viewrh    s     F1E1EFFFrq   c                     yNFr   )r   s    rR   <lambda>rk        rq   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wr   )is_pointwise_use)r   uis_pointwise_fns     rR   r   z#is_pointwise_use.<locals>.<genexpr>  s     KA#A7Ks   )rg  rs   targetrK   _ops
OpOverloadr  getitemr   rh  rw   usersTag	pointwisetags)userr  rs  s    ` rR   rp  rp    s     66_$3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHrq   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrM   )r  g
graph_argss    rR   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg  s,    #}}s3z?"3455rq   r3   r   Tensor)r  torch.Tensorr   r2   )rK   fxGraphrn  r#   r  rM   rd  returnsr   r   outputr1   )rs  rz   kwargsr  nodegmr  r  s         @@rR   gen_gm_and_inputsr    s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rq   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr   )rT   rL   r   r  s     rR   r   r      s4    /7$$&$$& 'rq   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rK   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rR   timedr  (  sr     	d				B5\ 'F 
			B7Nrq   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rK   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rR   print_performancer  :  sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   r   )r  s   rR   rk  z#precompute_method.<locals>.<lambda>M  s     rq   N)rJ   setattr)objmethodr  s     @rR   precompute_methodr  J  s     !WS&!#FC(rq   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rR   precompute_methodsr  P  s     '#v&'rq   c                <    t        | |kD        t        | |k        z
  S r   )r   )r  r  s     rR   cmpr  V  s    q1u:AE
""rq   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr3   r   )rs   r   rM   r   )rO   sizes     rR   pad_listliker  Z  sC    !SsTz
1v{tAw!v%%Hrq   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr3   )rA   )rs   r   	schedulerrA   get_name)elemrA   s     rR   	sort_funcztuple_sorted.<locals>.sort_funcg  s1    dC K0$ 1222}}rq   rK  )r  rh   r   r   )rM   sorted)rO   r  s     rR   tuple_sortedr  c  s&    
1v{	 !##rq   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r   r   )r   s    rR   clear_cachezCachedMethod.clear_cachey  s    ),rq   c                     y r   r   selfrz   r  s      rR   __call__zCachedMethod.__call__|  rl  rq   N)r   r   r   None)rz   P.argsr  P.kwargsr   r  )r   r   r   staticmethodr  r  r   rq   rR   r  r  x  s    , ,Drq   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y r   r   delattrr  rK  s    rR   r  z"cache_on_self.<locals>.clear_cache  s    4D# rq   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rK  s        @rR   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNrq   c                    t        |       S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  )r   s    rR   cache_property_on_selfr    s    
 rq   c                     	 	 	 	 d fd}|S )Nc           	         d d| j                    dd| i}t        d d d dj                         |        t        j                  |       |d	         }dfd
}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                8    t        |       rt        |        y y r   r  r  s    rR   r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!c" "rq   r  r  )r   r  r  r  rK  
class_names       @rR   r  z'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, $	#CL1	# (rq   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rR   cache_on_self_and_argsr    s     
$$	$L Nrq   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr3   irr  ) r  rs   rJ  r  r  r  or_r   r  originsr!   r:   )node_scheduler  r  s      rR   aggregate_originsr    s     -&LL *4(TYY 		!!
 L
 	
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rq|D cg c]Q  }|j                  dk(  r@d|j                  v r2|j                  d   #|j                  d   j                  j                  S }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenrn  rK   source_fn_stackr   r3   inductor_noder   fused)r  rg  r   _overloadpacketr   r  r!   rs   r   r  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         rR   get_fused_kernel_namer    sm    $M2KO+ &
yyO+6;;.O,8	 KK(88AA
 
 G,-	g	%! 	:FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89	: G,-	o	-&1
"VYY/5QFKK
 
 "!G88WI'((5
(
s   AE(%E-:E-c                	    t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }dt        |      rt        d |D              }t        |      dk(  r_|d   j                  t        d      s/t        j                        D 	ci c]  \  }}	|	|
 }
}}	|
_        |j                  fd       |D ]  }d	|j                  v rO|j                  d	   @t        |j                  d	   j                        }||   j!                  |j"                         d
|j                  v so|j                  d
   d   j"                  }||   j!                  |j"                          dnd}|j$                   d| ddj'                  |j)                                ddj'                  |j)                                d}|j$                   dg}t+        |j-                               D ]@  \  }}|j!                  |j$                   d| ddj'                  t+        |                    B dddlm |j!                  |j$                   d       t               }g }t3        | j4                        sddlm} 	 	 	 	 	 	 d'fd}d(d d) fd}| D ]  }	t        |	d      r|	j:                  t        |	j:                  d      r|	j:                  j<                  |	j:                  j<                  D ]  }|j"                  |v r|j?                  |j"                         |j                  jA                  |j"                        }|U |||j"                        \  }}|j!                  |j$                   d| d ||       d| d        t        |	j:                  d       s|	j:                  jB                  )|	j:                  jB                  D ]T  }|j                  jA                  |j"                        }|+ |||j"                        \  }}|j!                  d!|z          V  |D ]2  }|j!                  |j$                   d|jE                  d"#              4 |j!                  |j$                   d$d%j'                  |              |d&j'                  |      fS c c}w c c}	}w )*aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    rn  Nc              3  4   K   | ]  }|j                     y wr   )rX  )r   ns     rR   r   z&get_kernel_metadata.<locals>.<genexpr>)  s     "Cq177"Cr   r3   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S r   )r  )r  single_graphs    rR   rk  z%get_kernel_metadata.<locals>.<lambda>1  s    lTTUVW rq   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:rT  c                >   t        | j                        rAt        | j                  j                        r!| j                  j                  j                  }n| j                  }||}n|j
                  }	 | j                         }||fS # t        $ r d }Y ||fS w xY wr   )rs   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer  r   layoutr  s        rR   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_infoR  s     fbll3
KK9 #)++"2"2">">K"("4"4K&"D&++D"#..0F V|# + "!FV|#"s   7B BBc           	     d    ddj                  | D cg c]  }t        |       c}       dS c c}w )N[r  r  )r  r   )shaperO   s     rR   stringify_shapez,get_kernel_metadata.<locals>.stringify_shapef  s-    499e%<c!f%<=>a@@%<s   -
c                    | y | j                          } | j                         }| j                   }dt        | j                      | | | dS )Nr  ")r  strider   r    r   )r
  shape_annotationstride_annotationdevice_annotationr  s       rR   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layouti  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?rq   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r	  r   r   ztuple[str, ir.Layout | None])r  zIterable[int]r   r   )r
  zir.Layout | Noner   r   )#r  rg  collectionsdefaultdictrJ  rM   r!   rX  r   r   nodesr  sortr   r   r  r  r   commentr  keysr  itemsr  r  rs   r:   rW  rU  r  r  addtry_get_bufferr  format_node)!r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_mapr  rK  sort_strmetadatadetailed_metadataoriginal_noder   	all_reads
all_writesrU  r  r  rr  
input_namer
  woutput_namer   r  r  r  s!                                 @@@rR   get_kernel_metadatar8    s   $ $M2K+6W&)):VfWNW ,,T2N$006
 L
>""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     2dii'DIIo,F,Rdii0@@ACs#**4995$))#))K(+00C3&&tyy12 *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF%/\	 "
-9&$J$UX$-$(A
 # =q-0AMM4I1=='2q}}7J7J7V]]00 66Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ AMM84,,8]]11 =!"!7!7!?!>$)8)HQ"))#*;<=-=< # 	D$$??#3t'7'7PT'7'U&VW	
 	  GOO#4Jsxx
?S>T!UVTYY0111w X #Ws   SS Sc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)rJ  r!   rN   rw  r%  r  )initial_queueskip_filterdominated_setr  users        rR   dominated_nodesr>    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  rq   c                4  	 ddl m d	fd	t        |      \  }}|D cg c]  } 	|      s|j                   }}t        |       \  }}|D cg c]  } 	|      s|j                   }}t	        t        j                  g ||       S c c}w c c}w )Nr3   r  c                F   t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr9 t        | j
                  j                  j                  j                  f       S r   )	rs   r  r  r  r;   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rR   rE  z*gather_origins.<locals>.is_unrealized_node  s    a&%aff--a'%aff--!RYY' 

!!!!	1
 -
 	
rq   )r  r;   r   r"  )r  r  r"   r  r!   	itertoolschain)
rz   r  kwargs_flattenr   valkwargs_originsargs_flattenargs_originsr  rE  s
           @@rR   gather_originsrM    s     
" %V,NA-;Wc?QRU?VckkWNW"4(OL!+7SC;Mc;RCKKSLSiooE|EnEFF XSs   BBB Bc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )rs   rt   MulrM   rz   )exprs    rR   is_neg_leadzsympy_str.<locals>.is_neg_lead  s:    tUYY'VC		Na,?VDIIaLTVDV	
rq   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )NrP  r3   r   z - z + )rs   rt   ru   rM   rz   r  rx   )rR  rS  sympy_str_muls    rR   sympy_str_addz sympy_str.<locals>.sympy_str_add  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&rq   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r3   z * )rs   rt   rQ  rz   r  rx   )rR  rS  sympy_str_atoms    rR   rU  z sympy_str.<locals>.sympy_str_mul  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''rq   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r  )rs   rt   Symbolr   ru   rQ  r^   r[   r\   r]   funcr   r  rx   	sympy_strrz   r   )rR  rV  s    rR   rY  z!sympy_str.<locals>.sympy_str_atom  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9rq   )rR  r   r   r"  rR  r   r   r   r   )rR  rS  rV  rY  rU  s    @@@@rR   r_  r_    s$    

	'	( rq   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr3   rT  current_node
index_expr)
rW  rU  rd   compute_all_boundsrJ   interpreterrs  ra   rb   unknown)r   rU  fx_nodes      rR   get_bounds_index_exprrh    sN     	!!~tDDWDNNl*5!!""$$rq   c                    | d   dk(  S )Nr   r4  r   )prefixs    rR   prefix_is_reductionrk    s    !9rq   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)r`   SIZEr_   )rj  r,  s     rR   sympy_index_symbol_with_prefixrr    s)     TYY vsDdCCrq   c                N    | xs t         j                  xr t         j                  S r   )rd   debug_index_assertsassert_indirect_indexing)checks    rR   generate_assertrw    s    /V//TV5T5TTrq   c                F    | d   dk7  sJ t        j                  | dd      S )rm  r   r   Trn  )rt   r]  r   s    rR   sympy_index_symbolrz    s)     7c>> <<d==rq   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nrn  )rs   rt   r  r   r]  r   is_nonnegative)replacedreplacements     rR   	to_symbolzsympy_subs.<locals>.to_symbol1  sP     (EJJ///k3'<< ++$33  rq   )r~  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)rt   r  xreplacer$  )rR  replacementsr  kr}   s        rR   
sympy_subsr  +  sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wr   is_symbolicr   rO   s     rR   r   zis_symbolic.<locals>.<genexpr>G  s     N1AN   )	rs   rK   r/   r  rc  rF  rG  r  r  )r  s    rR   r  r  D  sS    a& 1ell# 	ON	!((*(MNNrq   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wr   r  rb  s     rR   r   z"any_is_symbolic.<locals>.<genexpr>L  s     ,!{1~,r  rc  )rz   s    rR   any_is_symbolicr  K  s    ,t,,,rq   c                T   ddl m} t        g d      }t        j                         r|j                  d       | j                  j                  D ]  }t        |j                        |v r|c S t        j                  j                  j                  slt        |j                  t        j                  j                        r>t        j                   j"                  j$                  |j                  j&                  v r|c S |j(                  j+                  d      x} ||      s|c S  y )Nr   )r'   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outrI  )%torch.fx.experimental.symbolic_shapesr'   r!   rK   $are_deterministic_algorithms_enabledrI  rX  r   r   rs  	_inductorrd   graph_partitionrs   rt  ru  r   rx  cudagraph_unsaferz  r   get)r  r'   forbidden_setr  rI  s        rR   %get_first_incompatible_cudagraph_noder  O  s     L	
M  113	
"  t{{},K &&664;;

(=(=>--1A1AA
 K99==''C49Ns9SK" rq   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )nextiterreversedrX  r   rg  )r  	last_nodes     rR   output_noder    s6    T(288>>234I<<8###rq   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nr  rf  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywrI  N)rs   r   r  rK   r  r   )r   r  s     rR   r   z"get_all_devices.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywr  )rs   rK   r  r2   r   r  r  r   )r   r  s     rR   r   z"get_all_devices.<locals>.<genexpr>  sS      7c588==)sxx||E*ELL9 	7s   A:A<)rX  
find_nodesr!   r  rz   rs   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rR   get_all_devicesr    s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&rq   c                    t        t        j                  j                               D ]'  } | j	                  d      st        j                  |    }|j
                  j                         D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | = * dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rJ  sysmodulesr#  
startswith__dict__rJ   rs   rK   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r   driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rR   unload_xpu_triton_pydsr    sR   CKK,,./ %%%&NOKK$* 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY #MM--1199;<	< KK$!%& #++-kk12""(()2JJ#JJLrq   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  s    rR   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jrq   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rR   clear_cachesr    s     " rq   c              #    K   t                ddlm}  |t        j                  |            	 t
        j                  j                  t        j                  di      5  t        j                  d        |t        j                  j                  d            }t
        j                  j                  t        j                  d|i      5  d t        | t              rt        |       dk(  sJ d	       t        j                  j!                  |      rtt        j"                  |      }| j%                  |D ci c]D  }d
|vr>|t        j                  j'                  t        j                  j                  ||            F c}       ddd       ddd       |rUt)               r(t*        j,                  j/                         r
t1                t3        j4                  t)               fd       t                yc c}w # 1 sw Y   xxY w# 1 sw Y   |xY w# t6        $ r t        j9                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r^  pathr  inductor_cache_dirs      rR   rk  zfresh_cache.<locals>.<lambda>  s    S[[@&% 6A 6 rq   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictosenvironr   r   r  r  rs   rM   existslistdirrI  getsize
is_windowsrK   rE   rL   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rR   fresh_cacher    s     ND1(2B2Bs2KL)ZZ__JJ24FG
 	 II35GH7/:  .@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	( |		 6 6 8&(MM" )l  	5 	 	H  >@RS 	sn   -I0H !A-HA-H;A	G=HHAH 2I=HH	HHH !H;;H> >I

Ic           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)rK  reverse)__getitem__r   rM   rJ  r  r  )seqgettera_rs      rR   argsortr    s1    __F
C/C>?@@rq   c           	     2    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |            }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)rs   r"  evaluate_expr)rR  rZ  s    rR   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate,  s(    $%**4*EErq   r   r3   r   )rR  z%Union[bool, torch.SymInt, sympy.Expr]r   r"  r   )r  r  a_idxa_valb_idxb_valr  rZ  s          rR   r  zargsort_sym.<locals>.cmp(  sT    uu	F
 EEM"EEM"
 5=5=rq   r  )r  tuple[int, sympy.Expr]r  r  r   r   )	r   rs   rK   r/   r  rR  r  r  
cmp_to_key)rZ  r  r  r,  r   exprsr   r  s   `       rR   argsort_symr  %  s    4  nC 
Z5<<8affkka@E  5i22378E %&fc1c&F&M
 's   >B<Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nrl   r   r   )rK   rF  r   element_sizer  s    rR   get_dtype_sizer  I  s-     ;;r'4466rq   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   rq   rR   r
  r
  R  s    Lrq   r
  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr   r   zlist[tuple[int, LineContext]]line_mapNr  r   rq   rR   r  r  V  s    J++rq   r  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
ddZddZdd	Zdd
ZddZ	 	 	 	 ddZdd dZdd!dZdd!dZ	 d"	 	 	 	 	 d#dZd$dZddZd%dZd&dZy)'IndentedBuffer   c                     g | _         || _        y r   )_lines_indent)r  initial_indents     rR   __init__zIndentedBuffer.__init___  s    GI%rq   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr   )tabwidth)r  r  prevs      rR   set_tabwidthzIndentedBuffer.set_tabwidthc  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr3   r  )r   r  rs   DeferredLineBaser
  r  r  r   writecountr  getvalue)r  bufr   linemaplilines         rR   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemapl  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88rq   c                6    | j                         j                  S r   )r%  r   r  s    rR   r   zIndentedBuffer.getvalue  s    '')///rq   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r  )	r   r  rs   r  r
  r   endswithr  r   )r  r!  r#  r$  s       rR   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~rq   c                8    | j                   j                          y r   )r  clearr'  s    rR   r-  zIndentedBuffer.clear  s    rq   c                ,    t        | j                        S r   )r"  r  r'  s    rR   __bool__zIndentedBuffer.__bool__  s    DKK  rq   c                :    d| j                   | j                  z  z  S )Nr   )r  r  r'  s    rR   rj  zIndentedBuffer.prefix  s    dllT]]233rq   c                &    | j                  d       y )Nr  	writeliner'  s    rR   newlinezIndentedBuffer.newline  s    trq   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )rs   r
  r  r  r  with_prefixrj  stripr  r$  s     rR   r3  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rq   c                4    |D ]  }| j                  |        y r   r2  )r  linesr$  s      rR   
writelineszIndentedBuffer.writelines  s      	!DNN4 	!rq   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr   r  )offsetr  s   rR   r  z"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r@  r  s   `` rR   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' urq   c                .    | xj                   |z  c_         y r   r?  r  r@  s     rR   	do_indentzIndentedBuffer.do_indent      rq   c                .    | xj                   |z  c_         y r   r?  rG  s     rR   do_unindentzIndentedBuffer.do_unindent  rI  rq   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r  )rs   r  floatr  r
  minrM   r  mathisinfr  r3  r   textwrapdedentrstriprH  )r  
other_coder8  rS  r$  r   s         rR   splicezIndentedBuffer.splice  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"rq   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r  )r  r  r  )r  r^  r   r$  s       rR   rx   zIndentedBuffer.map  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nr[  r\  )r   r   r'  s    rR   __repr__zIndentedBuffer.__repr__  s     t*Qt}}/q11rq   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S rX  )r  r  r<  r  )r  otherr   s      rR   __add__zIndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
rq   c                    || j                   v S r   )r  )r  new_lines     rR   containszIndentedBuffer.contains  s    4;;&&rq   Nr   )r  r   r   r  )r  r   r   rB  )r   r  r   r   r   r  r   r"  )r$  z)Union[LineContext, DeferredLineBase, str]r   r  )r;  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )r@  r   r   'contextlib.AbstractContextManager[None])r@  r   r   r  )F)rU  zUnion[IndentedBuffer, str]r8  r"  r   r  )r^  zCallable[[Any], Any]r   r  )r\  r   r   r  )r_  z)Union[DeferredLineBase, LineContext, str]r   r"  )r   r   r   r  r  rC  rD  r  r%  r   r+  r-  r/  rj  r4  r3  r<  rE  rH  rK  rV  rx   rZ  r]  r`  r   rq   rR   r  r  \  s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"2
2'rq   r  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y r   )superr  )r  	__class__s    rR   r  zFakeIndentedBuffer.__init__  s    rq   c                V    |dk(  rt         j                  | |      S t        d| d      )Nrj  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rR   rm  z#FakeIndentedBuffer.__getattribute__  s;    ;**466!$ (= =
 	
rq   rc  )r   r   r   r   )r   r   r   r  rm  __classcell__rj  s   @rR   rg  rg    s    
rq   rg  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rR   restore_stdout_stderrru     s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y r6  )r8  r$  r9  s     rR   r  zDeferredLineBase.__init__  s    zz|D	rq   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r'  s    rR   r  zDeferredLineBase.__call__      !!rq   c                    t         )z3Returns a new deferred line with the same conditionry  r9  s     rR   	_new_linezDeferredLineBase._new_line  rz  rq   c                @    | j                  | | j                         S r   r|  r$  )r  rj  s     rR   r7  zDeferredLineBase.with_prefix  s    ~~455rq   c                T    | j                  | j                  j                               S r   )r|  r$  r  r'  s    rR   r  zDeferredLineBase.lstrip  s    ~~dii..011rq   c                >    | j                  | j                  |         S r   r~  )r  r   s     rR   r  zDeferredLineBase.__getitem__  s    ~~dii.//rq   c                ,    t        | j                        S r   )r"  r$  r'  s    rR   r/  zDeferredLineBase.__bool__"  s    DIIrq   c                ,    t        | j                        S r   )rM   r$  r'  s    rR   __len__zDeferredLineBase.__len__%  s    499~rq   N)r$  r   )r   zUnion[str, None])r$  r   r   r   )rj  r   r   r   )r   r   )r   zUnion[int, slice]r   r   rd  r   r   )r   r   r   r   r  r  r|  r7  r  r  r/  r  r   rq   rR   r  r  	  s-    8
""620rq   r  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y r   )ri  r  rK  value_fn)r  rK  r  r$  rj  s       rR   r  zDelayReplaceLine.__init__,  s     rq   c                j    | j                   j                  | j                  | j                               S r   )r$  replacerK  r  r'  s    rR   r  zDelayReplaceLine.__call__1  s#    yy  4==?;;rq   c                D    t        | j                  | j                  |      S r   )r  rK  r  r9  s     rR   r|  zDelayReplaceLine._new_line4  s    $-->>rq   )rK  r   r  zCallable[[], str]r$  r   rb  )r$  r   r   r  )r   r   r   r   r  r  r|  rn  ro  s   @rR   r  r  )  s    @!
<?rq   r  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrE   ri   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rs   rK   r   rS   r   createversionhipmajorr   r  r   multi_processor_count)index_or_devicer   propr  r  s        rR   
is_big_gpur  8  s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 rq   c                     t         j                  j                         r(t         j                  j                         j                  S t         j
                  j                  d      j                  S )NrC   )rK   rE   rL   get_device_propertiesgpu_subslice_countrC   r  r   rq   rR   get_max_num_smsr  U  sF    yyyy..0CCC::++F3IIIrq   c                     t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rK   rC   rL   r  r  r  )device_propertiess    rR   
using_b200r  \  sJ     ::""$

889R9R9TU""b((rq   c                     t         j                  j                         r
t               S t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rK   rE   rL   r  r   _get_sm_carveout_experimental)carveouts    rR   get_num_smsr  f  sJ     yy  xx557HH,@HHaHHrq   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r3   )r4   WorkspaceZeroModeF)r  	zero_moder   
outer_name)codegen.commonr4   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr4   r  r  r  s          rR   get_tma_workspace_argr  o  sZ     @"}!++E2I--0CCD+<++-	 rq   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r   r  )r
  allowed_layout_dtypess     rR   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%rq   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w Nr  )upperrd   max_autotune_gemm_backendsrH  r8  backendrO   s     rR   _use_autotune_backendr    M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r  rd   max_autotune_conv_backendsrH  r8  r  s     rR   _use_conv_autotune_backendr    r  r  F)enable_int32enable_float8check_max_autotunec                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrS t         j"                  xs t         j$                  xs | xr* t'        d      xr  || j                  |j(                        S )Nr3   )BackendFeaturehas_backend_featurer   TRITON)r  r  r  rK   r   r2  r4  r<  extendr,  r-  r  r   r   r  r   rd   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r
  r  r  r  r  r  layout_dtypess          rR   use_triton_templater    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOrq   )
add_guardsc                     ddl m} ddlm d	fdd
 fdd
fd |       xr t	        fd|D              S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer3   rT  c                X    j                   j                  j                  | t              S r   )rX  rY  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrU  s    rR   _alignedzcan_use_tma.<locals>._aligned  s     ww<<ZWWrq   c                   | j                         }| j                         }t        |      }| j                         }|j                  }|dk  s|dkD  ry|t
        j                  t
        j                  t
        j                  fvry| j                         j                  j                  v ryrKj                  j                  j                  |      }j                  j                  j                  |      }nd|D cg c]'  }j                  j                  j                  |      ) }}|D 	cg c]'  }	j                  j                  j                  |	      ) }}	t        fd|D              ryt!        |      D 
	cg c]-  \  }
}	j                  j                  j#                  |	d      r|
/ }}
}	t        |      dk7  ry|d   }t!        |      D ]  \  }
}	|
|k(  r |	|z        r y ||   } ||z        sy|t
        j                  k(  r'j                  j                  j%                  |d      syyc c}w c c}	w c c}	}
w )	NrP  r   Fc              3  l   K   | ]+  }j                   j                  j                  |d         - ywrP  N)rX  rY  statically_known_geq)r   r   rU  s     rR   r   zBcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>  s+     P1177##88A>>Ps   14r3   r       T)get_size
get_striderM   	get_dtypeitemsizerK   r   r2  r,  r  rX  unaligned_buffersrY  guard_int_seqsymbolic_hintrc  r   statically_known_equalsr  )rO   sizesstridesrankr   r  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrU  r  r  s                 rR   _is_tma_compatible_defaultz/can_use_tma.<locals>._is_tma_compatible_default  s   

,,.5z>> !8tax 8K8KLL ::<177444gg&&44U;G((66w?IBGHQqww''55a8HGHFMN))77;NIN PPP
 #9-
2ww77A> 
 

 u:?!H	 y) 	EArI~BM*		 I&		H,- E'''0@0@0U0Ur1
 G IN
s   >,H<0,I2Ic                D   | j                         }|D cg c]'  }j                  j                  j                  |      ) }}t	        |      D cg c]-  \  }}j                  j                  j                  |d      r|/ }}}t        |      dk7  ryyc c}w c c}}w )Nr3   FT)r  rX  rY  r  r   r  rM   )rO   r  r  r  r   r  rU  s         rR   _is_tma_compatible_xpuz+can_use_tma.<locals>._is_tma_compatible_xpu  s    ,,.BIJBQWW%%33B7J	J #9-
2ww77A> 
 

 u:? K
s   ,B2Bc              3     K   | ]5  }|j                         xj                  dk7  r |      n |       7 y w)NrE   )
get_devicer   )r   r  r  r  m_devices     rR   r   zcan_use_tma.<locals>.<genexpr>  sK      +  &H/8==E3I 	#1%#A&	'+s   ;>)r  Union[int, sympy.Expr]r   r"  rO   r;   r   r"  )torch.utils._tritonr  rW  rU  rw   )r  matricesr  rU  r  r  r  r  s   `  @@@@@rR   can_use_tmar    sD     :X:x !" s + 	+ ( rq   c                x    t        d |D              xr' t        |d| ixr t        j                  j                  S )Nc              3  T   K   | ]   }t        |j                               d k(   " ywr  )rM   r  )r   r  s     rR   r   z*use_triton_tma_template.<locals>.<genexpr>%  s      5qC

"5s   &(r  )rw   r  rd   r  enable_persistent_tma_matmul)r  r  s     rR   use_triton_tma_templater  #  s9    5H55 	79j9	7MM66rq   c                <   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  g}t!        | |      xr/ t
        j"                  xs t
        j$                  xr t'        d      }|r6 |       s/t(        j+                  d	t
        j                  j,                         y|S )
Nr3   rT  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rW  rU  rX  rY  	size_hintrd   rC   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rK   r  r  r   r2  r<  r  r  r  r  r   r  cutlass_dir)	r
  r  r  r  rU  	gemm_sizer  r  r   s	            rR   use_cutlass_templater  +  s      **1q519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4 ''	 Jrq   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTr  )rd   rC   cutlass_enabled_opsr  rH  r8  )op_nameenabled_opsrO   s      rR   _use_cutlass_for_opr
  L  sU    ++11779Ke==?+2C2CC2HIQqwwyIIIIs   A,r   _IntLikec           
        ddl m} t        j                  j                  }t
        j                  j                   xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                   S )Nr   rT  )torch._inductor.virtualizedrU  rd   r  decompose_k_thresholdrK   r  r  rX  rY  statically_known_truert   AndGeaot_modecpp_wrapper)r  r  r  rU  r  s        rR   use_decompose_k_choicer  W  s    -"MM?? MM 	$GG22II1A561A56
	$    	$ ###
rq   c           
        t         j                  j                  }ddlm} t        t        j                  j                        xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                    S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rT  )rd   rocmcontiguous_thresholdr  rU  r"  rK   r  r  rX  rY  r  rt   r  r  r  r  )r  r  r  r  rU  s        rR   use_contiguousr  j  s     ";;;; . 	U]] 	$GG22II01450145
	$    	$ ###
rq   c                   t         j                  j                  }g d}t        |t        j
                        r|j                  s|S |dk(  rg S t        | t        j
                        r| j                  r&t        |t        j
                        r|j                  sd}nt        || z  ||z        }d}t	        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }}
}	|D ]Z  }||z  }|dk  r||dz
  z  dk(  r|dk\  r|	j                  |       0|dz  dk(  r|
j                  |       J|j                  |       \ t         j                  dk(  r|	|
z   |z   S |	|
z   |z   }|d | S c c}w )	N)ri   r  rk   rj      r   r  rP  rj   r3   r  
EXHAUSTIVE)rd   r  num_decompose_k_splitsrs   rt   r  	is_numberrO  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rR   get_k_splitsr+    s    ]]99N .!UZZ 	1		1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF#&88>IK''=s   
E,c                T    t         j                  j                  |       j                  S r   )rK   rC   r  gcnArchNamer   s    rR   _rocm_native_device_arch_namer/    s    ::++F3???rq   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S r   r   r   rq   rR   r1  z*try_import_ck_lib.<locals>.gen_ops_library      Irq   c                     g S r   r   r   rq   rR   r2  z.try_import_ck_lib.<locals>.gen_ops_preselected  r5  rq   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   rq   rR   r3  r8    s    rq   r3  )r   r|  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr1  r2  ck4inductor.universal_gemm.opr3  r  r  dirname__file__r   )r9  r1  r2  r3  package_dirnames        rR   try_import_ck_libr?    sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                   t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk(  syt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       yt        j*                         r|t         j                  _        t         j                  j,                  st&        j)                  d       y|t         j                  j,                  k7  rt&        j)                  d       yyc c}w c c}w )	NFrC   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rd   r  r  rK   r  r  r   r   r/  r  archrH  r#  ck_supported_archr   r   r2  r4  r?  r   r  	is_fbcodeck_dir)r
  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rR   use_ck_templaterJ    s   6#;#;====' 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   G-,G2c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr3   rT  CKr   r  r   rW  rU  r  rJ  rX  rY  r  r
  r  r  r  rU  s        rR   use_ck_gemm_templaterO    sR     	d# 	CF#	CGG&&q1uqy2&>Brq   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr3   rT  CKTILEr   r  r   rM  rN  s        rR   use_ck_tile_gemm_templaterR    sR     	h' 	CF#	CGG&&q1uqy2&>Brq   c                2    t        d      xr t        |       S )NrL  )r  rJ  r
  s    rR   use_ck_conv_templaterU     s    %d+G0GGrq   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )rd   r  r  r   r   rT  s    rR   _use_template_for_cpurW  $  s2    7v77&
--


%&rq   c                    ddl m} t        |j                  |      sJ t	        | ||d      xr |j                  j                         S )Nr3   )r<   F)require_constant_mat2)r  r<   rs   r
  use_cpp_gemm_templateis_contiguous)r
  mat1mat2r<   s       rR   use_cpp_bmm_templater^  *  sE     dkk6*** 	fdDN 	(KK%%'rq   c                `   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr3   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r3   )freeze_layoutr  rO   s    rR   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1j  s"    	||~b!Q&&rq   r  )r  r  codegen.cpp_micro_gemmr`  codegen.cpp_utilsra  kernel.mm_commonrb  rW  r  rd   cppweight_prepackr  rK   rC  r8  r4  r2  halfr   has_free_symbolsrs   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r
  r\  r]  re  rY  is_woq_int4rm  r  r`  ra  rb  	int8_gemmr  r  r  r  rj  r   rg  rq  s                       rR   rZ  rZ  7  s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Arq   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)rd   r  r  r  r   rq   rR   use_aten_gemm_kernelsr  w  s-    7v77 '	v	&'rq   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y r   )r  r  counterr   r'  s    rR   r  zDebugDirManager.__init__  s    ../rq   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rK   _dynamord   debug_dir_rootr  r   new_namer'  s    rR   	__enter__zDebugDirManager.__enter__  sM    $}}33BB//0dggY?.2mm+rq   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r   )r  r  r  r  rK   r  rd   r  )r  rz   s     rR   __exit__zDebugDirManager.__exit__  s*    dmm$.2.B.B+rq   Nrc  )rz   r   r   r  )
r   r   r   rF  r  r  r   r  r  r  r   rq   rR   r  r  }  s(    iooa G0<
Crq   r  c                    ddl m} g dfd}t        j                  j	                  |d|      5  t
        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr3   r7   c                (    j                  |        y r   r  codesource_codess    rR   save_output_codez*run_and_get_code.<locals>.save_output_code      D!rq   r  r  r   r   r  rX  r8   r   r  rl  rK   r  reset)r   rz   r  r8   r  r  r  s         @rR   run_and_get_coder    su    
 % L" 
		=*<>N	O %T$V$% <% <s   'A$$A0c                    t        | g|i |\  }}g }|D ]6  }|j                  t        j                  d|t        j                               8 ||fS )Nz	'''.*?''')r  r  r   findallDOTALL)r   rz   r  r  r  kernelsr  s          rR   run_and_get_kernelsr    sZ     ,B@@@FLG Brzz,bii@AB7?rq   c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S r   )r   backward)r  r   s    rR   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backward  s!    

rq   )r   r   )r  )r   r  s   ` rR   run_fw_bw_and_get_coder    s    
 -..rq   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r3   r7   c                (    j                  |        y r   r  r  s    rR   r  z"get_code.<locals>.save_output_code  r  rq   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y r   r   r'  s    rR   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rq   c                     y r   r   r  s      rR   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rq   Nrc  rz   r   r  r   r   r  )r   r   r   r   r  r  r   rq   rR   DummyModuler    s    Frq   r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rR   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module  s]    	 	 04/?/?D))+T\\^ 	"k 	++,[../}rq   compile_to_moduler  Nr  )r  r8   r   r   r  )r   rz   r  r8   r  r   r  r  s         @@rR   get_coder    s    $ L", 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr3   rP  z%expected one or two code outputs got r   )r  rM   )r   rz   r  r  s       rR   get_triton_coder    sQ    B000LL!&Q& 
/L0A/BC& ?rq   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r  )r  rM   )r   rz   r  r   r  s        rR   run_and_get_triton_coder    sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?rq   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r7   r?   c                 ^     | i | | d   }t        |      sJ j                  |       y )NrP  )rs   r  )rz   r  rX  r8   graph_lowerings	real_inits      rR   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init  s7    4"6"Q%///u%rq   r  r  )torch._inductor.graphr8   torch._inductor.output_coder@   r  r   r  rl  )	r   rz   r  r@   r  r  r8   r  r  s	         @@@rR   run_and_get_graph_loweringr    sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)aten_opoverride_fnr  orig_fns       rR   override_loweringr  	  s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r   r   )r  r   outr  post_fnpre_fns      rR   r  z(add_scheduler_init_hook.<locals>.wrapper$	  s+    y% i'Iu%
rq   r  )r  r   r   r   r   r   )torch._inductor.schedulerr  r  unittestr   r  rl  )r  r  r  r  r  s   ``  @rR   add_scheduler_init_hookr  	  s9     4  G ==%%iWEErq   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rd   developer_warningsr   r  info)msgs    rR   developer_warningr  .	  s$       Crq   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr3   r   rX  z--only=N)r  argvr   rM   
ValueErrorr  )r,  r  s     rR   get_benchmark_namer  :	  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr3   Nr   r  s     rR   r   zis_ones.<locals>.<genexpr>\	       %!qAv%   rw   r$  s    rR   is_onesr  [	      %u%%%rq   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r  s     rR   r   zis_zeros.<locals>.<genexpr>`	  r  r  r  r  s    rR   is_zerosr  _	  r  rq   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)rs   rK   r  r   )r   r   s     rR   r   z is_cpu_device.<locals>.<genexpr>d	  s8      dELL) 	u||E**s   AAr  )inputss    rR   is_cpu_devicer  c	  s       rq   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rs   rt   r  r   rK   r>  r6  )rI  s    rR   get_sympy_Expr_dtyper  k	  s=    c5::& B& ~~{{}}rq   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr   )rK   r   r   )should_profilerz   r  r   s       rR   maybe_profiler  u	  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr3   )rd   ru  threadsrK   get_num_threads)r  s    rR   r{  r{  ~	  s+    jj  G{'')Nrq   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr3   )get_backend_options
num_stagesrP     )runtime.triton_helpersr  r  rK   r  r  )r  optionss     rR   get_backend_num_stagesr  	  s2    ;!#G;;|%--*;*;QCCCCrq   c                   t        | t        j                  j                  j                  j
                        }||S ddlm}m} t        j                  j                         xr! t        j                  j                         dk\  }| t        j                  t        j                  t        j                  fv sJ t        j                  |      j                   j#                  d      rddlm}  |       }| t        j                  t        j                  fv r|r	 || |      S t        j                  j                  j                  j
                  r |t        j                  |      S  |t        j                  |      S | t        j                  t        j                  fv r
|r ||       S t        j                  j                  j                  j
                  r |t        j                        S  |t        j                        S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rl   r   
clock_rate)max_clock_rate)r   rK   backendsrC   matmul
allow_tf32triton.testingr   r  rL   get_device_capabilityr   r2  r4  inspect	signature
parametersr  torch._utils_internalr  )r   ds_topsr   r  SM80OrLaterr  sm_clocks          rR   get_device_tflopsr  	  sk    UENN,?,?,F,F,Q,QRGM**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rq   c                     ddl m}   |        S )Nr   get_dram_gbps)r  r  r  s    rR   get_gpu_dram_gbpsr  	  s    ,?rq   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  r  s    rR   get_gpu_shared_memoryr  	  s.    %==44Q7;;<LaPPrq   c                $    | j                  d      S )Nwelford)r  reduction_types    rR   is_welford_reductionr  	  s    $$Y//rq   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducerP  r3   )r  r  s    rR   reduction_num_outputsr!  	  s    N+	2	2rq   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rq   rR   is_linuxr&  	  s    ??''rq   c                 (    t         j                  dk(  S )Nrf   )r  r$  r   rq   rR   r  r  	  s    <<7""rq   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr   )rs   rt   r  r  r  s     rR   r   z#has_free_symbols.<locals>.<genexpr>	  s)     Jz!UZZ(<_<Js   35r  )itrs    rR   rx  rx  	  s    JcJJJrq   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr3   r  r   Tzunexpected type for is_dynamic F)r  r  rs   r  r  ry  rA  r9   rx  maybe_get_sizemaybe_get_strider;   	TypeErrorr   )rz   r  ts      rR   
is_dynamicr0  	  s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI rq   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r3  r4  r   rq   rR   r2  r2  	  s      K *rq   r2  c                   ddl m} t        j                  ddd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |	       t        |j                  |	       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d
|j                   |	       t        |j                  |	       |j!                         |j!                         k(  }
t"        j%                  d||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr3   )stable_topological_sortr6  zutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr6  r  NamedTemporaryFileior   rY   rU   	propagater  rX  r
   nowrX   lint	recompiler   r   r  r   )r^  r  inpr  r6  r  	before_ioafter_io
start_timetime_elapsedr/  s              rR   pass_execution_and_saverG  	  sX    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
	 	
 
s%   BF4<F(CF4(F1	-F44F=c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r3   r  )r  r  rs   CppTemplateBufferr
  MultiOutputLayout	input_bufr  s     rR   is_multi_outputs_templaterM  
  s9     i!5!56 :"..< rq   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r3   r  r   )r  r  rs   MultiOutputrM   r  rM  rK  s     rR   #is_output_of_multi_outputs_templaterP  )
  sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:rq   c                (   | yddl m} t        | |j                        xr- t        | |j                         xr |d u xs | j
                  |u xsB t        |       |j                  k(  xr' t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  S )NFr3   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rs   _CollectiveKernel_WaitKernelop_overloadr   FallbackKernelr   rK   r   torchrecrR  defaultrS  rT  r  rg  r  s      rR   is_collectiver\  8
  sG    | 	4--. 	3400	34Z14++r1 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/rq   c                >    ddl m} t        |       |j                  k(  S Nr3   r  )r  r  r   rV  r  r  s     rR   is_waitr`  ^
  s    :''rq   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wr   )contains_collectiver  s     rR   r   z&contains_collective.<locals>.<genexpr>h
  s     @a&q)@r  )r  rc  rs   rc  snodesr\  r  snoderc  s     rR   re  re  d
  s4    >%-.@5<<@@@$$rq   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   rb  c              3  2   K   | ]  }t        |        y wr   )contains_waitr  s     rR   r   z contains_wait.<locals>.<genexpr>q
  s     :=#:r  )r  rc  rs   rc  rf  r`  r  rg  s     rR   rk  rk  m
  s4    >%-.:U\\:::uzz""rq   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r^  )r  r  rs   rK   rt  ru  rX  rW  r[  s      rR   is_fallback_oprm  v
  sE     "ejj++,TdB--.I43C3Cr3IIrq   c                B    |||    j                   j                            S r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rR   buf_name_to_fused_snoders  
  s#     k(3??HHJKKrq   c                     yrj  r   rh  s    rR   rk  rk  
  rl  rq   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r%  unmet_dependenciesrs  r   find_recursive_deps_of_node)rh  collected_node_setrq  rr  rx  depdefining_op_for_deps          rR   rz  rz  
  sn     55!'' 
5HHk#5
 "44##	

rq   c                     yrj  r   ru  s    rR   rk  rk  
  rl  rq   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTrw  )r%  get_outputsrw  r  r  find_recursive_users_of_node)rh  r{  rq  rr  rx  or=  user_ops           rR   r  r  
  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	rq   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rP  r   )rK   
_functorchrd   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rR   num_fw_fixed_argumentsr  
  s6     $$::   "669SSSrq   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_statery  rp  s    rR   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor
  sH    aff$ .!&&(.!/.  qvv-		
rq   r   r  r3   )rO   r2   r   r"  )rX  r   rg  r  rJ  r   rM   )fx_gr  	arg_countstatic_arg_idxsr  s        rR   count_tangentsr  
  s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????rq   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr"  r   c                    | j                   S r   )r   r'  s    rR   r/  zBoxedBool.__bool__
  s    zzrq   c                6    t        | t              r	d| _        | S yrj  )rs   r  r   r  s    rR   disablezBoxedBool.disable
  s    c9%CIJrq   Nrd  )r  r   r   zUnion[BoxedBool, bool])r   r   r   r   r/  r  r  r   rq   rR   r  r  
  s     K  rq   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr3   r5   c                @    j                  |        | |||||      S r   r  )r  kernel_namer  r/  gpucpp_definitionkernel_listorig_define_kernels         rR   define_kernelz.collect_defined_kernels.<locals>.define_kernel
  s-     	;'!+{Hc>
 	
rq   r  )NTN)r  r6   r  r   r  r   r/  Optional[str]r  r"  r  r  r   r   )codegen.wrapperr6   r  r   r  rl  )r  r6   r  r  s   `  @rR   collect_defined_kernelsr  
  s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   ry  s    rR    get_cloned_parameter_buffer_namer    s    .  rq   c                    | t         v S r   )rI   r.  s    rR   r  r    s    Yrq   c                &    | dk7  xr t        |       S )NrD   )r  r.  s    rR   device_need_guardr    s    U?-vf~-rq   c                d   t        j                         rc| t        j                  k(  rPt        j                  j                         r2t        j                  j                         dk\  rt         j                  ry| t        t        j                  t        j                  t        j                  g      v S )N)r  r   F)rd   rD  rK   r2  rC   rL   r  bfloat16_atomic_adds_enabledr!   r>  r"  r  s    rR   ,needs_fallback_due_to_atomic_add_limitationsr    sp    
 	U^^#JJ##%JJ,,.&8//
EKKU^^#LMMMrq   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFr%  r   r   r3   )overloadpacketrK   r   atenscatter_reduce_scatter_reducescatter_r  r  rd   ru  fallback_scatter_reduce_sumdynamic_threadsr{  r"  r>  r  )rW  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rR   use_scatter_fallbackr  #  sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!rq   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr   3rA  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rM   r   rs   is_reductionr  r  reduction_hintr  r  r  r   r   )r  r  r  r  r,  r  is_redr|  s           rR   dump_node_scheduler  J  s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hrq   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )r  )r  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  s     rR   tensor_is_alignedr  i  s:     L 				 >&,,#?	??RVWW rq   c                |    t        | j                  j                        syt        j                  xs t        |       S rj  )r  r   r   rd   assume_aligned_inputsr  )example_inputs    rR   should_assume_input_alignedr  w  s2     -&&++,''K+<]+KKrq   c                 6   t         j                  j                  j                         } | st	        j
                         S | j                  r| j                  j                  st	        j
                         S | j                  j                  }|j                         S r   )	rK   _guardsTracingContexttry_getrC  nullcontextr9  rZ  suppress_guards)tracing_contextrZ  s     rR   #maybe_get_suppress_shape_guards_ctxr    sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&rq   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r  r   r  rl  rd   rK   r  r  r=  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr   removeHandler)r   rz   r  r=  r  log_capture_stringchr  
prev_levelr  r   s              rR   run_and_get_cpp_coder    s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                    t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y r   )rU   rZ  rs   rK   r/   r  )r  r9  inputs      rR   shape_env_from_inputsr    sT     (I """  (eU\\*::'''(
 rq   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S r   )copy_misaligned_inputsrM   rK   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr  inputs_to_checkr  mutated_input_idxss       rR   r  z)align_inputs_from_check_idxs.<locals>.run  sE    #9);$
 [ J {  k:
rq   )r  list[InputType]r   r   )rM   )r  r  r  r  s   ``` rR   align_inputs_from_check_idxsr    s#    
 ?q  Jrq   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr  r   )r   r  r  s      rR   r   z)clone_preserve_strides.<locals>.<genexpr>  s     Tf$Tr  r3   r   )r  r   r   r  rK   
as_stridedclone)rO   needed_sizer  s      rR   clone_preserve_stridesr    s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99rq   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rs   rK   r  r   data_ptr	ALIGNMENTr  r  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rR   r  r    s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##rq   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rs   rK   r  r  r  r  rM   )r  static_input_idxsaligned_static_input_idxsr,  r  s        rR   remove_unaligned_input_idxsr    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((rq   c                   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }|j                  j                  j                  | |k        ry|j                  r)|j                  j                  j                  | dk        ry ||       xr  ||       |k  S )Nr3   rT  Tg@xDF)rW  rU  rK   iinfor<  r   rX  rY  r  rZ  has_hintr  aot_compilation)r   rU  int_maxr  r	  s        rR   expr_fits_within_32bitr    s    kk%++&**G  **Iww))22H 	ww--a7l; 	 7711!d(;  A;29Q<722rq   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrZ  s    rR   map_exprz4set_tracing_context_output_strides.<locals>.map_exprE  s7     ("1v((<<Q??$55a88rq   c              3  .   K   | ]  } |        y wr   r   )r   r   r  s     rR   r   z5set_tracing_context_output_strides.<locals>.<genexpr>M  s     5!(1+5s   )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rK   r  r  r  output_stridesrM   r  r  r  r  )r  compiled_graphr  r  r  r  r  rZ  s        @@@rR   "set_tracing_context_output_stridesr  4  s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Brq   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rd   fx_graph_remote_cacherD  rK   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rR    should_use_remote_fx_graph_cacher   Q  s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r   subry  s    rR   normalize_namer#  d  s    66"C..rq   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer"  r   _triton_type_mappingr  )r   triton_type_names     rR   triton_typer)  z  s.    &**5#e*=##$46FGGrq   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr%  r  )_torch_triton_mappingr  r  rJ   rK   rs   r   )r   adjusted_type	type_namerd  s       rR   triton_type_to_torchr.    sL    )--eU;M%%eR0Iy)Ii---rq   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S r   )	is_mkldnnr  r  r   r   untyped_storager  r  r  r   s     rR   is_same_tensorr3    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rq   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S r   )r0  r  r   r   rK   r   mkldnnr  r2  s     rR   is_same_mkldnn_tensorr6    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrq   c                      y)N)rQ  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rq   rR   boolean_opsrD    s    rq   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler0   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr  r   rq   rR   rF  rF    s    8800rq   rF  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y r   )rF  rJ  )r   rG  rI  s      rR   #register_op_dtype_propagation_rulesrL    s    
 (32(t$rq   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y r   )rM  r%  ry  s    rR   #register_op_requires_libdevice_fp64rO    s    ""4(rq   c                     ddl m}  | j                  j                         j                  }|dk(  rt
        j                  S |dk(  ryt
        j                  S )Nr   rT  r   rD   )r  rU  rX  get_current_device_or_throwr   rd   cpu_backendcuda_backend)rU  
device_strs     rR   get_current_backendrU    sH    -446;;JU!!!	u	"""rq   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rK   r   r2  rd   r  codegen_upcast_to_fp32rU  r4  r  s    rR   upcast_compute_typerX    s@     	%--00MM00!X-}}Lrq   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y r   original_dict	new_items)r  r_  s     rR   r  zScopedDict.__init__  s    *13rq   c                Z    || j                   v r| j                   |   S | j                  |   S r   r`  r_  r  s     rR   r  zScopedDict.__getitem__  s.    $.. >>#&&!!#&&rq   c                "    || j                   |<   y r   )r`  )r  rK  r   s      rR   __setitem__zScopedDict.__setitem__  s    #srq   c                >    || j                   v xs || j                  v S r   rb  r  s     rR   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AArq   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S r   )r`  r_  r  )r  rK  rZ  s      rR   r  zScopedDict.get  s6    $.. >>#&&!!%%c733rq   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S r  )rM   r_  r`  )r  r  r  s      rR   r  zScopedDict.__len__  sC    ""# 	A***Q	 rq   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wr   r^  )r  r  s     rR   __iter__zScopedDict.__iter__
  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S r   )r"  r_  r`  r'  s    rR   r/  zScopedDict.__bool__  s    D&&8$..99rq   c                    t         r   ry  r  s     rR   __delitem__zScopedDict.__delitem__  s    !!rq   )r_  Mapping[KeyType, ValType])rK  rY  r   rZ  )rK  rY  r   rZ  r   r  )rK  rl  r   r"  r   )rK  rY  rZ  Optional[ValType]r   ro  r  )r   zIterator[KeyType]rd  )rK  rY  r   r  )r   r   r   r   r  r  rd  rf  r  r  rj  r/  rm  r   rq   rR   r\  r\    s5    4'
$B4
:"rq   r\  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                    t         j                  dk\  rt        j                  | d      S t        j                  |       S )N)r  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   s    rR   wrapzir_dataclass.<locals>.wrap  s;    w&((d6JJ ((V<<rq   )r   rh   r   rh   r   )r   r   rw  s    ` rR   ir_dataclassrx    s    = {9rq   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y r   )rK   r  r  r  fw_metadatabw_donated_idxs)r  s    rR   get_donated_idxsr|  &  s=    mm22::<O"'B'B**:::rq   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r3   rP  r  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   rq   rR   r~  r~  -  s     LKK	  Grq   r~  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)	importlibutil	find_specr~  r  triton.backends.compilertriton.compiler.compilerr   r  compilerr  r  r  )r  s    rR   #get_triton_attrs_descriptor_versionr  7  s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333rq   c                 8    t               t        j                  k(  S r   )r  r~  r  r   rq   rR   triton_version_uses_attrs_dictr  Q  s    .04P4X4XXXrq   c                   ddl m} t        | |j                        syt        | j                  t
        j                  j                        r;t
        j                  j                  j                  | j                  j                  v ryy)zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r3   r  FT)r  r  rs   rX  rW  rK   rt  ru  r   rx  r  rz  r_  s     rR   is_cudagraph_unsafe_opr  U  s^    
 dB--. 	4##UZZ%:%:;HHLL))T-=-=-B-BBrq   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r  rd   rD  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_paths       rR   get_ld_library_pathr  h  sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKrq   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  rs   partition_signatures)r  r  s     rR   #is_codegen_graph_partition_subgraphr  u  s*    L 	789 	5((4rq   c                     t         j                  j                  j                  j                  xs t
        j                  d uxr$ t         j                  j                  j                  S r   )rK   r  rd   r  
cudagraphs&_unstable_customized_partition_wrapperr  r  r   rq   rR   is_using_cudagraph_partitionr  ~  sN    %%00 	F199E1 //
 
 
0
01rq   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr3   rT  l        i   )	rW  rU  rX  rY  statically_known_ltr  rK   r<  r>  )r  rU  s     rR   dtype_from_sizer    sP    ww++e
''


/
/h
?{{{{rq   )r   rE   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r   rE   TF)rK   r   r5  _is_mkldnn_bf16_supportedr   s    rR   is_mkldnn_bf16_supportedr    3     eyy99;;	+	rq   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r   rE   TF)rK   r   r5  _is_mkldnn_fp16_supportedr  s    rR   is_mkldnn_fp16_supportedr    r  rq   c           
     n   |D cg c]  }t        t        |             }}| D ]R  }t        |      t        |      k(  sJ t        |      D ])  \  }}t        ||   t        t        |                  ||<   + T g }|j	                  dj                  d t        ||      D                     t        |      t        |      dz  z   t        |      dz
  z   }|j	                  d|z         | D ]3  }|j	                  dj                  d t        ||      D                     5 dj                  |      S c c}w )N|c              3  6   K   | ]  \  }}d || dd   ywr   r  Nr   )r   hr6  s      rR   r   ztabulate_2d.<locals>.<genexpr>  s$     H41aAa0tWA,H   rP  r3   rX  c              3  6   K   | ]  \  }}d || dd   ywr  r   )r   r   r6  s      rR   r   ztabulate_2d.<locals>.<genexpr>  s$     Htq!!QCp4lHr  r  )rM   r   r   r   r  r  r   r   )elementsheadersr   widthsrowr   r;  total_widths           rR   tabulate_2dr    s   #*+ac#a&k+F+ 43x3w<'''cN 	4DAqF1Is3q6{3F1I	44 E	LLH3w3GHHIf+Vq1S[1_EK	LL{"# JSXXHs37GHHIJ99U ,s   D2c              #     K   t        | j                               t        |j                               z  }|D ]3  }| j                  |      }|j                  |      }|||n|||n|f 5 yw)a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r!   r#  r  )dict1dict2
d1_default
d2_defaultall_keysrK  value1value2s           rR   	zip_dictsr    sv     ( %**,'*UZZ\*BBH  	
33 (Fj(Fj
 	
	
s   A-A/c                   	 	 	 	 	 	 	 	 d	d}| j                  dt        j                  j                        }| j	                         } |rA || dd        || dd        || dt
        j                  j                           || dd       | S )
a1  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    c                    | j                  |t        t        |            }||| |<   y |s||k7  rt        d| d| d      y y )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rJ   rd   r   )config_patchesconfig_nameconfig_valuer   s       rR   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s]     "";0LM=*6N;'5L0";-q>lm  1rq   zaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_model)r  dict[str, Any]r  r   r  r   r   r  )r  rd   aot_inductorcompile_standalonecopyrK   r  r  )r  r  r  s      rR   maybe_aoti_standalone_configr    s    "	&	58	HK			 (++)6+>+>+Q+Q $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 rq   c                     ddl m}  | j                  j                  }|yt	        |t
              st        d      |dk(  ryt        j                  d|      st        d      y)zL
    Validates if a model name is suitable for use in code generation.

    r   rc   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rd   r  model_name_for_generated_filesrs   r   r  r   r   )rd   
model_names     rR   is_valid_aoti_model_namer    sh    
 '$$CCJj#&OPPR 88/<d
 	
 rq   c                2    |rt        |       S t        |       S r   )r'   r&   )rO   unbacked_onlys     rR   get_free_symbolsr  *  s    $Q''Arq   c                    t         j                  j                  sy| |  }|rE|j                  x}r7|j	                         x}r%|j
                  j                  dd      x}r| d| }t        j                  |       y)z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rd   r  r  r  get_origin_noder   r  perf_hint_logr  )r  rj  r  warning_msgir_noderg  r  s          rR   maybe_log_cudagraph_partitionr  1  s     ==##HSE"K 			!W!//11W1#LL,,]DAA[A$%7}E+&rq   c                    i t         j                  dt         j                  j                  dt         j                  j	                  t
        j                              i} t        j                         rt        j                  d      | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r  r  r  r  r  r  r  rd   rD  	sysconfigget_path)envs    rR   python_subprocess_envr  J  sl    

** 	bjjnn%rzzsxx'@
	C  %..v6LJrq   c                  &    e Zd ZU dZded<   ded<   y)CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   r   rq   rR   r  r  e  s      rq   r  .c                      e Zd ZU dZded<   y)CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   r   rq   rR   r  r  |  s    .2G+2rq   r  c                    | t         _        y r   )r  r  )r  s    rR   !set_customized_partition_wrappersr    s    5<*2rq   c                8   | j                   j                  }| j                   j                  g || j                   j                  | j                   j                        }| j                   j                  }t        j                  ||f      \  }}dd}|D cg c]7  } ||      r+t        j                  j                  j                  |d      n|9 }}dddfd}|D cg c]
  } ||       }}t        j                  ||      \  }}||fS c c}w c c}w )	Nc                    t        | t        j                  j                  j                        xr/ t        | t        j                  j                  j
                         S r   )rs   rK   r  r  r;   GeneratorStaterp  s    rR   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  sH    !U__//667 

u!!00A
 =
 	
rq   F)guard_shapec                2    t        j                  | ||      S )Nr   )rK   r   )r  r   r   s      rR   _tensorz"snode_args_kwargs.<locals>._tensor  s    {{4uV<<rq   c                    t        | t        j                        s| S  | j                         | j                  | j
                        }|S r   )rs   rK   r  r  r   r   )r   r  r  s     rR   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor  s7    !U\\*Haffh2
rq   rd  )r   r  )r   r   r   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer"   rK   r  r  ir_node_to_tensortree_unflatten)	rh  rz   r  	flat_argsflat_args_pytree_specr  r  r  r  s	           @rR   snode_args_kwargsr    s   ::D::,,*$*))*

D ZZF'-':':D&>'J$I$
 	   	,,QE,B	I = -66q"6I6((4IJLD&<%  7s   <D$Drb  )ro   r   r   r   )r}   r   r   r"  )   d   )r   zCallable[[], Any]r   r   r   r   r   rN  rd  )r   z"Union[Optional[torch.device], str]r   torch.device)r  zIterable[sympy.Expr]r   r   )r  Sequence[sympy.Expr]r  r  r   r   )r  zIterable[_T]r   zValuesView[_T])r  r  r  r  r   r  )rK  rH  r   r   )rQ  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])r   r  r   zUnion[int, torch.SymInt])rQ  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])rg  torch._ops.OpOverloadr   r"  )r{  r2   rr  z'Callable[[torch._ops.OpOverload], bool]r   r"  )rs  r   rz   r|  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rC   )r   r   r   r  )r3   rC   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r   r   rN  )r   r  r  g      ?rC   )r  r  r  r  r   r   r  r   r  rN  r   r   r   rN  )r  r   r  r   r   r  )r  r   r  r   r   r  )r  r   r  r   r   r   )rO   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rO   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   CachedMethod[P, RV])r   zCallable[P, RV]r   r  )r  r   r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r   )r  r	  r  r6   r   ztuple[str, str]r   )r:  zIterable[torch.fx.Node]r;  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])rz   zSequence[IRNode]r  zdict[str, IRNode]r   r  r`  )r   r   r   zValueRanges[Any])rj  r   r   r"  )rj  r`   r,  r   r   r  )rv  r"  r   r"  )r   r   r   r  )rR  r   r  zdict[sympy.Expr, Any]r   r   )r  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rz   r   r   r"  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r  r   r2   )r  r  r   zOrderedSet[torch.device]rc  )r  r   r   r   )NNT)r  zOptional[dict[str, Any]]r  r  r  r"  r   rB  )r  r  r   	list[int])rZ  r)   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r   r  )r   torch.dtyper   r   rA  ra  )r  zUnion[int, torch.device]r   r"  r  )r  r   r   r  r  Optional[int]r   r4   )r
  r<   r  zlist[torch.dtype]r   r"  )r  r   r   r"  )
r
  r<   r  r"  r  r"  r  r"  r   r"  )r  r;   r  r"  r   r"  )
r
  r<   r  r   r  r   r  r   r   r"  )r  r   r   r"  )r  r  r  r  r  r  r   r"  )r  r  r  r  r  r  r   r  )r   r   r   r   )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r
  r<   r   r"  )r
  r<   r\  zUnion[ReinterpretView, Buffer]r]  r;   r   r"  )FTFN)r
  r<   r\  r;   r]  r;   re  r"  rY  r"  r}  r"  rm  r  r   r"  )r   Callable[P, _T]rz   r  r  r  r   ztuple[_T, list[str]])r   r  r   ztuple[Any, list[str]])r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   ztuple[Any, list[GraphLowering]])r  r  r  r  r   rB  )r  r  r  zOptional[Callable[..., Any]]r   r   )r  r   r   r  )r   r  )r$  r  r   r"  )r  zSequence[torch.Tensor]r   r"  )rI  r   r   r  )r  r"  rz   r   r  r   r   zIterator[Any])r   r  r   rN  )r  r   r   r"  )r  r   r   r   )r*  zIterable[Any]r   r"  )
r^  r  r  r1   rB  r  r  r   r   r  )rL  z"Optional[Union[Buffer, Operation]]r   r"  )r  z Optional[Union[Node, Operation]]rg  z!Optional[torch._ops.OperatorBase]r   r"  )r  z"Optional[Union[IRNode, Operation]]r   r"  )rh  rA   r   r"  )r  zOptional[Operation]rg  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r"  )rp  r   rq  r  rr  r  r   r   )rh  rA   r{  zMutableSet[BaseSchedulerNode]rq  zdict[str, SchedulerBuffer]rr  zdict[str, BaseSchedulerNode]rx  zCallable[[Any], bool]r   r  )r  r   r  r   r   r   )r  r  r   r   )r  r   r   rB  )r   r   r   r   )r   r  r   r"  )r   r   r   r"  )r   r  r   r"  )rW  r  r  r  r  r  r  r  r  r   r  r"  r   r"  )r  r
  r   r  )r   r  r   r"  )r  r  r   r"  )r   re  )r   r  rz   r  r  r  r   ztuple[_T, str])r  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r  r  r  zOrderedSet[int]r   r  )rO   r  r   r  )r  r  r  r  r   zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  r  r  r  r   r  )r   r   r   r"  )r  r  r  r@   r   r  )r   r  r   r   )r   r   r   r  )r  r  r   r  r   r"  )r   ztuple[str, ...])r   r   rG  r0   rI  rH  r   r  )r   r   r   r  )r   r  r   r  )r   zOptional[type[Any]]r   r"  r   r   )r   zOptional[list[int]])r   r~  )r  r=   r   r"  )r  r6   r   r"  )r  r   r   r  )r   r   r   r"  )r  zSequence[Sequence[T]]r  zSequence[T]r   r   )NN)
r  rn  r  rn  r  ValType | Noner  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )rO   r(   r  r"  r   zOrderedSet[sympy.Symbol])zcudagraph partition due to N)r  r   rj  r  r  zOptional[BaseSchedulerNode]r   r  )r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )rh  rA   r   z tuple[list[Any], dict[str, Any]](s  
__future__r   r  rC  ru  enumr  r  r	  r=  rF  r  rP  r  r  r$  r   r  r   r  r  r  rR  r  r  collections.abcr   r   r   r   r   r	   r
   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   rt   rK   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRADr  r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   torch._prims_commonr0   torch.fxr1   torch.fx.noder2   r  r4   r  r6   rX  r8   r  r9   r:   r;   r<   r=   r>   output_coder@   r  rA   rB   rI   rG   r   rS   torch._dynamo.device_interfacerT   torch._dynamo.utilsrU   torch.autogradrV   torch.autograd.profiler_utilrW   (torch.fx.passes.graph_transform_observerrX   torch.fx.passes.shape_proprY   torch.utils._sympy.functionsrZ   r[   r\   r]   r^   torch.utils._sympy.symbolr_   r`   torch.utils._sympy.value_rangesra   rb   r  rd   runtime.runtime_utilsre   r  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerr  rh   r  r  	VarRangesr  r   	InputTypeGPU_KERNEL_BIN_EXTSr  r  r  r  rn   rp   ry   Functionr{   rv  r   r   r   r   r  r  r  r  rN  rR  r\  r^  rh  rp  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  r8  r>  rM  r_  rh  rk  rr  rw  rz  r  r  r  r  r  r  r  r  r  r   r  r  rD  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r
  r  r  rg  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r+  r/  r?  rJ  rO  rR  rU  rW  r^  rZ  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r{  r  r  r  r  r  r!  r&  r  rx  r0  Enumr2  rG  rM  rP  r\  r`  re  rk  rm  rs  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r#  r'  r$  r+  compiler&  r)  r.  r3  r6  rD  rF  rJ  rL  rM  rO  rU  rX  rY  rZ  r\  rx  r|  r~  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyper  r  r  r  r  )r  r}   s   00rR   <module>r@     s   "        	     	  	   
                $ $ ? : 0 / ; ($ 
  >>//C$",5$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%g!00<H T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  d#  $"GV 9<SS#&S25S
Sl  ;@
+*"*+A**#AL+	+++"/	)/#/G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:++/+\C*!).!)O!) 	!)HN2CN2!N2 N2f 48*0 (G
G$5GG:,^%	DU	>2-888v'& 
: !# I "	 .29+9	9 9 	9 9z !5 $ " A!!L!!H Q7 7*  , , ,
R' R'j
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
: 7< dN CH BJ CO,) ,  $  . 5( 5(p @ @ R R:+\H&

8
@F
	
" ""&"&==
= = 	=
  = =  = 
=@'C C"      	 $ &2:/(V &2:## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D %6 %6P  Q0(#K(*$)) *!

!
"-!
4A!
HK!
	!
H1	" -1#
*#)# 
#L(%#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.N $&$!$ $ 	$
 $ $ 
$NH>L'  &2:2(*" ( %	0	: 37$$$$ 0$ 3	$<$ $ 3F!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)	# )

)
-" 01 -"` D)t  *499  4 42Y&
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
F.b6 :(,'	'' &' 
	'26 d#  $ 38$./@ 3 3 *:); &= } Hs   l