
    piy                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZ d dlZd dl Zd dl!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 erd dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ dZA e3eBd      ZC G d deD      ZE G d d      ZF G d d      ZGee&j                  e&j                  f   ZJej                   G d d             ZLej                   G d d             ZM G d  d!eM      ZN G d" d#      ZO G d$ d%      ZP G d& d'eM      ZQ G d( d)eOeQ      ZR G d* d+ePeQ      ZS G d, d-eOeM      ZT G d. d/ePeM      ZU G d0 d1eOeM      ZVej                  d4d2       ZX	 	 	 	 d5d3ZYy)6    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet)
ModuleType)PartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorN__name__
__module____qualname__     b/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr)   r)   ;       r/   r)   c                      e Zd ZdZedd       Zedd       Zedd       ZddZd Z	ddZ
ddZddd	Zddd
ZddZddZddZy)TuningProcesszF
    Class to launch and interact with a benchmarking subprocess.
    c                     t         j                  dt        j                         t        j                  j                  t                      fd}	  |        y# t        $ r Y yw xY w)z4
        Entry point for the child process.
        z3Started autotune subprocess %s. Visible devices: %sc                     	 t         j                        } | y 	  |        }t         j                  |       7# t        $ r}|}Y d }~'d }~ww xY wN)r3   recv	Exceptionsend)jobresulte	read_pipe
write_pipes      r0   workloopz,TuningProcess.process_main.<locals>.workloopO   sX    #((3; UF ""6:6  ! Fs   : 	AA

AN)autotuning_logdebugosgetpidenvirongetr&   EOFError)r=   r>   r?   s   `` r0   process_mainzTuningProcess.process_mainD   sQ    
 	AIIKJJNN/0	

	7	J 		s   A 	A('A(c                P    t        j                  | |       |j                          y r6   )pickledumpflush)objr>   s     r0   r9   zTuningProcess.senda   s    C$r/   c                ,    t        j                  |       S r6   )rI   load)r=   s    r0   r7   zTuningProcess.recvf   s    {{9%%r/   c                2    || _         | j                          y r6   )devicestart)selfrP   s     r0   __init__zTuningProcess.__init__j   s    

r/   c                   t         j                  j                  t         j                  j                  t              d      }t        j
                         \  }}t        j
                         \  }}t        j                  |d      | _        t        j                  |d      | _        t        j                         | _        | j                  j                  | j                  t        j                         t        j                  |dt        j                           dt#        |       dt#        |       g}i t%               dt'               t(        j*                  rdndd	}| j,                  t#        | j,                        |t.        <   t1        j2                  ||||f      | _        t        j6                  |       t        j6                  |       d| _        y
)z4
        Start the benchmarking subprocess.
        z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)rB   pathjoindirname__file__pipefdopenr>   r=   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerC   strr   r   r#   /profile_bandwidth_with_do_bench_using_profilingrP   r&   
subprocessPopenprocesscloserunning)rR   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdr\   s           r0   rQ   zTuningProcess.startn   sw    RWW__X68NO$&GGI!$&GGI!!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%'78

 	!
!"r/   c                V    | j                   xr | j                  j                         du S )z:
        True if the subprocess is still running.
        N)rq   ro   pollrR   s    r0   alivezTuningProcess.alive   s%     ||; 1 1 3t ;;r/   c                    | j                         s| j                          t        j                  || j                         y)z8
        Push a work item to the child process.
        N)r{   rQ   r3   r9   r>   )rR   reqs     r0   putzTuningProcess.put   s*     zz|JJL30r/   c                   	 | j                   j                  |      s"t        d| j                  j                         t
        j                  | j                        }t        |t              r||S # t        $ r | j                           t        $ r | j                           t        $ r< t        j                  d| j                  j                         | j                           w xY w)z
        Get a response from the child process. Raises TimeoutError on timeout;
        raises EOFError if the subprocess crashes.
        zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rf   selectTimeoutErrorro   pidr3   r7   r=   killrF   rp   r8   r@   	exception
isinstance)rR   timeoutr;   s      r0   rE   zTuningProcess.get   s    
	==''0"%DT\\EUEUDV#WXX"''7F fi(L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   AA2 2A7C)c                    | j                         r t        j                  d| j                         |r| j	                          yy)zC
        Signal the child process to shut down gracefully.
        N)r{   r3   r9   r>   wait)rR   r   s     r0   shutdownzTuningProcess.shutdown   s2     ::<tT__5IIK r/   c                x    | j                         r| j                  j                          | j                          y)z5
        Wait for the child process to exit.
        N)r{   ro   r   rp   rz   s    r0   r   zTuningProcess.wait   s&     ::<LL

r/   c                    | j                   j                          | j                  j                          | j                  j                          d| _        y)z"
        Close resources.
        FN)rf   rp   r=   r>   rq   rz   s    r0   rp   zTuningProcess.close   s;     	r/   c                    | j                         rDt        j                  d| j                  j                         | j                  j                          | j                          y)z6
        Send a SIGKILL to the child process.
        z)Sending SIGKILL to autotune subprocess %dN)r{   r@   errorro   r   r   rp   rz   s    r0   r   zTuningProcess.kill   sF     ::<  ;   LL

r/   N)r=   	IO[bytes]r>   r   returnNone)rL   r   r>   r   r   r   )r=   r   r   r   )rP   Optional[int])r   bool)r}   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r+   r,   r-   __doc__staticmethodrG   r9   r7   rS   rQ   r{   r~   rE   r   r   rp   r   r.   r/   r0   r3   r3   ?   sq      8   & &+Z<16
r/   r3   c                  J    e Zd ZdZddZed	d       ZddZd
dZ	 	 	 	 ddZ	y)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    c                V   | j                         }t        j                  d|       |D cg c]  }t        |       c}| _        t        j                         | _        | j                  D ]  }| j                  j                  |        t        t        |            | _        yc c}w )z,
        Start the child processes.
        z$Sub-process autotune device list: %s)rP   )max_workersN)get_device_listr@   rA   r3   	processesqueueQueueprocess_queuer~   r   lenexecutor)rR   devicesrP   ps       r0   rS   zTuningProcessPool.__init__   s     &&(CWM FMM6-v6M9> 	&A""1%	& +s7|D Ns   B&c                 l   t         j                  sdgS t               } t        |       }|j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r#   autotune_multi_devicer   r   device_countr&   rB   rD   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r0   r   z!TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   7B1c                    | j                   j                          | j                  D ]  }|j                  d        | j                  D ]  }|j                           y)z5
        Signal all child processes to exit.
        F)r   N)r   r   r   r   )rR   r   s     r0   r   zTuningProcessPool.shutdown  sQ     	  	#AJJEJ"	# 	AFFH	r/   c                   |j                   J | j                  j                         }|j                  |j                   j                         	 |j                  t
        j                        | j                  j                  |       S # t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S w xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice ')bmreqr   rE   r~   	benchmarkr#   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   r8   )rR   choicero   s      r0   targetzTuningProcessPool.target%  s     ||'''$$((*FLL**+	,;;BB$ ""7+  	 MM1& :W W
 < ""7+  	 MM.vh 7W W
 <""7+	  ""7+s0   B	 	-D6D  ,D?D  DD   D=c           	     x    t        t        || j                  j                  | j                  |                  }|S )z>
        Benchmark each choice in a separate process.
        )dictzipr   mapr   )rR   choicesresultss      r0   r   zTuningProcessPool.benchmarkD  s/     s7DMM$5$5dkk7$KLMr/   Nr   )r   zSequence[Optional[int]])r   r!   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])
r+   r,   r-   r   rS   r   r   r   r   r   r.   r/   r0   r   r      sB    E& " "(	,>+ 
+r/   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicerP   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
     :   t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J |j                         }|J t        ||t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                         |j+                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wr6   )r   r   .0xs     r0   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>e  s     AQz!Z0A   fake)r   layout)fallback)rP   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r%   graphsizevars
size_hintsget_sizer#   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r;   noder   rP   s          r0   r   zTensorMeta.from_irnodes_  sU    gx(>E F!1!1!!4 FF FA&AAAAMdBII&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)rP   r   
extra_size)r   r   r   rP   r   r   rz   s    r0   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r/   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)r+   r,   r-   __annotations__r   classmethodr   r   r.   r/   r0   r   r   V  sQ    ((++KD-!
E!
	,!
 !
F
r/   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)BenchmarkRequesta1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                    || _         t        |t              r|g}|| _        t        t        t
        f      r)t              dkD  rt        fdD              sJ d   | _        || _	        y )Nr"   c              3  d   K   | ]'  }d D ]   }t        d   |      t        ||      k(   " ) yw))rP   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r0   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sG       Q  .q148GAt<LLLs   -0r   )
kernel_namer   r   input_tensor_metatupler   r   r   r   
extra_args)rR   r   r   r   r   s      ` r0   rS   zBenchmarkRequest.__init__  s     ''4!2 3!2(5$-8%&* /   
 "4A!6"4$r/   c                   t         r6   NotImplementedErrorrR   outinput_tensorss      r0   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r/   c                     y r6   r.   rz   s    r0   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r/   Nr   c                   t         r6   r   rR   fnr   r   s       r0   do_benchzBenchmarkRequest.do_bench  s
     "!r/   c                   t         j                  t        j                        }|rt	        j                         }|Ft        |      dk(  sJ t        d | j                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         } | j                  |g|| }|r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)Nr   c              3  <   K   | ]  }|j                           y wr6   )r   r   s     r0   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)r@   isEnabledForloggingDEBUGtimer   r   r   r   r   r   r)   infor   r  rA   rk   r   )
rR   r   r   rA   start_tscreate_tensor_elapser  load_elapseresbench_elapses
             r0   r   zBenchmarkRequest.benchmark  sP   
 ++GMM:yy{H ;}%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!!=:c:B ))+0Kyy{HdmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   0E )E=<E=)
r   rk   r   r   r   r   r   Iterable[Any]r   r   r   r   r   r   r   zCallable[[], None]r   r   r   r   zOptional[torch.Tensor]r   r   )	r+   r,   r-   r   rS   r   r   r  r   r.   r/   r0   r   r     s    %% ?% @	%
 "% 
%6"*"1="	"
 '+	" %" $	"
 
" '+)$) $) 
	)r/   r   c                  N    e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)	_TestBenchmarkRequestz
    Supports unit testing. Defined in this file instead of the test file so the
    TuningProcess sub-process can unpickle these objects.
    Nc                J    || _         || _        || _        || _        || _        y r6   )r;   rP   sleepexccrash)rR   r;   rP   r  r  r  s         r0   rS   z_TestBenchmarkRequest.__init__  s'     

r/   r   c               r   | j                   <t        j                  j                  t        d       t        | j                         k(  sJ | j                  rt        j                  | j                         | j                  r| j                  | j                  rt        j                  d       | j                  S )Nr"   )rP   rB   rD   rE   r&   rk   r  r	  r  r  ri   exitr;   r   s      r0   r   z_TestBenchmarkRequest.benchmark  sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r/   )g        NNNF)
r;   r   rP   r   r  zOptional[float]r  zOptional[Exception]r  r   r  )r+   r,   r-   r   rS   r   r.   r/   r0   r  r    sq      $!%#'  	
 !  KO*1G	r/   r  c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkMixinNr   c                  t        d g ||D              }t        |      dk  s
J d|        t        d |D        d      }t        |      }t        |      dk(  rt        t	        |            }n|j                         }|j                  |      5  t        j                  |      }|j                          d d d        |S # 1 sw Y   S xY w)Nc              3     K   | ]i  }t        |t        j                        rMt        |j                  j
                        r.|j                  j                  |j                  j                   k y wr6   )r   torchTensorr   rP   typeindexr   tensors     r0   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>  sR      $
&%,,/v}}))*##/	 MM$
s   A/A1r"   zCan not mix devices c              3     K   | ]9  }t        |j                  j                        r|j                  j                   ; y wr6   )r   rP   r!  r#  s     r0   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>   s4      &--,,- ""s   ?Acuda)
r   r   nextr   itercurrent_devicerP   r$   benchmark_gpusynchronize)	rR   r  r   r   device_idx_setdevice_typer   
device_idxr  s	            r0   r  z GPUDeviceBenchmarkMixin.do_bench  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0 	+++B/C((*	+ 
		+ 
s   &CCr  r+   r,   r-   r  r.   r/   r0   r  r    s*    
 '+	 % $	
 
r/   r  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkMixinNr   c               ,    t        j                  |      S r6   )r$   benchmark_cpur  s       r0   r  z CPUDeviceBenchmarkMixin.do_bench4  s     ((,,r/   r  r/  r.   r/   r0   r1  r1  3  s*    
 '+	- %- $	-
 
-r/   r1  c                       e Zd Z	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd ZddZ xZS )	TritonBenchmarkRequestc                    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        y r6   )superrS   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpack)rR   r   r   r   r   r8  r9  r:  r;  r<  r=  r>  r?  r@  	__class__s                 r0   rS   zTritonBenchmarkRequest.__init__@  s_      	&79KZX& 0$"#6 %:"$8!(
r/   c               ~   t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        | j                        }d|j                  _        i }dd l}d|j                  |      j                  v rd|d<   |j                   j"                  dk(  rd}nP|j                   j"                  }	t%        |	      }
|
j'                  | j(                  j                   j*                        }t-        t        || j                        t.        j0                  j2                  j4                  j6                        r!t9        j:                  |g|||i |d|iS t9        j:                  |g|||i ||ddS )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)rE  benchmark_run)r   load_by_key_pathr9  r8  r@   rA   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersrP   r!  r   get_raw_streamr   r"  r   r  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rR   r   r   mod
run_methodr   
warmup_argrK  rE  r-  r   s              r0   r   z"TritonBenchmarkRequest.make_run_fn[  s    **4+@+@$BRBRS0!!	
 S$"2"2377
$//*
27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 $$  	
    $$  	
  " r/   c                    t        j                  | j                  | j                        }t	        || j
                        j                          y r6   )r   rG  r9  r8  r   r   
precompile)rR   rU  s     r0   rY  z!TritonBenchmarkRequest.precompile  s9    **4+@+@$BRBRST%%&113r/   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r8  r9  rz   s    r0   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr/   )r   r   r   r   r   )r   rk   r   r   r   r   r   r  r8  rk   r9  rk   r:  r   r;  r   r<  r   r=  r   r>  r   r?  r   r@  r   r   r   r  r   rk   )r+   r,   r-   rS   r   rY  r\  __classcell__rA  s   @r0   r5  r5  =  s     $%%&$% ? @	
 "     !  # "   
64*41=4	4l4Ur/   r5  c                      e Zd Zy)TritonGPUBenchmarkRequestNr*   r.   r/   r0   ra  ra    r1   r/   ra  c                      e Zd Zy)TritonCPUBenchmarkRequestNr*   r.   r/   r0   rc  rc    r1   r/   rc  c                  t     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd Z	 	 	 	 	 	 d
dZddZd ZddZ	ddZ
 xZS )CUDABenchmarkRequestae  
    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
    managing the lifecycle of a CUDA kernel benchmark, including compiling
    the source code, managing workspace memory, and executing the kernel.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    c                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr   F so)r7  rS   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerR   r   r   r   r   ri  rA  s         r0   rS   zCUDABenchmarkRequest.__init__  sr     	&79KZX&#$15)-',$ "*7*=*=d>N>NPT*U't'r/   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y)z
        Precompile the CUDA source code to populate the CUDACodeCache.
        This may happen in a separate thread pool.
        Precompiling %srh  Done precompiling %sN)r@   rA   r   compileri  rz   s    r0   rY  zCUDABenchmarkRequest.precompile  s<    
 	.5d..53T:r/   c          	        | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        t        j                  j                         j                        }t!        | j                  | j                        }t        d      }| j"                  dkD  rht        j$                  | j"                  dz   dz  t        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d|| }	  |        |S c c}w # t0        $ r,}	t3        |	      fd}
| j5                          |
cY d}	~	S d}	~	ww xY w)zc
        Create a function to run the CUDA kernel with the given input and output tensors.
        zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   rP   Nc                     t               r6   )RuntimeError)err_msgs   r0   raise_runtime_errorz=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r/   )ensure_dll_loadedupdate_workspace_sizer   r	   data_ptrr@   rA   r   ro  rn  rl  r   r  r&  current_streamcuda_streamr   rj  zerosfloat64rP   rk  rS  rT  rz  rk   r   )rR   r   r   r$  args
stream_ptrrV  workspace_ptrretr<   r|  r{  s              @r0   r   z CUDABenchmarkRequest.make_run_fn  s    	 ""$:>}:MQTPU:UV*+VVMMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
W WD  	'!fG, !&&	's#    F'F, ,	G!5!GG!G!c           
        | j                   ry | j                          t        t        j	                  d | j
                  D                    }t        |dz         D cg c]  }t        d        }}t        t        j                  j                         j                        }t        | j                  | j                        }t               } |g || j                   t#        |      d |  t        j                  j%                          |j&                  | _        t*        j-                  d| j(                  | j                  | j.                  | j0                  | j                  || j                          d| _         y c c}w )Nc              3  4   K   | ]  }|j                     y wr6   )r   )r   metas     r0   r   z=CUDABenchmarkRequest.update_workspace_size.<locals>.<genexpr>  s     G$))Gs   r"   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)rm  r}  r   r   fromkeysr   r   r	   r  r&  r  r  r   rl  r   r   r   r   r+  valuerj  r@   rA   ro  rn  )rR   unique_input_count_r  r  rV  c_workspace_sizes          r0   r~  z*CUDABenchmarkRequest.update_workspace_size  sU   ''  MMG0F0FGG
 )..@1.D(EF1FFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$7 Gs   E>c                    | j                   4t        j                  | j                  d      \  | _         | _        | _        y y )Nrh  )rl  r   rN   ri  rn  ro  rz   s    r0   r}  z&CUDABenchmarkRequest.ensure_dll_loaded   s:    888E8J8J  $95DHdmT%5 r/   c                l    | j                   !| j                   j                          d | _         d | _        y r6   )rl  rp   rk  rz   s    r0   r   z#CUDABenchmarkRequest.cleanup_run_fn&  s(    88HHNNDHr/   c                T    d| j                   d| j                  d| j                  S )Nr[  z, self.source_file=z, self.hash_key=)r   ro  rn  rz   s    r0   r\  zCUDABenchmarkRequest.__str__,  s0    #$""$$8t'7'7&99JDMM;KLLr/   r   rk   r   r   r   r   r   r  ri  rk   r   r   r  r   r]  )r+   r,   r-   r   rS   rY  r   r~  r}  r   r\  r^  r_  s   @r0   re  re    s    VV ?V @	V
 "V V 
V$;4*41=4	4l",HMr/   re  c                  b     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZd	dZ xZS )
CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r6   )r7  rS   ri  r   rn  rl  rq  s         r0   rS   zCppBenchmarkRequest.__init__4  s5     	&79KZX& -6:r/   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )Nrs  rD  r-  rt  )r@   rA   r   rN   ri  rz   s    r0   rY  zCppBenchmarkRequest.precompileA  s<     	.5$**>3T:r/   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )NrD  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr6   )r   ctypesc_ulonglong)r   args     r0   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>W  s     R3:c6#5#56Rs   $&)r   rN   ri  rl  r   r  r@   rA   r   r   r   r   r  r  r   argtypesrS  rT  )rR   r   r   r$  r  rV  s         r0   r   zCppBenchmarkRequest.make_run_fnH  s     $$T%5%55I04]0Cse0KLf!LLXHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ms   D)c                    | j                   3	 t        | j                   d      r| j                   j                          y y y )Nrp   )rl  hasattrrp   rz   s    r0   r   z"CppBenchmarkRequest.cleanup_run_fnc  s9    88 txx)  *	  r/   c                     d| j                   S )Nr[  )r   rz   s    r0   r\  zCppBenchmarkRequest.__str__k  s    #$""$%%r/   r  r  r   r]  )	r+   r,   r-   rS   rY  r   r   r\  r^  r_  s   @r0   r  r  0  so    ;; ?; @	;
 "; ; 
;;
*
1=
	
6!&r/   r  c                  X     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZddZ xZS )CuteDSLBenchmarkRequestz;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                    t         |   ||||       |j                         }t        j                  |      \  | _        | _        y r6   )r7  rS   finalize_allr   rp  r9  r8  )rR   r   r   r   r   ri  finalized_coderA  s          r0   rS   z CuteDSLBenchmarkRequest.__init__r  sC     	&79KZX$1132=2C2CN2S/t/r/   c          	     T  	 t        j                  | j                  | j                        }ddlm} | j                   d| }t        ||      s?t        |      D cg c]  }t        t        ||            s| }}t        d| d|       t        ||      		fd}|S c c}w )z
        Create a function to run the CuteDSL kernel with the given input and output tensors.
        Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
        r"   )MAIN_SUFFIXr  z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~    t        d      } | j                  j                  j                        } g d|iS )Nr&  rE  )r   rN  rP   r"  )r   rE  r   kernel_funcr   s     r0   
run_kernelz7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernel  s@    7?%44SZZ5E5EFFBBsB6BBr/   )r   rG  r9  r8  codegen.cutedsl.cutedsl_kernelr  r   r  dircallabler   rz  )
rR   r   r   rU  r  main_func_namer   	availabler  r  s
    ``      @r0   r   z#CuteDSLBenchmarkRequest.make_run_fn  s     **4+@+@$BRBRS 	@ ,,-Q{m<sN+*-c(S$hwsD?Q6RSIS??OOghqgrs  c>2	C
  Ts   B%9B%c                     y)z*Clean up any resources used by the kernel.Nr.   rz   s    r0   r   z&CuteDSLBenchmarkRequest.cleanup_run_fn  s    r/   )r   rk   r   r   r   r   r   ztuple[Any, ...]ri  r    r   r   r  r   )r+   r,   r-   r   rS   r   r   r^  r_  s   @r0   r  r  o  so    ETT ?T @	T
 $T #T 
T*1=	:9r/   r  c                 X    t               } t        j                  | j                         | S r6   )r   atexitrg   r   )pools    r0   get_tuning_process_poolr    s    D
OODMM"Kr/   c                4    t               j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )r  r   )r   s    r0   benchmark_in_sub_processr    s     #$..w77r/   )r   r   r   )Z
__future__r   r  r  dataclassesrS  r  rB   rI   r   rd   rm   ri   r	  r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r   r  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   r   torch._loggingr   torch.utils._ordered_setr   typesr    torch._inductor.select_algorithmr    r!   rg  r#   runtime.benchmarkingr$   virtualizedr%   r&   r+   r@   r8   r)   r3   r   r   r   LayoutOrBuffer	dataclassr   r   r  r  r1  r5  ra  rc  re  r  r  cacher  r  r.   r/   r0   <module>r     s   "      	     
   . 1 2 2 D D  $ C .    - /  T  -  . "8\:		 	i iXe eP ryy"))+, 3
 3
 3
l ] ] ]@, D   F- -YU- YUx	 79O 		 79O 	LM24D LM^<&13C <&~.957G .9b  8'8&8r/   