
    pi&7                       d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZmZ d dlmZ dd	lmZ e	rd d
lmZmZ ej4                  j7                  ed      Zej4                  j7                  ed      Zeee
e ejB                  f         Z"eee   ge"f   Z# ejH                  d       G d d             Z% ejH                  d       G d d             Z& ejH                  d       G d d             Z'	 	 	 	 d-dZ(d.dZ)d/dZ*d0dZ+d1dZ,	 	 	 	 	 	 d2dZ-	 	 	 	 	 	 	 	 d3dZ.d4dZ/	 	 	 	 d5dZ0	 	 	 	 d5dZ1d6dZ2ejH                   G d  d!             Z3	 	 	 	 	 	 	 	 	 	 d7d"Z4d8d#Z5 G d$ d%e      Z6	 	 	 	 	 	 	 	 	 	 	 	 d9d&Z7	 	 	 	 	 	 d:d'Z8 ejH                  d       G d( d)             Z9 ejH                  d       G d* d+             Z:	 	 	 	 	 	 d;d,Z;y)<    )annotationsN)Enum)AnyCallableOptionalTYPE_CHECKINGUnion)countersget_metrics_context)GraphPartitionMap	InputType)
OrderedSet   )is_using_cudagraph_partition)SequenceSet
perf_hintscudagraph_static_inputsT)frozenc                      e Zd ZU dZded<   y)
FunctionIDz9Unique counter of a function wrapped in cudagraphify_implintidN__name__
__module____qualname____doc____annotations__     a/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/cudagraph_utils.pyr   r      s
    ?Gr!   r   c                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
PlaceholderInfoz
    A serializable version of torch.fx.Node that contains information
    pertinent to placeholder stack traces. We use these in logging and error messages
    related to cudagraphs, and will cache these results.
    strnameOptional[str]stack_tracelist[PlaceholderInfo]usersmutating_use_stack_traceNr   r    r!   r"   r$   r$   %   s      I  ++r!   r$   c                  N    e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   y)WrappedFunctionz
    Represents a function that you want to record for CUDA graph replay,
    with a little more metadata so we can identify if we have an applicable
    CUDA graph in our CUDA graph tree for it.
    zCallable[..., Any]modelSequence[int]static_input_idxsr   r   ztuple[torch.Tensor, ...]	constantsSequence[PlaceholderInfo]placeholdersmutated_input_idxsNr   r    r!   r"   r-   r-   4   s,     $$N''++%%r!   r-   c                   t        | j                        dk(  r8t        t        | j                              j                  j                  dd       S | j                  D ]`  }|j                  t        j                  j                  j                  j                  k(  s?|j                  j                  dd       x}s^|c S  y )Nr   r(   )lenr*   nextitermetagettargettorchopsatencopy_default)placeholder_nodeuser(   s      r"   &get_mutating_use_stack_trace_from_noderC   D   s     !!"a'D)//0166::=$OO%% #::--555!hhll=$??{?""#
 r!   c                    | j                   S N)r+   )placeholder_infos    r"   get_mutating_use_stack_tracerG   S   s    444r!   c                    | j                   }| j                  j                  dd       }g }d }| j                  dk(  r-| j                  D cg c]  }t        |       }}t        |       }t        ||||      S c c}w )Nr(   placeholder)r&   r9   r:   opr*   to_placeholder_inforC   r$   )rA   r&   r(   r*   r+   is         r"   rK   rK   W   s      D"''++M4@KE#m+1A1G1GHA$Q'HH#I$
  4e5MNN Is   
A7c                r    | j                   D cg c]  }|j                  dk(  st        |       c}S c c}w )NrI   )nodesrJ   rK   )graphnodes     r"   get_placeholder_inforQ   f   s4    .3kk&*TWW=UD!  s   44c                    d|  S )Nzskipping cudagraphs due to r    )reasons    r"   format_default_skip_messagerT   l   s    (11r!   c                    d}|D ]  }| |   }t        |      x}s n t        dt        |       d      }|r| d| S |S )N zmutated inputs (z instances). Found from : 
 )rG   rT   r6   )r3   mutation_indicesr(   idxrI   msgs         r"   get_mutation_stack_tracer[   p   sp     "$K "3'6{CC;C
 &
3/01=C (66Jr!   c                   t         j                  j                  j                  j                  r3| j
                  D cg c]  }|| j                  v s |||         s| }}n| j
                  }t        j                  d| j                         t        j                  d|       |rt        | j                  |      S d S c c}w )Nz'check mutation static input indices: %sz#check mutation mutation indices: %s)r<   	_inductorconfigtritoncudagraph_treesr4   r0   static_inputs_logdebugr[   r3   )funcinputsis_cuda_graph_recorded_tensorrY   rX   s        r"   check_for_mutationrf      s     $$44 ..+
t---0=	 +
 +
  22143I3I ACST  	!!2!24DE !+
s   "B>c                j    | j                   D ]$  }|j                  j                  dd       x}s"|c S  y )Nr(   )r*   r9   r:   )rP   rB   r(   s      r"   _get_use_stack_tracerh      s:    zz ((,,}d;;;; r!   c                J   | j                  t        j                  d      d        t               r%| j                  t        j                  d      d        | j	                  t        j                  d            x}r8d|j
                   d}t        |      x}rt        | d|       S t        |      S t        |       dk(  r0t        t        | j                                     j                  dk(  ry d | j                         D        }t        d	d
j                  |             S )Nr9   cpuzcpu device ()rW   r   cudac              3  2   K   | ]  }t        |        y wrE   )repr).0keys     r"   	<genexpr>z:check_multiple_devices_or_any_cpu_nodes.<locals>.<genexpr>   s     AscAs   zmultiple devices: z, )popr<   devicer   r:   r&   rh   rT   r6   r7   r8   keystypejoin)device_node_mappingcpu_noderZ   r(   	keys_reprs        r"   'check_multiple_devices_or_any_cpu_nodesrz      s    ELL0$7 $%U 3T:&**5<<+>??x?X]]O1-.x88;8.#6H/VWW*3// 	 A%)..012776AA&9&>&>&@AI&);DIIi<P;Q'RSSr!   c                    t        |       S rE   )rz   )rw   s    r"    check_lowering_disable_cudagraphr|      s     33FGGr!   c                &   t         j                  |        t        d   dxx   dz  cc<   t        j                  j
                  j                  j                  rt        |       t               }|j                         r|j                  d| d       y y )Ninductorcudagraph_skipsr   cudagraph_skip_reasonT)	overwrite)perf_hint_logwarningr
   r<   r]   r^   r_   cudagraph_or_errorRuntimeErrorr   in_progressset)rZ   metrics_contexts     r"   #log_cudagraph_skip_and_bump_counterr      sy    #Z*+q0+$$773)+O""$3SDI %r!   c                       e Zd ZU ded<   ddZy)BoxedDeviceIndexOptional[int]valuec                :    |t        |t              sJ || _        y rE   )
isinstancer   r   )self
device_idxs     r"   r   zBoxedDeviceIndex.set   s    !Z
C%@@@
r!   N)r   r   returnNone)r   r   r   r   r   r    r!   r"   r   r      s     r!   r   c                H   t        d      }t        j                  j                  j                  j
                  rQt        |      }|D cg c]	  }||vs| }}t        |      dk7  }|sy t        | j                        }	t        |	|      S t        |      dk7  }|sd S |S c c}w )Nzmutated inputsr   )rT   r<   r]   r^   r_   r`   r   r6   rQ   rO   r[   )
gmmutated_inputsr4   r0   default_msgunique_idxsrY   rX   has_mutationr3   s
             r"   3check_for_mutation_ignore_cuda_graph_managed_tensorr      s     ..>?K $$44 !23+=XCKAWCXX+,1+BHH5'6FGG >*a/'t8[8 Ys   		BBc                    | j                   r| j                   S | j                  D ]  }|j                   s|j                   c S  y)zM
    Gets the first non-empty stack trace of a placeholder or its users.
    N)r(   r*   )rI   users     r"   get_placeholder_stack_tracer      sH     &&&!! $###$ r!   c                  $    e Zd ZdZdZdZdZddZy)CheckInvariantStatusr            c                    | j                   dk(  ry| j                   dk(  ry| j                   dk(  ry| j                    d| j                   S )NCudagraphManagedIdxMismatchz-cudagraph managed tensor data pointer changedStaticInputIdxMismatchz!static input data pointer changed&ExpectedDeadIndicesBeforeGraphMismatchz+expected dead indices before graph are livez: )r&   r   )r   s    r"   __str__zCheckInvariantStatus.__str__  sK    9955BYY226YYBB@ii[4::,//r!   Nr   r%   )r   r   r   SUCCESSr   r   r   r   r    r!   r"   r   r     s$    G #$  ./*0r!   r   c                   t        |      t        |      k(  rt        |      t        |       k(  sJ d       |D cg c]  }||   	 }}|D cg c]  }||   	 }}| d}t        t        ||            D ]t  \  }\  }	}
t        |	t        j
                        sJ ||   }|	j                         |
k7  s>| |   }| d|j                   d|
 d|	j                          dt        |       d
}v |S c c}w c c}w )z}
    Logs the mismatch between input data pointers and recorded data pointers.
    This checks only idxs in target_idxs.
    zClength mismatch between inputs, recorded_data_ptr, and placeholdersz.
zinput name: z. data pointer changed from z to z. input stack trace: 
)	r6   	enumeratezipr   r<   Tensordata_ptrr&   r   )r3   rd   recorded_data_ptrtarget_idxsmismatchrL   	t_tensorst_data_ptrs	error_msgtensorr   indexrI   s                r"   log_data_ptr_mismatchr     s'    v;#/00S[CDU5U MU %00q0I01<=A$Q'=K=*C I!*3y++F!G 	FH&%,,///A??(&u-K+\+*:*:); <--5Jd6??;L:M N&&A+&N%OrS 	  1=s   C-C2c                >   t        | j                               dz   dfd}t        j                  j                  j
                  j                  rLt        j                  j                  j
                  j                  kD  rt        j                   |              yy)Nr   c                     d  dS )NzCUDAGraph supports dynamic shapes by recording a new graph for each distinct input size. Recording too many CUDAGraphs may lead to extra overhead. We have observed a0   distinct sizes. Please consider the following options for better performance: a) padding inputs to a few fixed number of shapes; or b) set torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None to silence this warning.r    )num_cudagraphss   r"   warn_msgz4maybe_warning_due_to_dynamic_shape.<locals>.warn_msgD  s    00>/? @''		
r!   TFr   )	r6   rt   r<   r]   r^   r_   "cudagraph_dynamic_shape_warn_limitr   r   )fn_cachenew_int_keyr   r   s      @r"   "maybe_warning_due_to_dynamic_shaper   >  st     )A-N

 	%%HH
//
 
 
'
'
J
JK 	hj)r!   c                  0    e Zd ZU dZded<   ded<   ded<   y)	CudagraphCachedInfoz'
    Info needed to realign inputs
    r2   r3   list[Optional[str]]stack_tracesz	list[str]cudagraph_fail_reasonsNr   r    r!   r"   r   r   [  s     ,+%%%%r!   r   c                  D    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   y)CudagraphMetadataz.
    Metadata for recording a CUDA graph.
    r2   r3   OrderedSet[int]r0   r4   r   r   zdict[str, torch.Tensor]r1   Nr   r    r!   r"   r   r   f  s'     ,+&&''%%&&r!   r   c                h   g }t               }t               }t        | j                        D ]  \  }}||j                  v r|j	                  |       ||j
                  v r|j	                  |       ||j                  |   }nt        d| j                   d| dg d      }|j                  |        g }| j                  D ]4  }	|	|j                  |j                  |	          $|j                  d       6 | j                  D 
ci c]  }
|
|j                  |
    }}
t        |||||      S c c}
w )z
    Convert the cudagraph metadata at the graph level to the graph partition level,
    given the graph partition info (i.e., mapping from partition input/output index
    to graph input/output index).
    N
partition__placeholder_)r&   r(   r*   r+   )r   r   input_index_mappingr0   addr4   r3   r$   r   appendoutput_index_mappingr   constant_namesr1   r   )partition_mapmetadatapartition_placeholderspartition_static_input_idxspartition_mutated_input_idxspartition_input_idxgraph_input_idxrI   partition_stack_tracesgraph_output_idxr&   partition_constantss               r"    get_partition_cudagraph_metadatar   s  sp     3=<4>L 09))1 3,_ h888'++,?@h999(,,-@A&"//@K *!-"2"2!3=AT@UV )-	K 	%%k2'3*  )>> 0'"))(*?*?@P*QR"))$/	0 4A3O3O+/h  &&  #$ 	s   D/)rA   torch.fx.Noder   r'   )rF   r$   r   r'   )rA   r   r   r$   )rO   ztorch.fx.Graphr   r)   )rS   r%   r   r%   )r3   r2   rX   z&Union[AbstractSet[int], Sequence[int]]r   r%   )rc   r-   rd   list[InputType]re   zCallable[[torch.Tensor], bool]r   r'   )rP   r   r   r'   )rw   z!dict[torch.device, torch.fx.Node]r   r'   )rZ   r%   r   r   )
r   ztorch.fx.GraphModuler   zOrderedSet[str]r4   r   r0   r/   r   r'   )rI   r$   r   r'   )r3   r2   rd   r   r   zSequence[Optional[int]]r   r/   r   r   r   r%   )r   z)dict[tuple[int, ...], Callable[..., Any]]r   r   r   bool)r   r   r   r   r   r   )<
__future__r   dataclassesenumr   typingr   r   r   r   r	   r<   torch._dynamo.utilsr
   r   torch._inductor.utilsr   r   torch.utils._ordered_setr   utilsr   collections.abcr   r   AbstractSet_logginggetArtifactLoggerr   r   ra   listr   r   
OutputType	ModelType	dataclassr   r$   r-   rC   rG   rK   rQ   rT   r[   rf   rh   rz   r|   r   r   r   r   r   r   r   r   r   r   r    r!   r"   <module>r      s   "   @ @  = > / / < 00<HNN44' 
 (5ell!2345
d9o&
23	 d#  $ d#, , $, d#& & $&#5O2+< 	(
 $B 	>T:TT8H:HH	J      99#9 (9 %	9
 9004 00+ / 	
 # 	>7 
: d#& & $& d#	' 	' $	'3$33 3r!   