
    pi                    <   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ ddlmZmZ dd	lmZ dd
lmZ e	rddlmZ ddlmZmZ  ej<                  e      Z ejB                   G d d             Z"ejB                   G d d             Z#ejB                   G d d             Z$ejB                   G d d             Z%	 	 	 	 	 	 d'dZ&	 	 	 	 d(dZ'	 	 	 	 	 	 d)dZ(	 	 	 	 	 	 	 	 	 	 d*dZ)ejB                   G d d             Z*	 	 	 	 	 	 	 	 d+dZ+	 	 	 	 	 	 	 	 d,dZ,ejB                   G d d             Z-	 	 	 	 	 	 	 	 d-dZ.	 	 	 	 	 	 	 	 	 	 d.d Z/d/d!Z0d/d"Z1d0d#Z2	 	 	 	 	 	 	 	 d1d$Z3	 	 	 	 	 	 	 	 	 	 	 	 d2d%Z4e/e0e1gf	 	 	 	 	 	 	 	 	 	 	 	 	 d3d&Z5y)4    )annotationsN)CallableOptionalTYPE_CHECKING	TypedDictUnion)	is_fbcode)signpost_event)
OrderedSet   )MultiOutputLayout
NoneLayout)get_dtype_size)V)Dep)BaseSchedulerNodeSchedulerBufferc                  ,    e Zd ZU ded<   ded<   ded<   y)PeakMemoryResultlist[BaseSchedulerNode]orderintpeak_memorystrmethodN__name__
__module____qualname____annotations__     X/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/memory.pyr   r      s    ""Kr"   r   c                  Z    e Zd ZU dZded<   dZded<    ej                  e      Z	ded<   y)	MemoryPlanningInfoForBufferr   r   
size_alloc	size_freedefault_factoryOrderedSet[BaseSchedulerNode]
succ_nodesN)
r   r   r   r&   r    r'   dataclassesfieldr   r+   r!   r"   r#   r%   r%   !   s3    JIs0A0A0A"1J- r"   r%   c                      e Zd ZU dZded<   dZded<    ej                  e      Z	ded<    ej                  e      Z
ded	<    ej                  e      Zded
<   y)MemoryPlanningInfoForNoder   r   indexsizer(   z7OrderedSet[Union[SchedulerBuffer, FreeableInputBuffer]]pred_buffersr*   
pred_nodesr+   N)r   r   r   r0   r    r1   r,   r-   r   r2   r3   r+   r!   r"   r#   r/   r/   *   sq    E3ND#M*5 I  1B0A0A"1J-  1B0A0A"1J- r"   r/   c                  X    e Zd ZU ded<    ej
                  e      Zded<   d	dZd
dZ	y)FreeableInputBufferr   namer(   r%   
mpi_bufferc                    | j                   S N)r6   selfs    r#   get_namezFreeableInputBuffer.get_name@   s    yyr"   c                ,    t        | j                        S r9   )hashr6   r:   s    r#   __hash__zFreeableInputBuffer.__hash__C   s    DIIr"   N)returnr   )r@   r   )
r   r   r   r    r,   r-   r%   r7   r<   r?   r!   r"   r#   r5   r5   9   s.    
I.?k.?.?3/J+ r"   r5   c                b   dd}t        j                  t              }t               }| D ]  }|j                  j
                  D ]  }|j                  |v s|j                  }t        j                  j                  r,|j                  t        j                  j                  dz         }|j                  d      rv||j                     j                  |        ||      ||j                  <     t               }|j                         D ]"  \  }}	t        |t        ||   |	            ||<   $ |S )z
    Create and keep track of all input buffers that can be freed during the program

    Returns:
        A dictionary containing all freeable input buffers, keyed by their names.
    c                @    t         j                  j                  |       S r9   )r   graphget_dep_size_hint)deps    r#   _dep_size_hintz.get_freeable_input_buf.<locals>._dep_size_hintR   s    ww((--r"   _)primals_argfwd_rng_statebwd_rng_state)r'   r+   )rE   r   r@   r   )collectionsdefaultdictr   dictread_writesreadsr6   r   rC   removeprefix
startswithadditemsr5   r%   )
nodesgraph_inputsrF   dep_name_to_succ_nodesdep_name_to_sizenoderE   dep_namename_to_freeable_input_bufr+   s
             r#   get_freeable_input_bufr\   G   s'   . 	
+  (,v E##)) 	ECxx<'88 77<<'44QWW\\C5GHH**I +388488>1?1D$SXX.	EE BF 6 < < > 
*/B'*840
"8,
 &%r"   c                    ddl m ddlm t	               	 d	 	 	 	 	 dfd| j                         D ]  }|j                         vs |        S )a=  
    Compute the size of each scheduler buffer, including (1) memory allocated when
    it is created and (2) memory deallocated when it is freed.

    We specially handle the case of MultiOutputLayout.
    Consider the following case:
        buf0 = some_ops_with_multi_outputs(...)
        buf1 = buf0[0] # assume 10 bytes
        buf2 = buf0[1] # assume 20 bytes
    In such cases,
        buf0: at creation, 30 bytes allocated, when deleted, 0 bytes freed
        buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
        buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed

    When an operation mutates a buffer in-place, the scheduler creates a new buffer name
    to track the "before" and "after" states, even though they share the same memory.

    The mutated buffer represents a rename with zero allocation and deallocation cost.
    During dependency tracking, we transfer dependencies from the mutated name back to
    the original buffer, ensuring the original memory is only freed when all aliases
    are done.

    This handles cases where a buffer has multiple non-overlapping aliases - rather than
    trying to assign free costs to individual aliases, we forward all alias dependencies
    to the original buffer.

    Consider:
        buf0 = op0()
        buf1 = mutation_op_(buf0)
        del buf0
        ...
        op(buf1)
        del buf1

    The only memory events are the creation prior to op0, and the deletion following buf1.

    Returns:
        A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
    r   )MultiOutput)
OutputNodec                ^   | j                         t        j                  j                  j                  v rd	| j                         <   yt        | j                  j                  t              rd	| j                         <   yt        | j                  j                  t              rd}| j                  D ][  }t        |j                        r|j                  j                         D ]%  }t        |j                        s| |d      z  }' ] |rdn|df	| j                         <   |S t        j                  j                  j                  | j                  j                         d      t        | j                  j!                               z  }|rdn||f	| j                         <   |S )N)r   r   r   T)fallback)r<   r   rC   	schedulermutation_real_name
isinstancerY   layoutr   r   usersget_outputssizevars	size_hint	get_numelr   	get_dtype)
	sched_bufuser_of_MultiOutputLayoutr&   userbufbuf_sizer^   r_   _compute_and_update_buf_sizesched_buf_to_sizes
         r#   rq   zGcompute_size_for_scheduler_buffer.<locals>._compute_and_update_buf_size   s    177#4#4#G#GG6<i0023	--z:6<i0023	--/@AJ! Ndii499002 NC!#((K8"&B3&MM
NN /J7i0023 ww''11((*Q 2 y~~779:;H /H7i0023 Or"   )F)rl   r   rm   boolr@   r   )irr^   rb   r_   rN   valuesr<   )name_to_bufrl   r^   r_   rq   rr   s     @@@@r#   !compute_size_for_scheduler_bufferrw   v   s|    T  %48F GL"?C	 @ !'') 4	 '88(3	4 r"   c                   t        |      }t        j                  t              }| D ]1  }|j                  D ]   }||j
                     j                  |       " 3 t        t        j                  j                  j                  j                               D ]  \  }}||xx   ||   z  cc<    |j                         D ]'  }t        ||   d   ||   d   ||         ||   _        ) y)z
    For each SchedulerBuffer, assign its size info and successor nodes.
    A buffer's successor nodes determines when a buffer can be freed.
    r   r   )r&   r'   r+   N)rw   rL   rM   r   unmet_dependenciesr6   rS   reversedr   rC   rb   rc   rT   keysr%   r7   )	rU   rv   rr   rW   rY   rE   mutating_buf_namereal_buf_namebuf_names	            r#   1assign_memory_planning_info_for_scheduler_buffersr      s    :+F
 	
+   7** 	7C"388,006	77
 -5	,,224- 
(= 	}-1G2
 	
-
  $$& 
+F(215'1!4-h7,
H(
r"   c                   t        j                  t              }i }t        j                  t              }| D ]  }t        d |j                         D              }|||<   |D ]  }	||	   j	                  |        |j                         D ]1  }
|
j
                  j                  D ]  }	||	   j	                  |
        3  |j                         D ]1  }|j
                  j                  D ]  }	||	   j	                  |        3 t        |       D ]l  \  }}t        d |j                         D              }||   }||   }|j                  |       |j                  |       t        ||||   ||   |      |_        n y)zL
    Assign to each scheduler node its predecessor and successor nodes.
    c              3  V   K   | ]!  }|j                   j                  D ]  }|  # y wr9   r7   r+   ).0buffer	succ_nodes      r#   	<genexpr>zBassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>  s9       
#..99 
   
 
s   ')c              3  H   K   | ]  }|j                   j                    y wr9   )r7   r&   )r   r   s     r#   r   zBassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>  s     W&**55W    ")r0   r1   r2   r3   r+   N)rL   rM   r   rg   rS   r7   r+   ru   	enumeratesumdiscardr/   mpi_node)rU   name_to_fused_noderv   r[   node_to_pred_nodesnode_to_succ_nodesnode_to_pred_buffersrY   r+   r   r   freeable_bufferr0   r&   r3   s                  r#   /assign_memory_planning_info_for_scheduler_nodesr      s    	
+  RT 	
+ 
  <  
**, 
 


 $.4  $ 	4Iy)--d3	4
 &&( 	<F#..99 <	$Y/33F;<	<<$ 6<<> A(33>> 	AI +//@	AA
 !' 
tWDDTDTDVWW
'-
'-
 	4 4 1-d3)$/!

r"   c                  @    e Zd ZU ded<   ded<   ded<   ded<   ded<   y)	
BufferInfoz+Union[SchedulerBuffer, FreeableInputBuffer]r   r   r&   r'   
start_stepend_stepNr   r!   r"   r#   r   r   1  s    77ONOMr"   r   c                   t        |       D ci c]  \  }}||
 c}}g }i }	 	 	 	 dfd}|j                         D ]e  \  }}	d}
||vr ||	      \  }
}|J |||	<   |j                  t        |	|	j                  j
                  |	j                  j
                  d|
             g t        |       D ]  \  }}|j                         D ]  }|j                         }d}
||vr! ||      \  }
}|
dk(  r|}
|||<   n	|J |||<   |j                  t        ||j                  j                  |j                  j
                  ||
               ||fS c c}}w )zj
    Compute buffer allocation and deallocation sizes and map their
    lifetime to the node schedule
    c                x    d}d }| j                   j                  }|r|D ]  }|   }||kD  s|}|} |J ||fS )Nr   )ro   max_stepmax_step_snoder+   r   stepnode_to_steps         r#   _get_end_step_and_snodez8compute_memory_timeline.<locals>._get_end_step_and_snodeT  sd     6:^^..
' /	#I.(?#H%.N	/
 "---''r"   r   r   )ro   z+Union[FreeableInputBuffer, SchedulerBuffer]r@   z'tuple[int, Optional[BaseSchedulerNode]])	r   rT   appendr   r7   r'   rg   r<   r&   )rU   r[   graph_outputsr   rY   buf_info_listbuf_to_snode_last_user   r~   	input_bufr   end_step_snoderl   r   s                @r#   compute_memory_timeliner   :  s   " &/u%52!tTd
2L
 ')M 	 (8(	0(   :??A 
)=('>y'I$Hn!---/=!),$$..$$..	

$  & 
d))+ 	I !))+HH},+B9+M(.r>#H7;))4)5557E))4  ((33((22	4 ,(===M2s   Ec                   t        | ||      \  }}}t        t        |       dz         D cg c]  }d }}|D ]G  }||j                  xx   |j                  z  cc<   ||j
                  dz   xx   |j                  z  cc<   I d}d}g }	t        t        |       dz         D ]'  }
|||
   z  }|	j                  |       t        ||      }) ||	fS c c}w )a  
    Given a list of nodes in their execution order, estimate the peak memory, by
    keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.

    Returns:
        int: peak memory
        List[int]: memory usage at each node (or each step).
    r   r   )	r   rangelenr   r&   r   r'   r   max)rU   r[   r   r   rG   memorybuf_info
max_memory
cur_memorymemories_at_nodests              r#   estimate_peak_memoryr     s     2)=M1a
 s5zA~./Aa/F/ " <x""#x':'::#x  1$%););;%<
 JJ3u:>" 1fQi
  ,Z0
1
 )**! 0s   	Cc                  "    e Zd ZU ded<   ded<   y)SNodeMemoryr   r&   r'   Nr   r!   r"   r#   r   r     s    ONr"   r   c                |   t        | ||      \  }}}t        t        |             D cg c]  }t        dd       }}|D ]j  }||j                     xj
                  |j
                  z  c_        |j                  dk7  s?||j                     xj                  |j                  z  c_        l i }t        |       D ]  \  }	}
||	   ||
<    d}d}g }t        t        |             D ]M  }||   j
                  }||   j                  }||z  }|}t        ||      }||z  }|}|j                  ||f       O ||||fS c c}w )a  
    Alternative version of estimate_peak_memory, that respects the fact,
    that every SchedulerNode has multiple phases:
    1. alloc ( outputs )
    2. run_kernel
    3. dealloc last_use buffers
    estimate_peak_memory collapses memory into one value: size_alloc - size_free
    While peak memory happens after alloc.

    Duplicating the code to not migrate all callsites at once,
    In future usages of estimate_peak_memory will migrate to this version.
    r   r   )r   r   r   r   r   r&   r   r'   r   r   r   )rU   r[   r   r   rG   r   step_idx_allocfreer   snodes_allocfreeirY   r   r   snodes_curr_memoryr   allocfree
post_alloc	post_frees                      r#   estimate_peak_memory_allocfreer     s   . /F)=/+M1+
 6;3u:5FG+a+GG " R8../::h>Q>QQ:"x001;;x?Q?QQ;R
 U# 74!3A!67 JJ3u: ;"1%00!!$..e

Z0
d
	!!:y"9:; 		 3 Hs   D9c                    G d dt               } G d dt               }t               t               }t               }| D ]D  }t        |j                  j
                        dd|<   |   d   dk(  s4|j                  |       F t        |j                               t        |j                               z   D ]=  }	dt        |	j                  j                        |	j                         |v rd	ndz   i||	<   ? t        d
 |j                         D              d}
|D ]D  }||v r|
||   j                  j                  z  }
$||v s)|
||   j                  j                  z  }
F t        |
      | D ]  }|j                  j                  D ]2  }	||	   d   d	k(  s|   dxx   |	j                  j                  z  cc<   4 |j!                         D ]2  }	||	   d   dk(  s|   dxx   |	j                  j                  z  cc<   4  g }d}|t        |       k  rV|rSt#        |fd      }|j%                  |       |j'                  |       |d	z  }|j                  j(                  z  t              |   d   z  |j                  j                  D ]<  }|   d   dkD  sJ |   dxx   d	z  cc<   |   d   dk(  s,|j                  |       > |j                  j                  D ]j  }	||	   d   dkD  sJ ||	   dxx   d	z  cc<   ||	   d   d	k(  s,|	j                  j                  D ]&  }|   dxx   |	j                  j                  z  cc<   ( l |t        |       k  r|rS|t        |       kD  rt+        d      |S )a  
    A bfs-based greedy topological order. LPMF stands for "Least Peak Memory First".

    The idea is from this paper:
    Buffer memory optimization for video codec application modeled in Simulink
    https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF

    The algorithm maintains the max memory so far.
    At every iteration, for each scheduleable node, it computes:
        - how much memory needs to be allocated for the output buffers of this node;
        - how much memory can be freed as a result of executing this node.
    This gives us two values for each node:
        (1) mem1: memory during the execution of the node;
        (2) mem2: memory after executing the node, after some input buffers are freed.
    The greedy approach select as follows:
        (i) if there are nodes whose mem1 values are below the max memory so far,
            then pick the node with the lowest mem2 value;
        (ii) otherwise, pick the one with the lowest mem1 value.
    c                  "    e Zd ZU ded<   ded<   y)'topological_sort_lpmf.<locals>.NodeInfor   indegreememory_to_freeNr   r!   r"   r#   NodeInfor     s    r"   r   c                      e Zd ZU ded<   y))topological_sort_lpmf.<locals>.BufferInfor   	outdegreeNr   r!   r"   r#   r   r     s    r"   r   r   )r   r   r   r   r   c              3  H   K   | ]  }|j                   j                    y wr9   r7   r'   )r   r   s     r#   r   z(topological_sort_lpmf.<locals>.<genexpr>3  s$       	&&r   r   c                    t        | j                  j                  z         | j                  j                  |    d   z
  | j                  j                  fS )Nr   )r   r   r1   r0   )rY   live_memoryr   	node_infos    r#   <lambda>z'topological_sort_lpmf.<locals>.<lambda>U  sL    K$--"4"44jA""Yt_5E%FF## r"   keyz4Failed to schedule, while loop ran too long for lpmf)r   rN   r   r   r   r3   rS   listru   r7   r+   r<   r   r'   r   r2   rg   minremover   r1   RuntimeError)rU   r[   rv   r   r   r   r   nodes_to_schedulerY   ro   output_memoryr~   schedule	num_itersselected_noder   r   r   r   s                   @@@r#   topological_sort_lpmfr     s   49 Y  486INRfH 8B| (DMM445
	$ T?:&!+!!$'( K&&()D1K1R1R1T,UU 
S^^667LLNm3q<

  3::< K M! W{"[2==GGGM337ALLVVVM	W
 [-0J  N==-- 	NC}[)Q.$ 01S^^5M5MM1	N ##% 	NC}[)Q.$ 01S^^5M5MM1	NN )+HI
c%j
 %6
 	  /&Q	 	}--222[1
y/0@AA '//:: 	1IY'
3a777i ,1,#J/14!%%i0		1 !))66 	WCC=-111SM+&!+&}[)Q.!$!:!: WIi()9:cnn>V>VV:W		W7 c%j
 %6D 3u:QRROr"   c           	       
  G d dt               }t               
t        j                   G d d             }d
fd}g }| D ]V  }t	        |j
                  j                        dd
|<   
|   d   d	k(  s4t        j                  | | ||      |             X g }d	}|t	        |       k  r|rt        j                  |      j                  }t	        |      
|   d
<   |j                  |       |dz  }|j
                  j                  D ]N  }	
|	   d   d	kD  sJ 
|	   dxx   dz  cc<   
|	   d   d	k(  s,t        j                  | | ||	      |	             P |t	        |       k  r|r|t	        |       kD  rt        d      |S )a  
    A BFS topological sort that selects nodes whose dependencies are executed the
    earliest. This follows a FIFO idea. Specifically, at every iteration, for each node
    that is schedulable, we gather the order in which its predecessor nodes are executed,
    and this sorted list of execution orders of predecessor nodes defines the priority.
    We select the node whose predecessors nodes are executed the earliest. The FIFO
    idea aims to reduce the liveness duration of buffers created.
    c                  "    e Zd ZU ded<   ded<   y)&topological_sort_bfs.<locals>.NodeInfor   r   r   Nr   r!   r"   r#   r   r     s    
r"   r   c                  *    e Zd ZU ded<   ded<   ddZy).topological_sort_bfs.<locals>.NodeWithPriority	list[int]priorityr   rY   c                    | j                   |j                   k(  rA| j                  j                  j                  |j                  j                  j                  k  S | j                   |j                   k  S r9   )r   rY   r   r0   )r;   others     r#   __lt__z5topological_sort_bfs.<locals>.NodeWithPriority.__lt__  sP    }}.yy))//%**2E2E2K2KKK==5>>11r"   N)r   NodeWithPriorityr@   rs   )r   r   r   r    r   r!   r"   r#   r   r     s    	2r"   r   c                    |    d   dk(  sJ t        t        fd| j                  j                  D                    }|S )Nr   r   c              3  .   K   | ]  }|   d      yw)r   Nr!   )r   	pred_noder   s     r#   r   z?topological_sort_bfs.<locals>._node_priority.<locals>.<genexpr>  s       2;	)$W-s   )sortedr   r   r3   )rY   exec_ordersr   s     r#   _node_priorityz,topological_sort_bfs.<locals>._node_priority  sK    z*a/// ?C}}?W?W 

 r"   r   )r   r   r   r   r   r   z3Failed to schedule, while loop ran too long for bfs)rY   r   r@   r   )r   rN   r,   	dataclassr   r   r3   heapqheappushheappoprY   r   r+   r   )rU   r   r   r   r   rY   r   r   r   r   r   s             @r#   topological_sort_bfsr   y  s   9  486I2 2 2 13 '*4==+C+C'DrR	$T?:&!+NN!#3N44H$#O )+HI
c%j
 %6&78==,/M	- )&Q	 '//:: 	IY'
3a777i ,1,#J/14%$^I%>	J		 c%j
 %6" 3u:PQQOr"   c                n   t               t               g t               dfd| D ]  }|j                         D ]  }||<   	  | D ]B  }|j                  j                  t        d |j                  j                  D              z   |<   D t        | fd      D ]
  } |        S )a  
    This is a DFS topological sort. The setup is similar to `topological_sort_schedule`
    in scheduler.py. The difference is the order nodes are visited in the outer loop.
    In `topological_sort_schedule`, nodes are visited in their original order.
    In this function, nodes are visited based on their priority -- for each node, we
    compute the total memory of all buffers it reads from or writes to, and we visit
    the nodes in ascending order of this priority.
    c                    | vrtj                  |        | j                  D cg c]  }|j                  v r|j                     ! }}t        |fd      D ]
  } |        j	                  |        y y c c}w )Nc                :    |    | j                   j                  fS r9   r   r0   nsize_with_readss    r#   r   z5topological_sort_dfs.<locals>.visit.<locals>.<lambda>  s    /!*<ajj>N>N)O r"   r   )rS   ry   r6   r   r   )	r   rE   	dep_nodesrY   name_to_noderesultseenr   visits	       r#   r   z#topological_sort_dfs.<locals>.visit  s    D=HHQK //88|+ SXX&I 
 O  d MM! s   $A;c              3  H   K   | ]  }|j                   j                    y wr9   r   )r   pred_bufs     r#   r   z'topological_sort_dfs.<locals>.<genexpr>  s!      9
.6H))9
r   c                :    |    | j                   j                  fS r9   r   r   s    r#   r   z&topological_sort_dfs.<locals>.<lambda>  s    _Q-?AQAQ,R r"   r   )r   r   r@   None)r   rN   get_buffer_namesr   r1   r   r2   r   )rU   rY   r6   r   r   r   r   r   s      @@@@@r#   topological_sort_dfsr     s     +5,D15L&(F48FO   &))+ 	&D!%L	&&  
 $ 2 2S 9
:>--:T:T9
 6
 !

 u"RS d Mr"   c                    d\  }t         j                  | |      g dfd| D ]  }|   |k(  s |        y)z
    Validate that the graph is acyclic by checking predecessor relationships.

    Raises:
        RuntimeError: If a cycle is detected in the graph
    )r   r      c                v   |    k(  ry |    k(  rMj                  |        dj                  D  cg c]  } | j                          c}       }t        d| d      | <   j                  |        | j                  j
                  D ]  }|| k7  sJ  |        j                          | <   y c c} w )Nz -> z_Cycle detected in memory planning graphPath containing cycle (i -> j: j is a dependency of i): zG This indicates invalid dependency relationships in the scheduler graph)r   joinr<   r   r   r3   pop)rY   	path_infor   BLACKGRAYcolor	dfs_visitpaths      r#   r	  z)validate_graph_acyclic.<locals>.dfs_visit  s    ;%;$KK$FT]]_$FGIKKT+ VYZ  dD11 	!I$$$i 	! 	
d! %Gs   B6N)rY   r   r@   r   )rN   fromkeys)rU   WHITErY   r  r  r  r	  r
  s      @@@@@r#   validate_graph_acyclicr    sT     !E4MM%'E$&D 2  ;%dOr"   c                h   | D ]  }|j                         D ]  }|j                         }||vrt        | d|j                          d      ||   |k7  r7t        d| d| d|j                          d||   j                          d	      ||v szt        d| d	|j                          d
        y)aV  
    Validate that for each node's output buffer, the name_to_buf mapping is correct.
    For each output buffer buf, we should have name_to_buf[buf.get_name()] == buf.
    Also validate that no buffer names overlap with freeable input buffer names.

    Raises:
        RuntimeError: If buffer name mapping is incorrect or names overlap
    z from zN is not found in name_to_buf mapping. This indicates a missing buffer mapping.z&Buffer name mapping is incorrect for 'z'.Expected name_to_buf['z	'] to be zbut got z/This indicates some buffers share the same namez Buffer name conflict detected: 'z' from node z/ is also used as a freeable input buffer name. N)rg   r<   r   	debug_str)rU   rv   r[   rY   ro   r~   s         r#   validate_unique_buffer_namesr    s      ##% 	C||~H {*"jt}}&7 8@ A  8$+"<XJ G--5Ji?P{84>>@AEG  55"6xjT]]_L] ^E F +	r"   c                v    t        | |      }t        | |       t        | |||       t        | ||      \  }}||fS )a  
    Prepare planning info. As nodes are scheduled one at a time, these help
    keep track of when a buffer can be freed, and when a node can be scheduled

    Returns:
        int: peak memory estimation
        dict[str, FreeableInputBuffer]: name to freeable input buffer
    )r\   r   r   r   )rU   rv   r   rV   r   r[   estimated_peak_memoryrG   s           r#   prepare_planning_infor  B  sV     "8|!L5e[I3!;0J
  4)= 1 !"<<<r"   c           
        t         j                  dt        |              t        | ||||      \  }}	 t	        |        t        | ||       g }	|	j                  t        | |d             t         j                  d|       |D ]  }
	 |
t        k(  r |
| |||      }n |
|       }t        |      t        |       k(  sJ t        |||      \  }}|	j                  t        |||
j                               t         j                  d|
j                  |        t        dd	d
|	D ci c]  }|j                   |j"                   c}i       t%        |	d       }|j&                  S # t        $ r,}t         j                  d|       t               s Y d}~Fd}~ww xY w# t        $ r7}t         j                  d|
j                  |       t               s Y d}~Sd}~ww xY wc c}w )z
    Try a few heuristics based topological sort algorithms, and pick the one whose
    resulting topological order has the lowest peak memory estimation.
    z&Reordering for peak memory -- %d nodesz%Memory planning validation failed: %sNbaselinezBaseline peak memory: %dz%s peak memory: %dzFailed to reorder for %s: %sinductorr   orm)categoryr6   
parametersc                    | j                   S r9   )r   )xs    r#   r   z)reorder_for_peak_memory.<locals>.<lambda>  s
    amm r"   r   )	torch_loginfor   r  r  r  r   errorr	   r   r   r   r   r   	Exceptionr
   r   r   r   r   )rU   rv   r   rV   r   methodsr  r[   epeak_memory_diff_methodsr   r   r   rG   elembest_results                   r#   reorder_for_peak_memoryr%  _  s   " NN;SZH8M955u%$UK9ST 8:## 5zB NN-/DE  	..5{M uu:U+++11=NK %++ V__E NN/+N* >VWdDKK!1!11W
 .4KLK[  ?C{ :  	OO:FOOQO; 	 Xs6   E! BF G!	F*!FF	G",GG)rU   r   rV   OrderedSet[str]r@   dict[str, FreeableInputBuffer])rv   dict[str, SchedulerBuffer]r@   zdict[str, tuple[int, int]])rU   r   rv   r(  r@   r   )
rU   r   r   dict[str, BaseSchedulerNode]rv   r(  r[   r'  r@   r   )rU   r   r[   r'  r   r&  r@   z{tuple[list[BufferInfo], dict[BaseSchedulerNode, int], dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode]])rU   r   r[   r'  r   r&  r@   ztuple[int, list[int]])rU   r   r[   r'  r   r&  r@   ztuple[int, list[tuple[int, int]], dict[BaseSchedulerNode, SNodeMemory], dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode]])
rU   r   r[   r'  rv   r(  r   r&  r@   r   )rU   r   r@   r   )rU   r   r@   r   )rU   r   rv   r(  r[   r'  r@   r   )rU   r   rv   r(  r   r)  rV   r&  r   r&  r@   z*tuple[int, dict[str, FreeableInputBuffer]])rU   r   rv   r(  r   r)  rV   r&  r   r&  r   z,list[Callable[..., list[BaseSchedulerNode]]]r@   r   )6
__future__r   rL   r,   r   loggingtypingr   r   r   r   r   torch._environmentr	   torch._utils_internalr
   torch.utils._ordered_setr   rt   r   r   utilsr   virtualizedr   dependenciesr   rb   r   r   	getLoggerr   r  r   r   r%   r/   r5   r\   rw   r   r   r   r   r   r   r   r   r   r   r  r  r  r%  r!   r"   r#   <module>r4     s)   "     F F ( 0 / - !  != Gh'	          
 
 
,&",&!,& $,&^U+UUp#
"#
+#
 
#
L9
"9
49
 ,9
 !?	9

 
9
z   V>"V> >V> #V>	V>r#+"#+ >#+ ##+ 	#+L   
:": >: #:	:zz"z >z ,z #	z
 zzEP'T+\&"&+& !?& 
	&R="=+= 5= "	=
 #= 0=H 	=L"L+L 5L "	L
 #L :L Lr"   