
    pi;(                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlZddl	m
Z
 ddlmZmZmZ ddlmZ  ej"                  e      Z G d d	e      Z G d
 de      Ze j,                  defd       Zde
j0                  defdZde
j0                  defdZde
j0                  defdZ G d de      Z G d de      Z G d de      ZdgdggZ dgdggdgdggdgdgggZ!g dg dg dgZ"dee#   fd Z$de
j0                  de#fd!Z%y)"    N)IntEnum)Optional   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                       e Zd ZdZdZdZdZy)	NCCL_COLLr   r         N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALL     _/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/comm_analysis.pyr   r      s    JJNJr   r   c                       e Zd ZdZdZdZy)NVIDIA_GPU_TYPEr   r   r   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      s    EFFr   r   returnc                  8   t         j                  j                  j                  t         j                  j                  j                        xs d} d| v rt
        j                  S d| v rt
        j                  S d| v rt
        j                  S t
        j                  S )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r   r   r   )gpu_infos    r   get_gpu_typer*       s|    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%% %%%r   nodec                 .   t        | t        j                        st        d|        | j                  }|J d|v rt
        j                  S d|v rt
        j                  S d|v rt
        j                  S d|v rt
        j                  S t        d|       )Nz!node is not a collective kernel: 
all_reduce
all_gatherreduce_scatterz-torch.ops._dtensor.shard_dim_alltoall.defaultzUnsupported collective kernel: )

isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namer   r   r   r   r   )r+   kernel_names     r   get_collective_typer5   .   s    dB001<TFCDD))K"""{"###		$###	[	('''	8K	G###:;-HIIr   c                 V   d}| j                   D ]  }t        |j                  j                        }t	        |t
        j                        rt        |      }n+t        j                  j                  j                  |d      }||t        |j                  j                        z  z  } |S )Nr   )fallback)inputsr	   layoutsizer0   sympyIntegerintr
   graphsizevars	size_hintr   dtype)r+   sz_bytesinpnumels       r   get_collective_input_size_bytesrE   @   s    H{{ =cjjoo.eU]]+JEGG$$..uq.AEEN3::+;+;<<<= Or   c                     t        | t        j                        r5t        | t        j                        sddlm}  || j                  d         S t        d|        )Nr   )_get_group_size_by_namezUnsupported collective type: )r0   r   r1   _WaitKernel"torch.distributed.distributed_c10drG   constant_args	TypeError)r+   rG   s     r   get_collective_group_sizerM   M   sK    $,,-jr~~6VN&t'9'9"'=>>7v>??r   c                       e Zd ZdZdZdZy)NCCL_HWr   r   r   N)r   r   r   NVLINKPCINETr   r   r   rO   rO   [   s    F
C
Cr   rO   c                       e Zd ZdZdZy)	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   rT   rT   a   s    DDr   rT   c                       e Zd ZdZy)
NCCL_PROTOr   N)r   r   r   LLr   r   r   rX   rX   f   s	     
Br   rX   g333333@gffffff@g333333?      ?g      @g@)     C@r[   gffffff4@)gU@g     6@g      3@c                    | j                   }|J t        |dd      }d|v sd|v sy ddlm} |j                  d   } ||      }t
        j                  j                  |      }t        j                  d|       }t        |      }t        |       \  }	}
d	|v r|	d
d  |	d   z   }		 t
        j                  j                  ||      5 } ||	i |
}t
        j                  j                  j                  j                  |       d d d        j&                  }|dk  ry |dz  }|S # 1 sw Y   "xY w# t         $ r}t"        j%                  |       Y d }~y d }~ww xY w)Nr3   r    r.   r/   r   )_resolve_process_grouprH   zcuda:all_gather_into_tensor_outr   )groupdevice     @@)r+   getattrrJ   r]   rK   r$   distributedget_rankr`   evalr   _time_estimatorops_c10d_functionalwait_tensordefault	Exceptionloginfoestimated_time)snodekernelpy_kernel_namer]   pg_namepgrankr`   fnargskwargstime_estimatorweest_time_usest_time_mss                   r   /estimate_nccl_collective_runtime_nccl_estimatorr}      sh   ZZFV%92>NN*.>..PI""2&G		(B!!**2.D \\E$.)F	n	B$U+LD& $~5ABx$q'!	..V / 
 	>D#F#AII&&22::1=		> !//K Q#K!	> 	>
  s0   $!D. <D"D. "D+'D. .	E7EEc                    t        |       }|dz  dz  dz  }d}t        |       }t        j                  ||z        }|}|dk  ryt        j
                  }t        j                  }t        |       }	t        j                  j                  j                  }
t        j                  j                  j                  }t               }|dk  r|dz
  nd}|dk(  r|nd}t        |   |   }|dk(  r|
n|}d}||z  }t!        |||dkD  s|	t"        j$                  k(  rdndz        }|	t"        j$                  k(  r	d|dz
  z  }nC|	t"        j&                  k(  r	d|dz
  z  }n'|	t"        j(                  t"        j*                  fv r|dz
  }d|z  z  }||z  }|d	z  }t,        j.                  }|	t"        j$                  k(  r|dkD  rd|z  }n9d}n6|	t"        j(                  t"        j*                  t"        j&                  fv r|dz
  }t0        |   |   }t2        |   |   |   }t2        t,        j4                     |   |   }d
}|dkD  rd}t7        ||      }||z
  |z  ||z  z   z  }|dz  }||z  }||z   }|dz  } | S )a:  
    Returns estimated NCCL collective runtime in milliseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r   g      ?gUUUUUU?rZ   g    eAg        ra   g    .A)rE   rM   mathceilrT   rV   rX   rY   r5   r$   	_inductorconfigintra_node_bwinter_node_bwr*   llMaxBwsminr   r   r   r   r   rO   rP   baseLathwLatrR   max)!r+   tensor_storage_size_bytestensor_storage_size_GBnum_gpus_per_node
group_sizenNodesnRanks	nccl_algo
nccl_protocollbwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss!                                    r    estimate_nccl_collective_runtimer      s    !@ E6=DtK *40JYYz$556FF{ IJt$D
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	%%	%fqj!	)**I,@,@A	A! 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@)BVBVW	Wqj i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L	
	"B	cBIr   )&	functoolsloggingr   enumr   typingr   r;   r$   r    r   r%   r   r   r	   virtualizedr
   	getLoggerr   rl   r   r   	lru_cacher*   IRNoder5   r=   rE   rM   rO   rT   rX   r   r   r   floatr}   r   r   r   r   <module>r      sx           C C  g! g  
&o 
& 
&Jbii JI J$
")) 
 
@BII @# @g  
  	
 		  
	 
	 
		,,)huo )Xf299 f fr   