
    pi                     $   d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZE 	 d dlFZF eeFj                        ZHdZI ej                  eL      ZMej                  j                  ZOej                  j                  ZP e6deDej                  j                  eHdk\  rdnddd      ZS e6d eEd!"      ZTd#ZUd$ZVd%ZW e6d&eEeWeUz   eVz   "      ZXe j                  d'        ZZ e4ej                  d(eOj                  j                  )      Z] e4ej                  d*eOj                  j                  )      Z_ e4ej                  d+eOj                  j                  )      Za e4ej                  d,deOj                  j                  -      Zd e4ej                  d.eOj                  j                  )      Zfd/ Zgdddd0d1ZhdOd3Zi e4ehd      Zjd4 Zk G d5 d6e%      Zl el       Zm G d7 d8e%      Znd9 Zod: Zp end;d<eo      Zq end=d>ep      Zr e1eOj                  d?      dd@dA       Zs e1eOj                  d?      dd@dB       Zt e1eOj                  d?      ddddCdD       Zu e1eOj                  d?      dddEdF       Zv e/eOj                  j                  e0        e1eOj                  j                  d?      	 	 	 	 	 dPdG       Zwe j                  dHeex   d2eyfdI       ZzdJ Z{	 	 dQdKeex   fdLZ|dM Z}dN Z~y# eJ$ r  ed      ZHdZIY w xY w)R    N)AnyOptional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)gen_best_config)V)make_fx)TorchVersion   )config)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphChoiceCallerSubgraphTemplate)BufferChoiceCallerFlexibleLayout	is_tritonLayout)MMKernelInputs)add_layout_constraintconstrain_to_fx_stridesregister_lowering)autotune_select_algorithmExternKernelChoicerealize_inputsTritonTemplate)	_use_cutlass_for_opuse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_triton_templateuse_triton_tma_template   )_is_static_problemmm_argsmm_gridpersistent_mm_gridTz0.0.0Fmmz3.3.0aK
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and (M >= BLOCK_M and K > 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and (N >= BLOCK_N and K > 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}

        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmaa  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    {%- if TMA_EXPERIMENTAL_API %}
    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    {%- else %}
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K] if A_ROW_MAJOR else [K, M],
        strides=[stride_am, 1] if A_ROW_MAJOR else [stride_ak, 1],
        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[K, N] if B_ROW_MAJOR else [N, K],
        strides=[stride_bk, 1] if B_ROW_MAJOR else [stride_bn, 1],
        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        {%- else %}
        a = tl.load_tensor_descriptor(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
        )
        b = tl.load_tensor_descriptor(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
        )
        {%- endif %}
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

)r4   r5   r6   a  
@triton.jit
def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
    if SCALING_ROWWISE:
        # For row-wise scaling, we'll return the pointers
        return a_scale_ptr, b_scale_ptr
    else:
        # For per-tensor scaling, we'll load the scalar values
        a_scale = tl.load(a_scale_ptr)
        b_scale = tl.load(b_scale_ptr)
        return a_scale, b_scale
a'  
@triton.jit
def apply_scaling(
    accumulator,
    a_scale,
    b_scale,
    SCALING_ROWWISE: tl.constexpr,
    offs_cm,
    offs_cn,
    M,
    N,
    stride_a_scale_m,
    stride_b_scale_n,
):
    if SCALING_ROWWISE:
        # For row-wise scaling, we need to load the scales for each row/column
        a_scales = tl.load(
            a_scale + (offs_cm * stride_a_scale_m),
            mask=offs_cm < M,
            other=0.0,
        )
        b_scales = tl.load(
            b_scale + (offs_cn * stride_b_scale_n),
            mask=offs_cn < N,
            other=0.0,
        )
        acc_scale = a_scales[:, None] * b_scales[None, :]
    else:
        # For per-tensor scaling, we can directly use the loaded scalar values
        acc_scale = a_scale * b_scale

    return accumulator * acc_scale
a  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    if SCALING_ROWWISE:
        stride_a_scale_m = 1
        stride_b_scale_n = 1
    else:
        stride_a_scale_m = 0
        stride_b_scale_n = 0

    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    num_pid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = num_pid_m * num_pid_n

    {%- if TMA_EXPERIMENTAL_API %}
    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K],
        global_size=[M, K],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_N, BLOCK_K],
        global_size=[N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    {%- else %}
    stride_am = {{stride("A", 0)}}
    stride_bn = {{stride("B", 1)}}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K],
        strides=[stride_am, 1],
        block_shape=[BLOCK_M, BLOCK_K],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[N, K],
        strides=[stride_bn, 1],
        block_shape=[BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    pid_m = 0
    pid_n = 0
    offs_am = 0
    offs_bn = 0

    num_pid_in_group = GROUP_M * num_pid_n
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            group_id = tile_id // num_pid_in_group
            first_pid_m = group_id * GROUP_M
            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
            pid_m = first_pid_m + (tile_id % group_size_m)
            pid_n = (tile_id % num_pid_in_group) // group_size_m

            offs_am = pid_m * BLOCK_M
            offs_bn = pid_n * BLOCK_N

        offs_k = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
        )
        {%- else %}
        a = tl.load_tensor_descriptor(a_desc, [offs_am, offs_k])
        b = tl.load_tensor_descriptor(b_desc, [offs_bn, offs_k])
        {%- endif %}
        if USE_FAST_ACCUM:
            accumulator = tl.dot(a, b.T, accumulator)
        else:
            accumulator += tl.dot(a, b.T)

        if ki == k_tiles - 1:
            # Apply inverse scaling
            offs_cm = offs_am + tl.arange(0, BLOCK_M)
            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
            # Apply scaling
            accumulator = apply_scaling(
                accumulator,
                a_scale,
                b_scale,
                SCALING_ROWWISE,
                offs_cm,
                offs_cn,
                M,
                N,
                stride_a_scale_m,
                stride_b_scale_n,
            )

            idx_m = offs_cm[:, None]
            idx_n = offs_cn[None, :]
            mask = (idx_m < M) & (idx_n < N)
            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
scaled_mm_device_tmac                     t        |       S N)r"   )fns    [/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicer?   &  s    b!!    z
at::mm_out)op_overloadzat::addmm_outzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantrA   zat::_scaled_mm_outc                 b    | j                         t        j                  t        j                  fv S r<   )	get_dtypetorchint8uint8)mats    r>   _is_int8_matrI   A  s     ==?uzz5;;777r@   outalphabetac                    | j                  d      dk(  r| j                  d      dk7  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r.   rJ   )stridesizerE   addmm)inpmat1mat2rK   rL   rM   s         r>   
bias_addmmrU   E  sh     	

1sxx{a/CHHQK14D{{3q643e$OO;;sD$Cu4HHr@   returnc                 X    dt         fd}dt         fd}dt         fd}t        j                   | j                               xs  | j	                                fd       t        j                   |j                               xs  |j	                               fd       y )NrV   c                 \    t         j                  j                  j                  | d   d      S )Nr.   r   graphsizevarsstatically_known_equalsrO   s    r>   is_row_majorz.check_supported_striding.<locals>.is_row_majorQ  #    ww77q	1EEr@   c                 \    t         j                  j                  j                  | d   d      S Nr   r.   rY   r]   s    r>   is_col_majorz.check_supported_striding.<locals>.is_col_majorT  r_   r@   c                     t        t        j                  j                  j	                  | d   d      xs- t        j                  j                  j	                  | d   d            S ra   )boolr   rZ   r[   r\   )rP   s    r>   has_zero_dimz.check_supported_striding.<locals>.has_zero_dimW  sQ    GG44T!Wa@ Dww77QC
 	
r@   c                  *    d j                          S )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   r>   <lambda>z*check_supported_striding.<locals>.<lambda>`      6u7G7G7I6JK r@   c                  *    d j                          S )Nz$mat_b must be col_major, got stride rg   )mat_bs   r>   rj   z*check_supported_striding.<locals>.<lambda>f  rk   r@   )rd   rE   _checkrh   get_size)ri   rm   r^   rb   re   s   ``   r>   check_supported_stridingrp   P  s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKr@   c                    | j                   d   }|j                   d   }| j                   d   }||z  }|}t        j                  | j                  |||      d      }|j                  |||      }	t        j                  ||	t        j
                        }
t        j                  |
d      }|j                  | j                        S )Nr   r.   )r.   r   r   	out_dtype)	shaperE   permutereshapebmmfloat32sumtodtype)abk_splitsmnkk_partsB
a_reshaped
b_reshapedresultreduced_bufs               r>   
decomposeKr   m  s    	
A	
A	
A8mGAqyyAw7CJ1gq)JYYz:GF))FA&K>>!''""r@   c                   @     e Zd Z fdZdee   dededef fdZ	 xZ
S )DecomposeKSugraphTemplatec                 &    t         |   d       y )Ndecompose_kr4   )super__init__)self	__class__s    r>   r   z"DecomposeKSugraphTemplate.__init__|  s     	 	
r@   input_nodeslayoutk_splitrV   c                     ddl m} ddlm} d| d}d|} |       5   |       }t	        t        j                  t        |      |      }	t        
| %  ||||	|	      cd d d        S # 1 sw Y   y xY w)
Nr   enable_python_dispatcherr   select_decomp_tabledecompose_k_mm__splitzk_split=)r~   r4   r   r   make_fx_graphdescription)
torch._dispatch.pythonr   decompositionr   r   	functoolspartialr   r   generate)r   r   r   r   r   r   r4   r   decompositionsr=   r   s             r>   r   z"DecomposeKSugraphTemplate.generate  s     	D7 	0!
m%' 	02N!!*w?B
 7#' ' $ 	 	 	s   A A**A3)__name__
__module____qualname__r   listr   r   intr   r   __classcell__r   s   @r>   r   r   {  s<    

&\  	
 
 r@   r   c                   J     e Zd Zdededef fdZdee   dede	f fdZ
 xZS )	ContiguousTemplater4   r   r=   c                 P    || _         || _        || _        t        |   |       y )Nr   )r4   r   r=   r   r   )r   r4   r   r=   r   s       r>   r   zContiguousTemplate.__init__  s.    	& 	 	
r@   r   r   rV   c                     ddl m} ddlm}  |       5   |       }t	        | j
                  |      }t        |   | j                  |||| j                        cd d d        S # 1 sw Y   y xY w)Nr   r   r   r   r   )
r   r   r   r   r   r=   r   r   r4   r   )r   r   r   r   r   r   r=   r   s          r>   r   zContiguousTemplate.generate  sp    
 	D7%' 	02NB
 7#YY'  ,, $ 	 	 	s   AA$$A-)r   r   r   strr   r   r   r   r   r   r   r   r   s   @r>   r   r     sG    
S 
s 
 
&\  
	 r@   r   c                 J    t        j                  | |j                               S r<   )rE   r3   
contiguous)r|   r}   s     r>   contiguous_mmr     s    88Aq||~&&r@   c                 L    t        j                  | ||j                               S r<   )rE   rQ   r   )rR   r|   r}   s      r>   contiguous_addmmr     s    ;;sAq||~..r@   r   zcontiguous mmr   zcontiguous addmm)type_promotion_kindr   c                t	   t        | ||      \  }}}}} }t        |      \  }}d}t        | |g      }	t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       |}
t        j                  s<t        j                  s,t        |j                  |j                  |j                        }
g }t               r6|j                  t         j"                  j%                  |	|
t&        gd             t        |      \  }}|rt)        |d	
      r|j                  t         j"                  j%                  |	|t*        gd             t-        | |      r6|j                  t         j"                  j%                  |	|t.        gd             t1        |||      r6|j                  t         j"                  j%                  |	|t2        gd             |j                  t         j"                  j%                  |	|t4        gd             |r>t7        ||||      r0t9        d      r%t;        j<                  |||	j?                                |r3tA        ||||      r%tC        jD                  |||	j?                                |r3tG        ||||      r%tI        jJ                  |||	j?                                tM        || |      r%tO        jJ                  |||	j?                                | |g}|rt)        |      rtP        jR                  jT                  jW                  |      rtY        |       rg }t               r|j[                  d       t]        |      }|j                  t         j"                  j%                  |	|t*        gd             t_        | |||||||ta               dd|      }tP        jR                  jT                  jc                  |      s*|#t]        |      dkD  r|D cg c]	  }||v s| }}n|d| }t        jd                  D ]:  }|j[                  tg        |      ji                  |	j?                         |             < d}tP        jR                  jT                  jj                  rtm        | |      }to        |||	j?                         ||      S c c}w )z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r3   aten_mm_infozaten.mm__r.   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sdevicer{   rP   Fcheck_max_autotune	extern_mmzmm-ahN
   )top_kalways_includedr   )best_config_future)8r0   r/   r   r   loginforD   inductor_configmax_autotunemax_autotune_gemmr   r   r{   rP   r&   extendr   choicesget_mm_configsaten_mmr,   mm_templater-   persistent_tma_mm_templater+   decompose_k_subgraph_templatemm_contiguous_subgraph_templater*   r%   r   add_cutlass_gemm_choicesnodesr'   r   add_ck_gemm_choicesr(   r   add_choicesr)   r   rE   	_inductorr   run_autoheuristicr   appendlenmm_autoheuristicr
   collect_autoheuristicexternal_matmulr?   bindremote_gemm_autotune_cacher   r!   )rS   rT   r   r   r   r   static_shape
is_nonzeror4   kernel_inputsaten_layoutr   r   r    num_choices_before_extra_configs
ah_choiceschoicer   s                     r>   tuned_mmr     sE    #*$V"DAq!VT41&9L*D #D$<0M ^xs!A3as3494HHY			 K((O,M,M$==6;;
 #%GII$$]K'DQ	
  2&9L*)&UKII$$]F[M4P	
 #4.NN		((!6,F+G "!Q*NN		((!6,I+JD
 	II$$v(G'H$	
 	 Aq1%66V]002	
 *61a;**7FM<O<O<QR/1a@&&w8K8K8MNVT40##!	
 ,K'OO""44T:dO """;/+.w<(II$$ 
	
 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD,, 
'*//0C0C0EvN	


  88,T48$-  Qs   	R5R5c          	      
   t        | ||t        j                        \  }}}}} }d}t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       t        |      \  }}|xr |xr t        ||||      }	g }
t        | |g      }t               r6|
j                  t        j                  j                  ||t        g|             |	r3t!        |      r(t#        j$                  |
||j'                         dd	       |rDt)        |dd
      r6|
j                  t        j                  j                  ||t*        g|             t-        ||
|j'                         |      S )Nr   rs   int_mmr   zaten._int_mm_r   r.   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sTfuseablenon_fuseableF)enable_int32r   )r0   rE   int32r   r   r   rD   r/   r*   r   r&   r   r   r   r   aten__int_mmr%   r   r   r   r,   r   r!   )rS   rT   r   r   r   r   r4   r   r   use_cutlassr   r   s               r>   tuned_int_mmr   k  s    #*d6U[[#Aq!VT4 D^}QCq1QC89Q>9HH^			  2&9L*W:W2FvqRSUV2WK"$G #D$<0MII$$		
 *4066V]002TPT	
 )Te 	II$$]F[M4P	
 %T7M4G4G4I6RRr@   )rL   rM   r   c          	         t        ||| |      \  }}}}}}}	t        |      \  }
}d}t        |	||gt        ||            }g }t        d   d| d| d| xx   dz  cc<   t
        j                  d	||||j                         |j                         |       |}|r t        j                  st        j                  sd
dlm}m} t        ||      r) ||j                  |j                   |j"                        }t        | ||gt        ||            }|j%                  t&        j(                  j+                  ||t,        g|             t/        |||j1                         |      S t3               rl|j%                  t&        j(                  j+                  ||t4        g|             |j%                  t&        j(                  j+                  ||t,        g|             |rt7        |d      r|j%                  t&        j(                  j+                  ||t8        g|             t;        ||      r6|j%                  t&        j(                  j+                  ||t<        g|             |j%                  t&        j(                  j+                  ||t>        gd             |rEtA        ||||      r7tC        |      r,tE        jF                  |||j1                  g d      ||       |r=tI        ||||      r/tK        jL                  |||j1                  g d      ||g d       tO        |||      r)tQ        jR                  |||j1                         ||d       t/        |||j1                         |      S )zb
    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
    r   rQ   )rL   rM   )scalarsr   zaten.addmm_r   r.   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   r   Fr   )r.   r   r   )reorder)r   r   r.   )rL   rM   input_reorderT)rL   rM   has_bias)*r0   r/   r   dictr   r   r   rD   r   r   r   torch._inductor.irr   r   
isinstancer   r{   rP   r   r   r   r   
aten_addmmr!   r   r&   aten_bias_addmmr,   r   r-   r   "addmm_contiguous_subgraph_templater*   r%   r   r   r'   r   r   r)   r   r   )rR   rS   rT   rL   rM   r   r   r   r   inp_expandedr   r   r4   r   r   r   r   r   s                     r>   tuned_addmmr     sJ    18dCPV0W-Aq!VT41&9L*D"	tT"Du4,HM #%G ^{1#Qqc1#671<7HH\			 K))_-N-N 	Cfk*(}}FLLv{{K '$t%d'C
 	II$$		
 )w8K8K8MvVVII$$ !		
 	II$$		
 )&UK 	II$$		
 #4.NN		((!/0	 	II$$34		
 	 Aq1%66 	2	
 *61a;** 	2#		
 VT40##!	
 %T7M4G4G4I6RRr@   )rs   r   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r6t        ||||      r(t        d      rt!        j"                  ||| ||gd	d	
       t%        d|| ||f|      S )Nr   )r#   r   )r   r.   z,out_dtype is ignored if layout is specified.rr   sparse_semi_structured_mmTr   ) torch._inductor.select_algorithmr#   ro   r   rZ   r[   check_equals_and_simplifyr   r   
get_devicerD   r&   aten__sparse_semi_structured_mmr   r*   r%   r   r   r!   )rS   	mat1_metarT   rs   r   r#   m1k1m2r   k2r   r   r   r   r   s                   r>   tuned_sparse_semi_structured_mmr
  -  su    @ +4DAD)T]]_FB EBMMOEB	222r:A	221r62>A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	
A
 Aq1 ;<66VdD)4tRV	
 %#WtY.Ev r@   c	                 ,   t        | |||      \  }	}
}}} }t        d   d|	 d|
 d| xx   dz  cc<   t        j                  d|	|
|| j	                         |j	                         |       d}t        | |       t        ||      \  }}|s| |||g}nt        |      }| ||||g}t        |dd	      }g }t               rS|j                  t        j                  j                  ||t        g|t        j                  t        ||
      i             |j                   t"        j$                  k7  rt'        ||||      S t)        |      \  }}|rt+        |dd      rt        |      }t-        | |      rJ|sH|j                  t        j                  j                  ||t.        g|t.        j                  |i             |j                  t        j                  j                  ||t0        g|t0        j                  |i             |r@t3        ||	|
|      r2t5        |      r't7        j8                  |||j;                         |       |r3t=        ||	|
|      r%t?        j@                  |||j;                                t'        |||j;                         |      S )a9  
    Performs an optimized matrix multiplication where scaling factors are applied
    to the inputs and/or output.

    Args:
        mat1 (Tensor): First input matrix
        mat2 (Tensor): Second input matrix
        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
        bias (Tensor, optional): Optional bias tensor to add to the result
        layout: Layout hint for optimization

    Returns:
        Tensor: The result of the scaled matrix multiplication
    r   r   zaten._scaled_mm.default_r   r.   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s	scaled_mmr   )mat1_idxmat2_idx)rs   use_fast_accum)kwarg_overridesTF)enable_float8r   )USE_FAST_ACCUM)r  )!r0   r   r   r   rD   rp   r#   r   r&   r   r   r   r   aten__fp8_mmuidr   r{   rE   rx   r!   r/   r,   r-   scaled_mm_device_tma_templater   r*   r%   r   r   r   r'   r   r   )ri   rm   scale_ascale_bbiasscale_resultrs   r  r   r   r   r   r4   scale_a_realscale_b_realr   	bias_realr   r   r   r   
overriderss                         r>   tuned_scaled_mmr  a  s   8 %,uVy%!Aq!VUE ^7s!A3asCDIDHHi			 DUE*!/!AL, e\<@"4(	e\<K #;QGM"$GII$$ $$d"+N'! % 
	
 }}%(wVLL&v.MAz)du 8
 #5%0NN		((!23%B%F%F
$S )  	II$$!,* = % 	
 	 Aq1%66!)		
 *61a;**7FM<O<O<QR$T7M4G4G4I6RRr@   indexc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )rE   cudaget_device_propertiesmajor)r  propss     r>   _is_sm7x_or_older_gpur&    s)    JJ,,UZa8E;;!r@   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wr<   )r   r   ).0dims     r>   	<genexpr>zdims_are_int.<locals>.<genexpr>  s     4z#s#4s   )all)dimss    r>   dims_are_intr.    s    4t444r@   r   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )Nr   r   r   
mat1_dtypeT)is_categorical
mat2_dtyperS   rT   mat1_iscontigmat2_iscontigr3   )r   add_featurer   r{   r   is_contiguousr	   )	r   r   r   rS   rT   mat1_stridemat2_stridecontextr4   s	           r>   get_contextz%mm_autoheuristic.<locals>.get_context  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>r@   c                       y r<    r=  r@   r>   fallbackz"mm_autoheuristic.<locals>.fallback  s    r@   )r>  r   r   r:  r4   augment_contextprecondition)r   )get_size_hintsr.  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rS   rT   r   r   r   r   r4   r   opsr@  r   r   r8  r9  r;  r>  r:  autoheuristics         `           r>   r   r     s     T4Aq1GAq!Aq	"5dDAK& !Q4{KHG0!M 55? 6 
 	
 **,,r@   c                    t        |t              rt        |t              s^t        j                  j                  j                  | j                         t        j                  j                  j                        \  }}t        |t              rt        |t              s^t        j                  j                  j                  |j                         t        j                  j                  j                        \  }}|||fS )Nr>  )r   r   r   rZ   r[   
size_hintsro   rE   r   r   unbacked_symint_fallback)rS   rT   r   r   r   s        r>   rA  rA  '  s    aZ3%7!!,,MMO__++DD - 
A
 aZ3%7!!,,MMO__++DD - 
A a7Nr@   c                 d   | j                   j                  }|j                   j                  }||g}g }|D ]p  }t        |t              sMt        j
                  j                  j                  |t        j                  j                  j                        }|j                  |       r |d   |d   fS )NrH  r   r.   )r   rO   r   r   r   rZ   r[   rI  rE   r   r   rJ  r   )rS   rT   r8  r9  stridesstrides_hintsrO   s          r>   rB  rB  6  s    ++$$K++$$KK(GM %&#&WW%%00//HH 1 F 	V$% ]1---r@   )rV   N)NNNFN)NN)r   loggingtypingr   r   rE   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r	   r
   )torch._inductor.codegen.cpp_gemm_templater   *torch._inductor.remote_gemm_autotune_cacher   torch._inductor.virtualizedr   "torch.fx.experimental.proxy_tensorr   torch.torch_versionr    r   r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   irr   r   r   r   r   r   r   loweringr   r   r    select_algorithmr!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   	mm_commonr/   r0   r1   r2   triton__version__triton_version
has_tritonImportError	getLoggerr   r   rE  atenprimsversionhipr   r   load_scalesapply_scaling
device_tmar  cacher?   r3   rK   r   rQ   r   _int_mmr   _sparse_semi_structured_mmdefaultr  
_scaled_mmr  rI   rU   rp   r   r   r   r   r   r   r   r   r   r   r   r   r
  r  r   rd   r&  r.  r   rA  rB  r=  r@   r>   <module>rt     s        ( T  F F ) 6 , ( U M D E H H * X X 
 
 
 P O!&"4"45NJ
 g!yy~~				T MM%.G*CQG	XFP (,"sZx ,		@D L FJ
Z !/		#m3!  " " UXX|
M	KKdjjnn
 "	MM$$,,2B2B #5	$$$//77	#  "	*8K8K
8 (,11 I4 %Z6#  0  F !: ; ) D'/ #5_m#  &8*,<& "
 4775#' T 6Tn 4<<T:'+ .S ;.Sb 4::48*+!D LS 9LS^ 422M(,T- N-` doo--/F G 4??**E 
{S F{S| # 4  
5  :- C=:-z.A(  !'*NJs   :M< <NN