
    pio                     V   d dl Z d dlmZmZ d dlZd dlmc mZ d dl	m
Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZmZm Z  ddlm!Z!m"Z" ddl#m$Z$m%Z%m&Z& dejN                  dej(                  dej(                  dej(                  dej(                  de(e)eej(                  ejT                  f   eeej(                  ejT                  f      f   fdZ+de)de%de%dee%   dee%   dee%   dee%   de%fdZ,	 	 	 	 d#dede-e   de-e   fd Z.d!e._/        d" Z0y)$    N)OptionalUnion)mm_args   )configir)CppGemmTemplate)CppGroupedGemmTemplatecreate_epilogue_with_attr)	TensorBox)addadd_needs_realized_inputsatenpermuteregister_loweringto_dtypeview)autotune_select_algorithmChoiceCallerExternKernelChoice)use_aten_gemm_kernelsuse_cpp_gemm_template)opsOpsValueVW_tensorpacked_weightx_scalex_zpw_scalereturnc                 R   d }t        d |||fD              }|r#t        j                  j                  |j	                            t        j                  j                  |j	                            z  }t        j                  j                  ||j	                         dz         }t        j                  | j                  t        j                        d      }t        j                  j                  |j	                            }	||z  |	z  }t        j                  j                  ||j	                         dz         }
nft        j                  | j                  t        j                        d      }t        j                  j                  ||j	                         dz         }
||
|fS )Nc              3   8  K   | ]  }t        |t        j                        xrr |j                         t        j
                  j                  v xrF t        |j                  d       xr. t        |j                  j                  t        j                          yw)dataN)

isinstancer   r   get_namer   graph	constantshasattrr%   ConstantBuffer).0items     b/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_inductor/mkldnn_lowerings.py	<genexpr>z+create_int8_compensation.<locals>.<genexpr>,   sy      *
 	 	4& 	:MMOqww000	:DIIv&	: tyy~~r'8'89	:*s   BB_x_w_compensnamer   )dim_BMatrixCompens)
allr   r(   r)   r'   add_tensor_constanttorchsumtofloat)r   r   r   r    r!   	x_w_scaleuse_int8_fast_compensation_pathx_w_scale_tensorweight_compens_tensorx_zp_tensorweight_compenss              r.   create_int8_compensationrA       s|    JNI&) *
 dG,* '# 'GGg..01gg 0 0 234 	 GG//'')N: 0 
	 !&		(++ekk*B Jgg''8 58H H; V44!''),== 5 

 !&		(++ekk*B J44!''),== 5 

 	(     r<   input_weight_compo_x_scale_x_zp_w_scale
_x_w_scalec                 Z   | r,t        j                  t        j                  ||      |      }|S t        j                  t        j                  ||      |      }t        j                  |t        j                  t        j                  t        j                  ||      |      |            }|S N)r   submul)r<   rC   rD   rE   rF   rG   rH   temps           r.   'codegen_int8_gemm_template_compensationrN   P   s     'wwGG 
H K9 wwGG 
 wwGGGG    	
 KrB   xwbc           	         | j                         }t        |      dkD  rt        | d|d   g      } t        |      }t        j                  st        j
                  sJ |D 	cg c]%  }	|	|	nt        j                  j                  |	      ' }}	g }
t        | t        |d   ddg      |      ^ }}} }|D 	cg c]  }	|	d u c}	dd t        j                  t        |      |       d}| g|}|j                  |D 	cg c]  }	|	|		 c}	       t        j                   |
||fi | t        |
      dk7  sJ t#        d|
||      }|j$                  j$                  }t        |      D cg c]   }t        j&                  ||t(        |fg      " }}t        j*                  |d   j-                         	      |_        ||_        t        |      D cg c]$  }t        j2                  j5                  ||         & }}t        |      dkD  r>t        |      D ]0  }t        ||   g |d d ||   j                         d         ||<   2 |S c c}	w c c}	w c c}	w c c}w c c}w )
N   r   r   layoutT)has_biastrans_wepilogue_creatoract_mappinggrouped_gemm)device)get_sizelenr   r   max_autotunemax_autotune_gemmr   ExternKernelrealize_inputr   r   dictfromkeysrangeextendr
   add_choicesr   r%   MultiOutputlistMultiOutputLayout
get_devicerV   outputsr   create)rO   rP   rQ   attrscalars	algorithmrV   x_sizenum_gemmbiaschoices_kwargsinput_nodesresulttemplate_bufgemm_idxreturn_bufsreturn_tensorss                      r.   grouped_gemm_loweringr}      su    ZZ\F
6{QR$%1vH&":":::STU42??#@#@#F	FUAU"$Gq'!A$A"7GQ1 344$T%4 }}U8_a8	F 'q'K?d.>?@&& 	 w<1&	F ;;##L h 	v|tX.>-?@K  ..k!n6O6O6QRL&LCH?7?K12N  6{Qh 	H'+x(G&"+G~h7@@B2FG(N8$	
 [ 	V 5 @"s$   !*I5I
9II%%I)ITc            !         t         j                  j                  rYddlm t        t         j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  ddj                  j                        t        t         j                  j                  j                  j                  ddj                   j                        t         j                  j                  j"                  t         j                  j                  j$                  t         j                  j                  j&                  t         j                  j                  j                  t(        j*                  j,                  t         j                  j                  j.                  g} t1        t         j                  j                  j"                        dt2        dt2        d	t2        ffd
       }t1        t         j                  j                  j"                  j                        dt2        dt2        dt2        d	t2        ffd       }t1        t         j                  j                  j$                  j                        dt2        dt2        dt2        d	t2        ffd       }t1        t         j                  j                  j                        	 d3dt2        dt2        dt2        ffd       }t1        t         j                  j                  j                  j                        	 d3dt2        dt2        dt2        dt2        ffd       }t1        t         j                  j                  j&                        dt2        dt2        d	t2        ffd       }t1        t(        j*                  j,                        dt2        dt2        dt2        dt2        dt2        dt2        dt2        dt4        dt6        t8           dt8        dt8        dt8        dt4        d t4        d!t4        d"t4        f fd#       }t1        t         j                  j                  j.                  d $      dt2        d%t2        d&t2        d't2        d	t2        f
fd(       }t1        t         j                  j                  j:                  j                  d $      t1        t         j                  j                  j:                  j<                  d $      dt2        d%t2        d&t2        d't2        d)t2        d	t2        ffd*              }	t1        t         j                  j                  j                  d $      	 d3dt2        d%t2        d&t2        d't2        d	t2        f
fd+       }
t1        t         j                  j                  j                  j                  d $      t1        t         j                  j                  j                  j<                  d $      	 d3dt2        d%t2        d&t2        d't2        d,t2        d	t2        ffd-              }t         j                  j>                  rt        t         j                  j@                  jB                  d.djD                  j                        | jG                  t         j                  j@                  jB                         t1        t         j                  j@                  jB                        d d/dt2        d0t2        d1t2        dtH        t2           ffd2       }tK        |        y y )4Nr   )	mkldnn_irzmkldnn::_linear_pointwiseF)has_out_variantkernel_creatorzonednn::qlinear_pointwiserO   weightrs   c
                 r    t        j                  
j                  j                  | |||||||||	
            S rJ   )r   rm   ConvolutionUnary)rO   r   rs   paddingstridedilationgroupsrn   ro   rp   r   s             r.   convolution_unaryz5register_onednn_fusion_ops.<locals>.convolution_unary   sJ     ##**11 rB   otherc                 x    t        j                  j                  j                  | |||||||||	|
||            S rJ   )r   rm   ConvolutionBinaryrO   r   r   rs   r   r   r   r   binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmr   s                r.   convolution_binaryz6register_onednn_fusion_ops.<locals>.convolution_binary  sS      ##++22 !# rB   c                 x    t        j                  j                  j                  | |||||||||	|
||            S rJ   )r   rm   ConvolutionBinaryInplacer   s                r.   convolution_binary_inplacez>register_onednn_fusion_ops.<locals>.convolution_binary_inplace'  sS      ##2299 !# rB   rP   rQ   c                 z   | j                         }t        |      dkD  rt        | d|d   g      } |t        j                  j                  |      }g }t        j                  st        j                  rnt        |ddg      }	t        | |	|      ^ }
}} }	t        || |	      r@fd}|d uddk(  rd n|d	}|g d
|d<   t        j                  |||| |gn| ||gfi | t        |      dk(  s
t               rAt              }|d |d<   |j!                   j"                  || |gn| ||g|fi |       |j%                         t&        j(                  j*                  v sJ dd i}t-        d||| |gn| ||g||      }t        |      dkD  r%t        |g |d d |j                         d         }|S )NrS   rT   r   r   rU   c                 "    t        |       S )Nro   rp   r   )bufrp   rn   ro   s    r.   rY   zJregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.epilogue_creator_  s    8w)  rB   TnonerW   rX   rY   )rS   r   r   input_indices)rn   ro   rp   Bc                 X    t         j                  j                  | j                            S rJ   r   r(   r)   r'   rO   s    r.   <lambda>zBregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.<lambda>      QWW..qzz|< rB   linear_unaryinput_gen_fnsr]   r^   r   r   ra   rb   r   r_   r`   r   r   r   r	   rg   r   rc   appendbindr'   r   r(   r)   r   )rO   rP   rQ   rn   ro   rp   rV   rq   rt   transposed_wru   rY   rv   r   rx   aten_mkldnn_linear_unarys      ```         r.   r   z0register_onednn_fusion_ops.<locals>.linear_unaryI  s    ZZ\F6{QR,-}OO11!4*,G""f&>&>&q1a&1.5af.U+FA|(LA %&TM#'$(FND8H	F }2;/#//"#)A!Q !	 7|q $9$;4IN9"&F3K1,11"#)A!Q ! ::<177#4#4444<M /)A!Q+F 6{Qf&Ks&KV__5Fr5J&KLMrB   yc                    | j                         }t        |      dkD  rt        | d|d   g      } j                         }t        |      dkD  rt        d|d   g      |t        j                  j                  |      }g }t        j                  st        j                  rnt        |ddg      }	t        | |	|      ^ }
}} }	t        || |	      r>fd}|d ud|d}|g d	ng d
|d<   t        j                  |||| |gn| ||gfi | t        |      dk(  s
t               rAt              }|d |d<   |j!                   j"                  || |gn| ||g|fi |       |j%                         t&        j(                  j*                  v sJ dd i}t-        d||| |gn| ||g||      }t        |      dkD  r%t        |g |d d |j                         d         }|S )NrS   rT   r   r   rU   c                      t        |       S )N)r   r   )r   rn   r   s    r.   rY   zKregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.epilogue_creator  s    8d!LLrB   Tr   )r   rS   r   )   r   rS   r   r   )rn   r   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.<lambda>  r   rB   linear_binaryr   r   )rO   r   rP   rQ   rn   rV   rq   y_sizert   r   ru   rY   rv   r   rx   aten_mkldnn_linear_binarys    `  `          r.   r   z1register_onednn_fusion_ops.<locals>.linear_binary  s,    ZZ\F6{QR,-ZZ\F6{QR,-}OO11!4*,G""f&>&>&q1a&118|Qv2.FA|Q )LAM %&TM#',<F <=9i,F?+#//%&YAq	Q1aL !	 7|q $9$;49"&F3K2-22%&YAq	Q1aL ! ::<177#4#4444<M /YAq	Q1aL+F 6{Qf&Ks&KV__5Fr5J&KLMrB   c                 t    t        j                  j                  j                  | |||||||||	|
            S rJ   )r   rm   ConvolutionTransposeUnary)rO   r   rs   r   output_paddingr   r   r   rn   ro   rp   r   s              r.   convolution_transpose_unaryz?register_onednn_fusion_ops.<locals>.convolution_transpose_unary  sM     ##33::" rB   w0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                     t        j                  t        j                  j                  j                  | |||||||||	|
|||||            S rJ   )pytreetree_mapr   rm   MkldnnRnnLayer)rO   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                   r.   mkldnn_rnn_layerz4register_onednn_fusion_ops.<locals>.mkldnn_rnn_layer  sc    & ??  ((//!! rB   )type_promotion_kindr   r!   w_zpc                    t        |      t        k(  sJ t        j                  j	                  t        j                  |t
        j                        d      }t        |      t        k(  sJ t        j                  j	                  t        j                  |t
        j                        d      }t        j                  j                  j                  | |||||||||	|
||||||            S )Ndtyper   r1   r    )typer:   r   r(   r6   r7   tensorfloat32intint32r   rm   QConvPointWisePT2E)rO   r   r    r   r!   r   rs   r   r   r   r   o_inv_scaleo_zero_pointoutput_dtypern   ro   rp   r   s                    r.   qconvolution_unaryz6register_onednn_fusion_ops.<locals>.qconvolution_unary  s    * =E)))gg11WEMM: 2 G :$$$77..T5F / D ##,,33!  # rB   accumc                    t        |      t        k(  sJ t        j                  j	                  t        j                  |t
        j                        d      }t        |      t        k(  sJ t        j                  j	                  t        j                  |t
        j                        d      }|dk(  rq|t
        j                  t
        j                  fv rO|j                         t
        j                  t
        j                  fv r|j                         |k7  rt        ||      }t        j                  j                  j                  | |||||||||	|
|||||||||||            S )Nr   r   r1   r    r8   )r   r:   r   r(   r6   r7   r   r   r   r   bfloat16	get_dtyper   r   rm   QConvPointWiseBinaryPT2E)rO   r   r    r   r!   r   r   rs   r   r   r   r   r   r   r   accum_scaleaccum_zpr   alphar   r   unary_algorithmmr   s                         r.   qconvolution_binaryz7register_onednn_fusion_ops.<locals>.qconvolution_binaryG  sF   > =E)))gg11WEMM: 2 G :$$$77..T5F / D
 u$ U]]ENN$CCOO%%--)HHOO%5 !5##2299!  !$- rB   c                 |  	
 |j                         t        j                  t        j                  fv sJ d       | j	                         }t        |      dkD  rt        | d|d   g      } t        t        j                        sYt              t        k(  sJ t        j                  j                  t        j                  t        j                         d      n^j#                          t%        d j	                         D              rt        g       t        j	                               dv sJ d	       Dt        j                  j                  t        j                  d
t        j&                        d      t        t        j                        sYt              t(        k(  sJ t        j                  j                  t        j                  t        j&                        d      nj#                          j+                         dk(  sJ d       |Dt        j                  j                  t        j                  d
t        j&                        d      }j#                          |j#                          |j                         t        j&                  k7  rt        t        j,                  j/                  |      t        j0                        rt        j                  j2                  |j5                            j7                  t        j&                        }t        j                  j                  t        j                  |t        j&                        |j5                               }d nj                         g }t8        j:                  st8        j<                  rt?        | ||	      ^ }}} }t        t        j,                  j/                  |      t        j0                        rNt        j@                  t        jB                  t        j                  j2                  |j5                                  t        j                  j2                  |j5                                  rtE        || |      rt        j                  j2                  |j5                            jG                         }tI        ||      \  
	fd}| j                         t        jJ                  t        j                  fv sJ tM        jN                  ||| ||gn| ||gd u|g dng d       t        |      d
k(  s
tQ               rLtS        	
      }d |d<   |jU                   jV                  | ||fn| ||f|fi |       |j5                         t        j                  j2                  v sJ d d d d d}t        t        j,                  j/                        t        j0                        rd |d<   t        t        j,                  j/                        t        j0                        rd |d<   tY        d|| ||gn| ||g||      }t        |      dkD  r%t        |g |d d |j	                         d         }|S )Nz=Only int8 and e4m3fn weights are supported by oneDNN qlinear.rS   rT   r   r   r1   c              3   &   K   | ]	  }|d k(    ywr   N r,   r3   s     r.   r/   zDregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<genexpr>       >Csax>   r   r   x_scale must be 0D or 1Dr   r    r   z(x_zp is incompatible with oneDNN qlinearr   rV   	out_dtypec                 Z  	
 t         j                  t         j                  t         j                  t         j                  fv sJ | j                         j                         d rJ j                         j                         j                         
j                         d j                         
f
d}t        j                  | j                         t         j                  || j                               }dk7  rt        |      }t         j                  k(  rM|j                         fd}t        j                  |j                         ||j                               }|S t         j                  t         j                  fv rzddlm |j                         		fd}t        j                  |j                         t        j                  |t!              t#              	      |j                               }|S )
Nc           	        
  |       }t        j                  |t        j                        }| d   f}d }d }d }s d      } d      } |      } |      }d }rJ  |      }t	        ||||||      }
y |      }	t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |	t        j                        }	t        j                  ||	      }|S NrT   r   r   r   r7   r   rN   r   r   )indexrC   weight_compens_indexrE   rF   rG   rD   rH   rM   _biasrs   
bias_dtypebias_loaderinput_loaderr<   w_scale_loaderweight_compens_loaderx_scale_loaderx_w_scale_loaderx_zp_loaders             r.   inner_fnz]register_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn  s   $0$7E %(LL$FE49"I<0'+H$(E'+H#B+9"+=(3B+9:N+O,ABV,WM)-J>'7'C C'C-=>R-S
#J ? % - ( % ( *$D  $/(34H(I'1emmU^^5T'T T'T#-#?,/LL,NE'*wwtU';#'KrB   r\   r   r   rangesr   r   c                 @     |       }t        j                  |      S rJ   r   r   r   rC   output_cast_loaderr   s     r.   inner_fn_cast_output_to_bf16zqregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16T      (:5(A'*||E<'H HrB   r   _create_constantsc                     |       } 	d|z  |t         j                        \  }}t        j                  ||z        |z   }
t         j                  k(  r 	ddt         j                        \  }}n 	ddt         j                        \  }}t        j
                  t        j                  ||      |      }t        j                  |
      S Ng      ?r   r      i   r7   r   r   rounduint8minimummaximumr   r   scale
zero_pointrC   	inv_scalevalqminqmaxclampedr	  r   requant_input_loaders            r.   inner_fn_requantzeregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_requantc  s    (<U(C8I$'%K5==9" 5	: '*ii	0A&BZ&O#/5;;#>1B()3emm2&JD$ 2C(,c2&JD$ +.++ckk#t6Ld*S'*||G\'J JrB   r  r  r7   r   r   r  int8make_loaderr   	Pointwiserk   r]   r   get_device_or_errorloweringr	  	functoolspartialr:   r   )input_bufferr   
output_bufr  r  r	  r   r   r  r  r   r   r   r   r   rp   rn   rs   r   o_scaler   r   ro   r<   r!   r@   r   r;   r    s        @@@@@@@@@@r.   rY   zKregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator  s   +!MM!NN!KK!JJ	0      (4'?'?'A0>0J0J0L-+/(:#,#88#8/8/D/D/F,)0)<)<)>)0)<)<)>&*&6&6&8&*+*.*:*:*<K'( '(R &(\\#/#:#:#<"'--%-#/#8#8#:	&
  6>)B *D'Y*J
 (5>>91;1G1G1I.I *,'1'E'E'G&2)E'1':':'<	*JN  *)C *ekk5::-FFC3=3I3I3K0K" *,'1'E'E'G&2)2):):$4*/./2</@*"
 (2':':'<	*J  *)rB   )r   r   r   rS         )   r   r   r   rS   r)  r*  rW   rY   r   )output_scaleoutput_zero_pointr   post_op_namepost_op_argspost_op_algorithmrs   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r   rB   )r   r)  r*  r+  c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>      QWW->->qzz|-L rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>  r7  rB   qlinear_unaryr   )-r   r7   r  float8_e4m3fnr]   r^   r   r&   r   r   r   r:   r   r(   r6   r   r   realizer5   r   r   	get_numelInputsKernelunwrap_storage_for_inputr+   r)   r'   r9   r   r_   r`   r   equal
zeros_liker   to_denserA   r  r	   rg   r   rc   r   r   r   )rO   r   r    r   r!   r   rs   r(  r   r   rn   ro   rp   rV   rq   w_zp_tensorrt   ru   r   rY   rv   r   rx   r   r<   r@   r;   aten_mkldnn_qlinear_unarys    `` ` ```````          @@@@r.   r9  z1register_onednn_fusion_ops.<locals>.qlinear_unary  s   " !**,U=P=P0QQ OQ ZZ\F6{QR,-gr||4G}---''55LL>Y 6  !>7+;+;+=>> #7B/G7++-.&8T:TT8|
 ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  >>#q(T*TT(
 | ww22LL%++6V 3  OOLLN~~5;;.:88>!!4
  gg//@CCEKKPww22LLEKK@t}} 3  "&4>>3CJ*,G""f&>&>/6}V|0,FA} @@F)) (():):4==?)KL))$--/: ,FA}E ww001G1G1IJSSUH 1 %	7&!{* {* {*z ;;=U[[%**,EEEE#//< GT='4H$wdS!%T!1)9< '92 7|q $9$;!(&2!-!%!(&/ <%)F6N2-22< GT='4H$wdS	
 ! !))+qww/@/@@@@<<<<	M 88A!!
 $Ma 88>!! $Ma .< GT='4@$wdK+F 6{Qf&Ks&KV__5Fr5J&KLMrB   x2c                 F  	
 ! | j                         }j                         }t        |      t        |      k(  sJ t        |      dkD  r'|dk(  r"t        | d|d   g      } t        d|d   g      t        t        j
                        sYt              t        k(  sJ t        j                  j                  t        j                  t        j                        d      n^j                          t        d j                         D              rt        g       t        j                               dv sJ d	       Dt        j                  j                  t        j                  d
t        j                         d      |Dt        j                  j                  t        j                  d
t        j                         d      }t        t        j
                        sYt              t"        k(  sJ t        j                  j                  t        j                  t        j                         d      nj                          j                          |j                          |j%                         t        j                   k7  rt        t        j&                  j)                  |      t        j*                        rt        j                  j,                  |j/                            j1                  t        j                         }t        j                  j                  t        j                  |t        j                         |j/                               }|dk(  r
t        j                  t        j2                  fv rPj%                         t        j                  t        j2                  fv r j%                         
k7  r't5        
      nj%                         
k(  sJ d       j%                          j%                         nd g }t6        j8                  st6        j:                  r|dk(  rt=        | ||
      ^ }}} }t        t        j&                  j)                        t        j*                        rt        j?                         j@                        d
k(  rZt        t        j&                  j)                  |      t        j*                        r"t        jB                  t        jD                  t        j                  j,                  |j/                                  t        j                  j,                  |j/                                  rtG        || |      rt        j                  j,                  |j/                            }|jI                         }tK        ||      \  !	
 !fd}tM        jN                  ||	| ||gn	| ||gd u|g dng d       t        |      d
k(  s
tQ               rRtS        	
||||
      }d |d<   |jU                   "jV                  	| ||fn	| ||f|fi |       |j/                         t        j                  j,                  v sJ d d d d}d |d<   tY        d|	| ||gn	| ||g||      }t        |      dkD  r*|dk(  r%t        |g |d d |j                         d         }|S )NrS   r   rT   r   r   r1   c              3   &   K   | ]	  }|d k(    ywr   r   r   s     r.   r/   zEregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<genexpr>  r   r   r   r   r   r    r   r8   zCdtype of accum for qlinear post op sum should be the same as outputr   c                   	
 t         j                  t         j                  t         j                  t         j                  fv sJ | j                         j                         j                         d rJ j                         j                         j                         
j                         d j                         
fd}t        j                  | j                         t         j                  || j                               }dk7  rt        |      }t         j                  k(  rM|j                         fd}t        j                  |j                         ||j                               }|S t         j                  t         j                  fv rddlm |j                         		fd}t        j                  |j                         t         j                  t        j                  |t!              t#              	      |j                               }|S )
Nc           	          |       } |       }d }d }d }| d   f}s d      } d      } |      }t        j                  |t        j                        } |      }d }rJ  |      }t	        ||||||      }	y |      }
t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |
t        j                        }
t        j                  |	|
      }	t        j                  t        j
                  fv sJ t        j
                  k(  r$t        j                  |t        j                        }t        j                  |	|      }	|	S r   r   )r   rC   _x2rE   rF   rG   r   rD   rH   rM   r   rs   r   r   r   r<   r   r   x2_dtype	x2_loaderr   r   r   s              r.   r   z^register_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_  s`   $0$7E"+E"2C'+H$(E'+H49"I<0#B+9"+=(3B+9:N+O$'LL$FE,ABV,WM)-J>'7'C C'C-=>R-S
#J ? % - ( % ( *$D  $/(34H(I'1emmU^^5T'T T'T#-#?,/LL,NE'*wwtU'; $,u~~/N#NN#N'5>>9&)ll3&F#&774#5D#'KrB   r   r   r   c                 @     |       }t        j                  |      S rJ   r  r  s     r.   r  zrregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16  r  rB   r   r  c                     |       } 	d|z  |t         j                        \  }}t        j                  ||z        |z   }
t         j                  k(  r 	ddt         j                        \  }}n 	ddt         j                        \  }}t        j
                  t        j                  ||      |      }t        j                  |t         j                        S r  r  r  s            r.   r  zfregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_requant  s    (<U(C8I$'%K5==9" 5	: '*ii	0A&BZ&O#/5;;#>1B()3emm2&JD$ 2C(,c2&JD$ +.++ckk#t6Ld*S'*||GU[['I IrB   r  r  ) r&  r   r'  r  r  r	  r   r   r  r  r   r   rK  r   r   r   rs   r   r(  r   r   r   r   r   r<   r!   r@   rD  rJ  r   r;   r    s         @@@@@@@@@@@r.   rY   zLregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creatorG  s	   +!MM!NN!KK!JJ	0      (4'?'?'A$&NN$4	0>0J0J0L-+/(:#,#88#8/8/D/D/F,)0)<)<)>)0)<)<)>&*&6&6&8&*+*.*:*:*<K-( -(^ &(\\#/#:#:#<"'--%-#/#8#8#:	&
 &/)B * *(5*:	*J (5>>91;1G1G1I.I *,'1'E'E'G&2)E'1':':'<	*JN  *)C *ekk5::-FFC3=3I3I3K0J" *,'1'E'E'G&+kk)2):):$4*/./2</@*"
 (2':':'<	*J  *)rB   )r   r   r   rS   r)  r*  r+  )   r   r   r   rS   r)  r*  r+  r,  )
r-  r.  r   other_scaleother_zpbinary_post_opr   unary_post_opunary_post_op_argsunary_post_op_algorithmrs   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r   rB   )r   r)  r*  c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>  r7  rB   rN  qlinear_binaryr   )-r]   r^   r   r&   r   r   r   r:   r   r(   r6   r7   r   r   r;  r5   r   r   r   r=  r>  r+   r)   r'   r9   r   r   r   r_   r`   r   
get_layoutsizer?  r@  r   rA  rA   r	   rg   r   rc   r   r   r   )#rO   r   r    r   r!   r   rD  rs   r(  r   r   x2_scalex2_zpr   r   r   r   r   rV   rq   x2_sizerB  rt   ru   r   rY   rv   r   rx   r   r<   r@   rJ  r;   aten_mkldnn_qlinear_binarys#    `` ` `````    ```           @@@@@r.   rY  z2register_onednn_fusion_ops.<locals>.qlinear_binary  s=   6 ZZ\FkkmGv;#g,...6{Q;%#7R,-"r72;/0gr||4G}---''55LL>Y 6  !>7+;+;+=>> #7B/G7++-.&8T:TT8|ww22LL%++6V 3  |ww22LL%++6V 3  dBLL1DzS(((ww22LLU[[9 3  
 OOLLN~~5;;.:88>!!4  gg//@CCEKKPww22LLEKK@t}} 3  e#MMNN$  lln(GG||~5
 &b,7<<>\9 ]9 ||~H-1-=)4J*,G##v'?'?&3:}b<40FA}b @@F)) DOO-223q8"@@F)) (():):4==?)KL))$--/: .faG ww001G1G1IJH'002H
 1 %	7&!F* F* F*P $//< GT='4L$wbRVW!%T!1)9  < '<5 7|q $9$;!(&2!- ("#.!&",'4,< <%)F6N3.33< GT='4L$wbRVW	
 ! !))+qww/@/@@@@<<<M
 #La . < GT='4D$wb$O+F 6{Q;%#7f&Ks&KV__5Fr5J&KLMrB   zmkl::_mkl_linearrU   packed_worig_wc                d   g }t         j                  st         j                  rMt        |ddg      }t	        | ||      ^ }}} }t        || |      rt        j                  ||| ||gdddg       t        |      dk(  s
t               r'|j                  j                  | ||f|d |             |j                         t        j                  j                  v sJ |j                         t        j                  j                  v sJ d d	 d
}	t!        d|| ||g||	      }
|t#        |
|      }
|
S )Nr   r   rU   TrS   )rX   r   )r   
batch_sizec                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>8      !2!21::<!@ rB   c                 X    t         j                  j                  | j                            S rJ   r   r   s    r.   r   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>9  re  rB   )r   rS   packed_linearr   )r   r_   r`   r   r   r   r	   rg   r^   r   r   r   r'   r   r(   r)   r   r   )rO   r`  ra  rQ   rc  rV   rt   r   ru   r   rx   aten_mkl_linears              r.   mkl_packed_linearz5register_onednn_fusion_ops.<locals>.mkl_packed_linear  sI    /1&&&*B*B#*6Aq6#:L29<3/Q< -VQE'33#"&1$(+,a& w<1$(=(?NN',,&16Tj -   ((*agg.?.????(AGG,=,==== A@! %>#&)"/% = ^FrB   rJ   )&r7   _C_has_mkldnn r   r   r   mkldnn_linear_pointwiseLinearUnaryrm   binaryLinearBinaryonednnqlinear_pointwiseQLinearPointwisePT2EQLinearPointwiseBinaryPT2E_convolution_pointwise_convolution_pointwise_ _convolution_transpose_pointwiser   r   defaultqconv_pointwiser   r   boolri   r   qconv2d_pointwisebinary_tensorhas_mklmkl_mkl_linearMKLPackedLinearr   r   r   )cpu_needs_realized_inputsr   r   r   r   r   r   r   r   r   r9  rY  ri  rh  r   r   r_  rC  r   s                @@@@@@r.   register_onednn_fusion_opsr     s   xx#5II..'!$0077	$
  %7II..55'!$1188	%
! %7II..'!$99@@	%
! &8II..55'!$??FF	&
" II33II44II==II..!!))II,,%
! 
599++BB	C			 	 
D	6 
599++BBII	J			 	 		 
K	B 
599++CCJJ	K			 	 		 
L	B 
599++==	> A	A	A	 A	 
?A	F 
599++==DD	EQU<	<	&<	+4<	9B<	 
F<	| 
599++LL	M			 	 
N	: 
40088	9&	&	&	 &	 	&	
 &	 &	 &	 &	 c&	 &	 &	 &	 &	  &	 &	  !&	 
:&	P 
599++;;QU	V1	1	 %	1	
 1	 1	 1	 
W1	f 
II..554

 
II..<<RV

F	F	 %	F	
 F	 F	 F	 F	



F	P 
599++==SW	X j	j	 %	j	
 j	 j	 j	 
Yj	X	 
II..554

 
II..<<RV

, '@	@	 %	@	
 @	 @	 @	 @	



@	D
 880		))" %(88??	O &,,UYY]]-F-FGuyy}}889 00#0 "0 I&	0 :0d 	"";<rB   )NNNN)1r$  typingr   r   r7   torch.utils._pytreeutils_pytreer    torch._inductor.kernel.mm_commonr   rl  r   r   codegen.cpp_gemm_templater	   !codegen.cpp_grouped_gemm_templater
   codegen.cpp_utilsr   r   r#  r   r   r   r   r   r   r   select_algorithmr   r   r   r   r   virtualizedr   r   r   Tensortupler{  ShapeAsConstantBufferrA   rN   ri   r}   _inductor_lowering_functionr  r   rB   r.   <module>r     s    "  $ $ 4  6 E 8    
 @ ) )-ll-<<- \\- ,,	-
 \\- 	",,00
01U2<<!9!99:;=-`.%).. . x 	.
 H. x . ". .j 
==I= I=@ 59  1DrB   