
    piǍ                     $   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlmZ ddlmZ  ej"                  d      Z	 dd	ZddZddZddZddZd Zi d
ddi ddgdfdZd Zd Z	 	 	 	 	 	 	 	 d dZi d
ddddddddddgfdZy)!zWeightOnly for onnxrt adaptor.    N)numpy_helper)np_dtype_to_tensor_dtype   )	ONNXModel)simple_progress_barneural_compressorc	           	      
   ||z  dz  }	t        j                  |j                  d   |	fd      }
| j                  d   d|d|z   }| j                  d   |g}g }i }d}|d	k(  r1|d
d
d
d
df   |d
d
dd
df   d	z  z  }|d
d
d
|	f   |
d
d
d
d
f<   n!|dk(  r|}
nt        j                  d| d       t        j                  |
d||	f      }
t        j                  |d|f      }|j                  t         j                  k(  s|j                  t         j                  k(  sJ t        j                  j                  | j                  d   dz   t        |j                        |j                  |j                         d      }|j                  |j                          |j                  |       |~|dk(  r|j#                  d      }n|d	k(  rt        j$                  |j                  d   dz   dz  dd      }t        j&                  |j                  d   |z  |z        j                  d      }|d
d
d   }|dd
d   }||dz     dz  ||   j)                         z  ||dz  <   ||dz     dz  ||   j)                         d	z  z  ||dz  <   nt+        d| d      t        j                  ||d   df      }t        j                  j                  | j                  d   dz   d|j                  |j                         d      }|j                  |j                          |j                  |       |d   |d<   |d   |d<   ||d<   ||d<   |dkD  r||d<   t        j                  j                  |d|
j                  |
j                         d      }|j                  |       t        j                  j,                  |f|| j.                  | j                   r| j                   dz   t1        |      z   ndt1        |      z   dd|}||fS )aB  Build MatMulNBits node.

    Args:
        node: original matmul node
        weight_shape: original weight shape
        num_bits (int): num_bits
        group_size (int): how many elements share one scale/zp
        k_blocks (int): block number
        q_weight (array): quantized weight
        scale (array): scale
        zero_point (array): zero point
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).

    Returns:
        matmul_weight_only_node: MatMulNBits node
        new_inits: initializers of the new node
       r   uint8dtyper   _QGMatMulNBits   N   z8MatMulNBits does not have kernel support for num_bits = ._scaleTname	data_typedimsvalsraw         _zpKNbits
block_sizeaccuracy_levelzcom.microsoft)inputsoutputsr   domain)npzerosshapeinputloggererrorreshaper   float32float16onnxhelpermake_tensorr   tobytesappendr   astypefullarangeravel
ValueError	make_nodeoutputstr)nodeweight_shapenum_bits
group_sizek_blocksq_weightscale
zero_pointr$   	blob_sizepackedq_weight_nameinput_names	new_initskwargsop_typeq_weight_pairsscale_tensor	packed_zpidxeven_idxodd_idx	zp_tensorq_weight_tensormatmul_weight_only_nodes                            x/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/onnxruntime/quantization/neural_compressor/weight_only.pymake_matmul_weight_only_noderW   ,   s   8 X%*IXXx~~a()4GDFJJqMbAj^$DDM::a=-0KIFG 1}!!SqS&)HQ1W,=,BB%a)m4q!t	QOPXzYZ[\ZZXy 9:F JJur8n-E;;"**$rzz(AAA;;**ZZ]X%*5;;7[[]]_ + L |(()\" q="))'2I]!1!1!!4q!8Q >7SI))J,,Q/8;hFGOOPRSC3Q3xH!$Q$iG(1(a-(@4(G:V^K_KeKeKg'gIh!m$'0A'>'E*U\J]JcJcJeijJj&kIgl#WX`WaabcddJJy<?B*?@	KK++A&!)//PYPaPaPcim , 
	 	9>>*# q/F3Kq/F3KF6N%F<#1 kk--\\^^ . O _%"kk3315TYYH-s8}@T  #I--    r       asymc           	      .   t        j                  | d|f      } |dk(  s|dk(  rd|z  dz
  }d}n'|dk(  r"|dk7  rd|dz
  z  dz
  nd}|dk7  r	d|dz
  z   nd}t        j                  | dd	      |z  }t        j                  | dd	      |z  }	|dk(  rt        j                  t        j
                  |      t        j
                  |	            }
t        j                  |	j                        }|
dkD  }|
|   d
z  j                  t         j                        z
  z  ||<   |dk(  rt        j                  |j                        n)t        j                  |	j                  d      d|dz
  z  z  }nt        j                  |	j                        }t        j                  |	|z
  ||	k7     j                         j                         D cg c]  }t        |      z
  z   c}      |||	k7  <   |dk(  r3t        j                  |j                        |z
  |z  j                         nit        j                  dt        j                   t        j                  |j                        |z
  |z  j                                     j                  d      }t        j"                  | |j$                        }t        j&                  | ||       t        j(                  |||       t        j                  ||       t        j*                  ||       |||fS c c}w )a	  Quantize tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   rZ   uintr   r   r   symTaxiskeepdimsg       @intr   r   out)r(   r.   minmaxmaximumabsonesr*   r6   float64r)   arrayflattentolistfloatroundminimum
empty_liker   divideaddclip)datar@   rA   schemer   ratiomaxqminqrminrmax	max_rangerD   maskrE   irC   s                   rV   quant_tensorr~      s     ::dR,-D5F?({Q	5*2a-qX\"Q&Q)1Qx!|$%B66$Q.6D66$Q.6DJJrvvd|RVVD\:	

#1} ,44RZZ@D4KPd%*e^BHHU[[!SZ9[_`empqeq_r9s 	 

# hh04tTT\/J/R/R/T/[/[/]^!U1X%^
ddl
 ~ hhu{{#d*e3::<Arzz$"((5;;2G$2NRW1W0^0^0`abiijqr 	 }}T5HIIdEx(FF8ZX.HHX8$GGHdDh/UJ&& _s   Lc                    t        j                  | d|f      j                  t         j                        } d|z  dz
  }d}t        j                  | dz  dd      }t        j
                  ||z        }t        j                  |t        j                  |             }t        j                  | dd      }t        j                  | dd      }	t        j                  |dd      }
t        j                  || z  dd      }t        j                  |	j                  | j                        }||	k7  }||z
  |	|   ||   z
  z  ||<   d|z  }t        j                  t        j                  || |z
  z        ||      }||z  |z   | z
  }t        j                  ||dz  z  dd      }d}d	}d}t        |      D ]  }t        j                  |	j                  | j                        }t        j                   |||z  z   |z   |z
  g      j                  | j                        d   }||	k7  }||	|   ||   z
  z  ||<   t        j                  t        j                  || |z
  z        ||      }||z  }t        j                  |dd      }t        j                  ||z  dd      }t        j                  || z  dd      }t        j"                  |
|z  |dz        }|
|z  ||z  z
  |z  }||z  ||z  z
  |z  }||z  |z   | z
  }t        j                  ||dz  z  dd      } t        j                   |       }!t        j                   |      }"t        j$                  |!|"k        d   }#||#d
d
f   ||#d
d
f<   | |#   ||#<   ||#   ||#<   ||#   ||#<    t        j                  | |z  j                         d|      j                  d      }$|j                  t         j&                        }t        j(                  | |j                        }%t        j*                  | ||%       t        j                  |%|$|%       t        j                  |%|%       t        j                  |%|||%       |%||$fS )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   r   r   r   Tr^   r      皙?Nr   rb   )r(   r.   r6   r/   sumsqrtrr   rg   rd   re   rh   r*   r   rs   rn   rangerj   subtractwhereri   rp   rq   )&rt   r@   rA   rw   rx   sum_x2av_xweightsry   rz   sum_wsum_xiscaler|   rD   
quant_datadiffbest_madnsteprdeltarrminis_
iscale_newfactorquant_data_newmul_weights_quant_data_newsum_lsum_l2sum_xlD
this_scalethis_minmadmad_1
best_mad_1idx_to_replacerE   rC   s&                                         rV   quant_tensor_k_quant_cpur      s    ::dR,-44RZZ@Dh;?DDVVD!G!d3F776J&'DffT266$<(G66$Q.D66$Q.DFF7T2EFF7T>D9EWWTZZtzz2F4<D4KDJd$;<F4LJE&D4K"894FJ:$t+Dvvga'a$?HEFEU| 8WWTZZtzz:
56C</$6=>?FFtzzRSTUt|!T$Z$t*%<=
4*t*D!EtTR%,~%="1DI2^C!VZ[2T9DQKKq1fnuu}49
UNUV^3q8N*X5<ffWtQw&Q>XXh'
%*"45a8(6~q7H(I
>1$%#&~#6  *> :n'7^184 D5E/002At<CCGLJLL$E}}T5HIIdEx(FF8ZX.HHX8$GGHdDh/UJ&&rX   c                 &	   	 ddl }ddl}|j                  j                         r|j	                  |       } | j                  d|f      j                  |j                        } d|z  dz
  }d}|j                  | dz  dd      }|j                  ||z        }|j                  ||j                  |             }	|j                  | dd      }
|j                  | dd      }|j                  |	dd      }|j                  |	| z  dd      }|j                  |j                  | j                         }|
|k7  }||z
  ||   |
|   z
  z  ||<   d|z  }|j#                  |j%                  || |
z
  z        ||      }||z  |
z   | z
  }|j                  |	|dz  z  dd      }d	}d
}d}t'        |      D ]  }|j                  |j                  | j                         }|j)                  |||z  z   |z   |z
  g      j                  | j                         d   }|
|k7  }|||   |
|   z
  z  ||<   |j#                  |j%                  || |
z
  z        ||      }|	|z  }|j                  |dd      }|j                  ||z  dd      }|j                  || z  dd      }|j+                  ||z  |dz        }||z  ||z  z
  |z  } ||z  ||z  z
  |z  }!| |z  |!z   | z
  }|j                  |	|dz  z  dd      }"|j)                  |"      }#|j)                  |      }$|j-                  |#|$k        d   }%||%ddf   ||%ddf<   |"|%   ||%<   | |%   ||%<   |!|%   |
|%<    |j#                  |
 |z  j%                         d|      j                  d      }&|j                  |j.                        }|j1                  | |j                         }'|j3                  | ||'       |j                  |'|&|'       |j%                  |'|'       |j#                  |'|||'       |'j5                         |j5                         |&j5                         fS t6        j9                  d       t;        | ||      S # t<        $ r% t6        j?                  d       t;        | ||      cY S w xY w)a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   Nr   r   r   Tr^   r   r   r   r   rb   zqTry to use k-quant quantization on CUDA. However, CUDA is not available.Fall back to k-quant quantization on CPU.zNow we are using k-quant quantization on cpu, which is time consuming.Please consider install cupy to speed up on CUDA. See https://cupy.dev/Please also install torch to check CUDA availability.) cupytorchcudais_availableasarrayr.   r6   r/   r   r   rr   rg   rd   re   rh   r*   r   rs   rn   r   rj   r   r   ri   rp   rq   getr,   warningr   ImportErrorinfo)(rt   r@   rA   cpr   rw   rx   r   r   r   ry   rz   r   r   r   r|   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rE   rC   s(                                           rV   quant_tensor_k_quant_cudar     s   ID::""$::d#D<<Z 0188DDh;?DDVVD!G!dV;F776J./DffT266$<0G66$Q66D66$Q66DFF7TF:EFF7T>DFAEWWTZZtzzW:F4<D 4KDJd,CDF4LJE&D4K*@!A4NJ:%,t3Dvvga/a$vGHEFEU| @WWTZZtzzWB
56C<#7$#>#E"FGNNtzzZ[\]t|#)T$Z$t*-D#E
4 !#*t2L)MtUY!Z-4~-E*9DQ :^ KRS^bc :T ATXYKKq9#fnuu}<A
"UNUV^;q@!N2X=DffWtQw.QfFXXh/
!#%**<!=a!@0>~q?P0Q
>1,-+.~+>((2>(Bn%'/'?^$1@4 D5E/!8!8!:AtDKKGTJLL,E}}T}=HIIdExI0FF8ZXF6HHX8H,GGHdDhG7<<>599;
0@@@NN< ,D(JGG DD	

 (h
CCDs   P=Q"  !Q" "+RRc                 z    | j                   }t        | |||||      \  }}}	t        j                  |||	z
  z  |      S )a  Quant dequant tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quant-dequant weight
    )r*   r~   r(   r.   )
rt   r@   rA   ru   r   rv   	org_shapeweightrD   zps
             rV   
qdq_tensorr   y  sC     

I$T8ZPUVFE2::ev{+Y77rX   c                     |dk(  r| S | j                   }||z  }||d   z
  }|dkD  rt        j                  | d|fdfd      } | S )a  Pad tensor rowi so that it can be is divisible by group_size.

    Args:
        weight (array): weight
        group_size (int): how many elements share one scale/zp
        k_blocks (int): the number of block

    Returns:
        weight: paded weight
    r   r   )r   r   constant)r*   r(   pad)r   rA   rB   org_w_shapepadded_rowspad_lens         rV   
pad_tensorr     sY     R,,KZ'KKN*G{!Wv 6
CMrX   CPUExecutionProviderk_quantc	                 t   t        |       } | j                  )t        j                  j	                  | j                        nd}	g }
g }t        | j                         D cg c]  }|j                  dv s| c}      }d}| j                         D ]n  }|j                  dv r|dz  }t        ||       |j                  dv s2| j                  |j                  d         Q|j                  |j                  i       dk7  sq| j                  |j                  d         }t        j                  ||	      j                         }t        |j                         dk7  r|j"                  }|j                  |v r6||j                     d	   }||j                     d
   }||j                     d   }|j                   }|dk7  r|n|d   }|d   dz
  |z  dz   }| j%                  |j                  d         }t'        |||      }|dk(  xs |dk(  }|r|dk(  rt)        |j*                  ||      \  }}}n;t-        |j*                  |||d|j                  |j                  d   d            \  }}}t/        ||||||j1                  d      |j1                  |      |dk(  s|dk(  r|nd|	      \  }}| j3                  |       |j5                  |       |
j5                  |       n	t7        |j*                  |||d|j                  |j                  d   d            }t9        j:                  ||d   df      }t9        j<                  |      }|d|d   ddf   j1                  |      }t>        j@                  jC                  |j                  d   d|d|z   tE        |      |j                   |jG                         d      }| jI                  |       |j                  |j                  d<   |dk(  s^| jK                  |       q | jM                  |
       | jO                  |       | jQ                          | S c c}w )a  Quant the model with round to nearst method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'RTN'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        ratios (dict, optional): percentile of clip. Defaults to {}.
        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    N MatMulr   r   fp32)base_dirr   r"   rA   ru   r   r   r
   r   r\   r   rZ   	r>   r?   r@   rA   rB   rC   rD   rE   r$   ra   r   r   Tr   ))r   
model_pathospathdirnamelennodesrL   r   get_initializerr+   r   r   r   to_arraycopyr*   r   get_initializer_share_numr   r   Tr~   rW   r6   add_initializersr5   r   r(   r.   	transposer1   r2   r3   r   r4   add_initializerremove_initializer	add_nodesremove_nodestopological_sort)modelweight_configr@   rA   ru   ratiosr$   	providers	algorithmr   	new_nodesr   r}   	total_numcurr_idr>   weight_tensorr   r   r   rB   init_share_numsatisfy_MatMulNBits_conditionrC   rD   r   q_matmul_noderJ   rT   s                                rV   rtn_quantizer     s   H eE494D4D4Prwwu//0VXHILI1j1HQIJIG E8<<:%qLG	73LLJ&%%djjm4@!!$))R0F:!11$**Q-@M!**=8LQQSF6<< A%LLEyyM)(3F;*4995lC
&tyy1(; ,,K'1R'7[^J#A*z9A=H"<<TZZ]KN
H=F,4M,JX]),	)*CFHHhXb*c'HeR*6(J

SWS]S]^_S`bcHd+'HeR ,H!,%)%%__W5,,u-%+v%5i9OrUY#1
,(y &&y1##D)  /%fhh*feU[U_U_`d`j`jkl`mopUqr::hQ0DE<<1#$4k!n$4a$78??F"&++"9"9A2h\:.)II6u=!))+ #: # %%o6 / 4 4

1"((7KE8N 
OOI	|$	LY Js   P52P5c           
      0   | j                   }|dk7  rt        j                  | d|f      n| } t        j                  t        j                  t        j                  |       t        j
                  t        j                  |       dd      z  |      d      }|S )zGet the scale of weight.r   r   Tr^   r   r_   )r*   r(   r.   meanrg   re   )r   rA   r   rD   s       rV   get_weight_scaler     sn    I5?25ERZZZ 016FGGBJJrvvf~rvvf~AX\0]]_hipqrELrX   c                    ddl m} ddlm} t	        j
                         }t        j                  dk  r$ |d      rddlm	} |j                   |              | j                  r1t        j                  | j                  | j                  dz   d	d	d
       | j                  s0t	        j                   | j                  j#                         ||      n$t	        j                   | j                  dz   ||      }|j%                         D 	cg c]  }	|	j&                   }
}	~g }t)        |      D ]~  \  }	}|dk7  r|	dz   |j*                  z  |kD  r ||fS t-        |
      dk7  st/        |d   t0              r<t-        |d         t-        |
      k(  s"J dt-        |
       dt-        |d                 t/        |d   t0              rG|j3                  t1        |d   j5                         D cg c]  \  }}| ||      f c}}             t/        |d   t6        j8                        rA|j3                  t1        t;        |
|d   gd
      D cg c]	  \  }}||f c}}             ;|j3                  t1        t;        |
|d   d
      D cg c]  \  }}| ||      f c}}              ||fS c c}	w c c}}w c c}}w c c}}w )as  Prepare inputs for weight only quantization.

    Args:
        model (ModelProto or ONNXModel): onnx model
        n_samples (int, optional): calibration sample number. -1 means all samples.
        dataloader (object): dataloader for calibration.
        providers (list): providers to use

    Returns:
        inputs: prepared inputs.
        so: session options
    r   )	find_specr   )to_numpy)      onnxruntime_extensions)get_library_path_augment.onnxTFsave_as_external_dataall_tensors_to_one_fileconvert_attributer   r   zInput number mismatch, require z	 but get strict)importlib.utilr   utilr   ortSessionOptionssysversion_infor   r   register_custom_ops_libraryis_large_modelr1   
save_modelr   r   InferenceSessionSerializeToString
get_inputsr   	enumerate
batch_sizer   
isinstancedictr5   itemsr(   ndarrayzip)r   	n_samples
dataloaderr   r   r   sor   sessionr}   inputs_namesr%   rt   r   inp_datainps                   rV   prepare_inputsr  #  s~    )				B
'!i0H&I;
&&'7'9:KK."&$(#	
 ## 	U[[::<bIV!!%"2"2_"DbT]^ 
 %,$6$6$89qAFF9L9FZ( s4?Q**?*? ?9L 2: |!ZQ%>tAw<3|#44 1#l2C1DIcRVWXRYl^\4 d1gt$MM$QUVWQXQ^Q^Q`a~tXx'9 :abcQ,MM$SPTUVPWyaf=gh	cshijMM$s<Y]^_Y`inGop)$x} 5pqrs 2:% :  bhps   J1'J6
J<
K   {Gz?FTc
                 |  	#$%& d|z  dz
  $d#d%d&#$%&	fd}
| j                   } |
|       \  }}t        j                  |      dk(  }d|||f<   d| |ddf<   |rGt        j                  t        j                  |            ddd	   }| |ddf   } ||ddf   dd|f   }t        j                  |       }t        j                  |       }|t        j
                  t        j                  |            z  }t        j                  |d         }|||fxx   |z  cc<   t        j                  j                  t        j                  j                  |            j                  }|}t        d|d   |      D ]  }t        ||z   |d         }||z
  }t        j                  | ||ddf         }t        j                  |      }t        j                  |      }t        j                  |      }|||||f   }t        |      D ]   }||ddf   }|||f   }|d	k7  r(||z   |z  dk(  r |
| ||z   ||z   |z   ddf         \  }}|t        j                  t        j                   |ddt        j"                  f   |z        |z   d$      |z
  z  j%                         } | ||ddf<   || z
  dz  |dz  z  ||ddf<   || z
  |z  }!||dddfxx   t        j&                  t        j(                  ||d|f   d
      t        j(                  |!d
            z  cc<   |!||ddf<   # ||||ddf<   |dz  |||ddf<   | |dddfxx   t        j&                  ||d||f   |      z  cc<    |rt        j                        }"||"ddf   }t        j*                  || j                         }~ |S )a  Quant the weight with GPTQ method.

    Args:
        W (array): weight.
        H (array): Hessian matrix.
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        blocksize (int, optional): blocksize to quantize weight.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.

    Returns:
        Q: fake quantized weight
    r   r   d   g?g333333@c                    | j                   }s%t        j                  | j                         d      } t        j                  | j                   d         }t        j
                  t        j                  | d      |      }t        j                  t        j                  | d      |      }dk(  rLt        j                  t        j                  |      |      }|dk  }t        j                  |      r	||    ||<   |dk(  |dk(  z  }d||<   d||<   ||z
  z  }dk(  r)t        j                  |j                         dz   z  dz  }nt        j                  | |z        }r,t        j                  | j                   d   g      t        d      z  }t        t        z              D ]  }d|z  z
  }	|	|z  }
|	|z  }||
z
  z  }dk7  rt        j                  |
 |z        n|}t        j                   t        j                  | |z        |z   d      }|| z  }t        j"                  t        j                  |            }t        j$                  |d      }||k  }t        j                  |      s||   ||<   ||   ||<   ||   ||<    s1|d   }t        j&                  ||      }t        j&                  ||      }dgdgt)        |      dz
  z  z   }t        j*                  ||      }t        j*                  ||      }||fS )Nr   r   r   r]   r   r   inf)r*   r(   expand_dimsrk   r)   ro   rd   rf   re   rg   anyrh   rn   rm   r   ra   rs   powerr   repeatr   r.   )r   r   tmpxminxmaxrD   zerobestr}   pxmin1xmax1scale1zero1qerrr*   gridrw   	maxshrinkmsenorm
perchannelru   s                    rV   find_paramszgptq.<locals>.find_params~  s   LL	^^FNN$41=Fhhv||A'zz"&&a0#6zz"&&a0#6U?::bffTlD1D(Cvvc{!#YJS	qyTQY'S	S	$U?775;;'4!84q8D88TEEM*D77FLLO,-e<D3y4/01 +DLDD%-4/5;u_%&1$GGBHHVf_5=q$GVHHRVVAY-ffQlDj66#; #CDI!'E#J %c
DI+ A,CIIeS)E99T3'Dsc)nq011

5%(zz$&d{rX   r   Nr   r   )r*   r(   diagargsort
zeros_liker   r8   linalgcholeskyinvr   r   rd   r   deepcopyrs   rn   newaxisrk   matmulr  r.   )'WHr@   rA   ru   	blocksizepercdampactorderr)  r+  r,  r*   rD   r   deadpermLossesQdampr-  Hinvi1i2countW1Q1Err1Losses1Hinv1r}   wdr%  err1invpermr'  rw   r(  r*  s'       `   ``                         @@@@rV   gptqrM  \  s   : h;?DDID. .` GGEAIE2771:?DAdDjMAdAgJ zz"''!*%dd+dAgJdAgJq$w]]1F
aAbggbggaj))D99U1XDdDjMTM
		299==+,..ADAuQx+ 7iq*R]]1RUAX;']]2}}R --#RUBrE\"u 	A1a4AadARFj(A- +ArAv"q&::M.NPQ.Q,R SIE2"''"((1Q

]+;e+C"Dr"I1dSVXXYbbdABq!tHUqL1a4/GAqDMEQ;Dqr1uI2>>%A,Q#GX\cdIeffIDAJ	  "R%("Q;r"uax	"#q&	RYYtBCBJ/66	;7> **T"gqjM


1aggA	HrX   c                    t        |       } | j                  )t        j                  j	                  | j                        nd}t        | |||      \  }}~t        j                  | j                  j                  j                        }| j                  |D cg c]  }|j                   c}       g }| j                         D ]  }|j                  dv s|j                  |j                  i       dk7  s2|j                  |j                  i       j                  dd      dk(  sb|j!                  |j"                  d           t%        t'        |            }| j)                  |       | j*                  r1t-        j.                  | j                  | j                  dz   d	d	d
       | j*                  s0t1        j2                  | j                  j5                         ||      n$t1        j2                  | j                  dz   ||      }t7        |      D ]  \  }}t9        t;        |      |dz          g }g }| j<                  |   D ].  }|j                  dv s|j                  |j                  i       dk7  s3|j                  |j                  i       j                  dd      dk(  sc| j?                  |j"                  d         tA        jB                  | j?                  | jE                  |j                        j"                  d         |      j                         }t;        |jF                        dk7  r|j!                  |       |j!                  | jE                  |j                               1 t;        |      dk(  rt|D cg c]3  }tI        jJ                  |jF                  d   |jF                  d   f      5 }}d}|D ]  }|jM                  |g|      d   }|jF                  d   }tI        jN                  |d|jF                  d   f      }|D cg c]  }||||z   z  z   }}||z  }tI        jP                  d|z        |z  }|D cg c]%  }|tI        jR                  |jT                  |      z   ' }} tW        |||d
      D ]%  \  }}} |j                  |v r6||j                     d   }||j                     d   }||j                     d   }|dk7  r|n|jF                  d   }|jX                  }!t[        || ||||||	|
|
      }"| j?                  |j"                  d         }#| j]                  |j"                  d         }$|dk(  }%|%r|jF                  }&|&d   |z   dz
  |z  }'t_        |"||'      }"ta        |"jT                  |||d      \  }"}(})tc        ||&|||'|"je                  d      |(je                  |!      |dk(  r|)nd|	      \  }*}+| jg                  |+       | ji                  |       | jk                  |*       nt,        jl                  jo                  |j"                  d   d|d|z   tq        |!      |"jF                  |"je                  |!      js                         d	      },| ju                  |,       |,j                  |j"                  d<   |$dk(  s| jw                  |#       (  | j                  |       | j                  j                  j                  jy                  |       | j{                          | j*                  rCddl>m?}-  |-| j                  t        j                  j                  | j                        d          | S c c}w c c}w c c}w c c}w )a  Quant the model with GPTQ method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        dataloader (object): dataloader for calibration.
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'GPTQ'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        n_samples (int, optional): calibration sample number.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        blocksize (int, optional): blocksize to quantize weight.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    Nr   r   r   r   GPTQr   r   TFr   r   r   r   r   r   r"   rA   ru   )r@   rA   ru   r8  r9  r:  r)  r+  r   r\   r   rZ   r   r   r   r   )load_external_data_for_model)Ar   r   r   r   r   r  r   r3  r   graphr<   remove_tensors_from_outputsr   r   rL   r   r5   r+   listsetadd_tensors_to_outputsr   r1   r   r   r   r   r  r   r   input_name_to_nodesr   r   r   get_noder*   r(   r)   runr.   r   r5  r   r  r   rM  r   r   r~   rW   r6   r   remove_nodeadd_noder2   r3   r   r4   r   r   	MergeFromr   onnx.external_data_helperrP  split).r   r
  r   r@   rA   ru   r	  r9  r8  r:  r)  r+  r$   r   r   r%   r  
org_outputr}   output_namesr>   r  rP   
input_name	node_listr   r   Hsnsamplesrt   r  r  r7  r   rC   r   r   r   r   rB   rD   r   r   rJ   rT   rP  s.                                                 rV   gptq_quantizerd    s   ^ eE494D4D4Prwwu//0VXHy*iHJFBu{{00778J	%%z&B!qvv&BCL /LLJ&!!$))R0F:!!$))R044[&IVS

1./ L)*L	  .KK."&$(#	
 ## 	U[[::<bIV!!%"2"2_"DbT]^  %\2 `8ZC-sQw7	--j9 	<D
*!%%dii4>!%%dii488fMQWW))$**Q-8D%..))%..*C*I*I!*LMx$&  v||$)v&  		!:;	<  w<1:ABQbhh
AGGAJ/0BB 	9D++zlD1!4C))A,C**S2syy}"56C=?@!x8c>23@B@OH''!h,'#-C578!biis++8B8	9 GR6	=	8 
yyM)(3F;*4995lC
&tyy1(;'1R'7V\\!_JLLE!%#!!%H "11$**Q-@M"<<TZZ]KN,4M),"LL	%aL:59jH%h
HE&28::xU[]c&d#%+G!*%)%%__W5,,u-%+v%5r4#1
,(y &&y1!!$'}-"&++"9"9A2h\:.)II6u=!!/779 #: # %%o6 / 4 4

1"((7{=	8G`8D 
%%l3	KK&&z2	 J$U[["''--@P@P2QRS2TULO 'Cd C A 9s   ]8]!]&*]+)r   )r   rY   rZ   ra   g      ?)r   rY   )r   rY   rZ   r  r  FFT)__doc__r   loggingr   r   numpyr(   r1   r   onnx.helperr   onnxruntimer   
onnx_modelr   r   r   	getLoggerr,   rW   r~   r   r   r   r   r   r   r  rM  rd   rX   rV   <module>rm     s   0 %   	 
    0  ! %			.	/ p.f3'lF'RXDv8&4 %&tn6x I^ %&|rX   