
    `ijK                     ,   S SK r S SKrS SKrS SKrS SKrS SKJr   / SQrSS jrS r	S r
SS jrS rS	 rS
 rS r   SS jrS rS rS r\S:X  aB  \" 5       r\" S\5        S SKJr  \" \R2                  5        \" \5      r\ H  r\" \5        M     gg)    N)TensorProto)ScanLoopIfc           	         [         R                  " 5       nUR                  SSS[        SS9  UR                  SSS[        S	S
S9  UR                  SSS[        SSS9  UR                  SS[        S	SS9  UR                  SS[        S	SS9  UR                  SS[        SSS9  UR                  SS[
        SSS9  UR                  SS[        SSS9  UR                  SS[        S SS9  UR                  SS[        S S S9  UR                  S!S[        S S"S9  UR                  S#SS$/ S%QS&S'9  UR                  S(S)SS*S+S,9  UR                  SS-9  UR                  S.SS*S/S,9  UR                  SS09  UR                  S1SS*S2S,9  UR                  SS39  UR                  S4S5SS*S69  UR                  SS79  UR                  U 5      $ )8Nz-mz--modelTzonnx model path)requiredtypehelpz-bz--batch_sizeF   zbatch size of input)r   r	   defaultr
   z-sz--sequence_length    zsequence length of inputz--past_sequence_lengthzpast sequence length for gpt2z--global_lengthz&number of global tokens for longformerz	--samplesi  z\number of samples to test. Set it large enough to reduce the variance of performance result.z--thresholdg{Gz?zfThreshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.z--thread_numznumber of threads to usez--input_ids_namez"input name for input IDs, for bertz--segment_ids_namez$input name for segment IDs, for bertz--input_mask_namez'input name for attention mask, for bertz--dummy_inputsr   )bertgpt2
longformerr   zEType of model inputs. The default will create dummy inputs with ones.)r   r   choicesr
   z-gz	--use_gpu
store_truezuse GPU)r   actionr
   )use_gpuz--basic_optimizationz_Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime)basic_optimizationz--kernel_time_onlyz.Only include the kernel time and no fence time)kernel_time_onlyz-vz	--verbose)r   r   )verbose)argparseArgumentParseradd_argumentstrintfloatset_defaults
parse_args)argvparsers     f/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/onnxruntime_tools/transformers/profiler.pyparse_argumentsr$      sx   $$&F
i$SGXY
nu3PQXmn
+!&  "7  9 0!&  !<	  > )!&  !E	  G k  m u  w S"Smn
*!&  $A	  C
 ,!&  $C	  E
 +!&  $F	  H (!& ) Id	  f kE,U^_
&
n	  p
 51
,!&+M  O /
kE,O
&T""    c                     SSK Jn  U" U UU(       + USS9nU H  nUR                  S U5      nM     UR                  5       n	U	$ )Nr   )create_onnxruntime_sessionT)enable_all_optimizationnum_threadsenable_profiling)benchmark_helperr'   runend_profiling)
onnx_model_pathr   r   
thread_num
all_inputsr'   sessioninputs_profile_files
             r#   run_profiler5   e   sS    ;()0EWAW5?:>	@G KKf%  ((*Lr%   c                     [        SU  S35        [        U S5       n[        R                  " U5      nS S S 5        [	        W[
        5      (       d   eU$ ! , (       d  f       N'= f)Nzloading profile output z ...r)printopenjsonload
isinstancelist)r4   opened_file	sess_times      r#   load_profile_jsonr@   u   sW    	#L>
67	lC	 KIIk*	 
! i&&&&	 
!	 s   A
A"c                    / n0 n0 n0 nSnU  H  nUS   S:X  d  M  SU;   d  M  SU;   d  M  SUS   ;   d  M)  US   R                  SS	5      R                  S
S	5      R                  SS	5      n	SUS   ;   a%  US   S   S:X  a  SOSn
X;  a  XU	'   OXi   U
:X  d   eO	U(       a  M  US   S   nU[        ;   a  M  X;   a  XI==   US   -  ss'   XY==   S-  ss'   OUS   XI'   SXY'   UR                  U	5        XxS   -  nM     / SQnSnU H\  n	XI   nXY   nU[        U5      -  nX-  S-  nX;   a  Xi   OS	nUR                  US SUS SUS SSU-
  S SU SU SU	 35        UU-  nM^     UR                  SUS S35        UR                  S5        UR                  S5        [	        UR                  5       S SS9 HO  u  pX-  nUU:  a  M  XY   nU[        U5      -  nX;   a  Xi   OS	nUR                  US SUS-  S SU SU	 35        MQ     U$ ) a  Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

Args:
    sess_time (List[Dict]): profile data
    kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
    threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

Returns:
    List[str]: lines of string for output.
r   catNodedurargsop_namename_kernel_time _fence_before_fence_afterproviderCPUExecutionProviderCPUCUDAr   )zResults:@----------------------------------------------------------------uT   Duration(μs)	Percentage	Before(Exclusive)	After(Inclusive)	Calls	Provider	Node_Nameg              Y@.1f	5.2fz5.1fz$
Top expensive nodes with threshold=z.2f:rP   u&   Duration(μs)	Percentage	Provider	Namec                     U S   $ Nr    xs    r#   <lambda>'parse_profile_results.<locals>.<lambda>   s    qtr%   Tkeyreverse)replaceNODES_TYPE_CONTAINING_SUBGRAPHappendr   sorteditems)r?   r   	thresholdnode_name_list	node_time	node_freqnode_providertotalitem	node_namedevicerF   linesbefore_percentagedurationcallsavg_time
percentagerL   ratios                       r#   parse_profile_resultsru      s    NIIME;& Ud]v~)W[\bWcJcV,,^R@HHIKMMTWUcegMh  T&\)"&v,z":>T"TZ`1/5),(3v===!6l9-G88%$U3$$)$'+E{	$'(	$%%i0%[ E5 :E #	'$eEl*&%/
/8/I=+rnBz$/r2CD1IETeLefjKkkmnsmttvw  wA  AC  DM  CN  O	
 	Z' $ 
LL83qIJ	LL	LL<=%ioo&7^UYZ	 9$eEl*/8/I=+r~Rd';2hZr)UV  [ Lr%   c                 2   0 n0 n0 n0 nSnU  H  nUS   S:X  d  M  SU;   d  M  SU;   d  M  SUS   ;   d  M)  U(       a  SUS   ;  a  M;  US   S   n	U	[         ;   a  MO  X;   a  X9==   US   -  ss'   XI==   S-  ss'   OUS   X9'   SXI'   XxS   -  nSUS   ;   =(       a    US   S   S	:H  n
U
(       d  M  X;   a  XY==   US   -  ss'   Xi==   S-  ss'   M  US   XY'   SXi'   M     U(       a  S
/nOS/n[        UR                  5       S SS9 H  u  pX-  nXI   nX;   a  XY   OSnX;   a  Xi   OSnU[        U5      -  nU(       a,  UR	                  US SU SUS-  S SU SU SU SU	 35        Md  UR	                  US SU SUS-  S SU SU	 3	5        M     U$ )zGroup results by operator name.

Args:
    sess_time (List[Dict]): profile data
    kernel_time_only (bool): Only include items for kernel time.
    use_gpu (bool): GPU is used in profiling or not.

Returns:
    List[str]: lines of string for output.
r   rB   rC   rD   rE   rF   rL   r   rM   uJ   Average(μs)	Total(μs)	Total_Percentage	Calls	Cpu_Duration	Cpu_Calls	Nameu3   Average(μs)	Total(μs)	Total_Percentage	Calls	Namec                     U S   $ rW   rX   rY   s    r#   r[   'group_profile_results.<locals>.<lambda>   s    1Q4r%   Tr]   rR   rS   rQ   rT   )ra   rc   rd   r   rb   )r?   r   r   op_time
op_recordsop_cpu_timeop_cpu_recordsrj   rk   rF   is_cpurn   rp   rt   rq   cpu_time	cpu_callsrr   s                     r#   group_profile_resultsr      s    GJKNE;& Ud]v~)W[\bWcJcJd6l$B6l9-G88! DK/ #q(##'; &'
#%[ E4</fDL4LPf4fFv)(DK7("+q0++/;K(./N+5 8 cdJK#GMMOQUV #+2+A;'q/6/HN+a	eEl*LLC.8*Buu}T.B"UG2hZWYZcYddfgnfopr LLHS>H:Rd7K2eWTVW^V_`a W Lr%   c                 ~    [        U R                  S5      5      [        :X  a  [        X R                  S5      5      $ S $ )Nvalue)r	   
WhichOneofr   getattr)dims    r#   get_dim_from_type_protor     s2    489P4QUX4X73w/0b^bbr%   c                     U R                   R                  R                   Vs/ s H  n[        U5      PM     sn$ s  snf N)tensor_typeshaper   r   )
type_protods     r#   get_shape_from_type_protor     s4    0:0F0F0L0L0P0PQ0P1#A&0PQQQs   ;c                 8   0 nU R                  5        GHc  n[        UR                  5      n/ n[        U5       H-  u  p[	        U	[
        5      (       d  M  UR                  U5        M/     [        U5      S:  a    g[        U5      S:  a  XUS   '   [        U5      S:  a  X&US   '   UR                  R                  R                  n
U
[        R                  [        R                  [        R                  4;   d   eU
[        R                  :X  a  [        R                  O3U
[        R                  :X  a  [        R                   O[        R"                  n[        R$                  " XkS9nXUR&                  '   GMf     [)        U5       Vs/ s H  oPM     nnU$ s  snf )zCreate dummy inputs for ONNX model.

Args:
    onnx_model (OnnxModel): ONNX model
    batch_size (int): batch size
    sequence_length (int): sequence length
    samples (int): number of samples

Returns:
    List[Dict]: list of inputs
   Nr   r   dtype)'get_graph_inputs_excluding_initializersr   r	   	enumerater<   r   rb   lenr   	elem_typer   FLOATINT32INT64numpyfloat32int64int32onesrG   range)
onnx_model
batch_sizesequence_lengthsamplesdummy_inputsgraph_inputr   symbol_dimsir   r   	data_typedatar3   r0   s                  r#   create_dummy_inputsr     sT    L!IIK)+*:*:;&FA#s##""1% '
 {a{a$.+a.!{a$3+a.!$$00::	[..0A0A;CTCTUUUU%.+2C2C%CEMM$(9(99EKKu{{ 	zz%1)-[%%&) L, ).g71,J7 8s   Fc                 J    SSK JnJn  U" XXV5      u  pnU" UUUSSU	U
USS9	nU$ )a  Create dummy inputs for BERT model.

Args:
    onnx_model (OnnxModel): ONNX model
    batch_size (int): batch size
    sequence_length (int): sequence length
    samples (int): number of samples
    input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
    segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
    input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

Returns:
    List[Dict]: list of inputs
r   )find_bert_inputsgenerate_test_data{   F)
test_casesseedr   	input_idssegment_ids
input_maskrandom_mask_length)bert_test_datar   r   )r   r   r   r   input_ids_namesegment_ids_nameinput_mask_namer   r   r   r   r   r0   s                r#   create_bert_inputsr   =  sF    * D)9*Vf)x&IJ#J$3/6),,1.70;/97<>J r%   c                    UUUX#-   S.n0 nU R                  5        GH8  n[        UR                  5      n[        U5       HA  u  p[	        U
[
        5      (       a!  XR                  5       ;  a  [        SU
 35      eXZ   X'   MC     UR                  R                  R                  nU[        R                  [        R                  [        R                  4;   d   eU[        R                  :X  a  [        R                  O3U[        R                  :X  a  [        R                   O[        R"                  n[        R$                  " XS9nXUR&                  '   GM;     [)        U5       Vs/ s H  oPM     nnU$ s  snf )a  Create dummy inputs for GPT-2 model.

Args:
    onnx_model (OnnxModel): ONNX model
    batch_size (int): batch size
    sequence_length (int): sequence length
    past_sequence_length (int): past sequence length
    samples (int): number of samples

Raises:
    RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.

Returns:
    List[Dict]: list of inputs
)r   seq_lenpast_seq_lentotal_seq_lensymbol is not supported: r   )r   r   r	   r   r<   r   keysRuntimeErrorr   r   r   r   r   r   r   r   r   r   r   rG   r   )r   r   r   past_sequence_lengthr   symbolsr   r   r   r   r   r   r   r   r3   r0   s                   r#   create_gpt2_inputsr   a  s>   $ !",(?	G L!IIK)+*:*:;&FA#s##<<>(A"%>se#DEE"<	 '  $$00::	[..0A0A;CTCTUUUU%.+2C2C%CEMM$(9(99EKKu{{ 	zz%1)-[%%& L ).g71,J7 8s   &E5c                 J   XS.n0 nU R                  5        GHh  n[        UR                  5      n[        U5       HA  u  p[	        U
[
        5      (       a!  XR                  5       ;  a  [        SU
 35      eXZ   X'   MC     UR                  R                  R                  nU[        R                  [        R                  [        R                  4;   d   eU[        R                  :X  a  [        R                  O3U[        R                  :X  a  [        R                   O[        R"                  nSUR$                  ;   a   [        R&                  " XS9nSUSS2SU24'   O[        R(                  " XS9nXUR$                  '   GMk     [+        U5       Vs/ s H  oPM     nnU$ s  snf )a  Create dummy inputs for Longformer model.

Args:
    onnx_model (OnnxModel): ONNX model
    batch_size (int): batch size
    sequence_length (int): sequence length
    global_length (int): number of global tokens
    samples (int): number of samples

Raises:
    RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.

Returns:
    List[Dict]: list of inputs
)r   r   r   globalr   r   N)r   r   r	   r   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   zerosr   r   )r   r   r   global_lengthr   r   r   r   r   r   r   r   r   r   r3   r0   s                   r#   create_longformer_inputsr     s_     (LGL!IIK)+*:*:;&FA#s##<<>(A"%>se#DEE"<	 '  $$00::	[..0A0A;CTCTUUUU%.+2C2C%CEMM$(9(99EKKu{{ 	 {''';;u6D&'DN]N"#::e5D)-[%%&% L( ).g71,J7 8s   F c           	         U R                   S:  a  U R                   O[        R                  " SS9nS[        R                  ;  a  [        U5      [        R                  S'   SSKJn  SSKJ	n  U" U" U R                  5      5      nS nU R                  S:X  aM  [        X@R                  U R                  U R                  U R                   U R"                  U R$                  5      nOU R                  S:X  a7  ['        X@R                  U R                  U R(                  U R                  5      nOrU R                  S	:X  a7  [+        X@R                  U R                  U R,                  U R                  5      nO+[/        X@R                  U R                  U R                  5      n[1        U R                  U R2                  U R4                  U R                   U5      n[7        U5      n[9        XpR:                  U R<                  5      nUR?                  S
5        UR?                  S5        U[A        XpR:                  U R2                  5      -  nU$ )Nr   F)logicalOMP_NUM_THREADS)r;   )	OnnxModelr   r   r   z
Grouped by operator type:rP   )!r/   psutil	cpu_countosenvironr   onnxr;   r   r   modelr   r   r   r   r   r   r   r   r   r   r   r   r   r5   r   r   r@   ru   r   re   rb   r   )	rE   r)   r;   r   r   r0   r4   profile_recordsrn   s	            r#   r,   r,     s   %)__q%8$//f>N>NW\>]K 

*(+K(8

$%$4

+,JJF"'
OOTEYEY[_[g[g(,(;(;T=R=RTXThThj
			f	$'
OOTEYEY[_[t[t(,6
			l	*-j//4K_K_aeasas.2ll<
 )__dFZFZ\`\h\hi
tzz4<<9P9PRVRaRacmnL'5O!/3H3H$..YE	LL./	LL	"?4I4I4<<XXELr%   __main__	Arguments)setup_loggerr   )Fr   )NNN)r   r   r:   r   r   r   r   ra   r$   r5   r@   ru   r   r   r   r   r   r   r   r,   __name__	argumentsr8   r+   r   r   resultslinerX   r%   r#   <module>r      s    	      "8 R#j K\>BcR$V '+(,'+!H)X(V"J z!I	+y!-""#)nGd  r%   