
    i(                         S SK r S SKrS SKrS SKJr  S SKJs  Jr  S SKJ	r	  S SK
JrJrJr  S SKJr   " S S\R                   5      rS
S jrS r\S	:X  a  \" 5         gg)    N)XnnpackPartitioner)EdgeCompileConfigEdgeProgramManagerto_edge_transform_and_lower)Dimc                   
  ^  \ rS rSrSr         SS\S\S\S\S\S\S	\S
\S\SS4U 4S jjjrS\	R                  4S\S\S\S\	R                  S\	R                  4
S jjrS\	R                  S\	R                  4S jrSrU =r$ )WhisperAudioProcessor   a~  
Computes Mel spectrograms from mono audio input.
Same as HuggingFace WhisperFeatureExtractor, but implemented in PyTorch

Args:
    feature_size (`int`, defaults to 80):
        The feature dimension of the extracted features.
    sampling_rate (`int`, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    hop_length (`int`, defaults to 160):
        Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
    chunk_length (`int`, defaults to 30):
        The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
        sequences.
    n_fft (`int`, defaults to 400):
        Size of the Fourier transform.
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
feature_sizesampling_rate
hop_lengthchunk_lengthn_fftpadding_valuemax_audio_lenstack_output	streamingreturnNc
                 Z  > [         T
U ]  5         Xl        X l        X`l        XPl        X0l        X@l        XB-  U l        U R                  U-  U l	        X l        U R                  X%US9U l        Xpl        Xl        Xl        U R                  (       a  U R                  (       a  [        S5      eg g )N)n_melsz}--streaming and --stack_output are mutually exclusive. stack_output assumes 30-second chunk padding which streaming disables.)super__init__r   r   r   r   r   r   	n_samplesnb_max_framesget_mel_filtersmel_filtersr   r   r   
ValueError)selfr   r   r   r   r   r   r   r   r   	__class__s             i/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/executorch/extension/audio/mel_spectrogram.pyr   WhisperAudioProcessor.__init__,   s     	(**
$(%5!^^z9*// 0 
 +(">>d//Y  0>       srr   dtypec                 l   [        U5      n[        R                  " U[        SUS-  -   5      4US9n[        R                  R	                  USU-  US9nSnSn[        R
                  " XxUS-   US9n	Sn
SnXU	-  -   nS	nX-
  U-  n[        R                  " [        R                  " S
US95      S-  nX:  nU[        R                  " XU   U-
  -  5      -  UU'   Un[        R                  " U5      n[        R                  " UR                  S5      UR                  S5      5      n[        U5       Hb  nUU   * UU   -  nUUS-      UUS-      -  n[        R                  " [        R                  " SUS9[        R                  " UU5      5      UU'   Md     SUSUS-    US U -
  -  nUUS S 2S 4   -  nU$ )N      r%   g      ?)ndr%           g%qF@gP@g     @@g@g      ;@r   g       @)inttorchzerosfftrfftfreqlinspacelogtensorexpdiffsubtract	unsqueezerangemaximumminimum)r   r$   r   r   r%   weightsfftfreqsmin_melmax_melmelsf_minf_spfreqs
min_log_hzmin_log_mellogsteplog_tmel_ffdifframpsilowerupperenorms                           r    r   %WhisperAudioProcessor.get_mel_filtersP   s    V++vs1uz>':;5I 99%%r%G $~~g
%H t# 
!)T1IIell3e45< 	
 #!EIIge{9R.S$TTe

5!uq183E3Ea3HIvA1XIa(E!a%L5Q</E S.eU0KGAJ  uQ!,uWf~=>5D>!r"   waveformc                    U R                   (       db  UR                  S   S-
  U R                  -  S-   n[        R                  " USU R                  U-  UR                  S   -
  4SU R
                  S9nSS[        R                  " S[        R                  -  [        R                  " SU R                  S-
  U R                  [        R                  S9-  U R                  -  5      -
  -  n[        R                  " UU R                  U R                  USSS	9n[        R                  " U5      S
SS24   S-  nU R                  U-  n[        R                   " [        R"                  " USS95      n[        R$                  " XwR'                  5       S-
  5      nUS-   S-  nU R(                  (       a;  UR+                  U R,                  SU R.                  5      nUR1                  SS5      nU$ UR3                  S5      $ )a  
Args:
    waveform (`torch.Tensor`): Mono waveform input, tensor of (dynamic) shape [num_samples],

Returns:
    torch.Tensor: Output of shape [1, feature_size, nb_max_frames * n_chunks]
    n_chunks is the number of chunks of `sampling_rate` samples in the input waveform.
    [1, 80, 3000] with default options and 1 chunk.
    In streaming mode, output shape is [1, feature_size, floor(N/hop_length)]
    with no chunk padding.
r   r'   constant)modevalueg      ?r(   r)   T)r   r   windowcenterreturn_complex.Ng|=)ming       @g      @)r   shaper   Fpadr   r.   cospir2   r   float32stftr   absr   log10clampr:   maxr   reshaper   r   	transposer8   )r   rP   n_chunksrU   r`   
magnitudesmel_speclog_specs           r    forwardWhisperAudioProcessor.forward   s    ~~ q)A-$..@1DHuuDNNX-q0AAB((	H ii((..DJJNDJJemmTU **
 zz**
 YYt_S#2#X.!3
##j0;;u{{8?@==<<>C+?@sNc)''(9(92t?Q?QRH))!Q/HO%%a((r"   )r   r   r   r   r   r   r   r   r   r   r   r   )	P   >          r,   X  FF)__name__
__module____qualname____firstlineno____doc__r-   floatboolr   r.   r_   r%   Tensorr   rk   __static_attributes____classcell__)r   s   @r    r	   r	      s    , "" """ " 	"
 " " " " " " 
" "J 255==33!3+.3=B[[3	3j7) 7) 7) 7)r"   r	   c           
         U c
  [        5       n U R                  (       a  SU R                  -  nOU R                  U R                  -  n[        R
                  " [        SU5      5      n[        R                  R                  5       nS[        R                  " US90XC'   [        R                  " 5          [        R                  R                  R                  R                  SS9   [        R                  R                  X4USS9n[         R"                  " U5        [%        U['        5       /[)        SS	9S
9n[         R"                  " UR+                  5       5        UR-                  5       n[/        US5       nUR1                  U5        S S S 5        [         R"                  " S5        S S S 5        S S S 5        g ! , (       d  f       N5= f! , (       d  f       N(= f! , (       d  f       g = f)Nr(   im r   )rd   T)backed_size_oblivious)dynamic_shapesstrictF)_check_ir_validity)partitionercompile_configwbDone)r	   r   r   r   r.   randnrY   exportShapesCollectionr   DYNAMICno_gradfxexperimental_configpatchloggingdebugr   r   r   exported_programto_executorchopenwrite_to_file)	modeloutput_filemax_samplesaudio_tensorshapes_collectionepedge	exec_progfiles	            r    export_processorr      sw   }%' %---))E,?,??;;s5+67L557'(#+++*F&G#	%((//77==" >  \\  ?3DT ! 
 	b $?+-.,#($
 	d++-. &&(	+t$##D) % 	f/( %$) s=   32G%BG	7F8	G	'G8
GG		
G	G
G(c                     [         R                  " SS9n U R                  S[        SSS9  U R                  S[        SS	S9  U R                  S
[        SSS9  U R                  S[        SSS9  U R                  S[        SSS9  U R                  S[        SSS9  U R                  S[        SSS9  U R                  SSSS9  U R                  SSSS9  U R                  5       n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9n[        X!R                   5        g ) Nz*Export WhisperAudioProcessor to ExecuTorch)descriptionz--feature_sizerm   z/The feature dimension of the extracted features)typedefaulthelpz--sampling_ratern   zAThe sampling rate at which audio files should be digitalized (Hz)z--hop_lengthro   z&Length of overlapping windows for STFTz--chunk_lengthrp   z1Maximum number of chunks of sampling_rate samplesz--n_fftrq   zSize of the Fourier transformz--output_filewhisper_preprocess.ptez'Output file path for the exported modelz--max_audio_lenrr   z3Max audio length that can be processed, in seconds.z--stack_output
store_truezWhether to stack output along the batch dimension, one per chunk. Used by models such as Voxtral, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/voxtral/processing_voxtral.py#L94 for more information.)actionr   z--streamingzStreaming mode: skip 30-second chunk padding, produce mel frames proportional to input length. For use with real-time audio input.)r   r   r   r   r   r   r   r   )argparseArgumentParseradd_argumentr-   str
parse_argsr	   r   r   r   r   r   r   r   r   r   r   )parserargsr   s      r    mainr      s   $$@F >	   P	   5	   @	   S/N   (6	   B	    {  
  R   D!&&((??&&jj((&&..	E U,,-r"   __main__)Nr   )r   r   r.   torch.nnnntorch.nn.functional
functionalr[   9executorch.backends.xnnpack.partition.xnnpack_partitionerr   executorch.exirr   r   r   torch.exportr   Moduler	   r   r   rs    r"   r    <module>r      s_          X 
 e)BII e)P$NC.L zF r"   