
    ?Ki                        S SK r S SKJr  S SKJr  SSKJr  SSKJrJ	r	  \	R                  " \5      r\" 5       (       a  S SKrS r   S"S\\   S	\S
   S\\   S\S\4   4S jjr   S"S\\   S	\S
   S\\   S\S\4   4S jjr   S"S\\   S	\S
   S\\   S\S\4   4S jjr S#S\S	S
S\\   S\S\4   4S jjr S#S\S	S
S\\   S\S\4   4S jjr S#S\S	S
S\\   S\S\4   4S jjr\\\\\\S.r  S$S\S\S\S\\   S\\   4
S jjrS#S\S\\   4S jjrS#S\S\\   4S jjrS#S\S\\   4S jjrS#S\S\\   4S jjrS#S\S\\   4S jjr S#S\S\\   4S  jjr!\\\\\ \!S.r"S#S\S\\   4S! jjr#g)%    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                 D   ^ ^^ S mS m[        T 5      UUU 4S j5       nU$ )aD  
Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
(i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

Args:
    rope_forward (Callable):
        The forward pass of the RoPE implementation.

Returns:
    The decorated forward pass.
c                    [         R                  " U5      S-   n[        U R                  S5      (       a  U R                  R                  nOU R                  R
                  nX4:  aR  [        U S5      (       d%  U R                  U R                  X$S-   S9u  U l        nU R                  SU R                  SS9  gU R                  R                  U5      U l	        U R                  SU R                  SS9  g)	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr   max_position_embeddingsrope_init_fnr   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r   _s         b/var/www/html/dynamic-report/venv/lib/python3.13/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update6dynamic_rope_update.<locals>.longrope_frequency_update+   s    ))L)A-4;; BCC/3{{/[/[,/3{{/R/R,5411(,(9(9KKTU1U ): )%"A   T-?-?E R &*%;%;%>%>v%FD"  T-C-CPU V    c                    [         R                  " U5      S-   nX0R                  :  a8  U R                  U R                  X#S9u  o@l        U R                  SUSS9  X0l        X0R                  :  ah  U R                  U R                  :  aM  U R                  R                  U5      U l        U R                  SU R                  SS9  U R                  U l        ggg)z
dynamic RoPE layers should recompute `inv_freq` in the following situations:
1 - growing beyond the cached sequence length (allow scaling)
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
r   r   r   Fr   N)
r   r   max_seq_len_cachedr   r   attention_scalingr   original_max_seq_lenr   r   )r   r   r   r   r   s        r    dynamic_frequency_update5dynamic_rope_update.<locals>.dynamic_frequency_update>   s     ))L)A-,,,/3/@/@f/@/f,H,  X% H&-#...43J3JTMfMf3f &*%;%;%>%>v%FD"  T-C-CPU V&*&?&?D# 4g.r#   c                    > SU R                   ;   a  T" XUR                  S9  O!U R                   S:X  a  T" XUR                  S9  T" XU5      $ )Ndynamic)r   longrope)	rope_typer   )r   xr   r(   r!   rope_forwards      r    wrapper$dynamic_rope_update.<locals>.wrapperQ   sD    &$TI^^z)%dJD\22r#   r   )r/   r0   r(   r!   s   ` @@r    dynamic_rope_updater2      s/    W&@& <3 3 Nr#   r   r   ztorch.devicer   returnztorch.Tensorc           	      F   U R                   n[        U SS5      n[        U SS5      =(       d    U R                  U R                  -  n[	        XT-  5      nSnSU[
        R                  " SUS[
        R                  S9R                  U[
        R                  S9U-  -  -  nX4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
partial_rotary_factor      ?head_dimNr      dtyper   r:   )

rope_thetagetattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser5   r7   dimattention_factorr   s	            r     _compute_default_rope_parametersrG   \   s    > D#F,CSIvz40dF4F4F&JdJd4dH
h.
/C du||AsAU[[ILLTZbgbmbmLnqttuvH%%r#   c                 J    U R                   S   n[        XU5      u  pEXC-  nXE4$ )a  
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
factor)rope_scalingrG   )r   r   r   rI   r   rF   s         r    '_compute_linear_scaling_rope_parametersrK      s:    >   *F "B&RY!ZH
 H%%r#   c           	      x   U R                   n[        U SS5      n[        U SU R                  U R                  -  5      n[	        XT-  5      nU R
                  nU R                  S   nSn	Uc  UnOi[        U[        R                  5      (       a?  [        R                  " U[        R                  " XrR                  UR                  S95      nO[        X'5      nX8U-  U-  US-
  -
  XfS-
  -  -  -  nSU[        R                  " SUS[        R                   S	9R#                  U[        R$                  S
9U-  -  -  n
X4$ )aP	  
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
            inference time
        *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
            will be accessed. The value of `factor` is used to determine the new base frequency, along with the
            current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
            computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
            factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
            context window using an exponent derived from `dim`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
        max_position_embeddings, this value will be overridden by max_position_embeddings.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
r5   r6   r7   rI   r:   r   r   r8   r   r9   r;   )r<   r=   r>   r?   r@   r   rJ   
isinstancer   Tensormaximumtensorr:   r   r   rA   rB   r   rC   )r   r   r   rD   r5   r7   rE   r   rI   rF   r   s              r    _compute_dynamic_ntk_parametersrR      s5   T D#F,CSIvz6+=+=A[A[+[\H
h.
/C$<<  *F )	GU\\	*	*--LL0gnn]

 g7 W$'>>6A:NTWab[bTcddDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%r#   c                 ,  ^ U R                   n[        U SS5      n[        U SU R                  U R                  -  5      n[	        XT-  5      nU R
                  S   nU R
                  R                  S5      nU R
                  R                  S5      n	U R
                  R                  S5      n
U R
                  R                  S5      =(       d    U R                  nSS
 jnUc1  U	(       a"  U
(       a  [        U" Xy5      U" Xz5      -  5      nOU" U5      nU R
                  R                  S5      =(       d    SnU R
                  R                  S5      =(       d    S	nS mU4S jnS nU[        R                  " SUS5      R                  U[        R                  S9U-  -  nSU-  nSUU-  -  nU R
                  R                  SS5      nU" XXcUU5      u  nnS	U" UUUS-  5      R                  U[        R                  S9-
  nUS	U-
  -  UU-  -   nUU4$ )a  
Computes the inverse frequencies with NTK scaling. Please refer to the
[original paper](https://huggingface.co/papers/2309.00071)

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble.
            *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                (only) in the linear ramp function.
            *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                (only) in the linear ramp function.
            *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                `mscale_all_dim`, if provided.
            *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                calculated based on `factor` only.
            *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                will be calculated based on `factor` only.
            *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
                during pretraining. If not provided, the function falls back to `max_position_embeddings`.
            *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
r5   r6   r7   rI   rF   mscalemscale_all_dimr   r   c                 N    U S::  a  gSU-  [         R                  " U 5      -  S-   $ )Nr   r6   g?)mathlog)scalerT   s     r    
get_mscale,_compute_yarn_parameters.<locals>.get_mscale:  s(    A:V|dhhuo-33r#   	beta_fast    	beta_slowc                     U[         R                  " X0S-  [         R                  -  -  5      -  S[         R                  " U5      -  -  $ )zPInverse dimension formula to find the dimension based on the number of rotationsr8   )rW   rX   pi)num_rotationsrE   rD   r   s       r    find_correction_dim5_compute_yarn_parameters.<locals>.find_correction_dimL  s@    dhh6!:Kdgg:UVWW\]`d`h`him`n\noor#   c                    > T" XX45      nT" XX45      nU(       a,  [         R                  " U5      n[         R                  " U5      n[        US5      [	        XrS-
  5      4$ )z.Find dimension range bounds based on rotationsr   r   )rW   floorceilr   min)	low_rothigh_rotrE   rD   r   truncatelowhighrb   s	           r    find_correction_range7_compute_yarn_parameters.<locals>.find_correction_rangeP  sR    !'N"8$P**S/C99T?D3{CAg...r#   c                     X:X  a  US-  n[         R                  " U[         R                  S9U -
  X-
  -  n[         R                  " USS5      nU$ )NgMbP?r9   r   r   )r   rA   float32clamp)rg   r   rE   linear_func	ramp_funcs        r    linear_ramp_factor4_compute_yarn_parameters.<locals>.linear_ramp_factorY  sH    :5LC||Cu}}=C	RKKQ2	r#   r   r8   r;   rj   T)r   )r<   r=   r>   r?   r@   rJ   getr   rC   r   rA   r   )r   r   r   rD   r5   r7   rE   rI   rF   rT   rU   r   rZ   r\   r^   rm   rt   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrj   rk   rl   inv_freq_extrapolation_factorr   rb   s                            @r    _compute_yarn_parametersr{      s$   p D#F,CSIvz6+=+=A[A[+[\H
h.
/C  *F**../AB  $$X.F((,,-=>N BCevGeGe %4 n$Z%?*VBd%de)&1 ##''4:I##''49Ip/ aa03363UX[[\I 9_ FY$67""&&z48H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r#   c                 p   U R                   n[        U SS5      n[        U SU R                  U R                  -  5      n[	        XT-  5      nU R
                  S   nU R
                  S   nU R
                  R                  S5      n	U R
                  R                  S5      n
[        U SS	5      =n(       a  U R                  U-  n	OU R                  nU
cM  U	S::  a  Sn
OD[        R                  " S
[        R                  " U	5      [        R                  " U5      -  -   5      n
U(       a*  X+:  a%  [        R                  " U[        R                  US9nO$[        R                  " U[        R                  US9n[        R                  " SUS[        R                  US9R!                  5       U-  nSXU-  -  -  nX4$ )a9  
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
[original implementation](https://github.com/microsoft/LongRoPE)

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
            pretraining. If not provided, defaults to `max_position_embeddings`.
        *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
            will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                the value of `factor`.
            *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                overridden s the ratio between those values.
            *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
            *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
r5   r6   r7   long_factorshort_factorrI   rF   r   Nr   rM   r   r8   )r<   r=   r>   r?   r@   rJ   rv   r   rW   sqrtrX   r   rQ   rp   rA   rB   rC   )r   r   r   rD   r5   r7   rE   r}   r~   rI   rF   r   ext_factorsinv_freq_shaper   s                  r    _compute_longrope_parametersr   s  s   ^ D#F,CSIvz6+=+=A[A[+[\H
h.
/C%%m4K&&~6L  $$X.F**../AB
 ,36;]_c+dd'd//2RR+1+I+I( S="#yyTXXf-=Ii@j-j)jk 7=ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\Nk.$889H%%r#   c                    [        XU5      u  p4U R                  S   nU R                  S   nU R                  S   nU R                  S   nX-  n	X-  n
S[        R                  -  U-  n[        R
                  " X:  X5-  U5      nX-  U-
  Xv-
  -  nSU-
  U-  U-  X-  -   nX:  ) X:  ) -  n[        R
                  " XU5      nX4$ )a	  
Computes the inverse frequencies for llama 3.1.

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                during smoothing.
            *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
            *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                the shift applied to the numerator and denominator of the smoothing factor.
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
            *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                during pretraining. If not provided, the function falls back to `max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
rI   low_freq_factorhigh_freq_factorr   r8   r   )rG   rJ   rW   r`   r   where)r   r   r   r   rF   rI   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                   r    _compute_llama3_parametersr     s    T "B&RY!ZH  *F))*;<O**+=>))*LMO&8':$''kH$G [[!;X=NPXYN$.@EUEghM]*n<vEHff238R6SSN[[NSN++r#   )defaultlinearr+   yarnr,   llama3r-   received_keysrequired_keysoptional_keysignore_keysc                     SU;   a  US1-  nUR                  S5        Ub  X-  nX!-
  nU(       a  [        SU  SU 35      eUb  X-
  U-
  nOX-
  nU(       a  [        R                  SU  SU 35        gg)zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper-   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r-   r   r   r   r   missing_keysunused_keyss          r    _check_received_keysr     s     &!+& $ 0LRS\R]]`am`nopp #3mC#3NykY\]h\ijk r#   c                     U R                   nUR                  SUR                  SS 5      5      nS1n[        UR                  5       5      n[	        X5XAS9  g )Nr-   r   r   )rJ   rv   setkeysr   )r   r   rJ   r-   r   r   s         r    !_validate_default_rope_parametersr   0  sP    &&L  l.>.>vt.LMI MM))+,M=Zr#   c                 &   U R                   nUR                  SUR                  SS 5      5      nSS1n[        UR                  5       5      n[	        X5XAS9  US   nUb  [        U[        5      (       a  US:  a  [        R                  SU 35        g g )Nr-   r   rI   r   r6   8`rope_scaling`'s factor field must be a float >= 1, got 	rJ   rv   r   r   r   rN   rC   r   r   )r   r   rJ   r-   r   r   rI   s          r    (_validate_linear_scaling_rope_parametersr   8  s    &&L  l.>.>vt.LMI (+M))+,M=Z(#F~Z66&3,QRXQYZ[ ;Gr#   c                 .   U R                   nUR                  SUR                  SS 5      5      nSS1nS1n[        UR                  5       5      n[	        X6XEUS9  US   nUb  [        U[        5      (       a  US:  a  [        R                  SU 35        g g )Nr-   r   rI   r   r   r6   r   r   )r   r   rJ   r-   r   r   r   rI   s           r    )_validate_dynamic_scaling_rope_parametersr   D  s    &&L  l.>.>vt.LMI (+M78M))+,M=]hi(#F~Z66&3,QRXQYZ[ ;Gr#   c           	         U R                   nUR                  SUR                  SS 5      5      nSS1n1 Skn[        UR                  5       5      n[	        X6XEUS9  US   nUb  [        U[        5      (       a  US:  a  [        R                  SU 35        UR                  S5      nUb3  [        U[        5      (       a  US	:  a  [        R                  S
U 35        UR                  S5      n	U	b-  [        U	[        5      (       d  [        R                  SU	 35        UR                  S5      n
U
b-  [        U
[        5      (       d  [        R                  SU
 35        U	=(       d    SU
=(       d    S:  a  [        R                  SU	 SU
 S35        U R                   R                  S5      nUb5  U R                  U-  nX:w  a   [        R                  SU SU SU S35        g g [        R                  S5        g )Nr-   r   rI   >   rT   rj   r\   r^   rU   rF   r   r   r6   r   rF   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got r\   z6`rope_scaling`'s beta_fast field must be a float, got r^   z6`rope_scaling`'s beta_slow field must be a float, got r]   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)rJ   rv   r   r   r   rN   rC   r   r   r   warning_once)r   r   rJ   r-   r   r   r   rI   rF   r\   r^   r   implicit_factors                r    _validate_yarn_parametersr   R  s   &&L  l.>.>vt.LMI (+MM ))+,M=]hi(#F~Z66&3,QRXQYZ[#''(:;#Z8H%-P-PTdghThZ[kZlm	
   -IZ	5%A%AOPY{[\  -IZ	5%A%AOPY{[\RIN+]^g]h i66?[@XZ	
 (.':':'>'>?a'b$'3 88;[[$Z[aZb cn ###A& Ju	u % 	_	
r#   c                    U R                   nUR                  SUR                  SS 5      5      n1 Skn1 Skn[        UR                  5       5      n[	        X6XEUS9  [        U SS5      n[        U SU R                  U R                  -  5      n[        X-  5      n	UR                  S	5      n
[        U
[        5      (       d/  [        S
 U
 5       5      (       a  [        R                  SU
 35        [        U
5      U	S-  :w  a'  [        R                  SU	S-   S[        U
5       35        UR                  S5      n[        U[        5      (       d/  [        S U 5       5      (       a  [        R                  SU 35        [        U5      U	S-  :w  a'  [        R                  SU	S-   S[        U5       35        [        U S5      (       a  [        R!                  S5        g UR                  S5      nUc  [        R                  S5        O3[        U["        5      (       a  US:  a  [        R                  SU 35        UR                  S5      nUb5  [        U["        5      (       a  US:  a  [        R                  SU 35        g g g )Nr-   r   >   r-   r}   r~   >   rI   rF   r   r   r5   r6   r7   r~   c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fNrN   r@   rC   .0r.   s     r    	<genexpr>0_validate_longrope_parameters.<locals>.<genexpr>  s!     1dWcRS*Qe2M2MWc   #%zC`rope_scaling`'s short_factor field must be a list of numbers, got r8   z5`rope_scaling`'s short_factor field must have length z, got r}   c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fr   r   r   s     r    r   r     s!     0bVaQRAU|1L1LVar   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.rI   z1Missing required keys in `rope_scaling`: 'factor'r   rF   g        r   )rJ   rv   r   r   r   r=   r>   r?   r@   rN   listallr   r   lenr   r   rC   )r   r   rJ   r-   r   r   r   r5   r7   rE   r~   r}   rI   rF   s                 r    _validate_longrope_parametersr     sN   &&L  l.>.>vt.LMI@MVM))+,M=]hi#F,CSIvz6+=+=A[A[+[\H
h.
/C##N3LlD))c1dWc1d.d.d\]i\jkl
<C1H$NsVWxjX^_bco_p^qrs""=1Kk4((S0bVa0b-b-b[\g[hij
;3!8#McUVhZW]^abm^n]opq
 v9::A	
 !!(+>NNNOFE**fslNNUV\U]^_'++,>?'.66:JS:Pbcsbtu ;Q (r#   c                    U R                   nUR                  SUR                  SS 5      5      n1 Skn[        UR                  5       5      n[	        X5XAS9  US   nUb  [        U[        5      (       a  US:  a  [        R                  SU 35        US   nUS	   nUb  [        U[        5      (       d  [        R                  S
U 35        Ub  [        U[        5      (       d  [        R                  SU 35        X::  a  [        R                  SU SU 35        US   n	U	b  [        U	[        5      (       d  [        R                  SU	 35        XR                  :  a&  [        R                  SU	 SU R                   35        g g )Nr-   r   >   rI   r-   r   r   r   r   rI   r6   r   r   r   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rJ   rv   r   r   r   rN   rC   r   r   r@   r   )
r   r   rJ   r-   r   r   rI   r   r   r   s
             r    _validate_llama3_parametersr     s   &&L  l.>.>vt.LMIvM))+,M=Z(#F~Z66&3,QRXQYZ["#45O#$67j%&H&HUVeUfghz2BE'J'JVWgVhij*q  5o5FH	

 (44V'W$'/zBbdg7h7h^/02	
 (+I+IIu/00MfNlNlMmo	
 Jr#   c                     [        U SS5      nUc  gUR                  SUR                  SS5      5      n[        R                  U5      nUb  U" XS9  g[        R	                  SU S35        g)	zG
Validate the RoPE config arguments, given a `PretrainedConfig` object
rJ   Nr-   r   r   r   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r=   rv   ROPE_VALIDATION_FUNCTIONSr   r   )r   r   rJ   r-   validation_fns        r    rope_config_validationr     su     6>48L   l.>.>vy.QRI-11)<M f6bclbmmno	
r#   )NNNr   )NN)$rW   	functoolsr   typingr   configuration_utilsr   utilsr   r	   
get_logger__name__r   r   r2   r@   tuplerC   rG   rK   rR   r{   r   r   ROPE_INIT_FUNCTIONSstrr   r   r   r   r   r   r   r   r   r    r#   r    <module>r      s$      1 . 
		H	% ;~ *.'+!(&%&(&^$(& c](& >5 !	(&X *.'+!(&%&(&^$(& c](& >5 !	(&X *.'+!A&%&A&^$A& c]A& >5 !	A&J PTz&z&&4z&?G}z&
>5 !z&| PTO&O&&4O&?G}O&
>5 !O&f PT>,>,&4>,?G}>,
>5 !>,J 05.$,(  $(!%lll l C=	l
 #l:[.> [XVY] [	\5E 	\T\]`Ta 	\\6F \U]^aUb \?
&6 ?
Xc] ?
D/*: /RU /d!
(8 !
xPS} !
L 168%-) 
#3 
(3- 
r#   