
    9is                     X    S r SSKJr  SSKJr  \R                  " 5       r " S S\5      rg)zXDeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config    )PretrainedConfig)loggerc                   Z   ^  \ rS rSrSrSr                   SU 4S jjrSrU =r$ )DebertaV2Config   a  
This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the DeBERTa
[microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Arguments:
    vocab_size (`int`, *optional*, defaults to 128100):
        Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`DebertaV2Model`].
    hidden_size (`int`, *optional*, defaults to 1536):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 24):
        Number of attention heads for each attention layer in the Transformer encoder.
    intermediate_size (`int`, *optional*, defaults to 6144):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
        are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities.
    max_position_embeddings (`int`, *optional*, defaults to 512):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    type_vocab_size (`int`, *optional*, defaults to 0):
        The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-7):
        The epsilon used by the layer normalization layers.
    relative_attention (`bool`, *optional*, defaults to `True`):
        Whether use relative position encoding.
    max_relative_positions (`int`, *optional*, defaults to -1):
        The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
        as `max_position_embeddings`.
    pad_token_id (`int`, *optional*, defaults to 0):
        The value used to pad input_ids.
    position_biased_input (`bool`, *optional*, defaults to `False`):
        Whether add absolute position embedding to content embedding.
    pos_att_type (`List[str]`, *optional*):
        The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
        `["p2c", "c2p"]`, `["p2c", "c2p"]`.
    layer_norm_eps (`float`, optional, defaults to 1e-12):
        The epsilon used by the layer normalization layers.

deberta_v2c                   > [         TU ]  " S0 UD6  X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl	        Xl
        Xl        Xl        Xl        Xl        UU l        [!        U5      ["        :X  a=  UR%                  5       R'                  S5       Vs/ s H  nUR)                  5       PM     nnUU l        Xl        Xl        UR1                  SU5      U l        UU l        UU l        g s  snf )N|pooler_hidden_size )super__init__hidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_acthidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangerelative_attentionmax_relative_positionspad_token_idposition_biased_inputtypestrlowersplitstrippos_att_type
vocab_sizelayer_norm_epsgetr   pooler_dropoutpooler_hidden_act)selfr#   r   r   r   r   r   r   r   r   r   r   r$   r   r   r   r   r"   r&   r'   kwargsx	__class__s                         n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/nlp/deberta_v2/configuration.pyr   DebertaV2Config.__init__P   s    * 	"6"&!2#6 !2$#6 ,H)'>$.!2"4&<#(%:" $/;/A/A/C/I/I#/NO/N!AGGI/NLO($,"(**-A;"O,!2 Ps   C0)r   r   r   r   r   r   r$   r   r   r   r   r   r&   r'   r   r"   r   r   r   r#   )id i   r   r   i   gelu皙?r/   i   r   g{Gz?gHz>Fr   TNr   r.   )	__name__
__module____qualname____firstlineno____doc__
model_typer   __static_attributes____classcell__)r+   s   @r,   r   r      sU    4j J #!#%%'#'"%(.1),!"#' $$)(*'+" !#)'03 03    r   N)r5   transformersr   modelscope.utilsr   logging
get_loggerr   r   r9   r,   <module>r>      s.    ` ) .				h3& h3r9   