Bases: PretrainedConfig
Source code in vllm/transformers_utils/configs/deepseek_v3.py
  class-attribute instance-attribute  ¶
   
 __init__(
    vocab_size=129280,
    hidden_size=7168,
    intermediate_size=18432,
    moe_intermediate_size=2048,
    num_hidden_layers=61,
    num_nextn_predict_layers=1,
    num_attention_heads=128,
    num_key_value_heads=128,
    n_shared_experts=1,
    n_routed_experts=256,
    ep_size=1,
    routed_scaling_factor=2.5,
    kv_lora_rank=512,
    q_lora_rank=1536,
    qk_rope_head_dim=64,
    v_head_dim=128,
    qk_nope_head_dim=128,
    topk_method="noaux_tc",
    n_group=8,
    topk_group=4,
    num_experts_per_tok=8,
    moe_layer_freq=1,
    first_k_dense_replace=3,
    norm_topk_prob=True,
    scoring_func="sigmoid",
    hidden_act="silu",
    max_position_embeddings=4096,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=0,
    eos_token_id=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    **kwargs,
)