Inference-only QWen model compatible with HuggingFace weights.
 
  Bases: Module
Source code in vllm/model_executor/models/qwen.py
  instance-attribute  ¶
 attn = Attention(
    num_heads,
    head_dim,
    scaling,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
)
 instance-attribute  ¶
 c_attn = QKVParallelLinear(
    hidden_size,
    head_dim,
    total_num_heads,
    bias=True,
    quant_config=quant_config,
)
 instance-attribute  ¶
 c_proj = RowParallelLinear(
    total_num_heads * head_dim,
    hidden_size,
    bias=False,
    quant_config=quant_config,
)
 instance-attribute  ¶
 rotary_emb = get_rope(
    head_dim,
    rotary_dim=head_dim,
    max_position=max_position_embeddings,
    base=rope_theta,
    rope_scaling=rope_scaling,
)
 
 __init__(
    hidden_size: int,
    num_heads: int,
    max_position_embeddings: int,
    rope_theta: float = 10000,
    rope_scaling: dict[str, Any] | None = None,
    cache_config: CacheConfig | None = None,
    quant_config: QuantizationConfig | None = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/qwen.py
  
  Source code in vllm/model_executor/models/qwen.py
  
  Bases: Module
Source code in vllm/model_executor/models/qwen.py
  instance-attribute  ¶
 lm_head = ParallelLMHead(
    vocab_size,
    hidden_size,
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "lm_head"),
)
 instance-attribute  ¶
   instance-attribute  ¶
 transformer = transformer_type(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "transformer"),
)
 
 __init__(
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    transformer_type: type[QWenModel] = QWenModel,
) -> None
Source code in vllm/model_executor/models/qwen.py
  
    
  Source code in vllm/model_executor/models/qwen.py
  
  Bases: Module
Source code in vllm/model_executor/models/qwen.py
  instance-attribute  ¶
 attn = QWenAttention(
    hidden_size,
    num_attention_heads,
    max_position_embeddings,
    rope_theta=rope_theta,
    rope_scaling=rope_scaling,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
)
 instance-attribute  ¶
 mlp = QWenMLP(
    hidden_size,
    intermediate_size // 2,
    quant_config=quant_config,
)
 
 __init__(
    config: PretrainedConfig,
    cache_config: CacheConfig | None = None,
    quant_config: QuantizationConfig | None = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/qwen.py
  
 forward(
    positions: Tensor,
    hidden_states: Tensor,
    residual: Tensor | None,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/models/qwen.py
  
  Bases: QWenBaseModel, SupportsPP, SupportsLoRA
Source code in vllm/model_executor/models/qwen.py
  class-attribute instance-attribute  ¶
   
 __init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/qwen.py
  
 forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
) -> Tensor | IntermediateTensors
Source code in vllm/model_executor/models/qwen.py
  
  Bases: Module
MLP for the language component of the Qwen model, which contains a MergedColumnParallelLinear merging 2 outputs via silu activation.
Source code in vllm/model_executor/models/qwen.py
  instance-attribute  ¶
 c_proj = RowParallelLinear(
    intermediate_size,
    hidden_size,
    bias=False,
    quant_config=quant_config,
)
 instance-attribute  ¶
 gate_up_proj = MergedColumnParallelLinear(
    hidden_size,
    [intermediate_size] * 2,
    bias=False,
    quant_config=quant_config,
)
 
 __init__(
    hidden_size: int,
    intermediate_size: int,
    hidden_act: str = "silu",
    quant_config: QuantizationConfig | None = None,
)
Source code in vllm/model_executor/models/qwen.py
  
  Bases: Module
Source code in vllm/model_executor/models/qwen.py
  instance-attribute  ¶
 make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors_factory(
        ["hidden_states", "residual"], hidden_size
    )
)
 
 __init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/qwen.py
  
 forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None,
    inputs_embeds: Tensor | None = None,
) -> Tensor | IntermediateTensors