Inference-only FlexOlmo model compatible with HuggingFace weights.
 
  Bases: OlmoeAttention
Source code in vllm/model_executor/models/flex_olmo.py
  
 __init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/flex_olmo.py
  
  Bases: Module
Source code in vllm/model_executor/models/flex_olmo.py
  instance-attribute  ¶
 post_attention_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)
 instance-attribute  ¶
 post_feedforward_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)
 instance-attribute  ¶
 self_attn = FlexOlmoAttention(
    vllm_config=vllm_config, prefix=f"{prefix}.self_attn"
)
 
 __init__(
    *, vllm_config: VllmConfig, prefix: str = ""
) -> None
Source code in vllm/model_executor/models/flex_olmo.py
  
 forward(
    positions: Tensor,
    hidden_states: Tensor,
    residual: Tensor | None,
) -> tuple[Tensor, Tensor | None]
Source code in vllm/model_executor/models/flex_olmo.py
  
  Bases: OlmoeForCausalLM
Source code in vllm/model_executor/models/flex_olmo.py
  class-attribute instance-attribute  ¶
   
 __init__(
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    layer_type: type[Module] = FlexOlmoDecoderLayer,
)
 
  Bases: Module
A tensor-parallel MoE implementation for FlexOlmo that shards each expert across all ranks.
Each expert's weights are sharded across all ranks and a fused MoE kernel is used for the forward pass, and finally we reduce the outputs across ranks.
Source code in vllm/model_executor/models/flex_olmo.py
  instance-attribute  ¶
 experts = FusedMoE(
    num_experts=num_experts,
    top_k=num_experts_per_tok,
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    reduce_results=True,
    renormalize=False,
    quant_config=None,
    tp_size=tp_size,
    prefix=f"{prefix}.experts",
)
 instance-attribute  ¶
 gate = ReplicatedLinear(
    hidden_size,
    num_experts,
    bias=False,
    return_bias=False,
    quant_config=None,
    prefix=f"{prefix}.gate",
)
 
 __init__(*, vllm_config: VllmConfig, prefix: str = '')