Bases: Enum
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   
 apply_flashinfer_per_tensor_scale_fp8(
    layer: Module,
    hidden_states: Tensor,
    router_logits: Tensor,
    routing_bias: Tensor | None,
    top_k: int,
    num_expert_group: int | None,
    topk_group: int | None,
    global_num_experts: int,
    apply_router_weight_on_input: bool,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 build_flashinfer_fp8_cutlass_moe_prepare_finalize(
    moe: FusedMoEConfig | None,
) -> FusedMoEPrepareAndFinalize
Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
  Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 flashinfer_cutlass_moe_fp8(
    hidden_states: Tensor,
    layer: Module,
    topk_weights: Tensor,
    topk_ids: Tensor,
    inplace: bool = False,
    activation: str = "silu",
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 get_flashinfer_moe_backend() -> FlashinferMoeBackend
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 get_moe_scaling_factors(
    input_scale: Tensor,
    gemm1_weights_scale: Tensor,
    activation_scale: Tensor,
    gemm2_weights_scale: Tensor,
) -> tuple[Tensor, Tensor, Tensor]
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 is_flashinfer_supporting_global_sf(
    backend: FlashinferMoeBackend | None,
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   
 register_moe_scaling_factors(layer: Module) -> None
Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
  Source code in vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  
 select_cutlass_fp8_gemm_impl(
    moe: FusedMoEConfig | None,
    quant_config: FusedMoEQuantConfig,
    out_dtype: dtype | None = None,
) -> FusedMoEPermuteExpertsUnpermute
Return a GEMM experts implementation for fused-MoE layers