Compatibility wrapper for FlashInfer API changes.
Users of vLLM should always import only these wrappers.
 module-attribute  ¶
 FLASHINFER_CUBINS_REPOSITORY = get(
    "FLASHINFER_CUBINS_REPOSITORY",
    "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/",
)
 module-attribute  ¶
 __all__ = [
    "has_flashinfer",
    "flashinfer_trtllm_fp8_block_scale_moe",
    "flashinfer_cutlass_fused_moe",
    "flashinfer_fp4_quantize",
    "nvfp4_block_scale_interleave",
    "trtllm_fp4_block_scale_moe",
    "autotune",
    "has_flashinfer_moe",
    "has_flashinfer_comm",
    "has_flashinfer_all2all",
    "has_flashinfer_cutlass_fused_moe",
    "has_nvidia_artifactory",
    "supports_trtllm_attention",
    "can_use_trtllm_attention",
    "use_trtllm_attention",
    "flashinfer_disable_q_quantization",
    "flashinfer_scaled_fp4_mm",
    "flashinfer_scaled_fp8_mm",
]
 module-attribute  ¶
 autotune = _lazy_import_wrapper(
    "flashinfer.autotuner",
    "autotune",
    fallback_fn=lambda *args, **kwargs: nullcontext(),
)
 module-attribute  ¶
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
    "flashinfer.fused_moe", "cutlass_fused_moe"
)
 module-attribute  ¶
 flashinfer_fp4_quantize = _lazy_import_wrapper(
    "flashinfer", "fp4_quantize"
)
 module-attribute  ¶
 flashinfer_trtllm_fp8_block_scale_moe = (
    _lazy_import_wrapper(
        "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"
    )
)
 module-attribute  ¶
 flashinfer_trtllm_fp8_per_tensor_scale_moe = (
    _lazy_import_wrapper(
        "flashinfer.fused_moe",
        "trtllm_fp8_per_tensor_scale_moe",
    )
)
 module-attribute  ¶
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
    "flashinfer", "nvfp4_block_scale_interleave"
)
 module-attribute  ¶
 trtllm_fp4_block_scale_moe = _lazy_import_wrapper(
    "flashinfer", "trtllm_fp4_block_scale_moe"
)
 cached  ¶
  Cache the env value for VLLM_USE_TRTLLM_ATTENTION
Source code in vllm/utils/flashinfer.py
   
  Safely import a submodule and return it, or None if not available.
 
 _lazy_import_wrapper(
    module_name: str,
    attr_name: str,
    fallback_fn: Callable[..., Any] = _missing,
)
Create a lazy import wrapper for a specific function.
Source code in vllm/utils/flashinfer.py
  
  Placeholder for unavailable FlashInfer backend.
Source code in vllm/utils/flashinfer.py
   
 bmm_fp8(
    A: Tensor,
    B: Tensor,
    A_scale: Tensor,
    B_scale: Tensor,
    dtype: dtype,
    backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
 bmm_fp8_fake(
    A: Tensor,
    B: Tensor,
    A_scale: Tensor,
    B_scale: Tensor,
    dtype: dtype,
    backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
  Check if the current configuration supports TRTLLM attention.
Source code in vllm/utils/flashinfer.py
  
 flashinfer_mm_fp4(
    A: Tensor,
    B: Tensor,
    A_scale: Tensor,
    B_scale: Tensor,
    g_scale: Tensor,
    dtype: dtype,
    backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
 flashinfer_mm_fp4_fake(
    A: Tensor,
    B: Tensor,
    A_scale: Tensor,
    B_scale: Tensor,
    g_scale: Tensor,
    dtype: dtype,
    backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
 flashinfer_scaled_fp4_mm(
    a: Tensor,
    b: Tensor,
    block_scale_a: Tensor,
    block_scale_b: Tensor,
    alpha: Tensor,
    out_dtype: dtype,
    backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
 flashinfer_scaled_fp8_mm(
    a: Tensor,
    b: Tensor,
    scale_a: Tensor,
    scale_b: Tensor,
    out_dtype: dtype,
    bias: Tensor | None = None,
) -> Tensor
Source code in vllm/utils/flashinfer.py
  
 force_use_trtllm_attention() -> bool | None
Return None if VLLM_USE_TRTLLM_ATTENTION is not set, return True if TRTLLM attention is forced to be used, return False if TRTLLM attention is forced to be not used.
Source code in vllm/utils/flashinfer.py
  cached  ¶
 has_flashinfer() -> bool
Return True if FlashInfer is available.
Source code in vllm/utils/flashinfer.py
  cached  ¶
 has_flashinfer_all2all() -> bool
Return True if FlashInfer mnnvl all2all is available.
Source code in vllm/utils/flashinfer.py
  cached  ¶
 has_flashinfer_comm() -> bool
Return True if FlashInfer comm module is available.
 cached  ¶
 has_flashinfer_cutlass_fused_moe() -> bool
Return True if FlashInfer CUTLASS fused MoE is available.
Source code in vllm/utils/flashinfer.py
  cached  ¶
 has_flashinfer_moe() -> bool
Return True if FlashInfer MoE module is available.
 cached  ¶
 has_nvidia_artifactory() -> bool
Return True if NVIDIA's artifactory is accessible.
This checks connectivity to the kernel inference library artifactory which is required for downloading certain cubin kernels like TRTLLM FHMA.
Source code in vllm/utils/flashinfer.py
  cached  ¶
 supports_trtllm_attention() -> bool
TRTLLM attention is supported if the platform is SM100 and NVIDIA artifactory is accessible
Source code in vllm/utils/flashinfer.py
  
 use_trtllm_attention(
    num_qo_heads: int,
    num_kv_heads: int,
    num_tokens: int,
    max_seq_len: int,
    kv_cache_dtype: str,
    q_dtype: dtype,
    is_prefill: bool,
    has_sinks: bool = False,
    has_spec: bool = False,
) -> bool
Return True if TRTLLM attention is used.