#pragma once /** * __device__ datatypes vectorized by 4 */ // Include both AMD and NVIDIA fp8 types to avoid circular import #include #include namespace vllm { // Vectorization containers template struct __align__(vec_size * sizeof(scalar_t)) vec_n_t { scalar_t val[vec_size]; }; template struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t { static_assert(std::is_same_v || std::is_same_v || std::is_same_v); quant_type_t val[vec_size]; }; template using vec4_t = vec_n_t; template using q8x4_t = q8_n_t; } // namespace vllm