Skip to content

Commit

Permalink
[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344)
Browse files Browse the repository at this point in the history
  • Loading branch information
bigPYJ1151 authored Oct 17, 2024
1 parent 8e1cddc commit 5eda21e
Show file tree
Hide file tree
Showing 7 changed files with 452 additions and 96 deletions.
8 changes: 4 additions & 4 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ docker exec cpu-test bash -c "
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# Run compressed-tensor test
# docker exec cpu-test bash -c "
# pytest -s -v \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

# Run AWQ test
docker exec cpu-test bash -c "
Expand Down
13 changes: 0 additions & 13 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt

# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git

RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm
Expand Down
40 changes: 34 additions & 6 deletions cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
include(FetchContent)

set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD 17)

#
# Define environment variables for special configurations
Expand Down Expand Up @@ -82,15 +85,40 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
endif()

#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
#
if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.5.3
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)

set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF")
set(ONEDNN_BUILD_EXAMPLES "OFF")
set(ONEDNN_BUILD_TESTS "OFF")
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ONEDNN_BUILD_GRAPH "OFF")
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

FetchContent_MakeAvailable(oneDNN)

list(APPEND LIBS dnnl)
endif()

message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

list(APPEND LIBS numa)

# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
if (AVX2_FOUND OR AVX512_FOUND)
list(APPEND LIBS dnnl)
endif()

#
# _C extension
#
Expand Down
41 changes: 39 additions & 2 deletions csrc/cpu/cpu_types_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
};

#ifdef __AVX512F__
struct INT32Vec16: public Vec<INT32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
union AliasReg {
__m512i reg;
int32_t values[VEC_ELEM_NUM];
};

__m512i reg;

explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}

void save(int32_t* ptr) const {
_mm512_storeu_epi32(ptr, reg);
}

void save(int32_t* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
_mm512_mask_storeu_epi32(ptr, mask, reg);
}
};
#endif

#ifdef __AVX512F__
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
Expand All @@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

explicit FP32Vec16(__m512 data) : reg(data) {}

explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}

explicit FP32Vec16(const FP32Vec4 &data)
: reg((__m512)_mm512_inserti32x4(
_mm512_inserti32x4(
Expand All @@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}

explicit FP32Vec16(const INT32Vec16 &v)
: reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}

FP32Vec16 operator*(const FP32Vec16 &b) const {
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
}
Expand Down Expand Up @@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
}

FP32Vec16 min(const FP32Vec16& b) const {
return FP32Vec16(_mm512_min_ps(reg, b.reg));
}

FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
}

FP32Vec16 abs() const {
return FP32Vec16(_mm512_abs_ps(reg));
}
Expand All @@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

float reduce_max() const { return _mm512_reduce_max_ps(reg); }

float reduce_min() const { return _mm512_reduce_min_ps(reg); }

template <int group_size> float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
Expand Down
Loading

0 comments on commit 5eda21e

Please sign in to comment.