From 169e581d7c8be4741f080a669b94c5ec6f9b6e95 Mon Sep 17 00:00:00 2001 From: Liu Liu Date: Fri, 9 Aug 2024 19:21:01 -0400 Subject: [PATCH] Also add laplacian_tests --- bin/nnc/laplacian_test.cpp | 411 ++++++++++++++++++++++++++++++++++ bin/nnc/makefile | 5 +- lib/nnc/mfa/v2/GEMMKernel.cpp | 7 +- 3 files changed, 418 insertions(+), 5 deletions(-) create mode 100644 bin/nnc/laplacian_test.cpp diff --git a/bin/nnc/laplacian_test.cpp b/bin/nnc/laplacian_test.cpp new file mode 100644 index 000000000..221e8f163 --- /dev/null +++ b/bin/nnc/laplacian_test.cpp @@ -0,0 +1,411 @@ +extern "C" { +#include +#include +#include +#include +} +#include "nnc/mfa/v2/ShaderCache.hpp" +#include "nnc/mfa/v2/GEMMDescriptor.hpp" +#include "nnc/mfa/v2/GEMMKernelDescriptor.hpp" +#include "nnc/mfa/v2/GEMMKernel.hpp" +#include "3rdparty/dsfmt/dSFMT.h" +#include + +ShaderCache shaderCache; + +std::pair profileProblemSize(GEMMDescriptor descriptor) +{ + const int problemSize = descriptor.matrixDimensions[0]; + + // Allocate FP32 memory for the operands. + float* A = (float*)ccmalloc(sizeof(float) * problemSize * problemSize); + float* B = (float*)ccmalloc(sizeof(float) * problemSize * problemSize); + float* C = (float*)ccmalloc(sizeof(float) * problemSize * problemSize); + float* previousOperandC = (float*)ccmalloc(sizeof(float) * problemSize * problemSize); + + // Initialize A as the 2nd-order periodic Laplacian. + int i, j; + for (i = 0; i < problemSize; i++) + { + for (j = 0; j < problemSize; j++) + A[i * problemSize + j] = 0; + const int diagonalAddress = i * problemSize + i; + A[diagonalAddress] = -2; + + const int leftColumnID = (i + problemSize - 1) % problemSize; + const int leftSubDiagonalAddress = i * problemSize + leftColumnID; + A[leftSubDiagonalAddress] = 1; + + const int rightColumnID = (i + problemSize + 1) % problemSize; + const int rightSubDiagonalAddress = i * problemSize + rightColumnID; + A[rightSubDiagonalAddress] = 1; + } + + dsfmt_t dsfmt; + dsfmt_init_gen_rand(&dsfmt, 1); + // Initialize B to random numbers. + for (int rowID = 0; rowID < problemSize; rowID++) + { + for (int columnID = 0; columnID < problemSize; columnID++) + { + const int address = rowID * problemSize + columnID; + B[address] = dsfmt_genrand_open_close(&dsfmt); + } + } + + // Initialize C to random numbers. + for (int rowID = 0; rowID < problemSize; rowID++) + { + for (int columnID = 0; columnID < problemSize; columnID++) + { + const int address = rowID * problemSize + columnID; + previousOperandC[address] = dsfmt_genrand_open_close(&dsfmt); + } + } + + // Since the Laplacian is symmetric, we swap roles of the matrices to test + // transposition of the left-hand side. + // + // Note that the test cannot cover correctness of A and B transposition + // simultaneously. Instead, test the correctness in isolation + // (AB, AB^T, A^T B). Performance can be tested in all four permutations + // (AB, AB^T, A^T B, A^T B^T). + if (descriptor.transposeState[0]) + { + float* t = A; + A = B; + B = t; + } + + // Multiply A with B. + int maxGFLOPS = 0; + int occupancy = 0; + DeviceProperties dprops; + dprops.coreCount = 18; + NS::SharedPtr device = NS::TransferPtr(MTL::CreateSystemDefaultDevice()); + NS::SharedPtr queue = NS::TransferPtr(device->newCommandQueue()); + { + // Generate the kernel. + auto pipelineValue = shaderCache.findKernel(descriptor, device.get(), dprops); + occupancy = pipelineValue->pipeline->maxTotalThreadsPerThreadgroup(); + NS::SharedPtr bufferA = NS::TransferPtr(device->newBuffer(A, sizeof(float) * problemSize * problemSize, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + NS::SharedPtr bufferB = NS::TransferPtr(device->newBuffer(B, sizeof(float) * problemSize * problemSize, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + NS::SharedPtr bufferC = NS::TransferPtr(device->newBuffer(previousOperandC, sizeof(float) * problemSize * problemSize, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked)); + + // load = directAccessCondition, + // store = false + // problemSize = 1488 | A B | 832 threads/core | 8175 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 8712 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 8818 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 8972 GFLOPS + // problemSize = 1489 | A B | 768 threads/core | 7888 GFLOPS + // problemSize = 1489 | A B^T | 768 threads/core | 8256 GFLOPS + // problemSize = 1489 | A^T B | 768 threads/core | 8026 GFLOPS + // problemSize = 1489 | A^T B^T | 832 threads/core | 8463 GFLOPS + // + // load = directAccessCondition + // store = directAccessCondition && (gid.y * M_group < M_edge) && (gid.x * N_group < N_edge) + // problemSize = 1488 | A B | 832 threads/core | 8186 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 8709 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 8808 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 8984 GFLOPS + // problemSize = 1489 | A B | 768 threads/core | 7902 GFLOPS + // problemSize = 1489 | A B^T | 768 threads/core | 8249 GFLOPS + // problemSize = 1489 | A^T B | 768 threads/core | 8034 GFLOPS + // problemSize = 1489 | A^T B^T | 832 threads/core | 8469 GFLOPS + // + // load = directAccessCondition && (gid.y * M_group < M_edge) && (gid.x * N_group < N_edge) + // store = directAccessCondition && (gid.y * M_group < M_edge) && (gid.x * N_group < N_edge) + // problemSize = 1488 | A B | 832 threads/core | 8181 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 8710 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 8806 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 8979 GFLOPS + // problemSize = 1489 | A B | 768 threads/core | 7892 GFLOPS + // problemSize = 1489 | A B^T | 768 threads/core | 8242 GFLOPS + // problemSize = 1489 | A^T B | 768 threads/core | 8034 GFLOPS + // problemSize = 1489 | A^T B^T | 832 threads/core | 8461 GFLOPS + // + // load previous C = false (M1 Max) + // problemSize = 1488 | A B | 896 threads/core | 8358 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 8682 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 8803 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 9024 GFLOPS + // problemSize = 1489 | A B | 768 threads/core | 8039 GFLOPS + // problemSize = 1489 | A B^T | 832 threads/core | 8376 GFLOPS + // problemSize = 1489 | A^T B | 832 threads/core | 8374 GFLOPS + // problemSize = 1489 | A^T B^T | 832 threads/core | 8654 GFLOPS + // + // load previous C = true (M1 Max) + // problemSize = 1488 | A B | 896 threads/core | 8352 GFLOPS + // problemSize = 1488 | A B^T | 896 threads/core | 8515 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 8760 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 9007 GFLOPS + // problemSize = 1489 | A B | 768 threads/core | 7917 GFLOPS + // problemSize = 1489 | A B^T | 768 threads/core | 7992 GFLOPS + // problemSize = 1489 | A^T B | 832 threads/core | 8185 GFLOPS + // problemSize = 1489 | A^T B^T | 832 threads/core | 8583 GFLOPS + // + // load previous C = false (M4) + // problemSize = 1488 | A B | 1024 threads/core | 3353 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 3324 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 3338 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 3289 GFLOPS + // problemSize = 1489 | A B | 1024 threads/core | 3375 GFLOPS + // problemSize = 1489 | A B^T | 1024 threads/core | 3317 GFLOPS + // problemSize = 1489 | A^T B | 1024 threads/core | 3343 GFLOPS + // problemSize = 1489 | A^T B^T | 1024 threads/core | 3298 GFLOPS + // + // load previous C = true (M4) + // problemSize = 1488 | A B | 1024 threads/core | 3374 GFLOPS + // problemSize = 1488 | A B^T | 1024 threads/core | 3312 GFLOPS + // problemSize = 1488 | A^T B | 1024 threads/core | 3321 GFLOPS + // problemSize = 1488 | A^T B^T | 1024 threads/core | 3249 GFLOPS + // problemSize = 1489 | A B | 1024 threads/core | 3323 GFLOPS + // problemSize = 1489 | A B^T | 1024 threads/core | 3280 GFLOPS + // problemSize = 1489 | A^T B | 1024 threads/core | 3308 GFLOPS + // problemSize = 1489 | A^T B^T | 1024 threads/core | 3256 GFLOPS + + // Profile the latency of matrix multiplication. + for (int i = 0; i < 15; i++) + { + const int duplicatedCommandCount = 20; + NS::SharedPtr commandBuffer = NS::TransferPtr(queue->commandBuffer()); + NS::SharedPtr encoder = NS::TransferPtr(commandBuffer->computeCommandEncoder()); + encoder->setComputePipelineState(pipelineValue->pipeline.get()); + encoder->setThreadgroupMemoryLength(pipelineValue->kernel->threadgroupMemoryAllocation, 0); + encoder->setBuffer(bufferA.get(), 0, 0); + encoder->setBuffer(bufferB.get(), 0, 1); + encoder->setBuffer(bufferC.get(), 0, 2); + for (int j = 0; j < duplicatedCommandCount; j++) + { + auto ceilDivide = + [=](int64_t target, uint16_t granularity) -> int64_t { + return (target + int64_t(granularity) - 1) / int64_t(granularity); + }; + MTL::Size gridSize = MTL::Size(ceilDivide(problemSize, pipelineValue->kernel->blockDimensions[1]), ceilDivide(problemSize, pipelineValue->kernel->blockDimensions[0]), 1); + MTL::Size groupSize = MTL::Size(pipelineValue->kernel->threadgroupSize, 1, 1); + encoder->dispatchThreadgroups(gridSize, groupSize); + } + encoder->endEncoding(); + commandBuffer->commit(); + commandBuffer->waitUntilCompleted(); + auto start = commandBuffer->GPUStartTime(); + auto end = commandBuffer->GPUEndTime(); + auto latency = end - start; + + // Determine the amount of work done. + auto operations = (int64_t)2 * problemSize * problemSize * problemSize; + operations = operations * duplicatedCommandCount; + auto gflops = (int)((double)operations / (double)latency / 1e9); + + // Report the results. + // let latencyMicroseconds = Int(latency / 1e-6) + // print(latencyMicroseconds, "μs", gflops, "GFLOPS") + maxGFLOPS = std::max(maxGFLOPS, gflops); + } + // Copy the results to C. + { + auto precision = descriptor.memoryPrecisions.C; + auto raw = bufferC->contents(); + for (int rowID = 0; rowID < problemSize; rowID++) + { + for (int columnID = 0; columnID < problemSize; columnID++) + { + const int address = rowID * problemSize + columnID; + float entry32; + switch (precision.value) { + case GEMMOperandPrecision::FP32: + entry32 = ((float*)raw)[address]; + break; + case GEMMOperandPrecision::FP16: { + uint16_t value = ((uint16_t*)raw)[address]; + ccv_half_precision_to_float(&value, &entry32, 1); + break; + } + } + C[address] = entry32; + } + } + } + } + + // Choose an error threshold. + auto createErrorThreshold = + [=](GEMMOperandPrecision precision) -> float { + switch (precision.value) { + case GEMMOperandPrecision::FP32: + return 1e-5; + case GEMMOperandPrecision::FP16: + return 5e-3; + case GEMMOperandPrecision::BF16: + return 5e-2; + } + }; + float errorThreshold = 0; + { + auto memoryPrecisions = descriptor.memoryPrecisions; + auto thresholdA = createErrorThreshold(memoryPrecisions.A); + auto thresholdB = createErrorThreshold(memoryPrecisions.B); + auto thresholdC = createErrorThreshold(memoryPrecisions.C); + errorThreshold = std::max(errorThreshold, thresholdA); + errorThreshold = std::max(errorThreshold, thresholdB); + errorThreshold = std::max(errorThreshold, thresholdC); + } + // Check the results. + int errorCount = 0; + for (int m = 0; m < problemSize; m++) + { + for (int n = 0; n < problemSize; n++) + { + // Find the source row IDs. + int leftRowID = (m + problemSize - 1) % problemSize; + int centerRowID = m; + int rightRowID = (m + problemSize + 1) % problemSize; + + // Find the source scalars. + float leftSource; + float centerSource; + float rightSource; + if (descriptor.transposeState[0]) + { + leftSource = A[leftRowID * problemSize + n]; + centerSource = A[centerRowID * problemSize + n]; + rightSource = A[rightRowID * problemSize + n]; + } else if (descriptor.transposeState[1]) { + leftSource = B[n * problemSize + leftRowID]; + centerSource = B[n * problemSize + centerRowID]; + rightSource = B[n * problemSize + rightRowID]; + } else { + leftSource = B[leftRowID * problemSize + n]; + centerSource = B[centerRowID * problemSize + n]; + rightSource = B[rightRowID * problemSize + n]; + } + + // Find the expected result. + float expected = leftSource - 2 * centerSource + rightSource; + + // Find the actual result. + float actual; + if (descriptor.transposeState[0]) + { + actual = C[n * problemSize + m]; + } else { + actual = C[m * problemSize + n]; + } + + // Report whether it is correct. + float error = fabs(expected - actual); + if (error > errorThreshold) + { + if (errorCount < 10) + { + printf("error: %f / ~1.000\n", error); + errorCount += 1; + } + } + } + } + return std::make_pair(maxGFLOPS, occupancy); +} + +struct TestDescriptor { + GEMMOperandPrecision precision; + int problemSize; + bool transposeState[2]; +}; + +void runTest(TestDescriptor descriptor) +{ + // Set up the kernel. + GEMMDescriptor gemmDesc = GEMMDescriptor(); + auto precision = descriptor.precision; + unsigned int n = (unsigned int)descriptor.problemSize; + gemmDesc.matrixDimensions = simd::uint3 { n, n, n }; + gemmDesc.memoryPrecisions = { + .A = precision, .B = precision, .C = precision + }; + gemmDesc.transposeState = simd::uchar3 { descriptor.transposeState[0], descriptor.transposeState[1] }; + gemmDesc.useBias = false; + + // Test the kernel. + auto statistic = profileProblemSize(gemmDesc); + + // Report the results. + std::cout << "problemSize = " << descriptor.problemSize << " | "; + if (descriptor.transposeState[0]) + { + std::cout << "A^T "; + } else { + std::cout << "A "; + } + if (descriptor.transposeState[1]) + { + std::cout << "B^T | "; + } else { + std::cout << "B | "; + } + + std::cout << statistic.first << " GFLOPS " << statistic.second << " threads/core | " << std::endl; +} + +int main(int argc, char** argv) +{ + ccv_nnc_init(); + { + int problemSizes[] = { + 7, 8, 9, 10, + 15, 16, 17, 18, + 23, 24, 25, + 31, 32, 33, + 47, 48, 49, + 63, 64, 65, + 103, 104, 112, + 126, 127, 128, 129, + 130, 131, + 135, 136, 137, + 143, 144, 145, + 151, 152, 153, + }; + bool transposeStates[] = { + false, false, + false, true, + true, false, + }; + printf("Correctness tests:\n"); + for (int i = 0; i < sizeof(problemSizes) / sizeof(int); i++) + { + for (int j = 0; j < sizeof(transposeStates) / (sizeof(bool) * 2); j++) + { + TestDescriptor testDescriptor = TestDescriptor(); + testDescriptor.precision = GEMMOperandPrecision::FP32; + testDescriptor.problemSize = problemSizes[i]; + testDescriptor.transposeState[0] = transposeStates[j * 2]; + testDescriptor.transposeState[1] = transposeStates[j * 2 + 1]; + runTest(testDescriptor); + } + } + } + { + bool transposeStates[] = { + false, false, + false, true, + true, false, + true, true, + }; + + printf("\nPerformance tests:\n"); + for (int problemSize = 1488; problemSize <= 1489; problemSize++) + { + for (int j = 0; j < sizeof(transposeStates) / (sizeof(bool) * 2); j++) + { + TestDescriptor testDescriptor = TestDescriptor(); + testDescriptor.precision = GEMMOperandPrecision::FP32; + testDescriptor.problemSize = problemSize; + testDescriptor.transposeState[2] = transposeStates[j * 2]; + testDescriptor.transposeState[1] = transposeStates[j * 2 + 1]; + runTest(testDescriptor); + } + } + } + return 0; +} diff --git a/bin/nnc/makefile b/bin/nnc/makefile index 09feaa39a..422743656 100644 --- a/bin/nnc/makefile +++ b/bin/nnc/makefile @@ -4,7 +4,7 @@ LDFLAGS := -L"../../lib" -lccv $(LDFLAGS) CFLAGS := -O3 -Wall -I"../../lib" $(CFLAGS) NVFLAGS := -O3 -I"../../lib" -lineinfo $(NVFLAGS) -TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm +TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm laplacian_test FUZZ_TARGETS = csv_fuzz @@ -37,6 +37,9 @@ libccv.a: %.o: %.c $(CC) $< -o $@ -c $(CFLAGS) +laplacian_test.o: laplacian_test.cpp + $(CC) $< -o $@ -c $(CFLAGS) -std=c++17 + .gitignore: echo $(TARGETS) | tr ' ' '\n' > .gitignore diff --git a/lib/nnc/mfa/v2/GEMMKernel.cpp b/lib/nnc/mfa/v2/GEMMKernel.cpp index c3eaaa866..be2848732 100644 --- a/lib/nnc/mfa/v2/GEMMKernel.cpp +++ b/lib/nnc/mfa/v2/GEMMKernel.cpp @@ -448,7 +448,7 @@ kernel void gemm(device MEMORY_NAME_A *A [[buffer(0)]], )"; if (useBias) { - if (descriptor.preferAsyncLoad) { + if (true) { // descriptor.preferAsyncLoad) { source += "\n"; source += "#define USE_BIAS_ASYNC_COND false\n"; } else { @@ -500,8 +500,7 @@ kernel void gemm(device MEMORY_NAME_A *A [[buffer(0)]], simdgroup_matrix_storage bias; bias.BIAS_LOAD( bias_src, 0, ushort2(m, 0)); - bias.thread_elements()[0][1] = bias.thread_elements()[0][0]; - + #pragma clang loop unroll(full) for (ushort n = 0; n < REGISTER_N; n += 8) { vec biasForm = *(bias.thread_elements()); @@ -520,7 +519,7 @@ kernel void gemm(device MEMORY_NAME_A *A [[buffer(0)]], simdgroup_matrix_storage bias; bias.BIAS_LOAD( bias_src, 0, ushort2(n, 0)); - + #pragma clang loop unroll(full) for (ushort m = 0; m < REGISTER_M; m += 8) { vec biasForm = *(bias.thread_elements());