From 97970c5d94b8c3c2bf268b55106ca98f4977477e Mon Sep 17 00:00:00 2001 From: bolun Date: Thu, 20 Jul 2023 16:49:14 +0800 Subject: [PATCH] Add benchmark for conv --- benchmark/kernels/cuda/conv.cc | 179 +++++++++++++++++++++++++++------ 1 file changed, 146 insertions(+), 33 deletions(-) diff --git a/benchmark/kernels/cuda/conv.cc b/benchmark/kernels/cuda/conv.cc index 84ff7c6f..14a9851e 100644 --- a/benchmark/kernels/cuda/conv.cc +++ b/benchmark/kernels/cuda/conv.cc @@ -12,14 +12,48 @@ using namespace infini; +#define M 1048576 + +const char algo_name[8][50] = { + "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_GEMM", + "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT", + "CUDNN_CONVOLUTION_FWD_ALGO_FFT", + "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", + "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", + "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", +}; + +const char mode_name[2][50] = { + "CUDNN_CONVOLUTION", + "CUDNN_CROSS_CORRELATION" +}; + int main() { - int warmupRounds = 200; - int timingRounds = 200; - DataType dtype = DataType::Float32; + int warmupRounds = 50; + int timingRounds = 100; + DataType dtype = DataType::Float32; + + cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION; + cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + float alpha = 1.f, beta = 0.f; + + int n, c, h, w, f, r, s; + int INPUT_BATCH_SIZE = n = 16; + int INPUT_CHANNELS = c = 128; + int INPUT_HEIGHT = h = 128; + int INPUT_WIDTH = w = 128; + Shape INPUT_SHAPE = {INPUT_BATCH_SIZE, INPUT_CHANNELS, \ + INPUT_HEIGHT, INPUT_WIDTH}; + + int OUTPUT_CHANNELS = f = 256; + int KERNEL_HEIGHT = r = 3; + int KERNEL_WIDTH = s = 3; + Shape KERNEL_SHAPE = {INPUT_CHANNELS, OUTPUT_CHANNELS, \ + KERNEL_HEIGHT, KERNEL_WIDTH}; - Shape INPUT_SHAPE = {16, 128, 112, 112}; - Shape KERNEL_SHAPE = {128, 256, 3, 3}; int NUM_GROUPS = 1; int PAD_HEIGHT = 0; @@ -79,42 +113,121 @@ int main() { warmupRounds, timingRounds ); - auto [] + int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS; // get inputs - cudnnTensorDescriptor_t inDesc; - checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); - checkCudnnError(cudnnSetTensor4dDescriptor( - inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w)); - - // get kernels - cudnnFilterDescriptor_t knDesc; - checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); - checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, - CUDNN_TENSOR_NCHW, f, - channelsPerGrp, r, s)); - // get bias - cudnnTensorDescriptor_t biasDesc; - checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); - checkCudnnError(cudnnSetTensor4dDescriptor( - biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); - - // get convlution descriptor - cudnnConvolutionDescriptor_t convDesc; - checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); - // TODO: CUDNN_CONVOLUTION is a tunable argument - checkCudnnError(cudnnSetConvolution2dDescriptor( - convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode], - CUDNN_DATA_FLOAT)); - if (g > 1) { - checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g)); - } + cudnnTensorDescriptor_t inDesc; + checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor( + inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + // get kernels + cudnnFilterDescriptor_t knDesc; + checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); + checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, + CUDNN_TENSOR_NCHW, f, + channelsPerGrp, r, s)); + // get bias + // cudnnTensorDescriptor_t biasDesc; + // checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); + // checkCudnnError(cudnnSetTensor4dDescriptor( + // biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); + // get convlution descriptor + cudnnConvolutionDescriptor_t convDesc; + checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); + checkCudnnError(cudnnSetConvolution2dDescriptor( + convDesc, PAD_HEIGHT, PAD_WIDTH, VERTICAL_STRIDE, HORIZONTAL_STRIDE, + DILATION_HEIGHT, DILATION_WIDTH, convMode, CUDNN_DATA_FLOAT)); + if (NUM_GROUPS > 1) { + checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS)); + } + int outn, outc, outh, outw; + checkCudnnError(cudnnGetConvolution2dForwardOutputDim( + convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); + cudnnTensorDescriptor_t outDesc; + checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, outn, outc, + outh, outw)); + Shape OUTPUT_SHAPE = {outn, outc, outh, outw}; + size_t outputSize = 1; + for (auto dim: OUTPUT_SHAPE) { + outputSize *= dim; + } + size_t outputSizeInBytes = outputSize * sizeof(dtype); + + // Build output data on CPU + Tensor outputCpu = + make_ref(OUTPUT_SHAPE, dtype, cpuRuntime); + outputCpu->dataMalloc(); + + // Build output data on GPU + Tensor outputGpu = + make_ref(OUTPUT_SHAPE, dtype, cudaRuntime); + outputGpu->dataMalloc(); + + size_t workspaceSize = 0; + checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize( + cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc, + outDesc, convAlgo, &workspaceSize)); + + CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize); + + time_op += timeit( + [&]() { + cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha, + inDesc, inputGpu->getRawDataPtr(), + knDesc, kernelGpu->getRawDataPtr(), + convDesc, convAlgo, workspace, + workspaceSize, &beta, + outDesc, outputGpu->getRawDataPtr()); + }, + [&]() { cudaRuntime->sync(); }, + warmupRounds, timingRounds + ); + + checkCudnnError(cudnnDestroyTensorDescriptor(outDesc)); + checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc)); + // checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc)); + checkCudnnError(cudnnDestroyFilterDescriptor(knDesc)); + checkCudnnError(cudnnDestroyTensorDescriptor(inDesc)); + + time_memcpy_dtoh += timeit( + [&]() { + outputCpu = outputGpu->clone(cpuRuntime); + }, + [&]() { cudaRuntime->sync(); }, + warmupRounds, timingRounds + ); + + // Print Results + printf("Operator - Convolution:\n"); + printf("Conv Algo: %s\n", algo_name[convAlgo]); + printf("Conv Mode: %s\n", mode_name[convMode]); + printf("Input shape: (%d, %d, %d, %d)\n", + INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]); + printf("Kernel shape: (%d, %d, %d, %d)\n", + KERNEL_SHAPE[0], KERNEL_SHAPE[1], KERNEL_SHAPE[2], KERNEL_SHAPE[3]); + printf("Output shape: (%d, %d, %d, %d)\n", + OUTPUT_SHAPE[0], OUTPUT_SHAPE[1], OUTPUT_SHAPE[2], OUTPUT_SHAPE[3]); + printf("Workspace size: %ld Bytes, dtype: %s\n", + workspaceSize, dtype.toString().c_str()); + + printf("TFlops: %.5lf tflops\n", + 2.0 * INPUT_BATCH_SIZE * channelsPerGrp * outh * outw * \ + OUTPUT_CHANNELS * KERNEL_HEIGHT * KERNEL_WIDTH / \ + VERTICAL_STRIDE / HORIZONTAL_STRIDE / 1e9 / time_op); + printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n", + time_memcpy_htod, time_memcpy_dtoh); + printf("Memcpy throughput: h2d - %.6lf MB/ms, d2h: %.6lf MB/ms\n", + (inputSizeInBytes + kernelSizeInBytes) / M / time_memcpy_htod, + outputSizeInBytes / M / time_memcpy_dtoh); + printf("Operation: %.6lf ms\n", time_op); return 0; } \ No newline at end of file