From 608f997042634bd8d3c7d7f9afccf454678d9825 Mon Sep 17 00:00:00 2001 From: bolun Date: Thu, 20 Jul 2023 16:57:27 +0800 Subject: [PATCH] Add comments in conv benchmark --- benchmark/kernels/cuda/conv.cc | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/benchmark/kernels/cuda/conv.cc b/benchmark/kernels/cuda/conv.cc index 14a9851e..cecff7d3 100644 --- a/benchmark/kernels/cuda/conv.cc +++ b/benchmark/kernels/cuda/conv.cc @@ -31,11 +31,12 @@ const char mode_name[2][50] = { }; int main() { - + // Benchmark Settings int warmupRounds = 50; int timingRounds = 100; DataType dtype = DataType::Float32; + // cudnn Conv Configurations cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION; cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; float alpha = 1.f, beta = 0.f; @@ -63,19 +64,21 @@ int main() { int DILATION_HEIGHT = 1; int DILATION_WIDTH = 1; + // Get input size size_t inputSize = 1; for (auto dim: INPUT_SHAPE) { inputSize *= dim; } size_t inputSizeInBytes = inputSize * sizeof(dtype); + // Get kernel size size_t kernelSize = 1; for (auto dim: KERNEL_SHAPE) { kernelSize *= dim; } size_t kernelSizeInBytes = kernelSize * sizeof(dtype); - // Init time + // Init time variables double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0; double time_op = 0.0; @@ -83,7 +86,7 @@ int main() { Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); auto cudaRuntime = make_ref(); - // Build input data on CPU + // Build input data and kernel on CPU Tensor inputCpu = make_ref(INPUT_SHAPE, dtype, cpuRuntime); inputCpu->dataMalloc(); @@ -94,7 +97,7 @@ int main() { kernelCpu->dataMalloc(); kernelCpu->setData(RandomGenerator()); - // Build input data on GPU + // Build input data and kernel on GPU Tensor inputGpu = make_ref(INPUT_SHAPE, dtype, cudaRuntime); inputGpu->dataMalloc(); @@ -115,26 +118,27 @@ int main() { int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS; - // get inputs + // Build cudnn descriptors + // input descriptor cudnnTensorDescriptor_t inDesc; checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); checkCudnnError(cudnnSetTensor4dDescriptor( inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); - // get kernels + // kernel descriptor cudnnFilterDescriptor_t knDesc; checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s)); - // get bias + // bias descriptor // cudnnTensorDescriptor_t biasDesc; // checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); // checkCudnnError(cudnnSetTensor4dDescriptor( // biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); - // get convlution descriptor + // convlution descriptor cudnnConvolutionDescriptor_t convDesc; checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); checkCudnnError(cudnnSetConvolution2dDescriptor( @@ -144,16 +148,19 @@ int main() { checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS)); } + // Get output shape int outn, outc, outh, outw; checkCudnnError(cudnnGetConvolution2dForwardOutputDim( convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); + // Build output descriptor cudnnTensorDescriptor_t outDesc; checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outn, outc, outh, outw)); + // Get output size Shape OUTPUT_SHAPE = {outn, outc, outh, outw}; size_t outputSize = 1; for (auto dim: OUTPUT_SHAPE) { @@ -171,6 +178,7 @@ int main() { make_ref(OUTPUT_SHAPE, dtype, cudaRuntime); outputGpu->dataMalloc(); + // Get workspace size size_t workspaceSize = 0; checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize( cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc, @@ -178,6 +186,7 @@ int main() { CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize); + // Do forward time_op += timeit( [&]() { cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha, @@ -197,6 +206,7 @@ int main() { checkCudnnError(cudnnDestroyFilterDescriptor(knDesc)); checkCudnnError(cudnnDestroyTensorDescriptor(inDesc)); + // Do memcpy device to host time_memcpy_dtoh += timeit( [&]() { outputCpu = outputGpu->clone(cpuRuntime);