Add comments in conv benchmark

This commit is contained in:
bolun 2023-07-20 16:57:27 +08:00
parent 97970c5d94
commit 608f997042
1 changed files with 18 additions and 8 deletions

View File

@ -31,11 +31,12 @@ const char mode_name[2][50] = {
}; };
int main() { int main() {
// Benchmark Settings
int warmupRounds = 50; int warmupRounds = 50;
int timingRounds = 100; int timingRounds = 100;
DataType dtype = DataType::Float32; DataType dtype = DataType::Float32;
// cudnn Conv Configurations
cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION; cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION;
cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
float alpha = 1.f, beta = 0.f; float alpha = 1.f, beta = 0.f;
@ -63,19 +64,21 @@ int main() {
int DILATION_HEIGHT = 1; int DILATION_HEIGHT = 1;
int DILATION_WIDTH = 1; int DILATION_WIDTH = 1;
// Get input size
size_t inputSize = 1; size_t inputSize = 1;
for (auto dim: INPUT_SHAPE) { for (auto dim: INPUT_SHAPE) {
inputSize *= dim; inputSize *= dim;
} }
size_t inputSizeInBytes = inputSize * sizeof(dtype); size_t inputSizeInBytes = inputSize * sizeof(dtype);
// Get kernel size
size_t kernelSize = 1; size_t kernelSize = 1;
for (auto dim: KERNEL_SHAPE) { for (auto dim: KERNEL_SHAPE) {
kernelSize *= dim; kernelSize *= dim;
} }
size_t kernelSizeInBytes = kernelSize * sizeof(dtype); size_t kernelSizeInBytes = kernelSize * sizeof(dtype);
// Init time // Init time variables
double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0; double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
double time_op = 0.0; double time_op = 0.0;
@ -83,7 +86,7 @@ int main() {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>(); auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU // Build input data and kernel on CPU
Tensor inputCpu = Tensor inputCpu =
make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime); make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
inputCpu->dataMalloc(); inputCpu->dataMalloc();
@ -94,7 +97,7 @@ int main() {
kernelCpu->dataMalloc(); kernelCpu->dataMalloc();
kernelCpu->setData(RandomGenerator()); kernelCpu->setData(RandomGenerator());
// Build input data on GPU // Build input data and kernel on GPU
Tensor inputGpu = Tensor inputGpu =
make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime); make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
inputGpu->dataMalloc(); inputGpu->dataMalloc();
@ -115,26 +118,27 @@ int main() {
int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS; int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS;
// get inputs // Build cudnn descriptors
// input descriptor
cudnnTensorDescriptor_t inDesc; cudnnTensorDescriptor_t inDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
// get kernels // kernel descriptor
cudnnFilterDescriptor_t knDesc; cudnnFilterDescriptor_t knDesc;
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW, f, CUDNN_TENSOR_NCHW, f,
channelsPerGrp, r, s)); channelsPerGrp, r, s));
// get bias // bias descriptor
// cudnnTensorDescriptor_t biasDesc; // cudnnTensorDescriptor_t biasDesc;
// checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); // checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
// checkCudnnError(cudnnSetTensor4dDescriptor( // checkCudnnError(cudnnSetTensor4dDescriptor(
// biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); // biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
// get convlution descriptor // convlution descriptor
cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionDescriptor_t convDesc;
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
checkCudnnError(cudnnSetConvolution2dDescriptor( checkCudnnError(cudnnSetConvolution2dDescriptor(
@ -144,16 +148,19 @@ int main() {
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS)); checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS));
} }
// Get output shape
int outn, outc, outh, outw; int outn, outc, outh, outw;
checkCudnnError(cudnnGetConvolution2dForwardOutputDim( checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
// Build output descriptor
cudnnTensorDescriptor_t outDesc; cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, outn, outc, CUDNN_DATA_FLOAT, outn, outc,
outh, outw)); outh, outw));
// Get output size
Shape OUTPUT_SHAPE = {outn, outc, outh, outw}; Shape OUTPUT_SHAPE = {outn, outc, outh, outw};
size_t outputSize = 1; size_t outputSize = 1;
for (auto dim: OUTPUT_SHAPE) { for (auto dim: OUTPUT_SHAPE) {
@ -171,6 +178,7 @@ int main() {
make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cudaRuntime); make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cudaRuntime);
outputGpu->dataMalloc(); outputGpu->dataMalloc();
// Get workspace size
size_t workspaceSize = 0; size_t workspaceSize = 0;
checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize( checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize(
cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc, cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc,
@ -178,6 +186,7 @@ int main() {
CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize); CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize);
// Do forward
time_op += timeit( time_op += timeit(
[&]() { [&]() {
cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha, cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha,
@ -197,6 +206,7 @@ int main() {
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc)); checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc)); checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
// Do memcpy device to host
time_memcpy_dtoh += timeit( time_memcpy_dtoh += timeit(
[&]() { [&]() {
outputCpu = outputGpu->clone(cpuRuntime); outputCpu = outputGpu->clone(cpuRuntime);