forked from jiuyuan/InfiniTensor
Add comments in conv benchmark
This commit is contained in:
parent
97970c5d94
commit
608f997042
|
@ -31,11 +31,12 @@ const char mode_name[2][50] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
// Benchmark Settings
|
||||||
int warmupRounds = 50;
|
int warmupRounds = 50;
|
||||||
int timingRounds = 100;
|
int timingRounds = 100;
|
||||||
DataType dtype = DataType::Float32;
|
DataType dtype = DataType::Float32;
|
||||||
|
|
||||||
|
// cudnn Conv Configurations
|
||||||
cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION;
|
cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION;
|
||||||
cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
cudnnConvolutionFwdAlgo_t convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
||||||
float alpha = 1.f, beta = 0.f;
|
float alpha = 1.f, beta = 0.f;
|
||||||
|
@ -63,19 +64,21 @@ int main() {
|
||||||
int DILATION_HEIGHT = 1;
|
int DILATION_HEIGHT = 1;
|
||||||
int DILATION_WIDTH = 1;
|
int DILATION_WIDTH = 1;
|
||||||
|
|
||||||
|
// Get input size
|
||||||
size_t inputSize = 1;
|
size_t inputSize = 1;
|
||||||
for (auto dim: INPUT_SHAPE) {
|
for (auto dim: INPUT_SHAPE) {
|
||||||
inputSize *= dim;
|
inputSize *= dim;
|
||||||
}
|
}
|
||||||
size_t inputSizeInBytes = inputSize * sizeof(dtype);
|
size_t inputSizeInBytes = inputSize * sizeof(dtype);
|
||||||
|
|
||||||
|
// Get kernel size
|
||||||
size_t kernelSize = 1;
|
size_t kernelSize = 1;
|
||||||
for (auto dim: KERNEL_SHAPE) {
|
for (auto dim: KERNEL_SHAPE) {
|
||||||
kernelSize *= dim;
|
kernelSize *= dim;
|
||||||
}
|
}
|
||||||
size_t kernelSizeInBytes = kernelSize * sizeof(dtype);
|
size_t kernelSizeInBytes = kernelSize * sizeof(dtype);
|
||||||
|
|
||||||
// Init time
|
// Init time variables
|
||||||
double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
|
double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
|
||||||
double time_op = 0.0;
|
double time_op = 0.0;
|
||||||
|
|
||||||
|
@ -83,7 +86,7 @@ int main() {
|
||||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||||
|
|
||||||
// Build input data on CPU
|
// Build input data and kernel on CPU
|
||||||
Tensor inputCpu =
|
Tensor inputCpu =
|
||||||
make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
|
make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
|
||||||
inputCpu->dataMalloc();
|
inputCpu->dataMalloc();
|
||||||
|
@ -94,7 +97,7 @@ int main() {
|
||||||
kernelCpu->dataMalloc();
|
kernelCpu->dataMalloc();
|
||||||
kernelCpu->setData(RandomGenerator());
|
kernelCpu->setData(RandomGenerator());
|
||||||
|
|
||||||
// Build input data on GPU
|
// Build input data and kernel on GPU
|
||||||
Tensor inputGpu =
|
Tensor inputGpu =
|
||||||
make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
|
make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
|
||||||
inputGpu->dataMalloc();
|
inputGpu->dataMalloc();
|
||||||
|
@ -115,26 +118,27 @@ int main() {
|
||||||
|
|
||||||
int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS;
|
int channelsPerGrp = INPUT_CHANNELS / NUM_GROUPS;
|
||||||
|
|
||||||
// get inputs
|
// Build cudnn descriptors
|
||||||
|
// input descriptor
|
||||||
cudnnTensorDescriptor_t inDesc;
|
cudnnTensorDescriptor_t inDesc;
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
||||||
|
|
||||||
// get kernels
|
// kernel descriptor
|
||||||
cudnnFilterDescriptor_t knDesc;
|
cudnnFilterDescriptor_t knDesc;
|
||||||
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
||||||
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
|
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
|
||||||
CUDNN_TENSOR_NCHW, f,
|
CUDNN_TENSOR_NCHW, f,
|
||||||
channelsPerGrp, r, s));
|
channelsPerGrp, r, s));
|
||||||
|
|
||||||
// get bias
|
// bias descriptor
|
||||||
// cudnnTensorDescriptor_t biasDesc;
|
// cudnnTensorDescriptor_t biasDesc;
|
||||||
// checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
// checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
||||||
// checkCudnnError(cudnnSetTensor4dDescriptor(
|
// checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||||
// biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
// biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
||||||
|
|
||||||
// get convlution descriptor
|
// convlution descriptor
|
||||||
cudnnConvolutionDescriptor_t convDesc;
|
cudnnConvolutionDescriptor_t convDesc;
|
||||||
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
|
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
|
||||||
checkCudnnError(cudnnSetConvolution2dDescriptor(
|
checkCudnnError(cudnnSetConvolution2dDescriptor(
|
||||||
|
@ -144,16 +148,19 @@ int main() {
|
||||||
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS));
|
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, NUM_GROUPS));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get output shape
|
||||||
int outn, outc, outh, outw;
|
int outn, outc, outh, outw;
|
||||||
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
|
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
|
||||||
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
||||||
|
|
||||||
|
// Build output descriptor
|
||||||
cudnnTensorDescriptor_t outDesc;
|
cudnnTensorDescriptor_t outDesc;
|
||||||
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||||
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
||||||
CUDNN_DATA_FLOAT, outn, outc,
|
CUDNN_DATA_FLOAT, outn, outc,
|
||||||
outh, outw));
|
outh, outw));
|
||||||
|
|
||||||
|
// Get output size
|
||||||
Shape OUTPUT_SHAPE = {outn, outc, outh, outw};
|
Shape OUTPUT_SHAPE = {outn, outc, outh, outw};
|
||||||
size_t outputSize = 1;
|
size_t outputSize = 1;
|
||||||
for (auto dim: OUTPUT_SHAPE) {
|
for (auto dim: OUTPUT_SHAPE) {
|
||||||
|
@ -171,6 +178,7 @@ int main() {
|
||||||
make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cudaRuntime);
|
make_ref<TensorObj>(OUTPUT_SHAPE, dtype, cudaRuntime);
|
||||||
outputGpu->dataMalloc();
|
outputGpu->dataMalloc();
|
||||||
|
|
||||||
|
// Get workspace size
|
||||||
size_t workspaceSize = 0;
|
size_t workspaceSize = 0;
|
||||||
checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize(
|
checkCudnnError(cudnnGetConvolutionForwardWorkspaceSize(
|
||||||
cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc,
|
cudaRuntime->cudnnHandle(), inDesc, knDesc, convDesc,
|
||||||
|
@ -178,6 +186,7 @@ int main() {
|
||||||
|
|
||||||
CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize);
|
CudaPtr workspace = cudaRuntime->getWorkspace(workspaceSize);
|
||||||
|
|
||||||
|
// Do forward
|
||||||
time_op += timeit(
|
time_op += timeit(
|
||||||
[&]() {
|
[&]() {
|
||||||
cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha,
|
cudnnConvolutionForward(cudaRuntime->cudnnHandle(), &alpha,
|
||||||
|
@ -197,6 +206,7 @@ int main() {
|
||||||
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||||
|
|
||||||
|
// Do memcpy device to host
|
||||||
time_memcpy_dtoh += timeit(
|
time_memcpy_dtoh += timeit(
|
||||||
[&]() {
|
[&]() {
|
||||||
outputCpu = outputGpu->clone(cpuRuntime);
|
outputCpu = outputGpu->clone(cpuRuntime);
|
||||||
|
|
Loading…
Reference in New Issue