Compare commits

...

7 Commits

Author SHA1 Message Date
bolun 1c55c74151 Add comments 2023-07-20 14:14:06 +08:00
bolun 7945693131 Add: benchmark for cudnn softmax 2023-07-20 11:03:07 +08:00
bolun fdb2d30868 fix: Makefile 2023-07-19 11:29:15 +08:00
zhangyue207 f532784d4f update CMakeList.txt Makefile for micro benchmark 2023-07-19 11:10:18 +08:00
zhangyue207 454b7651a8 update CMakeList.txt Makefile for micro benchmark 2023-07-19 10:55:51 +08:00
zhangyue207 48322dbf27 Conv Kernel FP16 2023-07-19 10:06:47 +08:00
zhangyue207 523946cb8b add conv_half kernel 2023-07-18 16:55:00 +08:00
4 changed files with 148 additions and 0 deletions

View File

@ -262,3 +262,19 @@ if(BUILD_TEST)
target_link_libraries(nnet_reader InfiniTensor)
endif()
endif()
function(build_bench files)
file(GLOB BENCH_SOURCES ${files})
foreach(benchsourcefile ${BENCH_SOURCES})
get_filename_component(benchname ${benchsourcefile} NAME_WE)
add_executable("benchmark_${benchname}" ${benchsourcefile})
target_link_libraries("benchmark_${benchname}" InfiniTensor)
# add_custom_target(NAME ${benchname} COMMAND ${benchname})
endforeach(benchsourcefile ${BENCH_SOURCES})
endfunction()
if (BENCH)
if (USE_CUDA)
build_bench(benchmark/kernels/cuda/*.cc)
endif()
endif()

View File

@ -6,12 +6,14 @@ BANG ?= OFF
INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
BENCH ?= ON
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
CMAKE_OPT += -DUSE_BANG=$(BANG)
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
CMAKE_OPT += -DBUILD_TEST=$(TEST)
CMAKE_OPT += -DBENCH=$(BENCH)
ifeq ($(INTELCPU), ON)
CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp

View File

@ -0,0 +1,126 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/softmax.h"
#include "benchmark.h"
#include <iostream>
#include <cmath>
#include <chrono>
#include <sys/time.h>
using namespace infini;
#define M 1048576
int main() {
// Benchmark Settings
int warmupRounds = 200;
int timingRounds = 200;
Shape INPUT_SHAPE = {16, 3, 128, 128};
DataType dtype = DataType::Float32;
// Get data size
size_t size = 1;
for (auto dim: INPUT_SHAPE) {
size *= dim;
}
size_t sizeInBytes = size * sizeof(dtype);
// Init time variables
double time_memcpy_htod = 0.0, time_memcpy_dtoh = 0.0;
double time_op = 0.0;
// Create runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(RandomGenerator());
// Build input data on GPU
Tensor inputGpu =
make_ref<TensorObj>(INPUT_SHAPE, dtype, cudaRuntime);
inputGpu->dataMalloc();
// Do memcpy host to device
time_memcpy_htod += timeit(
[&]() {
inputGpu = inputCpu->clone(cudaRuntime);
},
[&]() { cudaRuntime->sync(); },
warmupRounds, timingRounds
);
// Build output data on CPU
auto outputGpu = inputGpu->clone(cudaRuntime);
// Build output data on GPU
Tensor outputCpu =
make_ref<TensorObj>(INPUT_SHAPE, dtype, cpuRuntime);
outputCpu->dataMalloc();
// Build cudnn descriptors
cudnnTensorDescriptor_t inputDesc, outputDesc;
// input descriptor
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
// output descriptor
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, INPUT_SHAPE[0],
INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]));
// cudnn operator settings
float alpha = 1.0, beta = 0.0;
cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST;
cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
// Do forward
time_op += timeit(
[&]() {
cudnnSoftmaxForward(cudaRuntime->cudnnHandle(), algo, mode,
&alpha, inputDesc, inputGpu->getRawDataPtr<void *>(),
&beta, outputDesc, outputGpu->getRawDataPtr<void *>());
},
[&]() { cudaRuntime->sync(); },
warmupRounds, timingRounds
);
checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc));
// Do memcpy device to host
time_memcpy_dtoh += timeit(
[&]() {
outputCpu = outputGpu->clone(cpuRuntime);
},
[&]() { cudaRuntime->sync(); },
warmupRounds, timingRounds
);
// Print Results
printf("Operator - Softmax:\n");
printf("Input shape: (%d, %d, %d, %d)\n",
INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2], INPUT_SHAPE[3]);
printf("Input size: %ld, dtype: %s, size in bytes: %ld\n",
size, dtype.toString().c_str(), sizeInBytes);
printf("TFlops: %.5lf tflops\n", 5 * size / 1e9 / time_op);
printf("Memcpy time: h2d - %.6lf ms, d2h - %.6lf ms\n",
time_memcpy_htod, time_memcpy_dtoh);
printf("Memcpy throughput: h2d - %.6lf MB/ms, d2h: %.6lf MB/ms\n",
sizeInBytes / M / time_memcpy_htod, sizeInBytes / M / time_memcpy_dtoh);
printf("Operation: %.6lf ms\n", time_op);
return 0;
}

4
include/benchmark.h Normal file
View File

@ -0,0 +1,4 @@
#pragma once
#include "core/common.h"
#include "core/tensor_base.h"
#include "utils/data_generator.h"