diff --git a/CMakeLists.txt b/CMakeLists.txt index d2993c04..7c737313 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF) set(DEFAULT_BUILD_TYPE "RelWithDebInfo") +# Build Type +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + message("Configuring for Debug build.") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0") + add_compile_definitions(DEBUG_MODE) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + message("Configuring for Release build.") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + add_compile_definitions(NDEBUG) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + message("Configuring for RelWithDebInfo build.") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") +else() + message("Build type not specified. Configuring for RelWithDebInfo build.") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") +endif() + if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake) message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory") diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc index 23f58ced..a8051a91 100644 --- a/src/cuda/cuda_runtime.cc +++ b/src/cuda/cuda_runtime.cc @@ -4,6 +4,19 @@ #include "core/runtime.h" #include "operators/conv.h" #include "operators/matmul.h" + +#ifdef DEBUG_MODE +void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) { + cudaError_t kernelError = cudaGetLastError(); + if (kernelError != cudaSuccess) { + std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError) + << std::endl + << "Failed Operator: " << op->toString() << std::endl; + exit(EXIT_FAILURE); + } +} +#endif + namespace infini { void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { @@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { } else { kernel->compute(op, this); } + +#ifdef DEBUG_MODE + CHECK_CUDA_KERNEL_ERROR(op); +#endif } } @@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const { opTime[op->getOpType()] += t; opCnt[op->getOpType()]++; } + +#ifdef DEBUG_MODE + CHECK_CUDA_KERNEL_ERROR(op); +#endif } } diff --git a/src/kernels/cuda/clip.cu b/src/kernels/cuda/clip.cu index eabc4926..7d3e97bd 100644 --- a/src/kernels/cuda/clip.cu +++ b/src/kernels/cuda/clip.cu @@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue, float maxValue) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _clip_kernel<<>>(input, output, num, minValue, + _clip_kernel<<>>(input, output, num, minValue, maxValue); } diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu index b28f0144..93e384d3 100644 --- a/src/kernels/cuda/element_wise.cu +++ b/src/kernels/cuda/element_wise.cu @@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; } constexpr int thread_work_size() { return 4; } constexpr int block_work_size() { return thread_work_size() * num_threads(); } -__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, - int b0, int b1, int b2, int b3, - int c0, int c1, int c2, int c3) { +__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, + int a2, int a3, int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; int n = c0 * c1 * c2 * c3; for (int i = index; i < n; i += stride) { - int c0_index = i/ (c1 * c2 * c3); + int c0_index = i / (c1 * c2 * c3); int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; @@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2 int b1_index = c1_index % b1; int b2_index = c2_index % b2; int b3_index = c3_index % b3; - z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]; + z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 + + a3_index] / + y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 + + b3_index]; } } -__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, - int b0, int b1, int b2, int b3, - int c0, int c1, int c2, int c3) { +__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, + int a2, int a3, int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; int n = c0 * c1 * c2 * c3; for (int i = index; i < n; i += stride) { - int c0_index = i/ (c1 * c2 * c3); + int c0_index = i / (c1 * c2 * c3); int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; @@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2 int b1_index = c1_index % b1; int b2_index = c2_index % b2; int b3_index = c3_index % b3; - z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]); + z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + + a2_index * a3 + a3_index], + y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + + b2_index * b3 + b3_index]); } } namespace infini { void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, - int b0, int b1, int b2, int b3, - int c0, int c1, int c2, int c3) { + int b0, int b1, int b2, int b3, int c0, int c1, int c2, + int c3) { int blocksize = block_work_size(); - int num = c0*c1*c2*c3; + int num = c0 * c1 * c2 * c3; int gridsize = (num + block_work_size() - 1) / block_work_size(); - _div_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); + _div_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, + b3, c0, c1, c2, c3); } void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, - int b0, int b1, int b2, int b3, - int c0, int c1, int c2, int c3) { + int b0, int b1, int b2, int b3, int c0, int c1, int c2, + int c3) { int blocksize = block_work_size(); - int num = c0*c1*c2*c3; + int num = c0 * c1 * c2 * c3; int gridsize = (num + block_work_size() - 1) / block_work_size(); - _pow_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); + _pow_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, + b3, c0, c1, c2, c3); } }; // namespace infini diff --git a/src/kernels/cuda/extend.cu b/src/kernels/cuda/extend.cu index 03345e96..f6879105 100644 --- a/src/kernels/cuda/extend.cu +++ b/src/kernels/cuda/extend.cu @@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter, int oSize) { int blocksize = 32 * 16; int gridsize = (oSize + blocksize - 1) / blocksize; - _extend_kernel<<>>(in, out, blockSize, blockSizeOuter, + _extend_kernel<<>>(in, out, blockSize, blockSizeOuter, oSize); } } // namespace infini diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu index 5a1fd272..b79bd53f 100644 --- a/src/kernels/cuda/unary.cu +++ b/src/kernels/cuda/unary.cu @@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _softmax_kernel1<<<1, 1>>>(input, output, num); - _softmax_kernel2<<>>(input, output, num); + _softmax_kernel2<<>>(input, output, num); } void relu_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _relu_kernel<<>>(input, output, num); + _relu_kernel<<>>(input, output, num); } void sigmoid_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _sigmoid_kernel<<>>(input, output, num); + _sigmoid_kernel<<>>(input, output, num); } void tanh_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _tanh_kernel<<>>(input, output, num); + _tanh_kernel<<>>(input, output, num); } void abs_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _abs_kernel<<>>(input, output, num); + _abs_kernel<<>>(input, output, num); } void sqrt_kernel(float *input, float *output, int num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); - _sqrt_kernel<<>>(input, output, num); + _sqrt_kernel<<>>(input, output, num); } }; // namespace infini