Fix kernel arguments, add debug mode

This commit is contained in:
panzezhong 2023-08-24 13:39:22 +08:00
parent 1e91979c76
commit 0ce7e7651f
6 changed files with 72 additions and 26 deletions

View File

@ -15,6 +15,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
set(DEFAULT_BUILD_TYPE "RelWithDebInfo") set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
# Build Type
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
message("Configuring for Debug build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
add_compile_definitions(DEBUG_MODE)
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
message("Configuring for Release build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
add_compile_definitions(NDEBUG)
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
message("Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
else()
message("Build type not specified. Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
endif()
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake) if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory") message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")

View File

@ -4,6 +4,19 @@
#include "core/runtime.h" #include "core/runtime.h"
#include "operators/conv.h" #include "operators/conv.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#ifdef DEBUG_MODE
void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
cudaError_t kernelError = cudaGetLastError();
if (kernelError != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
<< std::endl
<< "Failed Operator: " << op->toString() << std::endl;
exit(EXIT_FAILURE);
}
}
#endif
namespace infini { namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
} else { } else {
kernel->compute(op, this); kernel->compute(op, this);
} }
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
} }
} }
@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
opTime[op->getOpType()] += t; opTime[op->getOpType()] += t;
opCnt[op->getOpType()]++; opCnt[op->getOpType()]++;
} }
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
} }
} }

View File

@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
float maxValue) { float maxValue) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue, _clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
maxValue); maxValue);
} }

View File

@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; } constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); } constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
int b0, int b1, int b2, int b3, int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) { int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x; int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3; int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3); int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1; int b1_index = c1_index % b1;
int b2_index = c2_index % b2; int b2_index = c2_index % b2;
int b3_index = c3_index % b3; int b3_index = c3_index % b3;
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]; z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
a3_index] /
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
b3_index];
} }
} }
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
int b0, int b1, int b2, int b3, int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) { int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x; int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3; int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3); int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1; int b1_index = c1_index % b1;
int b2_index = c2_index % b2; int b2_index = c2_index % b2;
int b3_index = c3_index % b3; int b3_index = c3_index % b3;
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]); z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
a2_index * a3 + a3_index],
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
b2_index * b3 + b3_index]);
} }
} }
namespace infini { namespace infini {
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c0, int c1, int c2, int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0*c1*c2*c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); _div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
} }
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c0, int c1, int c2, int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0*c1*c2*c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); _pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
} }
}; // namespace infini }; // namespace infini

View File

@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
int oSize) { int oSize) {
int blocksize = 32 * 16; int blocksize = 32 * 16;
int gridsize = (oSize + blocksize - 1) / blocksize; int gridsize = (oSize + blocksize - 1) / blocksize;
_extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter, _extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
oSize); oSize);
} }
} // namespace infini } // namespace infini

View File

@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_softmax_kernel1<<<1, 1>>>(input, output, num); _softmax_kernel1<<<1, 1>>>(input, output, num);
_softmax_kernel2<<<blocksize, gridsize>>>(input, output, num); _softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
} }
void relu_kernel(float *input, float *output, int num) { void relu_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_relu_kernel<<<blocksize, gridsize>>>(input, output, num); _relu_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
void sigmoid_kernel(float *input, float *output, int num) { void sigmoid_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num); _sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
void tanh_kernel(float *input, float *output, int num) { void tanh_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_tanh_kernel<<<blocksize, gridsize>>>(input, output, num); _tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
void abs_kernel(float *input, float *output, int num) { void abs_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_abs_kernel<<<blocksize, gridsize>>>(input, output, num); _abs_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
void sqrt_kernel(float *input, float *output, int num) { void sqrt_kernel(float *input, float *output, int num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<blocksize, gridsize>>>(input, output, num); _sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
} }
}; // namespace infini }; // namespace infini