Fix kernel arguments, add debug mode (#119)

Add debug mode macro in cmakelist.
This commit is contained in:
PanZezhong1725 2023-08-28 08:58:38 +08:00 committed by GitHub
commit 69fd251e5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 72 additions and 26 deletions

View File

@ -15,6 +15,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
# Build Type
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
message("Configuring for Debug build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
add_compile_definitions(DEBUG_MODE)
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
message("Configuring for Release build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
add_compile_definitions(NDEBUG)
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
message("Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
else()
message("Build type not specified. Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
endif()
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")

View File

@ -4,6 +4,19 @@
#include "core/runtime.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#ifdef DEBUG_MODE
void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
cudaError_t kernelError = cudaGetLastError();
if (kernelError != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
<< std::endl
<< "Failed Operator: " << op->toString() << std::endl;
exit(EXIT_FAILURE);
}
}
#endif
namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
} else {
kernel->compute(op, this);
}
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
}
}
@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
opTime[op->getOpType()] += t;
opCnt[op->getOpType()]++;
}
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
}
}

View File

@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
float maxValue) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
maxValue);
}

View File

@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3);
int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1;
int b2_index = c2_index % b2;
int b3_index = c3_index % b3;
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
a3_index] /
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
b3_index];
}
}
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3);
int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1;
int b2_index = c2_index % b2;
int b3_index = c3_index % b3;
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
a2_index * a3 + a3_index],
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
b2_index * b3 + b3_index]);
}
}
namespace infini {
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) {
int blocksize = block_work_size();
int num = c0*c1*c2*c3;
int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size();
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
}
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) {
int blocksize = block_work_size();
int num = c0*c1*c2*c3;
int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size();
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
}
}; // namespace infini

View File

@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
int oSize) {
int blocksize = 32 * 16;
int gridsize = (oSize + blocksize - 1) / blocksize;
_extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter,
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
oSize);
}
} // namespace infini

View File

@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_softmax_kernel1<<<1, 1>>>(input, output, num);
_softmax_kernel2<<<blocksize, gridsize>>>(input, output, num);
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
}
void relu_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_relu_kernel<<<blocksize, gridsize>>>(input, output, num);
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sigmoid_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num);
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void tanh_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_tanh_kernel<<<blocksize, gridsize>>>(input, output, num);
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void abs_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_abs_kernel<<<blocksize, gridsize>>>(input, output, num);
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sqrt_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<blocksize, gridsize>>>(input, output, num);
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
}
}; // namespace infini