forked from jiuyuan/InfiniTensor
Fix kernel arguments, add debug mode
This commit is contained in:
parent
1e91979c76
commit
0ce7e7651f
|
@ -15,6 +15,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
|||
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
|
||||
|
||||
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||
# Build Type
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
message("Configuring for Debug build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
|
||||
add_compile_definitions(DEBUG_MODE)
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
message("Configuring for Release build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||
add_compile_definitions(NDEBUG)
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||
message("Configuring for RelWithDebInfo build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||
else()
|
||||
message("Build type not specified. Configuring for RelWithDebInfo build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||
endif()
|
||||
|
||||
|
||||
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
||||
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
|
||||
|
|
|
@ -4,6 +4,19 @@
|
|||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
|
||||
cudaError_t kernelError = cudaGetLastError();
|
||||
if (kernelError != cudaSuccess) {
|
||||
std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
|
||||
<< std::endl
|
||||
<< "Failed Operator: " << op->toString() << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
|
@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
|||
} else {
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
CHECK_CUDA_KERNEL_ERROR(op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
|||
opTime[op->getOpType()] += t;
|
||||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
CHECK_CUDA_KERNEL_ERROR(op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
|
|||
float maxValue) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
|
||||
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
|
||||
maxValue);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
|||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c0_index = i / (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
|
|||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
|
||||
z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
|
||||
a3_index] /
|
||||
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
|
||||
b3_index];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c0_index = i / (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
|
|||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
|
||||
z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
||||
a2_index * a3 + a3_index],
|
||||
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
||||
b2_index * b3 + b3_index]);
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int num = c0*c1*c2*c3;
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
|
||||
b3, c0, c1, c2, c3);
|
||||
}
|
||||
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
int blocksize = block_work_size();
|
||||
int num = c0*c1*c2*c3;
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
|
||||
b3, c0, c1, c2, c3);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
|
|||
int oSize) {
|
||||
int blocksize = 32 * 16;
|
||||
int gridsize = (oSize + blocksize - 1) / blocksize;
|
||||
_extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter,
|
||||
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
|
||||
oSize);
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
|
|||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_softmax_kernel1<<<1, 1>>>(input, output, num);
|
||||
_softmax_kernel2<<<blocksize, gridsize>>>(input, output, num);
|
||||
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void relu_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_relu_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sigmoid_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void tanh_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_tanh_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void abs_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_abs_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sqrt_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sqrt_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
}; // namespace infini
|
||||
|
|
Loading…
Reference in New Issue