diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2993c04..7c737313 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
 
 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
+# Build Type
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    message("Configuring for Debug build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
+    add_compile_definitions(DEBUG_MODE)
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    message("Configuring for Release build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") 
+    add_compile_definitions(NDEBUG)
+elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    message("Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") 
+else()
+    message("Build type not specified. Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") 
+endif()
+
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
index 23f58ced..a8051a91 100644
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@@ -4,6 +4,19 @@
 #include "core/runtime.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
+
+#ifdef DEBUG_MODE
+void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
+    cudaError_t kernelError = cudaGetLastError();
+    if (kernelError != cudaSuccess) {
+        std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
+                  << std::endl
+                  << "Failed Operator: " << op->toString() << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
 namespace infini {
 
 void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
@@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
         } else {
             kernel->compute(op, this);
         }
+
+#ifdef DEBUG_MODE
+        CHECK_CUDA_KERNEL_ERROR(op);
+#endif
     }
 }
 
@@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
             opTime[op->getOpType()] += t;
             opCnt[op->getOpType()]++;
         }
+
+#ifdef DEBUG_MODE
+        CHECK_CUDA_KERNEL_ERROR(op);
+#endif
     }
 }
 
diff --git a/src/kernels/cuda/clip.cu b/src/kernels/cuda/clip.cu
index eabc4926..7d3e97bd 100644
--- a/src/kernels/cuda/clip.cu
+++ b/src/kernels/cuda/clip.cu
@@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
                  float maxValue) {
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
+    _clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
                                           maxValue);
 }
 
diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu
index b28f0144..93e384d3 100644
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 
-__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
-                                                          int b0, int b1, int b2, int b3,
-                                                          int c0, int c1, int c2, int c3) {
+__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
+                            int a2, int a3, int b0, int b1, int b2, int b3,
+                            int c0, int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
     int n = c0 * c1 * c2 * c3;
 
     for (int i = index; i < n; i += stride) {
-        int c0_index = i/ (c1 * c2 * c3);
+        int c0_index = i / (c1 * c2 * c3);
         int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
         int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
         int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
         int b1_index = c1_index % b1;
         int b2_index = c2_index % b2;
         int b3_index = c3_index % b3;
-        z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
+        z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
+                 a3_index] /
+               y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
+                 b3_index];
     }
 }
 
-__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
-                                                          int b0, int b1, int b2, int b3,
-                                                          int c0, int c1, int c2, int c3) {
+__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
+                            int a2, int a3, int b0, int b1, int b2, int b3,
+                            int c0, int c1, int c2, int c3) {
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     int stride = blockDim.x * gridDim.x;
     int n = c0 * c1 * c2 * c3;
 
     for (int i = index; i < n; i += stride) {
-        int c0_index = i/ (c1 * c2 * c3);
+        int c0_index = i / (c1 * c2 * c3);
         int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
         int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
         int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
         int b1_index = c1_index % b1;
         int b2_index = c2_index % b2;
         int b3_index = c3_index % b3;
-        z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
+        z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                     a2_index * a3 + a3_index],
+                   y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                     b2_index * b3 + b3_index]);
     }
 }
 
 namespace infini {
 void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
-                                              int b0, int b1, int b2, int b3,
-                                              int c0, int c1, int c2, int c3) {
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                int c3) {
 
     int blocksize = block_work_size();
-    int num = c0*c1*c2*c3;
+    int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+    _div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
+                                         b3, c0, c1, c2, c3);
 }
 void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
-                                              int b0, int b1, int b2, int b3,
-                                              int c0, int c1, int c2, int c3) {
+                int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                int c3) {
     int blocksize = block_work_size();
-    int num = c0*c1*c2*c3;
+    int num = c0 * c1 * c2 * c3;
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
+    _pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
+                                         b3, c0, c1, c2, c3);
 }
 
 }; // namespace infini
diff --git a/src/kernels/cuda/extend.cu b/src/kernels/cuda/extend.cu
index 03345e96..f6879105 100644
--- a/src/kernels/cuda/extend.cu
+++ b/src/kernels/cuda/extend.cu
@@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
                    int oSize) {
     int blocksize = 32 * 16;
     int gridsize = (oSize + blocksize - 1) / blocksize;
-    _extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter,
+    _extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
                                             oSize);
 }
 } // namespace infini
diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu
index 5a1fd272..b79bd53f 100644
--- a/src/kernels/cuda/unary.cu
+++ b/src/kernels/cuda/unary.cu
@@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
     _softmax_kernel1<<<1, 1>>>(input, output, num);
-    _softmax_kernel2<<<blocksize, gridsize>>>(input, output, num);
+    _softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
 }
 void relu_kernel(float *input, float *output, int num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _relu_kernel<<<blocksize, gridsize>>>(input, output, num);
+    _relu_kernel<<<gridsize, blocksize>>>(input, output, num);
 }
 void sigmoid_kernel(float *input, float *output, int num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num);
+    _sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
 }
 void tanh_kernel(float *input, float *output, int num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _tanh_kernel<<<blocksize, gridsize>>>(input, output, num);
+    _tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
 }
 void abs_kernel(float *input, float *output, int num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _abs_kernel<<<blocksize, gridsize>>>(input, output, num);
+    _abs_kernel<<<gridsize, blocksize>>>(input, output, num);
 }
 void sqrt_kernel(float *input, float *output, int num) {
 
     int blocksize = block_work_size();
     int gridsize = (num + block_work_size() - 1) / block_work_size();
-    _sqrt_kernel<<<blocksize, gridsize>>>(input, output, num);
+    _sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
 }
 }; // namespace infini