2022-10-26 01:42:45 +08:00
|
|
|
#include "cuda_utils.h"
|
|
|
|
// Kernel
|
2022-10-26 02:20:16 +08:00
|
|
|
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
2022-10-26 01:42:45 +08:00
|
|
|
int lane_id = threadIdx.x % 32;
|
|
|
|
int warp_id = threadIdx.x / 32;
|
|
|
|
int parallel_idx = blockIdx.x * 8 + warp_id;
|
|
|
|
float buf[8];
|
|
|
|
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
|
|
|
|
int offset_src = 0;
|
|
|
|
int tmp_offset_src = loop_idx;
|
|
|
|
offset_src += tmp_offset_src % 65536 * 256;
|
|
|
|
tmp_offset_src /= 65536;
|
|
|
|
int offset_dst = 0;
|
|
|
|
int tmp_offset_dst = loop_idx;
|
|
|
|
offset_dst += tmp_offset_dst % 65536 * 256;
|
|
|
|
tmp_offset_dst /= 65536;
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
2022-10-26 02:20:16 +08:00
|
|
|
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
|
2022-10-26 01:42:45 +08:00
|
|
|
}
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
|
|
|
buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
|
|
|
|
}
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
2022-10-26 02:20:16 +08:00
|
|
|
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
|
2022-10-26 01:42:45 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-10-26 02:20:16 +08:00
|
|
|
// Kernel
|
|
|
|
__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3,
|
|
|
|
float *tensor_ptr_4) {
|
|
|
|
int lane_id = threadIdx.x % 32;
|
|
|
|
int warp_id = threadIdx.x / 32;
|
|
|
|
int parallel_idx = blockIdx.x * 8 + warp_id;
|
|
|
|
float buf[24];
|
|
|
|
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
|
|
|
|
int offset_src = 0;
|
|
|
|
int tmp_offset_src = loop_idx;
|
|
|
|
offset_src += tmp_offset_src % 65536 * 256;
|
|
|
|
tmp_offset_src /= 65536;
|
|
|
|
int offset_dst = 0;
|
|
|
|
int tmp_offset_dst = loop_idx;
|
|
|
|
offset_dst += tmp_offset_dst % 65536 * 256;
|
|
|
|
tmp_offset_dst /= 65536;
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
|
|
|
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
|
|
|
|
}
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
|
|
|
buf[inst_idx + 8] =
|
|
|
|
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id];
|
|
|
|
}
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
|
|
|
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
|
|
|
|
}
|
|
|
|
#pragma unroll
|
|
|
|
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
|
|
|
tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] =
|
|
|
|
buf[inst_idx + 16];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void invoke_func_0(float *src, float *dst) {
|
|
|
|
dim3 gridDim(108, 1);
|
|
|
|
dim3 blockDim(256, 1);
|
|
|
|
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
|
|
|
cudaCheckError();
|
|
|
|
}
|
|
|
|
void invoke_func_1(float *src, float *dst) {
|
2022-10-26 01:42:45 +08:00
|
|
|
dim3 gridDim(108, 1);
|
|
|
|
dim3 blockDim(256, 1);
|
|
|
|
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
|
|
|
cudaCheckError();
|
|
|
|
}
|