forked from jiuyuan/InfiniTensor
add: codegen for all metaOps.
This commit is contained in:
parent
2c8bd3729b
commit
18b79903ee
|
@ -1,2 +1,3 @@
|
|||
rm ./eval_transpose
|
||||
nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
|
||||
./eval_transpose
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func(float *src, float *dst) {
|
||||
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
|
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
tmp_offset_dst /= 65536;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
|
||||
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
|
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func(float *src, float *dst) {
|
||||
// Kernel
|
||||
__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3,
|
||||
float *tensor_ptr_4) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int tmp_offset_src = loop_idx;
|
||||
offset_src += tmp_offset_src % 65536 * 256;
|
||||
tmp_offset_src /= 65536;
|
||||
int offset_dst = 0;
|
||||
int tmp_offset_dst = loop_idx;
|
||||
offset_dst += tmp_offset_dst % 65536 * 256;
|
||||
tmp_offset_dst /= 65536;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *src, float *dst) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_1(float *src, float *dst) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func(float *src, float *dst) {
|
||||
__global__ void kernel_func_2(float *tensor_ptr_9, float *tensor_ptr_10) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
|
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
tmp_offset_dst /= 1024;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
|
||||
buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
|
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func(float *src, float *dst) {
|
||||
// Kernel
|
||||
__global__ void kernel_func_3(float *tensor_ptr_9, float *tensor_ptr_10,
|
||||
float *tensor_ptr_11) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int tmp_offset_src = loop_idx;
|
||||
offset_src += tmp_offset_src % 1024 * 256;
|
||||
tmp_offset_src /= 1024;
|
||||
int offset_dst = 0;
|
||||
int tmp_offset_dst = loop_idx;
|
||||
offset_dst += tmp_offset_dst % 1024 * 256;
|
||||
tmp_offset_dst /= 1024;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_11[0 + offset + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_2(float *src, float *dst) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_3(float *src, float *dst) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func(float *src, float *dst) {
|
||||
__global__ void kernel_func(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
|
@ -21,7 +21,8 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
tmp_offset_dst /= 33;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
|
||||
buf[inst_idx] = src[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
|
||||
|
@ -37,7 +38,7 @@ __global__ void kernel_func(float *src, float *dst) {
|
|||
if (lane_id < 31) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
|
||||
dst[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
|
||||
tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ enum OpType {
|
|||
RELU,
|
||||
ADD,
|
||||
SUB,
|
||||
TRANSPOSE,
|
||||
};
|
||||
|
||||
enum MemType {
|
||||
|
|
|
@ -5,10 +5,14 @@
|
|||
|
||||
namespace memb {
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateUnary(const std::vector<int> &shape, const OpType opType);
|
||||
instantiateUnary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBinary(const std::vector<int> &shape, const OpType opType);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateTranspose(const std::vector<int> &_shape,
|
||||
const std::vector<int> &_perm);
|
||||
instantiateBinary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape);
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
||||
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape, const std::vector<int> &perm);
|
||||
} // namespace memb
|
||||
|
|
|
@ -30,6 +30,9 @@ class MetaOp {
|
|||
inline int getLoopSt() { return main_loop_st; }
|
||||
inline int getLoopEd() { return main_loop_ed; }
|
||||
|
||||
std::string genKernelFunc();
|
||||
std::string genInvokeFunc();
|
||||
|
||||
inline void print() {
|
||||
std::cout << "MetaOp: " << id << std::endl;
|
||||
for (auto microOp : microOps) {
|
||||
|
|
|
@ -31,9 +31,9 @@ class Pointer {
|
|||
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
|
||||
ptr->getOffset() + " + " + offset);
|
||||
}
|
||||
static inline std::shared_ptr<Pointer>
|
||||
buildPtrByTensor(std::string tensorName) {
|
||||
return std::make_shared<Pointer>(MemType::DRAM, tensorName, "0");
|
||||
static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
|
||||
return std::make_shared<Pointer>(
|
||||
MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
|
||||
}
|
||||
|
||||
inline const MemType getType() { return memType; }
|
||||
|
|
|
@ -15,7 +15,9 @@ size_t getSize(const std::vector<int> &shape) {
|
|||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateUnary(const std::vector<int> &shape, const OpType opType) {
|
||||
instantiateUnary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t size = getSize(shape);
|
||||
|
||||
|
@ -35,9 +37,7 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
|
|||
metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
|
||||
metaOp->mappingDst->map = {1};
|
||||
|
||||
auto &ptrs = metaOp->ptrs;
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
metaOp->microOps.emplace_back(std::make_shared<MemoryOp>(
|
||||
|
@ -55,7 +55,9 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
|
|||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBinary(const std::vector<int> &shape, const OpType opType) {
|
||||
instantiateBinary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t size = getSize(shape);
|
||||
|
||||
|
@ -75,10 +77,7 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
|
|||
metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
|
||||
metaOp->mappingDst->map = {1};
|
||||
|
||||
auto &ptrs = metaOp->ptrs;
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src0", "0"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src1", "0"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
@ -99,9 +98,9 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
|
|||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateTranspose(const std::vector<int> &shape,
|
||||
const std::vector<int> &perm) {
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
||||
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<int> &shape, const std::vector<int> &perm) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
|
||||
size_t size = 1;
|
||||
|
@ -155,9 +154,7 @@ instantiateTranspose(const std::vector<int> &shape,
|
|||
|
||||
// TODO: tiling is a metaOp or microOps?
|
||||
|
||||
auto &ptrs = metaOp->ptrs;
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
|
||||
metaOp->ptrs = ptrs;
|
||||
auto smem = Pointer::buildPtr(SRAM, "smem", "warp_id * 32 * 33");
|
||||
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
#include "pfusion/memory_codegen.h"
|
||||
#include "core/graph.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "pfusion/common.h"
|
||||
#include "pfusion/instantiate.h"
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "pfusion/pointer.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
|
@ -50,23 +52,44 @@ memb::MetaGraph instantiateGraph(infini::Graph graph) {
|
|||
switch (op->getOpType()) {
|
||||
case infini::OpType::Transpose:
|
||||
metaGraph.addNode(memb::instantiateTranspose(
|
||||
memb::TRANSPOSE,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
|
||||
break;
|
||||
case infini::OpType::Relu:
|
||||
metaGraph.addNode(memb::instantiateUnary(
|
||||
convertShape(op->getInputs()[0]->getDims()),
|
||||
memb::OpType::RELU));
|
||||
memb::RELU,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
break;
|
||||
case infini::OpType::Add:
|
||||
metaGraph.addNode(memb::instantiateBinary(
|
||||
convertShape(op->getInputs()[0]->getDims()),
|
||||
memb::OpType::ADD));
|
||||
memb::ADD,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
break;
|
||||
case infini::OpType::Sub:
|
||||
metaGraph.addNode(memb::instantiateBinary(
|
||||
convertShape(op->getInputs()[0]->getDims()),
|
||||
memb::OpType::SUB));
|
||||
memb::SUB,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
break;
|
||||
default:
|
||||
IT_ASSERT(false);
|
||||
|
|
|
@ -33,31 +33,48 @@ std::string MetaGraph::genHeader() {
|
|||
}
|
||||
|
||||
std::string MetaGraph::genKernelFunc() {
|
||||
auto metaOp = nodes[0].metaOps[0];
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
for (auto &node : nodes) {
|
||||
metaOps.emplace_back(node.metaOps[0]);
|
||||
}
|
||||
std::string code = "";
|
||||
for (auto metaOp : metaOps) {
|
||||
code += metaOp->genKernelFunc();
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MetaOp::genKernelFunc() {
|
||||
std::string code = "";
|
||||
code += "// Kernel\n";
|
||||
code += "__global__ void kernel_func(float *src, float *dst) {\n";
|
||||
code += "__global__ void kernel_func_" + std::to_string(id) + "(";
|
||||
IT_ASSERT(ptrs.size() > 0);
|
||||
code += "float *" + ptrs[0]->getName();
|
||||
for (size_t i = 1; i < ptrs.size(); i++) {
|
||||
code += ", float *" + ptrs[i]->getName();
|
||||
}
|
||||
code += ") {\n";
|
||||
code += "int lane_id = threadIdx.x % 32;\n";
|
||||
code += "int warp_id = threadIdx.x / 32;\n";
|
||||
code += "int parallel_idx = blockIdx.x * " +
|
||||
std::to_string(metaOp->numWarps) + " + warp_id;\n";
|
||||
if (metaOp->numReg != 0) {
|
||||
code += "float buf[" + std::to_string(metaOp->numReg) + "];\n";
|
||||
code += "int parallel_idx = blockIdx.x * " + std::to_string(numWarps) +
|
||||
" + warp_id;\n";
|
||||
if (numReg != 0) {
|
||||
code += "float buf[" + std::to_string(numReg) + "];\n";
|
||||
}
|
||||
if (metaOp->numSmem != 0) {
|
||||
code += "__shared__ float smem[" +
|
||||
std::to_string(metaOp->numSmem * metaOp->numWarps) + "];\n";
|
||||
if (numSmem != 0) {
|
||||
code += "__shared__ float smem[" + std::to_string(numSmem * numWarps) +
|
||||
"];\n";
|
||||
}
|
||||
|
||||
code += "for (int loop_idx = parallel_idx; loop_idx < " +
|
||||
std::to_string(metaOp->main_loop_ed) + "; loop_idx += " +
|
||||
std::to_string(metaOp->numBlocks * metaOp->numWarps) + ") {\n";
|
||||
std::to_string(main_loop_ed) +
|
||||
"; loop_idx += " + std::to_string(numBlocks * numWarps) + ") {\n";
|
||||
|
||||
// gen offset_src
|
||||
code += genOffset("offset_src", metaOp->mappingSrc);
|
||||
code += genOffset("offset_dst", metaOp->mappingDst);
|
||||
code += genOffset("offset_src", mappingSrc);
|
||||
code += genOffset("offset_dst", mappingDst);
|
||||
|
||||
for (auto microOp : metaOp->microOps) {
|
||||
for (auto microOp : microOps) {
|
||||
code += microOp->generate();
|
||||
}
|
||||
code += "}\n}\n";
|
||||
|
@ -65,13 +82,24 @@ std::string MetaGraph::genKernelFunc() {
|
|||
}
|
||||
|
||||
std::string MetaGraph::genInvokeFunc() {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
for (auto &node : nodes) {
|
||||
metaOps.emplace_back(node.metaOps[0]);
|
||||
}
|
||||
std::string code = "";
|
||||
auto metaOp = nodes[0].metaOps[0];
|
||||
code += "void invoke_func(float *src, float *dst) {\n";
|
||||
int numBlocks = metaOp->numBlocks;
|
||||
int numWarps = metaOp->numWarps * 32;
|
||||
for (auto metaOp : metaOps) {
|
||||
code += metaOp->genInvokeFunc();
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MetaOp::genInvokeFunc() {
|
||||
std::string code = "";
|
||||
code += "void invoke_func_" + std::to_string(id) +
|
||||
"(float *src, float *dst) {\n";
|
||||
int numThreads = numWarps * 32;
|
||||
code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
|
||||
code += "dim3 blockDim(" + std::to_string(numWarps) + ", 1);";
|
||||
code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
|
||||
code += "kernel_func<<<gridDim, blockDim>>>(src, dst);\n";
|
||||
code += "cudaCheckError();\n";
|
||||
code += "}\n";
|
||||
|
|
Loading…
Reference in New Issue