add: codegen for all metaOps.

This commit is contained in:
mazx 2022-10-26 02:20:16 +08:00
parent 2c8bd3729b
commit 18b79903ee
11 changed files with 202 additions and 60 deletions

View File

@ -1,2 +1,3 @@
rm ./eval_transpose
nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
./eval_transpose

View File

@ -1,6 +1,6 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func(float *src, float *dst) {
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
tmp_offset_dst /= 65536;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func(float *src, float *dst) {
// Kernel
__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3,
float *tensor_ptr_4) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
int offset_src = 0;
int tmp_offset_src = loop_idx;
offset_src += tmp_offset_src % 65536 * 256;
tmp_offset_src /= 65536;
int offset_dst = 0;
int tmp_offset_dst = loop_idx;
offset_dst += tmp_offset_dst % 65536 * 256;
tmp_offset_dst /= 65536;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_0(float *src, float *dst) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func<<<gridDim, blockDim>>>(src, dst);
cudaCheckError();
}
void invoke_func_1(float *src, float *dst) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func<<<gridDim, blockDim>>>(src, dst);

View File

@ -1,6 +1,6 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func(float *src, float *dst) {
__global__ void kernel_func_2(float *tensor_ptr_9, float *tensor_ptr_10) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
tmp_offset_dst /= 1024;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func(float *src, float *dst) {
// Kernel
__global__ void kernel_func_3(float *tensor_ptr_9, float *tensor_ptr_10,
float *tensor_ptr_11) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
int offset_src = 0;
int tmp_offset_src = loop_idx;
offset_src += tmp_offset_src % 1024 * 256;
tmp_offset_src /= 1024;
int offset_dst = 0;
int tmp_offset_dst = loop_idx;
offset_dst += tmp_offset_dst % 1024 * 256;
tmp_offset_dst /= 1024;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_11[0 + offset + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_2(float *src, float *dst) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func<<<gridDim, blockDim>>>(src, dst);
cudaCheckError();
}
void invoke_func_3(float *src, float *dst) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func<<<gridDim, blockDim>>>(src, dst);

View File

@ -1,6 +1,6 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func(float *src, float *dst) {
__global__ void kernel_func(float *tensor_ptr_2, float *tensor_ptr_3) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
@ -21,7 +21,8 @@ __global__ void kernel_func(float *src, float *dst) {
tmp_offset_dst /= 33;
#pragma unroll
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
buf[inst_idx] = src[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
buf[inst_idx] =
tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
@ -37,7 +38,7 @@ __global__ void kernel_func(float *src, float *dst) {
if (lane_id < 31) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
dst[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
buf[inst_idx];
}
}

View File

@ -15,6 +15,7 @@ enum OpType {
RELU,
ADD,
SUB,
TRANSPOSE,
};
enum MemType {

View File

@ -5,10 +5,14 @@
namespace memb {
std::vector<std::shared_ptr<MetaOp>>
instantiateUnary(const std::vector<int> &shape, const OpType opType);
instantiateUnary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape);
std::vector<std::shared_ptr<MetaOp>>
instantiateBinary(const std::vector<int> &shape, const OpType opType);
std::vector<std::shared_ptr<MetaOp>>
instantiateTranspose(const std::vector<int> &_shape,
const std::vector<int> &_perm);
instantiateBinary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape);
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape, const std::vector<int> &perm);
} // namespace memb

View File

@ -30,6 +30,9 @@ class MetaOp {
inline int getLoopSt() { return main_loop_st; }
inline int getLoopEd() { return main_loop_ed; }
std::string genKernelFunc();
std::string genInvokeFunc();
inline void print() {
std::cout << "MetaOp: " << id << std::endl;
for (auto microOp : microOps) {

View File

@ -31,9 +31,9 @@ class Pointer {
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
ptr->getOffset() + " + " + offset);
}
static inline std::shared_ptr<Pointer>
buildPtrByTensor(std::string tensorName) {
return std::make_shared<Pointer>(MemType::DRAM, tensorName, "0");
static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
return std::make_shared<Pointer>(
MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
}
inline const MemType getType() { return memType; }

View File

@ -15,7 +15,9 @@ size_t getSize(const std::vector<int> &shape) {
}
std::vector<std::shared_ptr<MetaOp>>
instantiateUnary(const std::vector<int> &shape, const OpType opType) {
instantiateUnary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = getSize(shape);
@ -35,9 +37,7 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
metaOp->mappingDst->map = {1};
auto &ptrs = metaOp->ptrs;
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
metaOp->ptrs = ptrs;
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
metaOp->microOps.emplace_back(std::make_shared<MemoryOp>(
@ -55,7 +55,9 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
}
std::vector<std::shared_ptr<MetaOp>>
instantiateBinary(const std::vector<int> &shape, const OpType opType) {
instantiateBinary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = getSize(shape);
@ -75,10 +77,7 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
metaOp->mappingDst->map = {1};
auto &ptrs = metaOp->ptrs;
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src0", "0"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src1", "0"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
@ -99,9 +98,9 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateTranspose(const std::vector<int> &shape,
const std::vector<int> &perm) {
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<int> &shape, const std::vector<int> &perm) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = 1;
@ -155,9 +154,7 @@ instantiateTranspose(const std::vector<int> &shape,
// TODO: tiling is a metaOp or microOps?
auto &ptrs = metaOp->ptrs;
ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
metaOp->ptrs = ptrs;
auto smem = Pointer::buildPtr(SRAM, "smem", "warp_id * 32 * 33");
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");

View File

@ -1,8 +1,10 @@
#include "pfusion/memory_codegen.h"
#include "core/graph.h"
#include "operators/transpose.h"
#include "pfusion/common.h"
#include "pfusion/instantiate.h"
#include "pfusion/memory_codegen.h"
#include "pfusion/pointer.h"
#include <cstdlib>
#include <filesystem>
@ -50,23 +52,44 @@ memb::MetaGraph instantiateGraph(infini::Graph graph) {
switch (op->getOpType()) {
case infini::OpType::Transpose:
metaGraph.addNode(memb::instantiateTranspose(
memb::TRANSPOSE,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims()),
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
break;
case infini::OpType::Relu:
metaGraph.addNode(memb::instantiateUnary(
convertShape(op->getInputs()[0]->getDims()),
memb::OpType::RELU));
memb::RELU,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
break;
case infini::OpType::Add:
metaGraph.addNode(memb::instantiateBinary(
convertShape(op->getInputs()[0]->getDims()),
memb::OpType::ADD));
memb::ADD,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
break;
case infini::OpType::Sub:
metaGraph.addNode(memb::instantiateBinary(
convertShape(op->getInputs()[0]->getDims()),
memb::OpType::SUB));
memb::SUB,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
break;
default:
IT_ASSERT(false);

View File

@ -33,31 +33,48 @@ std::string MetaGraph::genHeader() {
}
std::string MetaGraph::genKernelFunc() {
auto metaOp = nodes[0].metaOps[0];
std::vector<std::shared_ptr<MetaOp>> metaOps;
for (auto &node : nodes) {
metaOps.emplace_back(node.metaOps[0]);
}
std::string code = "";
for (auto metaOp : metaOps) {
code += metaOp->genKernelFunc();
}
return code;
}
std::string MetaOp::genKernelFunc() {
std::string code = "";
code += "// Kernel\n";
code += "__global__ void kernel_func(float *src, float *dst) {\n";
code += "__global__ void kernel_func_" + std::to_string(id) + "(";
IT_ASSERT(ptrs.size() > 0);
code += "float *" + ptrs[0]->getName();
for (size_t i = 1; i < ptrs.size(); i++) {
code += ", float *" + ptrs[i]->getName();
}
code += ") {\n";
code += "int lane_id = threadIdx.x % 32;\n";
code += "int warp_id = threadIdx.x / 32;\n";
code += "int parallel_idx = blockIdx.x * " +
std::to_string(metaOp->numWarps) + " + warp_id;\n";
if (metaOp->numReg != 0) {
code += "float buf[" + std::to_string(metaOp->numReg) + "];\n";
code += "int parallel_idx = blockIdx.x * " + std::to_string(numWarps) +
" + warp_id;\n";
if (numReg != 0) {
code += "float buf[" + std::to_string(numReg) + "];\n";
}
if (metaOp->numSmem != 0) {
code += "__shared__ float smem[" +
std::to_string(metaOp->numSmem * metaOp->numWarps) + "];\n";
if (numSmem != 0) {
code += "__shared__ float smem[" + std::to_string(numSmem * numWarps) +
"];\n";
}
code += "for (int loop_idx = parallel_idx; loop_idx < " +
std::to_string(metaOp->main_loop_ed) + "; loop_idx += " +
std::to_string(metaOp->numBlocks * metaOp->numWarps) + ") {\n";
std::to_string(main_loop_ed) +
"; loop_idx += " + std::to_string(numBlocks * numWarps) + ") {\n";
// gen offset_src
code += genOffset("offset_src", metaOp->mappingSrc);
code += genOffset("offset_dst", metaOp->mappingDst);
code += genOffset("offset_src", mappingSrc);
code += genOffset("offset_dst", mappingDst);
for (auto microOp : metaOp->microOps) {
for (auto microOp : microOps) {
code += microOp->generate();
}
code += "}\n}\n";
@ -65,13 +82,24 @@ std::string MetaGraph::genKernelFunc() {
}
std::string MetaGraph::genInvokeFunc() {
std::vector<std::shared_ptr<MetaOp>> metaOps;
for (auto &node : nodes) {
metaOps.emplace_back(node.metaOps[0]);
}
std::string code = "";
auto metaOp = nodes[0].metaOps[0];
code += "void invoke_func(float *src, float *dst) {\n";
int numBlocks = metaOp->numBlocks;
int numWarps = metaOp->numWarps * 32;
for (auto metaOp : metaOps) {
code += metaOp->genInvokeFunc();
}
return code;
}
std::string MetaOp::genInvokeFunc() {
std::string code = "";
code += "void invoke_func_" + std::to_string(id) +
"(float *src, float *dst) {\n";
int numThreads = numWarps * 32;
code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
code += "dim3 blockDim(" + std::to_string(numWarps) + ", 1);";
code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
code += "kernel_func<<<gridDim, blockDim>>>(src, dst);\n";
code += "cudaCheckError();\n";
code += "}\n";