From 18b79903eeae4489d9fa7ead8a05ec807d80c6ce Mon Sep 17 00:00:00 2001 From: mazx Date: Wed, 26 Oct 2022 02:20:16 +0800 Subject: [PATCH] add: codegen for all metaOps. --- eval_pfusion/eval_transpose.sh | 1 + generated_code/sar_drn_0.cu | 50 +++++++++++++++++++++++--- generated_code/sar_drn_1.cu | 50 +++++++++++++++++++++++--- generated_code/transpose.cu | 7 ++-- include/pfusion/common.h | 1 + include/pfusion/instantiate.h | 14 +++++--- include/pfusion/meta_op.h | 3 ++ include/pfusion/pointer.h | 6 ++-- src/pfusion/instantiate.cc | 27 +++++++------- src/pfusion/memory_codegen.cc | 37 +++++++++++++++---- src/pfusion/meta_op.cc | 66 ++++++++++++++++++++++++---------- 11 files changed, 202 insertions(+), 60 deletions(-) diff --git a/eval_pfusion/eval_transpose.sh b/eval_pfusion/eval_transpose.sh index 4e555b0c..588e96b5 100755 --- a/eval_pfusion/eval_transpose.sh +++ b/eval_pfusion/eval_transpose.sh @@ -1,2 +1,3 @@ +rm ./eval_transpose nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose ./eval_transpose diff --git a/generated_code/sar_drn_0.cu b/generated_code/sar_drn_0.cu index 1e1e2bed..de97c058 100644 --- a/generated_code/sar_drn_0.cu +++ b/generated_code/sar_drn_0.cu @@ -1,6 +1,6 @@ #include "cuda_utils.h" // Kernel -__global__ void kernel_func(float *src, float *dst) { +__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) { int lane_id = threadIdx.x % 32; int warp_id = threadIdx.x / 32; int parallel_idx = blockIdx.x * 8 + warp_id; @@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) { tmp_offset_dst /= 65536; #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { - buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id]; + buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id]; } #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { @@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) { } #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { - dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx]; + tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx]; } } } -void invoke_func(float *src, float *dst) { +// Kernel +__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3, + float *tensor_ptr_4) { + int lane_id = threadIdx.x % 32; + int warp_id = threadIdx.x / 32; + int parallel_idx = blockIdx.x * 8 + warp_id; + float buf[24]; + for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) { + int offset_src = 0; + int tmp_offset_src = loop_idx; + offset_src += tmp_offset_src % 65536 * 256; + tmp_offset_src /= 65536; + int offset_dst = 0; + int tmp_offset_dst = loop_idx; + offset_dst += tmp_offset_dst % 65536 * 256; + tmp_offset_dst /= 65536; +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id]; + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx + 8] = + tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id]; + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8] + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] = + buf[inst_idx + 16]; + } + } +} +void invoke_func_0(float *src, float *dst) { + dim3 gridDim(108, 1); + dim3 blockDim(256, 1); + kernel_func<<>>(src, dst); + cudaCheckError(); +} +void invoke_func_1(float *src, float *dst) { dim3 gridDim(108, 1); dim3 blockDim(256, 1); kernel_func<<>>(src, dst); diff --git a/generated_code/sar_drn_1.cu b/generated_code/sar_drn_1.cu index 82b35db0..200b33d5 100644 --- a/generated_code/sar_drn_1.cu +++ b/generated_code/sar_drn_1.cu @@ -1,6 +1,6 @@ #include "cuda_utils.h" // Kernel -__global__ void kernel_func(float *src, float *dst) { +__global__ void kernel_func_2(float *tensor_ptr_9, float *tensor_ptr_10) { int lane_id = threadIdx.x % 32; int warp_id = threadIdx.x / 32; int parallel_idx = blockIdx.x * 8 + warp_id; @@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) { tmp_offset_dst /= 1024; #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { - buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id]; + buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id]; } #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { @@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) { } #pragma unroll for (int inst_idx = 0; inst_idx < 8; inst_idx++) { - dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx]; + tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx]; } } } -void invoke_func(float *src, float *dst) { +// Kernel +__global__ void kernel_func_3(float *tensor_ptr_9, float *tensor_ptr_10, + float *tensor_ptr_11) { + int lane_id = threadIdx.x % 32; + int warp_id = threadIdx.x / 32; + int parallel_idx = blockIdx.x * 8 + warp_id; + float buf[24]; + for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) { + int offset_src = 0; + int tmp_offset_src = loop_idx; + offset_src += tmp_offset_src % 1024 * 256; + tmp_offset_src /= 1024; + int offset_dst = 0; + int tmp_offset_dst = loop_idx; + offset_dst += tmp_offset_dst % 1024 * 256; + tmp_offset_dst /= 1024; +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id]; + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx + 8] = + tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id]; + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8] + } +#pragma unroll + for (int inst_idx = 0; inst_idx < 8; inst_idx++) { + tensor_ptr_11[0 + offset + inst_idx * 32 + lane_id] = + buf[inst_idx + 16]; + } + } +} +void invoke_func_2(float *src, float *dst) { + dim3 gridDim(108, 1); + dim3 blockDim(256, 1); + kernel_func<<>>(src, dst); + cudaCheckError(); +} +void invoke_func_3(float *src, float *dst) { dim3 gridDim(108, 1); dim3 blockDim(256, 1); kernel_func<<>>(src, dst); diff --git a/generated_code/transpose.cu b/generated_code/transpose.cu index 32f7d951..ea20af2e 100644 --- a/generated_code/transpose.cu +++ b/generated_code/transpose.cu @@ -1,6 +1,6 @@ #include "cuda_utils.h" // Kernel -__global__ void kernel_func(float *src, float *dst) { +__global__ void kernel_func(float *tensor_ptr_2, float *tensor_ptr_3) { int lane_id = threadIdx.x % 32; int warp_id = threadIdx.x / 32; int parallel_idx = blockIdx.x * 8 + warp_id; @@ -21,7 +21,8 @@ __global__ void kernel_func(float *src, float *dst) { tmp_offset_dst /= 33; #pragma unroll for (int inst_idx = 0; inst_idx < 31; inst_idx++) { - buf[inst_idx] = src[0 + offset_src + 0 + inst_idx * 1056 + lane_id]; + buf[inst_idx] = + tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id]; } #pragma unroll for (int inst_idx = 0; inst_idx < 31; inst_idx++) { @@ -37,7 +38,7 @@ __global__ void kernel_func(float *src, float *dst) { if (lane_id < 31) { #pragma unroll for (int inst_idx = 0; inst_idx < 32; inst_idx++) { - dst[0 + offset_dst + 0 + inst_idx * 31 + lane_id] = + tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] = buf[inst_idx]; } } diff --git a/include/pfusion/common.h b/include/pfusion/common.h index 2edd0b3e..f217156d 100644 --- a/include/pfusion/common.h +++ b/include/pfusion/common.h @@ -15,6 +15,7 @@ enum OpType { RELU, ADD, SUB, + TRANSPOSE, }; enum MemType { diff --git a/include/pfusion/instantiate.h b/include/pfusion/instantiate.h index 43d57b51..95508457 100644 --- a/include/pfusion/instantiate.h +++ b/include/pfusion/instantiate.h @@ -5,10 +5,14 @@ namespace memb { std::vector> -instantiateUnary(const std::vector &shape, const OpType opType); +instantiateUnary(const OpType opType, + std::vector> ptrs, + const std::vector &shape); std::vector> -instantiateBinary(const std::vector &shape, const OpType opType); -std::vector> -instantiateTranspose(const std::vector &_shape, - const std::vector &_perm); +instantiateBinary(const OpType opType, + std::vector> ptrs, + const std::vector &shape); +std::vector> instantiateTranspose( + const OpType opType, std::vector> ptrs, + const std::vector &shape, const std::vector &perm); } // namespace memb diff --git a/include/pfusion/meta_op.h b/include/pfusion/meta_op.h index 43e8d101..a8171378 100644 --- a/include/pfusion/meta_op.h +++ b/include/pfusion/meta_op.h @@ -30,6 +30,9 @@ class MetaOp { inline int getLoopSt() { return main_loop_st; } inline int getLoopEd() { return main_loop_ed; } + std::string genKernelFunc(); + std::string genInvokeFunc(); + inline void print() { std::cout << "MetaOp: " << id << std::endl; for (auto microOp : microOps) { diff --git a/include/pfusion/pointer.h b/include/pfusion/pointer.h index efda5dd3..4003372a 100644 --- a/include/pfusion/pointer.h +++ b/include/pfusion/pointer.h @@ -31,9 +31,9 @@ class Pointer { return std::make_shared(ptr->getType(), ptr->getName(), ptr->getOffset() + " + " + offset); } - static inline std::shared_ptr - buildPtrByTensor(std::string tensorName) { - return std::make_shared(MemType::DRAM, tensorName, "0"); + static inline std::shared_ptr buildPtrByTensorGuid(size_t guid) { + return std::make_shared( + MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0"); } inline const MemType getType() { return memType; } diff --git a/src/pfusion/instantiate.cc b/src/pfusion/instantiate.cc index 87de94f5..f7238511 100644 --- a/src/pfusion/instantiate.cc +++ b/src/pfusion/instantiate.cc @@ -15,7 +15,9 @@ size_t getSize(const std::vector &shape) { } std::vector> -instantiateUnary(const std::vector &shape, const OpType opType) { +instantiateUnary(const OpType opType, + std::vector> ptrs, + const std::vector &shape) { std::vector> metaOps; size_t size = getSize(shape); @@ -35,9 +37,7 @@ instantiateUnary(const std::vector &shape, const OpType opType) { metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)}; metaOp->mappingDst->map = {1}; - auto &ptrs = metaOp->ptrs; - ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0")); - ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0")); + metaOp->ptrs = ptrs; auto buf = Pointer::buildPtr(REG, "buf", "inst_idx"); metaOp->microOps.emplace_back(std::make_shared( @@ -55,7 +55,9 @@ instantiateUnary(const std::vector &shape, const OpType opType) { } std::vector> -instantiateBinary(const std::vector &shape, const OpType opType) { +instantiateBinary(const OpType opType, + std::vector> ptrs, + const std::vector &shape) { std::vector> metaOps; size_t size = getSize(shape); @@ -75,10 +77,7 @@ instantiateBinary(const std::vector &shape, const OpType opType) { metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)}; metaOp->mappingDst->map = {1}; - auto &ptrs = metaOp->ptrs; - ptrs.emplace_back(Pointer::buildPtr(DRAM, "src0", "0")); - ptrs.emplace_back(Pointer::buildPtr(DRAM, "src1", "0")); - ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0")); + metaOp->ptrs = ptrs; auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx"); auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8"); auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16"); @@ -99,9 +98,9 @@ instantiateBinary(const std::vector &shape, const OpType opType) { return metaOps; } -std::vector> -instantiateTranspose(const std::vector &shape, - const std::vector &perm) { +std::vector> instantiateTranspose( + const OpType opType, std::vector> ptrs, + const std::vector &shape, const std::vector &perm) { std::vector> metaOps; size_t size = 1; @@ -155,9 +154,7 @@ instantiateTranspose(const std::vector &shape, // TODO: tiling is a metaOp or microOps? - auto &ptrs = metaOp->ptrs; - ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0")); - ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0")); + metaOp->ptrs = ptrs; auto smem = Pointer::buildPtr(SRAM, "smem", "warp_id * 32 * 33"); auto buf = Pointer::buildPtr(REG, "buf", "inst_idx"); diff --git a/src/pfusion/memory_codegen.cc b/src/pfusion/memory_codegen.cc index 5865752e..965bdc06 100644 --- a/src/pfusion/memory_codegen.cc +++ b/src/pfusion/memory_codegen.cc @@ -1,8 +1,10 @@ -#include "pfusion/memory_codegen.h" #include "core/graph.h" #include "operators/transpose.h" + #include "pfusion/common.h" #include "pfusion/instantiate.h" +#include "pfusion/memory_codegen.h" +#include "pfusion/pointer.h" #include #include @@ -50,23 +52,44 @@ memb::MetaGraph instantiateGraph(infini::Graph graph) { switch (op->getOpType()) { case infini::OpType::Transpose: metaGraph.addNode(memb::instantiateTranspose( + memb::TRANSPOSE, + {memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[0]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getOutputs()[0]->getGuid())}, convertShape(op->getOutputs()[0]->getDims()), convertPerm(infini::as(op)->getPerm()))); break; case infini::OpType::Relu: metaGraph.addNode(memb::instantiateUnary( - convertShape(op->getInputs()[0]->getDims()), - memb::OpType::RELU)); + memb::RELU, + {memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[0]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getOutputs()[0]->getGuid())}, + convertShape(op->getOutputs()[0]->getDims()))); break; case infini::OpType::Add: metaGraph.addNode(memb::instantiateBinary( - convertShape(op->getInputs()[0]->getDims()), - memb::OpType::ADD)); + memb::ADD, + {memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[0]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[1]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getOutputs()[0]->getGuid())}, + convertShape(op->getOutputs()[0]->getDims()))); break; case infini::OpType::Sub: metaGraph.addNode(memb::instantiateBinary( - convertShape(op->getInputs()[0]->getDims()), - memb::OpType::SUB)); + memb::SUB, + {memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[0]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getInputs()[1]->getGuid()), + memb::Pointer::buildPtrByTensorGuid( + op->getOutputs()[0]->getGuid())}, + convertShape(op->getOutputs()[0]->getDims()))); break; default: IT_ASSERT(false); diff --git a/src/pfusion/meta_op.cc b/src/pfusion/meta_op.cc index 1676df4f..83c32fff 100644 --- a/src/pfusion/meta_op.cc +++ b/src/pfusion/meta_op.cc @@ -33,31 +33,48 @@ std::string MetaGraph::genHeader() { } std::string MetaGraph::genKernelFunc() { - auto metaOp = nodes[0].metaOps[0]; + std::vector> metaOps; + for (auto &node : nodes) { + metaOps.emplace_back(node.metaOps[0]); + } + std::string code = ""; + for (auto metaOp : metaOps) { + code += metaOp->genKernelFunc(); + } + return code; +} + +std::string MetaOp::genKernelFunc() { std::string code = ""; code += "// Kernel\n"; - code += "__global__ void kernel_func(float *src, float *dst) {\n"; + code += "__global__ void kernel_func_" + std::to_string(id) + "("; + IT_ASSERT(ptrs.size() > 0); + code += "float *" + ptrs[0]->getName(); + for (size_t i = 1; i < ptrs.size(); i++) { + code += ", float *" + ptrs[i]->getName(); + } + code += ") {\n"; code += "int lane_id = threadIdx.x % 32;\n"; code += "int warp_id = threadIdx.x / 32;\n"; - code += "int parallel_idx = blockIdx.x * " + - std::to_string(metaOp->numWarps) + " + warp_id;\n"; - if (metaOp->numReg != 0) { - code += "float buf[" + std::to_string(metaOp->numReg) + "];\n"; + code += "int parallel_idx = blockIdx.x * " + std::to_string(numWarps) + + " + warp_id;\n"; + if (numReg != 0) { + code += "float buf[" + std::to_string(numReg) + "];\n"; } - if (metaOp->numSmem != 0) { - code += "__shared__ float smem[" + - std::to_string(metaOp->numSmem * metaOp->numWarps) + "];\n"; + if (numSmem != 0) { + code += "__shared__ float smem[" + std::to_string(numSmem * numWarps) + + "];\n"; } code += "for (int loop_idx = parallel_idx; loop_idx < " + - std::to_string(metaOp->main_loop_ed) + "; loop_idx += " + - std::to_string(metaOp->numBlocks * metaOp->numWarps) + ") {\n"; + std::to_string(main_loop_ed) + + "; loop_idx += " + std::to_string(numBlocks * numWarps) + ") {\n"; // gen offset_src - code += genOffset("offset_src", metaOp->mappingSrc); - code += genOffset("offset_dst", metaOp->mappingDst); + code += genOffset("offset_src", mappingSrc); + code += genOffset("offset_dst", mappingDst); - for (auto microOp : metaOp->microOps) { + for (auto microOp : microOps) { code += microOp->generate(); } code += "}\n}\n"; @@ -65,13 +82,24 @@ std::string MetaGraph::genKernelFunc() { } std::string MetaGraph::genInvokeFunc() { + std::vector> metaOps; + for (auto &node : nodes) { + metaOps.emplace_back(node.metaOps[0]); + } std::string code = ""; - auto metaOp = nodes[0].metaOps[0]; - code += "void invoke_func(float *src, float *dst) {\n"; - int numBlocks = metaOp->numBlocks; - int numWarps = metaOp->numWarps * 32; + for (auto metaOp : metaOps) { + code += metaOp->genInvokeFunc(); + } + return code; +} + +std::string MetaOp::genInvokeFunc() { + std::string code = ""; + code += "void invoke_func_" + std::to_string(id) + + "(float *src, float *dst) {\n"; + int numThreads = numWarps * 32; code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);"; - code += "dim3 blockDim(" + std::to_string(numWarps) + ", 1);"; + code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);"; code += "kernel_func<<>>(src, dst);\n"; code += "cudaCheckError();\n"; code += "}\n";