add: codegen for all metaOps.

2022-10-26 02:20:16 +08:00 · 2022-10-26 02:20:16 +08:00 · 18b79903ee
parent 2c8bd3729b
commit 18b79903ee
11 changed files with 202 additions and 60 deletions
--- a/eval_pfusion/eval_transpose.sh
+++ b/eval_pfusion/eval_transpose.sh
@ -1,2 +1,3 @@
+rm ./eval_transpose
 nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
 ./eval_transpose
--- a/generated_code/sar_drn_0.cu
+++ b/generated_code/sar_drn_0.cu
@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
    int lane_id = threadIdx.x % 32;
    int warp_id = threadIdx.x / 32;
    int parallel_idx = blockIdx.x * 8 + warp_id;
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
        tmp_offset_dst /= 65536;
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
+            buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
        }
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
        }
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
+            tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
        }
    }
 }
-void invoke_func(float *src, float *dst) {
+// Kernel
+__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3,
+                              float *tensor_ptr_4) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
+        int offset_src = 0;
+        int tmp_offset_src = loop_idx;
+        offset_src += tmp_offset_src % 65536 * 256;
+        tmp_offset_src /= 65536;
+        int offset_dst = 0;
+        int tmp_offset_dst = loop_idx;
+        offset_dst += tmp_offset_dst % 65536 * 256;
+        tmp_offset_dst /= 65536;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_0(float *src, float *dst) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func<<<gridDim, blockDim>>>(src, dst);
+    cudaCheckError();
+}
+void invoke_func_1(float *src, float *dst) {
    dim3 gridDim(108, 1);
    dim3 blockDim(256, 1);
    kernel_func<<<gridDim, blockDim>>>(src, dst);
--- a/generated_code/sar_drn_1.cu
+++ b/generated_code/sar_drn_1.cu
@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func_2(float *tensor_ptr_9, float *tensor_ptr_10) {
    int lane_id = threadIdx.x % 32;
    int warp_id = threadIdx.x / 32;
    int parallel_idx = blockIdx.x * 8 + warp_id;
@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
        tmp_offset_dst /= 1024;
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
+            buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
        }
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
        }
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
+            tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
        }
    }
 }
-void invoke_func(float *src, float *dst) {
+// Kernel
+__global__ void kernel_func_3(float *tensor_ptr_9, float *tensor_ptr_10,
+                              float *tensor_ptr_11) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
+        int offset_src = 0;
+        int tmp_offset_src = loop_idx;
+        offset_src += tmp_offset_src % 1024 * 256;
+        tmp_offset_src /= 1024;
+        int offset_dst = 0;
+        int tmp_offset_dst = loop_idx;
+        offset_dst += tmp_offset_dst % 1024 * 256;
+        tmp_offset_dst /= 1024;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_11[0 + offset + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_2(float *src, float *dst) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func<<<gridDim, blockDim>>>(src, dst);
+    cudaCheckError();
+}
+void invoke_func_3(float *src, float *dst) {
    dim3 gridDim(108, 1);
    dim3 blockDim(256, 1);
    kernel_func<<<gridDim, blockDim>>>(src, dst);
--- a/generated_code/transpose.cu
+++ b/generated_code/transpose.cu
@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func(float *tensor_ptr_2, float *tensor_ptr_3) {
    int lane_id = threadIdx.x % 32;
    int warp_id = threadIdx.x / 32;
    int parallel_idx = blockIdx.x * 8 + warp_id;
@ -21,7 +21,8 @@ __global__ void kernel_func(float *src, float *dst) {
        tmp_offset_dst /= 33;
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
-            buf[inst_idx] = src[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
+            buf[inst_idx] =
+                tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
        }
 #pragma unroll
        for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
@ -37,7 +38,7 @@ __global__ void kernel_func(float *src, float *dst) {
        if (lane_id < 31) {
 #pragma unroll
            for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
-                dst[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
+                tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
                    buf[inst_idx];
            }
        }
--- a/include/pfusion/common.h
+++ b/include/pfusion/common.h
@ -15,6 +15,7 @@ enum OpType {
    RELU,
    ADD,
    SUB,
+    TRANSPOSE,
 };

 enum MemType {
--- a/include/pfusion/instantiate.h
+++ b/include/pfusion/instantiate.h
@ -5,10 +5,14 @@

 namespace memb {
 std::vector<std::shared_ptr<MetaOp>>
-instantiateUnary(const std::vector<int> &shape, const OpType opType);
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<int> &shape);
 std::vector<std::shared_ptr<MetaOp>>
-instantiateBinary(const std::vector<int> &shape, const OpType opType);
-std::vector<std::shared_ptr<MetaOp>>
-instantiateTranspose(const std::vector<int> &_shape,
-                     const std::vector<int> &_perm);
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<int> &shape);
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<int> &shape, const std::vector<int> &perm);
 } // namespace memb
--- a/include/pfusion/meta_op.h
+++ b/include/pfusion/meta_op.h
@ -30,6 +30,9 @@ class MetaOp {
    inline int getLoopSt() { return main_loop_st; }
    inline int getLoopEd() { return main_loop_ed; }

+    std::string genKernelFunc();
+    std::string genInvokeFunc();
+
    inline void print() {
        std::cout << "MetaOp: " << id << std::endl;
        for (auto microOp : microOps) {
--- a/include/pfusion/pointer.h
+++ b/include/pfusion/pointer.h
@ -31,9 +31,9 @@ class Pointer {
        return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
                                         ptr->getOffset() + " + " + offset);
    }
-    static inline std::shared_ptr<Pointer>
-    buildPtrByTensor(std::string tensorName) {
-        return std::make_shared<Pointer>(MemType::DRAM, tensorName, "0");
+    static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
+        return std::make_shared<Pointer>(
+            MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
    }

    inline const MemType getType() { return memType; }
--- a/src/pfusion/instantiate.cc
+++ b/src/pfusion/instantiate.cc
@ -15,7 +15,9 @@ size_t getSize(const std::vector<int> &shape) {
 }

 std::vector<std::shared_ptr<MetaOp>>
-instantiateUnary(const std::vector<int> &shape, const OpType opType) {
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<int> &shape) {
    std::vector<std::shared_ptr<MetaOp>> metaOps;
    size_t size = getSize(shape);

@ -35,9 +37,7 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
    metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
    metaOp->mappingDst->map = {1};

-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
    auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");

    metaOp->microOps.emplace_back(std::make_shared<MemoryOp>(
@ -55,7 +55,9 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
 }

 std::vector<std::shared_ptr<MetaOp>>
-instantiateBinary(const std::vector<int> &shape, const OpType opType) {
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<int> &shape) {
    std::vector<std::shared_ptr<MetaOp>> metaOps;
    size_t size = getSize(shape);

@ -75,10 +77,7 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
    metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
    metaOp->mappingDst->map = {1};

-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src0", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src1", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
    auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
    auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
    auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
@ -99,9 +98,9 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
    return metaOps;
 }

-std::vector<std::shared_ptr<MetaOp>>
-instantiateTranspose(const std::vector<int> &shape,
-                     const std::vector<int> &perm) {
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<int> &shape, const std::vector<int> &perm) {
    std::vector<std::shared_ptr<MetaOp>> metaOps;

    size_t size = 1;
@ -155,9 +154,7 @@ instantiateTranspose(const std::vector<int> &shape,

    // TODO: tiling is a metaOp or microOps?

-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
    auto smem = Pointer::buildPtr(SRAM, "smem", "warp_id * 32 * 33");
    auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");

--- a/src/pfusion/memory_codegen.cc
+++ b/src/pfusion/memory_codegen.cc
@ -1,8 +1,10 @@
-#include "pfusion/memory_codegen.h"
 #include "core/graph.h"
 #include "operators/transpose.h"
+
 #include "pfusion/common.h"
 #include "pfusion/instantiate.h"
+#include "pfusion/memory_codegen.h"
+#include "pfusion/pointer.h"

 #include <cstdlib>
 #include <filesystem>
@ -50,23 +52,44 @@ memb::MetaGraph instantiateGraph(infini::Graph graph) {
        switch (op->getOpType()) {
        case infini::OpType::Transpose:
            metaGraph.addNode(memb::instantiateTranspose(
+                memb::TRANSPOSE,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
                convertShape(op->getOutputs()[0]->getDims()),
                convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
            break;
        case infini::OpType::Relu:
            metaGraph.addNode(memb::instantiateUnary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::RELU));
+                memb::RELU,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
            break;
        case infini::OpType::Add:
            metaGraph.addNode(memb::instantiateBinary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::ADD));
+                memb::ADD,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[1]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
            break;
        case infini::OpType::Sub:
            metaGraph.addNode(memb::instantiateBinary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::SUB));
+                memb::SUB,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[1]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
            break;
        default:
            IT_ASSERT(false);
--- a/src/pfusion/meta_op.cc
+++ b/src/pfusion/meta_op.cc
@ -33,31 +33,48 @@ std::string MetaGraph::genHeader() {
 }

 std::string MetaGraph::genKernelFunc() {
-    auto metaOp = nodes[0].metaOps[0];
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    for (auto &node : nodes) {
+        metaOps.emplace_back(node.metaOps[0]);
+    }
+    std::string code = "";
+    for (auto metaOp : metaOps) {
+        code += metaOp->genKernelFunc();
+    }
+    return code;
+}
+
+std::string MetaOp::genKernelFunc() {
    std::string code = "";
    code += "// Kernel\n";
-    code += "__global__ void kernel_func(float *src, float *dst) {\n";
+    code += "__global__ void kernel_func_" + std::to_string(id) + "(";
+    IT_ASSERT(ptrs.size() > 0);
+    code += "float *" + ptrs[0]->getName();
+    for (size_t i = 1; i < ptrs.size(); i++) {
+        code += ", float *" + ptrs[i]->getName();
+    }
+    code += ") {\n";
    code += "int lane_id = threadIdx.x % 32;\n";
    code += "int warp_id = threadIdx.x / 32;\n";
-    code += "int parallel_idx = blockIdx.x * " +
-            std::to_string(metaOp->numWarps) + " + warp_id;\n";
-    if (metaOp->numReg != 0) {
-        code += "float buf[" + std::to_string(metaOp->numReg) + "];\n";
+    code += "int parallel_idx = blockIdx.x * " + std::to_string(numWarps) +
+            " + warp_id;\n";
+    if (numReg != 0) {
+        code += "float buf[" + std::to_string(numReg) + "];\n";
    }
-    if (metaOp->numSmem != 0) {
-        code += "__shared__ float smem[" +
-                std::to_string(metaOp->numSmem * metaOp->numWarps) + "];\n";
+    if (numSmem != 0) {
+        code += "__shared__ float smem[" + std::to_string(numSmem * numWarps) +
+                "];\n";
    }

    code += "for (int loop_idx = parallel_idx; loop_idx < " +
-            std::to_string(metaOp->main_loop_ed) + "; loop_idx += " +
-            std::to_string(metaOp->numBlocks * metaOp->numWarps) + ") {\n";
+            std::to_string(main_loop_ed) +
+            "; loop_idx += " + std::to_string(numBlocks * numWarps) + ") {\n";

    // gen offset_src
-    code += genOffset("offset_src", metaOp->mappingSrc);
-    code += genOffset("offset_dst", metaOp->mappingDst);
+    code += genOffset("offset_src", mappingSrc);
+    code += genOffset("offset_dst", mappingDst);

-    for (auto microOp : metaOp->microOps) {
+    for (auto microOp : microOps) {
        code += microOp->generate();
    }
    code += "}\n}\n";
@ -65,13 +82,24 @@ std::string MetaGraph::genKernelFunc() {
 }

 std::string MetaGraph::genInvokeFunc() {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    for (auto &node : nodes) {
+        metaOps.emplace_back(node.metaOps[0]);
+    }
    std::string code = "";
-    auto metaOp = nodes[0].metaOps[0];
-    code += "void invoke_func(float *src, float *dst) {\n";
-    int numBlocks = metaOp->numBlocks;
-    int numWarps = metaOp->numWarps * 32;
+    for (auto metaOp : metaOps) {
+        code += metaOp->genInvokeFunc();
+    }
+    return code;
+}
+
+std::string MetaOp::genInvokeFunc() {
+    std::string code = "";
+    code += "void invoke_func_" + std::to_string(id) +
+            "(float *src, float *dst) {\n";
+    int numThreads = numWarps * 32;
    code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
-    code += "dim3 blockDim(" + std::to_string(numWarps) + ", 1);";
+    code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
    code += "kernel_func<<<gridDim, blockDim>>>(src, dst);\n";
    code += "cudaCheckError();\n";
    code += "}\n";