From 18b79903eeae4489d9fa7ead8a05ec807d80c6ce Mon Sep 17 00:00:00 2001
From: mazx <dyxdy@live.com>
Date: Wed, 26 Oct 2022 02:20:16 +0800
Subject: [PATCH] add: codegen for all metaOps.

---
 eval_pfusion/eval_transpose.sh |  1 +
 generated_code/sar_drn_0.cu    | 50 +++++++++++++++++++++++---
 generated_code/sar_drn_1.cu    | 50 +++++++++++++++++++++++---
 generated_code/transpose.cu    |  7 ++--
 include/pfusion/common.h       |  1 +
 include/pfusion/instantiate.h  | 14 +++++---
 include/pfusion/meta_op.h      |  3 ++
 include/pfusion/pointer.h      |  6 ++--
 src/pfusion/instantiate.cc     | 27 +++++++-------
 src/pfusion/memory_codegen.cc  | 37 +++++++++++++++----
 src/pfusion/meta_op.cc         | 66 ++++++++++++++++++++++++----------
 11 files changed, 202 insertions(+), 60 deletions(-)

diff --git a/eval_pfusion/eval_transpose.sh b/eval_pfusion/eval_transpose.sh
index 4e555b0c..588e96b5 100755
--- a/eval_pfusion/eval_transpose.sh
+++ b/eval_pfusion/eval_transpose.sh
@@ -1,2 +1,3 @@
+rm ./eval_transpose
 nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
 ./eval_transpose
diff --git a/generated_code/sar_drn_0.cu b/generated_code/sar_drn_0.cu
index 1e1e2bed..de97c058 100644
--- a/generated_code/sar_drn_0.cu
+++ b/generated_code/sar_drn_0.cu
@@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
     int lane_id = threadIdx.x % 32;
     int warp_id = threadIdx.x / 32;
     int parallel_idx = blockIdx.x * 8 + warp_id;
@@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
         tmp_offset_dst /= 65536;
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
+            buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
         }
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
         }
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
+            tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
         }
     }
 }
-void invoke_func(float *src, float *dst) {
+// Kernel
+__global__ void kernel_func_1(float *tensor_ptr_2, float *tensor_ptr_3,
+                              float *tensor_ptr_4) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
+        int offset_src = 0;
+        int tmp_offset_src = loop_idx;
+        offset_src += tmp_offset_src % 65536 * 256;
+        tmp_offset_src /= 65536;
+        int offset_dst = 0;
+        int tmp_offset_dst = loop_idx;
+        offset_dst += tmp_offset_dst % 65536 * 256;
+        tmp_offset_dst /= 65536;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = tensor_ptr_2[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_3[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_4[0 + offset + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_0(float *src, float *dst) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func<<<gridDim, blockDim>>>(src, dst);
+    cudaCheckError();
+}
+void invoke_func_1(float *src, float *dst) {
     dim3 gridDim(108, 1);
     dim3 blockDim(256, 1);
     kernel_func<<<gridDim, blockDim>>>(src, dst);
diff --git a/generated_code/sar_drn_1.cu b/generated_code/sar_drn_1.cu
index 82b35db0..200b33d5 100644
--- a/generated_code/sar_drn_1.cu
+++ b/generated_code/sar_drn_1.cu
@@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func_2(float *tensor_ptr_9, float *tensor_ptr_10) {
     int lane_id = threadIdx.x % 32;
     int warp_id = threadIdx.x / 32;
     int parallel_idx = blockIdx.x * 8 + warp_id;
@@ -16,7 +16,7 @@ __global__ void kernel_func(float *src, float *dst) {
         tmp_offset_dst /= 1024;
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            buf[inst_idx] = src[0 + offset + inst_idx * 32 + lane_id];
+            buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
         }
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
@@ -24,11 +24,53 @@ __global__ void kernel_func(float *src, float *dst) {
         }
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
-            dst[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
+            tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id] = buf[inst_idx];
         }
     }
 }
-void invoke_func(float *src, float *dst) {
+// Kernel
+__global__ void kernel_func_3(float *tensor_ptr_9, float *tensor_ptr_10,
+                              float *tensor_ptr_11) {
+    int lane_id = threadIdx.x % 32;
+    int warp_id = threadIdx.x / 32;
+    int parallel_idx = blockIdx.x * 8 + warp_id;
+    float buf[24];
+    for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
+        int offset_src = 0;
+        int tmp_offset_src = loop_idx;
+        offset_src += tmp_offset_src % 1024 * 256;
+        tmp_offset_src /= 1024;
+        int offset_dst = 0;
+        int tmp_offset_dst = loop_idx;
+        offset_dst += tmp_offset_dst % 1024 * 256;
+        tmp_offset_dst /= 1024;
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx] = tensor_ptr_9[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 8] =
+                tensor_ptr_10[0 + offset + inst_idx * 32 + lane_id];
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8]
+        }
+#pragma unroll
+        for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
+            tensor_ptr_11[0 + offset + inst_idx * 32 + lane_id] =
+                buf[inst_idx + 16];
+        }
+    }
+}
+void invoke_func_2(float *src, float *dst) {
+    dim3 gridDim(108, 1);
+    dim3 blockDim(256, 1);
+    kernel_func<<<gridDim, blockDim>>>(src, dst);
+    cudaCheckError();
+}
+void invoke_func_3(float *src, float *dst) {
     dim3 gridDim(108, 1);
     dim3 blockDim(256, 1);
     kernel_func<<<gridDim, blockDim>>>(src, dst);
diff --git a/generated_code/transpose.cu b/generated_code/transpose.cu
index 32f7d951..ea20af2e 100644
--- a/generated_code/transpose.cu
+++ b/generated_code/transpose.cu
@@ -1,6 +1,6 @@
 #include "cuda_utils.h"
 // Kernel
-__global__ void kernel_func(float *src, float *dst) {
+__global__ void kernel_func(float *tensor_ptr_2, float *tensor_ptr_3) {
     int lane_id = threadIdx.x % 32;
     int warp_id = threadIdx.x / 32;
     int parallel_idx = blockIdx.x * 8 + warp_id;
@@ -21,7 +21,8 @@ __global__ void kernel_func(float *src, float *dst) {
         tmp_offset_dst /= 33;
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
-            buf[inst_idx] = src[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
+            buf[inst_idx] =
+                tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
         }
 #pragma unroll
         for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
@@ -37,7 +38,7 @@ __global__ void kernel_func(float *src, float *dst) {
         if (lane_id < 31) {
 #pragma unroll
             for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
-                dst[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
+                tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
                     buf[inst_idx];
             }
         }
diff --git a/include/pfusion/common.h b/include/pfusion/common.h
index 2edd0b3e..f217156d 100644
--- a/include/pfusion/common.h
+++ b/include/pfusion/common.h
@@ -15,6 +15,7 @@ enum OpType {
     RELU,
     ADD,
     SUB,
+    TRANSPOSE,
 };
 
 enum MemType {
diff --git a/include/pfusion/instantiate.h b/include/pfusion/instantiate.h
index 43d57b51..95508457 100644
--- a/include/pfusion/instantiate.h
+++ b/include/pfusion/instantiate.h
@@ -5,10 +5,14 @@
 
 namespace memb {
 std::vector<std::shared_ptr<MetaOp>>
-instantiateUnary(const std::vector<int> &shape, const OpType opType);
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<int> &shape);
 std::vector<std::shared_ptr<MetaOp>>
-instantiateBinary(const std::vector<int> &shape, const OpType opType);
-std::vector<std::shared_ptr<MetaOp>>
-instantiateTranspose(const std::vector<int> &_shape,
-                     const std::vector<int> &_perm);
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<int> &shape);
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<int> &shape, const std::vector<int> &perm);
 } // namespace memb
diff --git a/include/pfusion/meta_op.h b/include/pfusion/meta_op.h
index 43e8d101..a8171378 100644
--- a/include/pfusion/meta_op.h
+++ b/include/pfusion/meta_op.h
@@ -30,6 +30,9 @@ class MetaOp {
     inline int getLoopSt() { return main_loop_st; }
     inline int getLoopEd() { return main_loop_ed; }
 
+    std::string genKernelFunc();
+    std::string genInvokeFunc();
+
     inline void print() {
         std::cout << "MetaOp: " << id << std::endl;
         for (auto microOp : microOps) {
diff --git a/include/pfusion/pointer.h b/include/pfusion/pointer.h
index efda5dd3..4003372a 100644
--- a/include/pfusion/pointer.h
+++ b/include/pfusion/pointer.h
@@ -31,9 +31,9 @@ class Pointer {
         return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
                                          ptr->getOffset() + " + " + offset);
     }
-    static inline std::shared_ptr<Pointer>
-    buildPtrByTensor(std::string tensorName) {
-        return std::make_shared<Pointer>(MemType::DRAM, tensorName, "0");
+    static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
+        return std::make_shared<Pointer>(
+            MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
     }
 
     inline const MemType getType() { return memType; }
diff --git a/src/pfusion/instantiate.cc b/src/pfusion/instantiate.cc
index 87de94f5..f7238511 100644
--- a/src/pfusion/instantiate.cc
+++ b/src/pfusion/instantiate.cc
@@ -15,7 +15,9 @@ size_t getSize(const std::vector<int> &shape) {
 }
 
 std::vector<std::shared_ptr<MetaOp>>
-instantiateUnary(const std::vector<int> &shape, const OpType opType) {
+instantiateUnary(const OpType opType,
+                 std::vector<std::shared_ptr<Pointer>> ptrs,
+                 const std::vector<int> &shape) {
     std::vector<std::shared_ptr<MetaOp>> metaOps;
     size_t size = getSize(shape);
 
@@ -35,9 +37,7 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
     metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
     metaOp->mappingDst->map = {1};
 
-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
     auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
 
     metaOp->microOps.emplace_back(std::make_shared<MemoryOp>(
@@ -55,7 +55,9 @@ instantiateUnary(const std::vector<int> &shape, const OpType opType) {
 }
 
 std::vector<std::shared_ptr<MetaOp>>
-instantiateBinary(const std::vector<int> &shape, const OpType opType) {
+instantiateBinary(const OpType opType,
+                  std::vector<std::shared_ptr<Pointer>> ptrs,
+                  const std::vector<int> &shape) {
     std::vector<std::shared_ptr<MetaOp>> metaOps;
     size_t size = getSize(shape);
 
@@ -75,10 +77,7 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
     metaOp->mappingDst->shape = {32 * 8, int(size / 32 / 8)};
     metaOp->mappingDst->map = {1};
 
-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src0", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src1", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
     auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
     auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
     auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
@@ -99,9 +98,9 @@ instantiateBinary(const std::vector<int> &shape, const OpType opType) {
     return metaOps;
 }
 
-std::vector<std::shared_ptr<MetaOp>>
-instantiateTranspose(const std::vector<int> &shape,
-                     const std::vector<int> &perm) {
+std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
+    const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
+    const std::vector<int> &shape, const std::vector<int> &perm) {
     std::vector<std::shared_ptr<MetaOp>> metaOps;
 
     size_t size = 1;
@@ -155,9 +154,7 @@ instantiateTranspose(const std::vector<int> &shape,
 
     // TODO: tiling is a metaOp or microOps?
 
-    auto &ptrs = metaOp->ptrs;
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "src", "0"));
-    ptrs.emplace_back(Pointer::buildPtr(DRAM, "dst", "0"));
+    metaOp->ptrs = ptrs;
     auto smem = Pointer::buildPtr(SRAM, "smem", "warp_id * 32 * 33");
     auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
 
diff --git a/src/pfusion/memory_codegen.cc b/src/pfusion/memory_codegen.cc
index 5865752e..965bdc06 100644
--- a/src/pfusion/memory_codegen.cc
+++ b/src/pfusion/memory_codegen.cc
@@ -1,8 +1,10 @@
-#include "pfusion/memory_codegen.h"
 #include "core/graph.h"
 #include "operators/transpose.h"
+
 #include "pfusion/common.h"
 #include "pfusion/instantiate.h"
+#include "pfusion/memory_codegen.h"
+#include "pfusion/pointer.h"
 
 #include <cstdlib>
 #include <filesystem>
@@ -50,23 +52,44 @@ memb::MetaGraph instantiateGraph(infini::Graph graph) {
         switch (op->getOpType()) {
         case infini::OpType::Transpose:
             metaGraph.addNode(memb::instantiateTranspose(
+                memb::TRANSPOSE,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
                 convertShape(op->getOutputs()[0]->getDims()),
                 convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
             break;
         case infini::OpType::Relu:
             metaGraph.addNode(memb::instantiateUnary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::RELU));
+                memb::RELU,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
             break;
         case infini::OpType::Add:
             metaGraph.addNode(memb::instantiateBinary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::ADD));
+                memb::ADD,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[1]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
             break;
         case infini::OpType::Sub:
             metaGraph.addNode(memb::instantiateBinary(
-                convertShape(op->getInputs()[0]->getDims()),
-                memb::OpType::SUB));
+                memb::SUB,
+                {memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[0]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getInputs()[1]->getGuid()),
+                 memb::Pointer::buildPtrByTensorGuid(
+                     op->getOutputs()[0]->getGuid())},
+                convertShape(op->getOutputs()[0]->getDims())));
             break;
         default:
             IT_ASSERT(false);
diff --git a/src/pfusion/meta_op.cc b/src/pfusion/meta_op.cc
index 1676df4f..83c32fff 100644
--- a/src/pfusion/meta_op.cc
+++ b/src/pfusion/meta_op.cc
@@ -33,31 +33,48 @@ std::string MetaGraph::genHeader() {
 }
 
 std::string MetaGraph::genKernelFunc() {
-    auto metaOp = nodes[0].metaOps[0];
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    for (auto &node : nodes) {
+        metaOps.emplace_back(node.metaOps[0]);
+    }
+    std::string code = "";
+    for (auto metaOp : metaOps) {
+        code += metaOp->genKernelFunc();
+    }
+    return code;
+}
+
+std::string MetaOp::genKernelFunc() {
     std::string code = "";
     code += "// Kernel\n";
-    code += "__global__ void kernel_func(float *src, float *dst) {\n";
+    code += "__global__ void kernel_func_" + std::to_string(id) + "(";
+    IT_ASSERT(ptrs.size() > 0);
+    code += "float *" + ptrs[0]->getName();
+    for (size_t i = 1; i < ptrs.size(); i++) {
+        code += ", float *" + ptrs[i]->getName();
+    }
+    code += ") {\n";
     code += "int lane_id = threadIdx.x % 32;\n";
     code += "int warp_id = threadIdx.x / 32;\n";
-    code += "int parallel_idx = blockIdx.x * " +
-            std::to_string(metaOp->numWarps) + " + warp_id;\n";
-    if (metaOp->numReg != 0) {
-        code += "float buf[" + std::to_string(metaOp->numReg) + "];\n";
+    code += "int parallel_idx = blockIdx.x * " + std::to_string(numWarps) +
+            " + warp_id;\n";
+    if (numReg != 0) {
+        code += "float buf[" + std::to_string(numReg) + "];\n";
     }
-    if (metaOp->numSmem != 0) {
-        code += "__shared__ float smem[" +
-                std::to_string(metaOp->numSmem * metaOp->numWarps) + "];\n";
+    if (numSmem != 0) {
+        code += "__shared__ float smem[" + std::to_string(numSmem * numWarps) +
+                "];\n";
     }
 
     code += "for (int loop_idx = parallel_idx; loop_idx < " +
-            std::to_string(metaOp->main_loop_ed) + "; loop_idx += " +
-            std::to_string(metaOp->numBlocks * metaOp->numWarps) + ") {\n";
+            std::to_string(main_loop_ed) +
+            "; loop_idx += " + std::to_string(numBlocks * numWarps) + ") {\n";
 
     // gen offset_src
-    code += genOffset("offset_src", metaOp->mappingSrc);
-    code += genOffset("offset_dst", metaOp->mappingDst);
+    code += genOffset("offset_src", mappingSrc);
+    code += genOffset("offset_dst", mappingDst);
 
-    for (auto microOp : metaOp->microOps) {
+    for (auto microOp : microOps) {
         code += microOp->generate();
     }
     code += "}\n}\n";
@@ -65,13 +82,24 @@ std::string MetaGraph::genKernelFunc() {
 }
 
 std::string MetaGraph::genInvokeFunc() {
+    std::vector<std::shared_ptr<MetaOp>> metaOps;
+    for (auto &node : nodes) {
+        metaOps.emplace_back(node.metaOps[0]);
+    }
     std::string code = "";
-    auto metaOp = nodes[0].metaOps[0];
-    code += "void invoke_func(float *src, float *dst) {\n";
-    int numBlocks = metaOp->numBlocks;
-    int numWarps = metaOp->numWarps * 32;
+    for (auto metaOp : metaOps) {
+        code += metaOp->genInvokeFunc();
+    }
+    return code;
+}
+
+std::string MetaOp::genInvokeFunc() {
+    std::string code = "";
+    code += "void invoke_func_" + std::to_string(id) +
+            "(float *src, float *dst) {\n";
+    int numThreads = numWarps * 32;
     code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
-    code += "dim3 blockDim(" + std::to_string(numWarps) + ", 1);";
+    code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
     code += "kernel_func<<<gridDim, blockDim>>>(src, dst);\n";
     code += "cudaCheckError();\n";
     code += "}\n";