Merge branch 'NNET_e2e_fix' into NNET_e2e

Support CUDA Graph for TVM kernels
2023-04-21 13:18:44 +08:00 · 2023-04-21 13:18:44 +08:00 · 2cd75bd79b
parent f0fcbe825f 8c91faa948
commit 2cd75bd79b
2 changed files with 38 additions and 0 deletions
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -5,6 +5,9 @@
 #include "cuda_profiler_api.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
+#ifdef INFINI_USE_TVM
+#include "tvm/runtime/device_api.h"
+#endif
 namespace infini {

 CudaRuntimeObj::CudaRuntimeObj()
@ -145,6 +148,13 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
        dbg(op);
    }

+    // Init tvm stream
+    #ifdef INFINI_USE_TVM
+    DLDevice tvm_device_id = {kDLCUDA, 0};
+    auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
+    tvm_device->SetStream(tvm_device_id, getStream());
+    #endif
+
    beginCudaGraphStreamCapture();
    for (auto &[op, kernel, perfData] : kernels) {
        if (perfData)
--- a/test/kernels/cuda/test_cuda_runtime.cc
+++ b/test/kernels/cuda/test_cuda_runtime.cc
@ -3,6 +3,8 @@
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/conv.h"
+#include "nnet/nmutator.h"
+#include "operators/matmul.h"
 #include "test.h"

 namespace infini {
@ -27,4 +29,30 @@ TEST(TestCudaRuntime, CudaGraph) {
    EXPECT_GE(time, 0.01);
 }

+TEST(TestCudaRuntime, CudaGraphMembound) {
+    auto runtime = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    Graph g = make_ref<GraphObj>(runtime);
+
+    Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
+    Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
+    Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
+    g->dataMalloc();
+    i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
+    NMutator nmutator(NMutator::Mode::ToNaiveMembound);
+    auto mutations = nmutator.run(g);
+    ASSERT_EQ(mutations.size(), 2u);
+    Graph gNew = mutations[1];
+    gNew->print();
+    gNew->dataMalloc();
+
+    runtime->run(gNew, true); // tune kernels
+    runtime->run(gNew, false);
+    runtime->getPerfTime(gNew);
+
+    runtime->timeWithCudaGraph(gNew);
+}
 } // namespace infini