Add: CUDA graph stream capture (MemboundOp fails)

2023-04-19 16:32:16 +08:00 · 2023-04-19 16:32:16 +08:00 · e86e993ed4
parent e4c20a9ae2
commit e86e993ed4
5 changed files with 136 additions and 20 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
 endif()
 # Python bindings
-file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
+file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
 pybind11_add_module(backend MODULE ${FFIS})
 target_link_libraries(backend PRIVATE InfiniTensor)
@ -168,6 +168,7 @@ endif()
 if(USE_CUDA)
  add_compile_definitions(USE_CUDA=1)
  add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
  # Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
  set(CMAKE_CUDA_HOST_COMPILER
      ${CMAKE_CXX_COMPILER}
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,35 +1,22 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
 #include "nnet/dbg.h"
 namespace infini {
 class CudaRuntimeObj : public RuntimeObj {
  private:
    cudaStream_t stream;
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    CudaPtr workspace;
    size_t workspaceSize;
    bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
  public:
-    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
+    CudaRuntimeObj();
-
+    virtual ~CudaRuntimeObj();
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
    }
    virtual ~CudaRuntimeObj() {
        try {
            dealloc(workspace);
            checkCudnnError(cudnnDestroy(cudnn));
            checkCublasError(cublasDestroy(cublas));
        } catch (const std::exception &e) {
            std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
        }
    }
    string toString() const override;
    void run(const Graph &graph, bool tune = false,
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
    void runWithoutSync(const Graph &graph) const;
    bool isInCudaGraph() const { return cudaGraphStatus; }
    cudaStream_t getStream() const { return stream; }
    double timeWithCudaGraph(Graph graph);
  private:
    void tune(const Graph &graph, bool profiling) const;
    void beginCudaGraphStreamCapture();
    cudaGraphExec_t endCudaGraphStreamCapture();
 };
 } // namespace infini
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -6,6 +6,46 @@
 #include "operators/matmul.h"
 namespace infini {
 CudaRuntimeObj::CudaRuntimeObj()
    : RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
      cudaGraphStatus(false) {
    checkCudnnError(cudnnCreate(&cudnn));
    checkCublasError(cublasCreate(&cublas));
    checkCudnnError(cudnnSetStream(cudnn, stream));
    checkCublasError(cublasSetStream(cublas, stream));
    // 10GB for Longformer
    // size_t longformerNum = 3lu * (1 << 30);
    workspaceSize = 7ll << 30; // 7 GB
    workspace = alloc(workspaceSize);
 }
 CudaRuntimeObj::~CudaRuntimeObj() {
    try {
        dealloc(workspace);
        checkCudnnError(cudnnDestroy(cudnn));
        checkCublasError(cublasDestroy(cublas));
    } catch (const std::exception &e) {
        std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
    }
 }
 void CudaRuntimeObj::beginCudaGraphStreamCapture() {
    enum cudaStreamCaptureStatus pCaptureStatus;
    checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
    dbg(pCaptureStatus);
    cudaGraphStatus = true;
    checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
 }
 cudaGraphExec_t CudaRuntimeObj::endCudaGraphStreamCapture() {
    cudaGraph_t cudaGraph;
    cudaGraphExec_t instance;
    checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
    cudaGraphStatus = false;
    checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
    return instance;
 }
 void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
@ -75,4 +115,52 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
 string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
 double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    // compile-time computable
    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
    vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
    bool status = graph->topo_sort();
    IT_ASSERT(status, "Topological sort failed");
    for (auto &op : graph->getOperators()) {
        // HACK: set correct data type
        auto kernelAttrs =
            KernelAttrs{device, op->getOpType(), DataType::Float32};
        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);
        if (perfData)
            kernel->compute(op, perfData, this);
        else
            kernel->compute(op, this);
        // if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
        // if (op->getOpType() == OpType::Matmul)
        // if (op->getOpType() == OpType::Matmul ||
        //     op->getOpType() == OpType::Relu
        //     // || op->getOpType() == OpType::MemBound
        // )
        kernels.emplace_back(op, kernel, perfData);
    }
    for (auto &[op, kernel, perfData] : kernels) {
        dbg(op);
    }
    beginCudaGraphStreamCapture();
    for (auto &[op, kernel, perfData] : kernels) {
        if (perfData)
            kernel->compute(op, perfData, this);
        else
            kernel->compute(op, this);
    }
    auto cudaGraphInstance = endCudaGraphStreamCapture();
    return timeit(
        [&, stream = getStream()]() {
            checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
        },
        [&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
        1000);
 }
 } // namespace infini
--- a/src/nnet/App/test_models.cc
+++ b/src/nnet/App/test_models.cc
@ -82,7 +82,8 @@ void printGraph(Graph g) {
    }
 }
-Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
+Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
    auto runtime = as<CudaRuntimeObj>(_runtime);
    Runtime cpu = NativeCpuRuntimeObj::getInstance();
    Graph gCpu = make_ref<GraphObj>(cpu);
@ -145,6 +146,7 @@ Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
            dbg(go0->equalData(bgo0, 1e-3));
            dbg(runtime->getPerfTime(bestGraph, true));
            dbg(runtime->timeNonCtcOperators(bestGraph));
            dbg(runtime->timeWithCudaGraph(bestGraph));
        }
        dbg("Best graph");
--- a/test/kernels/cuda/test_cuda_runtime.cc
+++ b/test/kernels/cuda/test_cuda_runtime.cc
@ -0,0 +1,30 @@
 #include "core/graph.h"
 #include "core/runtime.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "operators/conv.h"
 #include "test.h"
 namespace infini {
 TEST(TestCudaRuntime, CudaGraph) {
    auto runtime = make_ref<CudaRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);
    const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
    auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
    auto w0 =
        g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
    g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
    g->dataMalloc();
    runtime->run(g, true);
    runtime->run(g, false);
    runtime->getPerfTime(g);
    auto time = runtime->timeWithCudaGraph(g);
    EXPECT_GE(time, 0.01);
 }
 } // namespace infini