InfiniTensor/include/cuda/cuda_runtime.h

#pragma once
#include "core/runtime.h"
#include "cuda/cuda_common.h"

namespace infini {

class CudaRuntimeObj : public RuntimeObj {
  private:
    cudaStream_t stream;
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    CudaPtr workspace;
    size_t workspaceSize;

    // Memory information
    size_t allocatedGPUMemorySize = 0;
    map<void *, size_t> allocationMap;

    bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled

  public:
    CudaRuntimeObj();
    virtual ~CudaRuntimeObj();
    string toString() const override;

    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
    void sync() const;
    CudaPtr alloc(size_t size) override {
        void *ptr;
        checkCudaError(cudaMalloc(&ptr, size));
        allocatedGPUMemorySize += size;
        allocationMap[ptr] = size;
        // printf("cuda malloc: %p %lu bytes, total %lu bytes (%.2lf GB)\n",
        // ptr, size, allocatedGPUMemorySize,
        //        double(allocatedGPUMemorySize) / 1024 / 1024 / 1024);
        return ptr;
    }
    void dealloc(void *ptr) override {
        checkCudaError(cudaFree(ptr));
        allocatedGPUMemorySize -= allocationMap.at(ptr);
        allocationMap.erase(ptr);
        // printf("cuda dealloc: %p %lu bytes, total %lu\n", ptr,
        //        allocationMap.at(ptr), allocatedGPUMemorySize);
    }
    cudnnHandle_t cudnnHandle() const { return cudnn; }
    cublasHandle_t cublasHandle() const { return cublas; }
    size_t getWorkspaceSize() const { return workspaceSize; }
    CudaPtr getWorkspace(size_t size) const {
        IT_ASSERT(size <= workspaceSize);
        return workspace;
    }

    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
    }

    void copyBlobToCPU(void *dst, const void *src,
                       size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
    }

    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
        checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
    }

    void runWithoutSync(const Graph &graph) const;

    bool isInCudaGraph() const { return cudaGraphStatus; }
    cudaStream_t getStream() const { return stream; }

    double timeWithCudaGraph(Graph graph, int rounds = 1000);

  private:
    void tune(const Graph &graph, bool profiling) const;

    void beginCudaGraphStreamCapture();
    tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
};
} // namespace infini