forked from jiuyuan/InfiniTensor
85 lines
2.7 KiB
C++
85 lines
2.7 KiB
C++
#pragma once
|
|
#include "core/runtime.h"
|
|
#include "cuda/cuda_common.h"
|
|
|
|
namespace infini {
|
|
|
|
class CudaRuntimeObj : public RuntimeObj {
|
|
private:
|
|
cudaStream_t stream;
|
|
cudnnHandle_t cudnn;
|
|
cublasHandle_t cublas;
|
|
CudaPtr workspace;
|
|
size_t workspaceSize;
|
|
|
|
// Memory information
|
|
size_t allocatedGPUMemorySize = 0;
|
|
map<void *, size_t> allocationMap;
|
|
|
|
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
|
|
|
|
public:
|
|
CudaRuntimeObj();
|
|
virtual ~CudaRuntimeObj();
|
|
string toString() const override;
|
|
|
|
void run(const Graph &graph, bool tune = false,
|
|
bool profiling = false) const;
|
|
// double runEvaluation(const Graph &graph, int nWarmups,
|
|
// int nEvaluations) const;
|
|
void sync() const;
|
|
CudaPtr alloc(size_t size) override {
|
|
void *ptr;
|
|
checkCudaError(cudaMalloc(&ptr, size));
|
|
allocatedGPUMemorySize += size;
|
|
allocationMap[ptr] = size;
|
|
// printf("cuda malloc: %p %lu bytes, total %lu bytes (%.2lf GB)\n",
|
|
// ptr, size, allocatedGPUMemorySize,
|
|
// double(allocatedGPUMemorySize) / 1024 / 1024 / 1024);
|
|
return ptr;
|
|
}
|
|
void dealloc(void *ptr) override {
|
|
checkCudaError(cudaFree(ptr));
|
|
allocatedGPUMemorySize -= allocationMap.at(ptr);
|
|
allocationMap.erase(ptr);
|
|
// printf("cuda dealloc: %p %lu bytes, total %lu\n", ptr,
|
|
// allocationMap.at(ptr), allocatedGPUMemorySize);
|
|
}
|
|
cudnnHandle_t cudnnHandle() const { return cudnn; }
|
|
cublasHandle_t cublasHandle() const { return cublas; }
|
|
size_t getWorkspaceSize() const { return workspaceSize; }
|
|
CudaPtr getWorkspace(size_t size) const {
|
|
IT_ASSERT(size <= workspaceSize);
|
|
return workspace;
|
|
}
|
|
|
|
void copyBlobFromCPU(void *dst, const void *src,
|
|
size_t bytes) const override {
|
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
|
|
}
|
|
|
|
void copyBlobToCPU(void *dst, const void *src,
|
|
size_t bytes) const override {
|
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
|
size_t bytes) const override {
|
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
|
|
}
|
|
|
|
void runWithoutSync(const Graph &graph) const;
|
|
|
|
bool isInCudaGraph() const { return cudaGraphStatus; }
|
|
cudaStream_t getStream() const { return stream; }
|
|
|
|
double timeWithCudaGraph(Graph graph, int rounds = 1000);
|
|
|
|
private:
|
|
void tune(const Graph &graph, bool profiling) const;
|
|
|
|
void beginCudaGraphStreamCapture();
|
|
tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
|
|
};
|
|
} // namespace infini
|