Add: CUDA graph stream capture (MemboundOp fails)

This commit is contained in:
Liyan Zheng 2023-04-19 16:32:16 +08:00
parent e4c20a9ae2
commit e86e993ed4
5 changed files with 136 additions and 20 deletions

View File

@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
endif() endif()
# Python bindings # Python bindings
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc) file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
pybind11_add_module(backend MODULE ${FFIS}) pybind11_add_module(backend MODULE ${FFIS})
target_link_libraries(backend PRIVATE InfiniTensor) target_link_libraries(backend PRIVATE InfiniTensor)
@ -168,6 +168,7 @@ endif()
if(USE_CUDA) if(USE_CUDA)
add_compile_definitions(USE_CUDA=1) add_compile_definitions(USE_CUDA=1)
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong # Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
set(CMAKE_CUDA_HOST_COMPILER set(CMAKE_CUDA_HOST_COMPILER
${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER}

View File

@ -1,35 +1,22 @@
#pragma once #pragma once
#include "core/runtime.h" #include "core/runtime.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "nnet/dbg.h"
namespace infini { namespace infini {
class CudaRuntimeObj : public RuntimeObj { class CudaRuntimeObj : public RuntimeObj {
private: private:
cudaStream_t stream;
cudnnHandle_t cudnn; cudnnHandle_t cudnn;
cublasHandle_t cublas; cublasHandle_t cublas;
CudaPtr workspace; CudaPtr workspace;
size_t workspaceSize; size_t workspaceSize;
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
public: public:
CudaRuntimeObj() : RuntimeObj(Device::CUDA) { CudaRuntimeObj();
virtual ~CudaRuntimeObj();
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
}
virtual ~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
string toString() const override; string toString() const override;
void run(const Graph &graph, bool tune = false, void run(const Graph &graph, bool tune = false,
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
void runWithoutSync(const Graph &graph) const; void runWithoutSync(const Graph &graph) const;
bool isInCudaGraph() const { return cudaGraphStatus; }
cudaStream_t getStream() const { return stream; }
double timeWithCudaGraph(Graph graph);
private: private:
void tune(const Graph &graph, bool profiling) const; void tune(const Graph &graph, bool profiling) const;
void beginCudaGraphStreamCapture();
cudaGraphExec_t endCudaGraphStreamCapture();
}; };
} // namespace infini } // namespace infini

View File

@ -6,6 +6,46 @@
#include "operators/matmul.h" #include "operators/matmul.h"
namespace infini { namespace infini {
CudaRuntimeObj::CudaRuntimeObj()
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
cudaGraphStatus(false) {
checkCudnnError(cudnnCreate(&cudnn));
checkCublasError(cublasCreate(&cublas));
checkCudnnError(cudnnSetStream(cudnn, stream));
checkCublasError(cublasSetStream(cublas, stream));
// 10GB for Longformer
// size_t longformerNum = 3lu * (1 << 30);
workspaceSize = 7ll << 30; // 7 GB
workspace = alloc(workspaceSize);
}
CudaRuntimeObj::~CudaRuntimeObj() {
try {
dealloc(workspace);
checkCudnnError(cudnnDestroy(cudnn));
checkCublasError(cublasDestroy(cublas));
} catch (const std::exception &e) {
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
}
}
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
enum cudaStreamCaptureStatus pCaptureStatus;
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
dbg(pCaptureStatus);
cudaGraphStatus = true;
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
}
cudaGraphExec_t CudaRuntimeObj::endCudaGraphStreamCapture() {
cudaGraph_t cudaGraph;
cudaGraphExec_t instance;
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
cudaGraphStatus = false;
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
return instance;
}
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance(); const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance(); auto &perfEngine = PerfEngine::getInstance();
@ -75,4 +115,52 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; } string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
bool status = graph->topo_sort();
IT_ASSERT(status, "Topological sort failed");
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
// if (op->getOpType() == OpType::Matmul)
// if (op->getOpType() == OpType::Matmul ||
// op->getOpType() == OpType::Relu
// // || op->getOpType() == OpType::MemBound
// )
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
beginCudaGraphStreamCapture();
for (auto &[op, kernel, perfData] : kernels) {
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
auto cudaGraphInstance = endCudaGraphStreamCapture();
return timeit(
[&, stream = getStream()]() {
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
},
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
1000);
}
} // namespace infini } // namespace infini

View File

@ -82,7 +82,8 @@ void printGraph(Graph g) {
} }
} }
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) { Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
auto runtime = as<CudaRuntimeObj>(_runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance(); Runtime cpu = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(cpu); Graph gCpu = make_ref<GraphObj>(cpu);
@ -145,6 +146,7 @@ Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
dbg(go0->equalData(bgo0, 1e-3)); dbg(go0->equalData(bgo0, 1e-3));
dbg(runtime->getPerfTime(bestGraph, true)); dbg(runtime->getPerfTime(bestGraph, true));
dbg(runtime->timeNonCtcOperators(bestGraph)); dbg(runtime->timeNonCtcOperators(bestGraph));
dbg(runtime->timeWithCudaGraph(bestGraph));
} }
dbg("Best graph"); dbg("Best graph");

View File

@ -0,0 +1,30 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
TEST(TestCudaRuntime, CudaGraph) {
auto runtime = make_ref<CudaRuntimeObj>();
Graph g = make_ref<GraphObj>(runtime);
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
auto w0 =
g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
g->dataMalloc();
runtime->run(g, true);
runtime->run(g, false);
runtime->getPerfTime(g);
auto time = runtime->timeWithCudaGraph(g);
EXPECT_GE(time, 0.01);
}
} // namespace infini