forked from jiuyuan/InfiniTensor
Add: CUDA graph stream capture (MemboundOp fails)
This commit is contained in:
parent
e4c20a9ae2
commit
e86e993ed4
|
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Python bindings
|
# Python bindings
|
||||||
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
|
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
|
||||||
pybind11_add_module(backend MODULE ${FFIS})
|
pybind11_add_module(backend MODULE ${FFIS})
|
||||||
target_link_libraries(backend PRIVATE InfiniTensor)
|
target_link_libraries(backend PRIVATE InfiniTensor)
|
||||||
|
|
||||||
|
@ -168,6 +168,7 @@ endif()
|
||||||
|
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
add_compile_definitions(USE_CUDA=1)
|
add_compile_definitions(USE_CUDA=1)
|
||||||
|
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
|
||||||
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
|
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
|
||||||
set(CMAKE_CUDA_HOST_COMPILER
|
set(CMAKE_CUDA_HOST_COMPILER
|
||||||
${CMAKE_CXX_COMPILER}
|
${CMAKE_CXX_COMPILER}
|
||||||
|
|
|
@ -1,35 +1,22 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
#include "cuda/cuda_common.h"
|
#include "cuda/cuda_common.h"
|
||||||
|
#include "nnet/dbg.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
class CudaRuntimeObj : public RuntimeObj {
|
class CudaRuntimeObj : public RuntimeObj {
|
||||||
private:
|
private:
|
||||||
|
cudaStream_t stream;
|
||||||
cudnnHandle_t cudnn;
|
cudnnHandle_t cudnn;
|
||||||
cublasHandle_t cublas;
|
cublasHandle_t cublas;
|
||||||
CudaPtr workspace;
|
CudaPtr workspace;
|
||||||
size_t workspaceSize;
|
size_t workspaceSize;
|
||||||
|
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
CudaRuntimeObj();
|
||||||
|
virtual ~CudaRuntimeObj();
|
||||||
checkCudnnError(cudnnCreate(&cudnn));
|
|
||||||
checkCublasError(cublasCreate(&cublas));
|
|
||||||
// 10GB for Longformer
|
|
||||||
// size_t longformerNum = 3lu * (1 << 30);
|
|
||||||
workspaceSize = 7ll << 30; // 7 GB
|
|
||||||
workspace = alloc(workspaceSize);
|
|
||||||
}
|
|
||||||
virtual ~CudaRuntimeObj() {
|
|
||||||
try {
|
|
||||||
dealloc(workspace);
|
|
||||||
checkCudnnError(cudnnDestroy(cudnn));
|
|
||||||
checkCublasError(cublasDestroy(cublas));
|
|
||||||
} catch (const std::exception &e) {
|
|
||||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
void run(const Graph &graph, bool tune = false,
|
void run(const Graph &graph, bool tune = false,
|
||||||
|
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
|
|
||||||
void runWithoutSync(const Graph &graph) const;
|
void runWithoutSync(const Graph &graph) const;
|
||||||
|
|
||||||
|
bool isInCudaGraph() const { return cudaGraphStatus; }
|
||||||
|
cudaStream_t getStream() const { return stream; }
|
||||||
|
|
||||||
|
double timeWithCudaGraph(Graph graph);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void tune(const Graph &graph, bool profiling) const;
|
void tune(const Graph &graph, bool profiling) const;
|
||||||
|
|
||||||
|
void beginCudaGraphStreamCapture();
|
||||||
|
cudaGraphExec_t endCudaGraphStreamCapture();
|
||||||
};
|
};
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -6,6 +6,46 @@
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
CudaRuntimeObj::CudaRuntimeObj()
|
||||||
|
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
|
||||||
|
cudaGraphStatus(false) {
|
||||||
|
checkCudnnError(cudnnCreate(&cudnn));
|
||||||
|
checkCublasError(cublasCreate(&cublas));
|
||||||
|
checkCudnnError(cudnnSetStream(cudnn, stream));
|
||||||
|
checkCublasError(cublasSetStream(cublas, stream));
|
||||||
|
// 10GB for Longformer
|
||||||
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
|
workspaceSize = 7ll << 30; // 7 GB
|
||||||
|
workspace = alloc(workspaceSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
CudaRuntimeObj::~CudaRuntimeObj() {
|
||||||
|
try {
|
||||||
|
dealloc(workspace);
|
||||||
|
checkCudnnError(cudnnDestroy(cudnn));
|
||||||
|
checkCublasError(cublasDestroy(cublas));
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
|
||||||
|
enum cudaStreamCaptureStatus pCaptureStatus;
|
||||||
|
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
|
||||||
|
dbg(pCaptureStatus);
|
||||||
|
cudaGraphStatus = true;
|
||||||
|
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaGraphExec_t CudaRuntimeObj::endCudaGraphStreamCapture() {
|
||||||
|
cudaGraph_t cudaGraph;
|
||||||
|
cudaGraphExec_t instance;
|
||||||
|
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
|
||||||
|
cudaGraphStatus = false;
|
||||||
|
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
auto &perfEngine = PerfEngine::getInstance();
|
auto &perfEngine = PerfEngine::getInstance();
|
||||||
|
@ -75,4 +115,52 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
||||||
|
|
||||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||||
|
|
||||||
|
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
|
||||||
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
|
auto &perfEngine = PerfEngine::getInstance();
|
||||||
|
// compile-time computable
|
||||||
|
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||||
|
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||||
|
bool status = graph->topo_sort();
|
||||||
|
IT_ASSERT(status, "Topological sort failed");
|
||||||
|
|
||||||
|
for (auto &op : graph->getOperators()) {
|
||||||
|
// HACK: set correct data type
|
||||||
|
auto kernelAttrs =
|
||||||
|
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||||
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
|
if (perfData)
|
||||||
|
kernel->compute(op, perfData, this);
|
||||||
|
else
|
||||||
|
kernel->compute(op, this);
|
||||||
|
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||||
|
// if (op->getOpType() == OpType::Matmul)
|
||||||
|
// if (op->getOpType() == OpType::Matmul ||
|
||||||
|
// op->getOpType() == OpType::Relu
|
||||||
|
// // || op->getOpType() == OpType::MemBound
|
||||||
|
// )
|
||||||
|
kernels.emplace_back(op, kernel, perfData);
|
||||||
|
}
|
||||||
|
for (auto &[op, kernel, perfData] : kernels) {
|
||||||
|
dbg(op);
|
||||||
|
}
|
||||||
|
|
||||||
|
beginCudaGraphStreamCapture();
|
||||||
|
for (auto &[op, kernel, perfData] : kernels) {
|
||||||
|
if (perfData)
|
||||||
|
kernel->compute(op, perfData, this);
|
||||||
|
else
|
||||||
|
kernel->compute(op, this);
|
||||||
|
}
|
||||||
|
auto cudaGraphInstance = endCudaGraphStreamCapture();
|
||||||
|
return timeit(
|
||||||
|
[&, stream = getStream()]() {
|
||||||
|
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
|
||||||
|
},
|
||||||
|
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
|
||||||
|
1000);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -82,7 +82,8 @@ void printGraph(Graph g) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
|
||||||
|
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
|
||||||
|
@ -145,6 +146,7 @@ Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
||||||
dbg(go0->equalData(bgo0, 1e-3));
|
dbg(go0->equalData(bgo0, 1e-3));
|
||||||
dbg(runtime->getPerfTime(bestGraph, true));
|
dbg(runtime->getPerfTime(bestGraph, true));
|
||||||
dbg(runtime->timeNonCtcOperators(bestGraph));
|
dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||||
|
dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||||
}
|
}
|
||||||
|
|
||||||
dbg("Best graph");
|
dbg("Best graph");
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#include "cuda/cuda_utility.h"
|
||||||
|
#include "operators/conv.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
TEST(TestCudaRuntime, CudaGraph) {
|
||||||
|
auto runtime = make_ref<CudaRuntimeObj>();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
|
||||||
|
const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
|
||||||
|
auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
|
||||||
|
auto w0 =
|
||||||
|
g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
|
||||||
|
g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
|
||||||
|
g->dataMalloc();
|
||||||
|
runtime->run(g, true);
|
||||||
|
runtime->run(g, false);
|
||||||
|
runtime->getPerfTime(g);
|
||||||
|
|
||||||
|
auto time = runtime->timeWithCudaGraph(g);
|
||||||
|
EXPECT_GE(time, 0.01);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue