forked from jiuyuan/InfiniTensor
Add: CUDA graph stream capture (MemboundOp fails)
This commit is contained in:
parent
e4c20a9ae2
commit
e86e993ed4
|
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
|
|||
endif()
|
||||
|
||||
# Python bindings
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
|
||||
file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
|
||||
pybind11_add_module(backend MODULE ${FFIS})
|
||||
target_link_libraries(backend PRIVATE InfiniTensor)
|
||||
|
||||
|
@ -168,6 +168,7 @@ endif()
|
|||
|
||||
if(USE_CUDA)
|
||||
add_compile_definitions(USE_CUDA=1)
|
||||
add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
|
||||
# Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
|
||||
set(CMAKE_CUDA_HOST_COMPILER
|
||||
${CMAKE_CXX_COMPILER}
|
||||
|
|
|
@ -1,35 +1,22 @@
|
|||
#pragma once
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "nnet/dbg.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class CudaRuntimeObj : public RuntimeObj {
|
||||
private:
|
||||
cudaStream_t stream;
|
||||
cudnnHandle_t cudnn;
|
||||
cublasHandle_t cublas;
|
||||
CudaPtr workspace;
|
||||
size_t workspaceSize;
|
||||
bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled
|
||||
|
||||
public:
|
||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
||||
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
virtual ~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
CudaRuntimeObj();
|
||||
virtual ~CudaRuntimeObj();
|
||||
string toString() const override;
|
||||
|
||||
void run(const Graph &graph, bool tune = false,
|
||||
|
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
|
||||
void runWithoutSync(const Graph &graph) const;
|
||||
|
||||
bool isInCudaGraph() const { return cudaGraphStatus; }
|
||||
cudaStream_t getStream() const { return stream; }
|
||||
|
||||
double timeWithCudaGraph(Graph graph);
|
||||
|
||||
private:
|
||||
void tune(const Graph &graph, bool profiling) const;
|
||||
|
||||
void beginCudaGraphStreamCapture();
|
||||
cudaGraphExec_t endCudaGraphStreamCapture();
|
||||
};
|
||||
} // namespace infini
|
||||
|
|
|
@ -6,6 +6,46 @@
|
|||
#include "operators/matmul.h"
|
||||
namespace infini {
|
||||
|
||||
CudaRuntimeObj::CudaRuntimeObj()
|
||||
: RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
|
||||
cudaGraphStatus(false) {
|
||||
checkCudnnError(cudnnCreate(&cudnn));
|
||||
checkCublasError(cublasCreate(&cublas));
|
||||
checkCudnnError(cudnnSetStream(cudnn, stream));
|
||||
checkCublasError(cublasSetStream(cublas, stream));
|
||||
// 10GB for Longformer
|
||||
// size_t longformerNum = 3lu * (1 << 30);
|
||||
workspaceSize = 7ll << 30; // 7 GB
|
||||
workspace = alloc(workspaceSize);
|
||||
}
|
||||
|
||||
CudaRuntimeObj::~CudaRuntimeObj() {
|
||||
try {
|
||||
dealloc(workspace);
|
||||
checkCudnnError(cudnnDestroy(cudnn));
|
||||
checkCublasError(cublasDestroy(cublas));
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::beginCudaGraphStreamCapture() {
|
||||
enum cudaStreamCaptureStatus pCaptureStatus;
|
||||
checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
|
||||
dbg(pCaptureStatus);
|
||||
cudaGraphStatus = true;
|
||||
checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||
}
|
||||
|
||||
cudaGraphExec_t CudaRuntimeObj::endCudaGraphStreamCapture() {
|
||||
cudaGraph_t cudaGraph;
|
||||
cudaGraphExec_t instance;
|
||||
checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
|
||||
cudaGraphStatus = false;
|
||||
checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
|
||||
return instance;
|
||||
}
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
|
@ -75,4 +115,52 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
|||
|
||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||
|
||||
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
|
||||
vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
|
||||
bool status = graph->topo_sort();
|
||||
IT_ASSERT(status, "Topological sort failed");
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
// if (op->getOpType() == OpType::Matmul)
|
||||
// if (op->getOpType() == OpType::Matmul ||
|
||||
// op->getOpType() == OpType::Relu
|
||||
// // || op->getOpType() == OpType::MemBound
|
||||
// )
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
|
||||
beginCudaGraphStreamCapture();
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
if (perfData)
|
||||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
auto cudaGraphInstance = endCudaGraphStreamCapture();
|
||||
return timeit(
|
||||
[&, stream = getStream()]() {
|
||||
checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
|
||||
},
|
||||
[&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
|
||||
1000);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -82,7 +82,8 @@ void printGraph(Graph g) {
|
|||
}
|
||||
}
|
||||
|
||||
Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
||||
Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
|
||||
auto runtime = as<CudaRuntimeObj>(_runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
|
@ -145,6 +146,7 @@ Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
|
|||
dbg(go0->equalData(bgo0, 1e-3));
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||
}
|
||||
|
||||
dbg("Best graph");
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/conv.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(TestCudaRuntime, CudaGraph) {
|
||||
auto runtime = make_ref<CudaRuntimeObj>();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
|
||||
const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
|
||||
auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
|
||||
auto w0 =
|
||||
g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
|
||||
g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
g->dataMalloc();
|
||||
runtime->run(g, true);
|
||||
runtime->run(g, false);
|
||||
runtime->getPerfTime(g);
|
||||
|
||||
auto time = runtime->timeWithCudaGraph(g);
|
||||
EXPECT_GE(time, 0.01);
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue