Add: python API for timing ConvTranspose (#46)

* Add: python interfaced for timing operators

* Fix: CUDA Runtime run

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
zhengly123 2022-10-07 16:03:11 +08:00 committed by GitHub
parent b0c2a08252
commit 1152adc94a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 93 additions and 21 deletions

View File

@ -44,7 +44,7 @@ using HashType = uint64_t; // compatible with std::hash
? void(0) \ ? void(0) \
: throw ::infini::Exception( \ : throw ::infini::Exception( \
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \ std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
"] Assertion failed (" + #name + "): " + #info)) "] Assertion failed (" + #name + "): " + info))
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, ""); #define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__) #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)

View File

@ -16,7 +16,7 @@ class GraphObj : public Object {
GraphObj(Runtime runtime) : runtime(runtime){}; GraphObj(Runtime runtime) : runtime(runtime){};
string toString() const override; string toString() const override;
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32); Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
Tensor cloneTensor(const Tensor &tensor) { Tensor cloneTensor(const Tensor &tensor) {
auto ret = addTensor(tensor->getDims(), tensor->getDType()); auto ret = addTensor(tensor->getDims(), tensor->getDType());
ret->dataMalloc(); ret->dataMalloc();

View File

@ -102,7 +102,11 @@ class KernelRegistry {
} }
Kernel *getKernel(const KernelAttrs &kernelAttrs) const { Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
auto it = kernels.find(kernelAttrs); auto it = kernels.find(kernelAttrs);
IT_ASSERT(it != kernels.end(), "Kernel not found."); IT_ASSERT(it != kernels.end(),
"Kernel not found for key {" +
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
", " + std::get<2>(kernelAttrs).toString());
return std::get<0>(it->second); return std::get<0>(it->second);
} }
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const { const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {

View File

@ -71,6 +71,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
size_t bytes) const = 0; size_t bytes) const = 0;
virtual void copyBlobToCPU(void *dst, const void *src, virtual void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const = 0; size_t bytes) const = 0;
virtual string toString() const = 0;
protected: protected:
void printProfilingData(double totTime, void printProfilingData(double totTime,
@ -102,6 +103,7 @@ class CpuRuntimeObj : public RuntimeObj {
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override; void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
void copyBlobInsideRuntime(void *dst, const void *src, void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override; size_t bytes) const override;
string toString() const override;
}; };
} // namespace infini } // namespace infini

View File

@ -34,6 +34,7 @@ class CudaRuntimeObj : public RuntimeObj {
checkCublasError(cublasDestroy(cublas)); checkCublasError(cublasDestroy(cublas));
checkCUresult(cuCtxDestroy(newContext)); checkCUresult(cuCtxDestroy(newContext));
} }
string toString() const override;
void run(const Graph &graph, bool tune = false, void run(const Graph &graph, bool tune = false,
bool profiling = false) const; bool profiling = false) const;
@ -68,7 +69,9 @@ class CudaRuntimeObj : public RuntimeObj {
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
} }
void runWithoutSync(const Graph &graph) const;
private: private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const; void tune(const Graph &graph, bool profiling) const;
}; };
} // namespace infini } // namespace infini

View File

@ -6,6 +6,11 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
int dilationh, int dilationw, int group, int dilationh, int dilationw, int group,
const char *name); const char *name);
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
int s, int padh, int padw, int strideh,
int stridew, int dilationh, int dilationw,
int oph, int opw, int group);
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name); double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
} // namespace opTimer } // namespace opTimer
} // namespace infini } // namespace infini

View File

@ -2,12 +2,14 @@ from tokenize import Double
import pyinfinitensor # import getPerfConv, getPerfMatmul import pyinfinitensor # import getPerfConv, getPerfMatmul
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name): def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw, return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
strideh, stridew, dilationh, dilationw, group, name) strideh, stridew, dilationh, dilationw, group, name)
def getPerfMatmul(b, m, n, k, name): def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
return pyinfinitensor.getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group)
def getPerfMatmul(b, m, n, k, name=""):
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name) return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)

View File

@ -139,4 +139,6 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
memcpy(dst, src, bytes); memcpy(dst, src, bytes);
} }
string CpuRuntimeObj::toString() const { return "CPU Runtime"; }
} // namespace infini } // namespace infini

View File

@ -5,8 +5,25 @@
#include "operators/matmul.h" #include "operators/matmul.h"
namespace infini { namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false, void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
bool profiling = false) const { const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
// IT_ASSERT(perfData, "No perf data for OP " + op->toString());
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
}
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance(); const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance(); auto &perfEngine = PerfEngine::getInstance();
double totalTime = 0; double totalTime = 0;
@ -19,11 +36,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
if (!perfData && !tune) {
kernel->compute(op, this);
continue;
}
PerfRecord record; PerfRecord record;
if (!perfData) { if (!perfData) {
record = kernel->tune(op, this); record = kernel->tune(op, this);
@ -46,13 +58,19 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
} }
} }
void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const { void CudaRuntimeObj::run(const Graph &graph, bool runTune,
bool profiling) const {
if (profiling) if (profiling)
IT_TODO_HALT(); IT_TODO_HALT();
runWithoutSync(graph, tune, profiling); if (runTune)
tune(graph, profiling);
else
runWithoutSync(graph);
sync(); sync();
} }
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); } void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
} // namespace infini } // namespace infini

View File

@ -22,8 +22,9 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
Runtime cuda = make_ref<CudaRuntimeObj>(); Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda); Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph // Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32); Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c, r, s}, DataType::Float32); Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation? // Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc(); gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator()); i0Cpu->setData(IncrementalGenerator());
@ -43,6 +44,41 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
return cuda->getPerfTime(gCuda); return cuda->getPerfTime(gCuda);
} }
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
int s, int padh, int padw, int strideh,
int stridew, int dilationh, int dilationw,
int oph, int opw, int group) {
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
// dilationh, dilationw, group] =
// tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, f, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph
auto conv = gCuda->addOp<ConvTransposed2dObj>(
i0Cuda, w0Cuda, nullptr, padh, padw, strideh, stridew, dilationh,
dilationw, oph, opw, group);
// allocate CUDA memory
gCuda->dataMalloc();
// Execute on CUDA
bool tune = true;
cuda->run(gCuda, tune);
return cuda->getPerfTime(gCuda);
}
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) { double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew, // const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
// dilationh, dilationw, group] = // dilationh, dilationw, group] =

View File

@ -13,6 +13,7 @@ void register_operator_timer(py::module &m) {
#ifdef USE_CUDA #ifdef USE_CUDA
using namespace opTimer; using namespace opTimer;
m.def("getPerfConvCudnn", &getPerfConvCudnn); m.def("getPerfConvCudnn", &getPerfConvCudnn);
m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
m.def("getPerfMatmulCublas", &getPerfMatmulCublas); m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
#endif #endif
} }

View File

@ -250,8 +250,7 @@ class convBackwardDataCudnn : public Kernel {
outData); outData);
}, },
[&]() { context->sync(); }); [&]() { context->sync(); });
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
// record.time);
// Update the tune result // Update the tune result
if (ret.time > record.time) if (ret.time > record.time)