Add: python API for timing ConvTranspose (#46)

* Add: python interfaced for timing operators

* Fix: CUDA Runtime run

Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
zhengly123 2022-10-07 16:03:11 +08:00 committed by GitHub
parent b0c2a08252
commit 1152adc94a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 93 additions and 21 deletions

View File

@ -44,7 +44,7 @@ using HashType = uint64_t; // compatible with std::hash
? void(0) \
: throw ::infini::Exception( \
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
"] Assertion failed (" + #name + "): " + #info))
"] Assertion failed (" + #name + "): " + info))
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)

View File

@ -16,7 +16,7 @@ class GraphObj : public Object {
GraphObj(Runtime runtime) : runtime(runtime){};
string toString() const override;
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
Tensor cloneTensor(const Tensor &tensor) {
auto ret = addTensor(tensor->getDims(), tensor->getDType());
ret->dataMalloc();

View File

@ -102,7 +102,11 @@ class KernelRegistry {
}
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
auto it = kernels.find(kernelAttrs);
IT_ASSERT(it != kernels.end(), "Kernel not found.");
IT_ASSERT(it != kernels.end(),
"Kernel not found for key {" +
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
", " + std::get<2>(kernelAttrs).toString());
return std::get<0>(it->second);
}
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {

View File

@ -71,6 +71,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
size_t bytes) const = 0;
virtual void copyBlobToCPU(void *dst, const void *src,
size_t bytes) const = 0;
virtual string toString() const = 0;
protected:
void printProfilingData(double totTime,
@ -102,6 +103,7 @@ class CpuRuntimeObj : public RuntimeObj {
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
void copyBlobInsideRuntime(void *dst, const void *src,
size_t bytes) const override;
string toString() const override;
};
} // namespace infini

View File

@ -34,6 +34,7 @@ class CudaRuntimeObj : public RuntimeObj {
checkCublasError(cublasDestroy(cublas));
checkCUresult(cuCtxDestroy(newContext));
}
string toString() const override;
void run(const Graph &graph, bool tune = false,
bool profiling = false) const;
@ -68,7 +69,9 @@ class CudaRuntimeObj : public RuntimeObj {
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
}
void runWithoutSync(const Graph &graph) const;
private:
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
void tune(const Graph &graph, bool profiling) const;
};
} // namespace infini

View File

@ -6,6 +6,11 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
int dilationh, int dilationw, int group,
const char *name);
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
int s, int padh, int padw, int strideh,
int stridew, int dilationh, int dilationw,
int oph, int opw, int group);
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
} // namespace opTimer
} // namespace infini

View File

@ -2,12 +2,14 @@ from tokenize import Double
import pyinfinitensor # import getPerfConv, getPerfMatmul
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name):
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
strideh, stridew, dilationh, dilationw, group, name)
def getPerfMatmul(b, m, n, k, name):
def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
return pyinfinitensor.getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group)
def getPerfMatmul(b, m, n, k, name=""):
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)

View File

@ -139,4 +139,6 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
memcpy(dst, src, bytes);
}
string CpuRuntimeObj::toString() const { return "CPU Runtime"; }
} // namespace infini

View File

@ -5,8 +5,25 @@
#include "operators/matmul.h"
namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
bool profiling = false) const {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
for (auto &op : graph->getOperators()) {
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
// IT_ASSERT(perfData, "No perf data for OP " + op->toString());
if (perfData)
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
}
}
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
double totalTime = 0;
@ -19,11 +36,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey);
if (!perfData && !tune) {
kernel->compute(op, this);
continue;
}
PerfRecord record;
if (!perfData) {
record = kernel->tune(op, this);
@ -46,13 +58,19 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
}
}
void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
void CudaRuntimeObj::run(const Graph &graph, bool runTune,
bool profiling) const {
if (profiling)
IT_TODO_HALT();
runWithoutSync(graph, tune, profiling);
if (runTune)
tune(graph, profiling);
else
runWithoutSync(graph);
sync();
}
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
} // namespace infini

View File

@ -22,8 +22,9 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c, r, s}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
@ -43,6 +44,41 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
return cuda->getPerfTime(gCuda);
}
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
int s, int padh, int padw, int strideh,
int stridew, int dilationh, int dilationw,
int oph, int opw, int group) {
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
// dilationh, dilationw, group] =
// tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
// Set input data on CPU in a CPU Graph
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, f, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
// Build CUDA graph
auto conv = gCuda->addOp<ConvTransposed2dObj>(
i0Cuda, w0Cuda, nullptr, padh, padw, strideh, stridew, dilationh,
dilationw, oph, opw, group);
// allocate CUDA memory
gCuda->dataMalloc();
// Execute on CUDA
bool tune = true;
cuda->run(gCuda, tune);
return cuda->getPerfTime(gCuda);
}
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
// dilationh, dilationw, group] =

View File

@ -13,6 +13,7 @@ void register_operator_timer(py::module &m) {
#ifdef USE_CUDA
using namespace opTimer;
m.def("getPerfConvCudnn", &getPerfConvCudnn);
m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
#endif
}

View File

@ -250,8 +250,7 @@ class convBackwardDataCudnn : public Kernel {
outData);
},
[&]() { context->sync(); });
// printf("mode:%d algo:%d :%.8lf\n", mode, algo,
// record.time);
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
// Update the tune result
if (ret.time > record.time)