forked from jiuyuan/InfiniTensor
Add: python API for timing ConvTranspose (#46)
* Add: python interfaced for timing operators * Fix: CUDA Runtime run Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
b0c2a08252
commit
1152adc94a
|
@ -44,7 +44,7 @@ using HashType = uint64_t; // compatible with std::hash
|
||||||
? void(0) \
|
? void(0) \
|
||||||
: throw ::infini::Exception( \
|
: throw ::infini::Exception( \
|
||||||
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
||||||
"] Assertion failed (" + #name + "): " + #info))
|
"] Assertion failed (" + #name + "): " + info))
|
||||||
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
|
#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
|
||||||
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ class GraphObj : public Object {
|
||||||
GraphObj(Runtime runtime) : runtime(runtime){};
|
GraphObj(Runtime runtime) : runtime(runtime){};
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
|
||||||
Tensor cloneTensor(const Tensor &tensor) {
|
Tensor cloneTensor(const Tensor &tensor) {
|
||||||
auto ret = addTensor(tensor->getDims(), tensor->getDType());
|
auto ret = addTensor(tensor->getDims(), tensor->getDType());
|
||||||
ret->dataMalloc();
|
ret->dataMalloc();
|
||||||
|
|
|
@ -102,7 +102,11 @@ class KernelRegistry {
|
||||||
}
|
}
|
||||||
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
||||||
auto it = kernels.find(kernelAttrs);
|
auto it = kernels.find(kernelAttrs);
|
||||||
IT_ASSERT(it != kernels.end(), "Kernel not found.");
|
IT_ASSERT(it != kernels.end(),
|
||||||
|
"Kernel not found for key {" +
|
||||||
|
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
||||||
|
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
|
||||||
|
", " + std::get<2>(kernelAttrs).toString());
|
||||||
return std::get<0>(it->second);
|
return std::get<0>(it->second);
|
||||||
}
|
}
|
||||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||||
|
|
|
@ -71,6 +71,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
size_t bytes) const = 0;
|
size_t bytes) const = 0;
|
||||||
virtual void copyBlobToCPU(void *dst, const void *src,
|
virtual void copyBlobToCPU(void *dst, const void *src,
|
||||||
size_t bytes) const = 0;
|
size_t bytes) const = 0;
|
||||||
|
virtual string toString() const = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void printProfilingData(double totTime,
|
void printProfilingData(double totTime,
|
||||||
|
@ -102,6 +103,7 @@ class CpuRuntimeObj : public RuntimeObj {
|
||||||
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
|
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
|
||||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
size_t bytes) const override;
|
size_t bytes) const override;
|
||||||
|
string toString() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -34,6 +34,7 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
checkCublasError(cublasDestroy(cublas));
|
checkCublasError(cublasDestroy(cublas));
|
||||||
checkCUresult(cuCtxDestroy(newContext));
|
checkCUresult(cuCtxDestroy(newContext));
|
||||||
}
|
}
|
||||||
|
string toString() const override;
|
||||||
|
|
||||||
void run(const Graph &graph, bool tune = false,
|
void run(const Graph &graph, bool tune = false,
|
||||||
bool profiling = false) const;
|
bool profiling = false) const;
|
||||||
|
@ -68,7 +69,9 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
|
checkCudaError(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void runWithoutSync(const Graph &graph) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
void tune(const Graph &graph, bool profiling) const;
|
||||||
};
|
};
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -6,6 +6,11 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||||
int dilationh, int dilationw, int group,
|
int dilationh, int dilationw, int group,
|
||||||
const char *name);
|
const char *name);
|
||||||
|
|
||||||
|
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
|
||||||
|
int s, int padh, int padw, int strideh,
|
||||||
|
int stridew, int dilationh, int dilationw,
|
||||||
|
int oph, int opw, int group);
|
||||||
|
|
||||||
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
|
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name);
|
||||||
} // namespace opTimer
|
} // namespace opTimer
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -2,12 +2,14 @@ from tokenize import Double
|
||||||
import pyinfinitensor # import getPerfConv, getPerfMatmul
|
import pyinfinitensor # import getPerfConv, getPerfMatmul
|
||||||
|
|
||||||
|
|
||||||
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name):
|
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
|
||||||
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
|
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
|
||||||
strideh, stridew, dilationh, dilationw, group, name)
|
strideh, stridew, dilationh, dilationw, group, name)
|
||||||
|
|
||||||
|
|
||||||
def getPerfMatmul(b, m, n, k, name):
|
def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
|
||||||
|
return pyinfinitensor.getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group)
|
||||||
|
|
||||||
|
|
||||||
|
def getPerfMatmul(b, m, n, k, name=""):
|
||||||
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
|
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -139,4 +139,6 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
memcpy(dst, src, bytes);
|
memcpy(dst, src, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string CpuRuntimeObj::toString() const { return "CPU Runtime"; }
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -5,8 +5,25 @@
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||||
bool profiling = false) const {
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
|
auto &perfEngine = PerfEngine::getInstance();
|
||||||
|
for (auto &op : graph->getOperators()) {
|
||||||
|
// HACK: set correct data type
|
||||||
|
auto kernelAttrs =
|
||||||
|
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||||
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
|
// IT_ASSERT(perfData, "No perf data for OP " + op->toString());
|
||||||
|
if (perfData)
|
||||||
|
kernel->compute(op, perfData, this);
|
||||||
|
else
|
||||||
|
kernel->compute(op, this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
||||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||||
auto &perfEngine = PerfEngine::getInstance();
|
auto &perfEngine = PerfEngine::getInstance();
|
||||||
double totalTime = 0;
|
double totalTime = 0;
|
||||||
|
@ -19,11 +36,6 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||||
auto perfData = perfEngine.getPerfData(perfKey);
|
auto perfData = perfEngine.getPerfData(perfKey);
|
||||||
if (!perfData && !tune) {
|
|
||||||
kernel->compute(op, this);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
PerfRecord record;
|
PerfRecord record;
|
||||||
if (!perfData) {
|
if (!perfData) {
|
||||||
record = kernel->tune(op, this);
|
record = kernel->tune(op, this);
|
||||||
|
@ -46,13 +58,19 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
void CudaRuntimeObj::run(const Graph &graph, bool runTune,
|
||||||
|
bool profiling) const {
|
||||||
if (profiling)
|
if (profiling)
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
runWithoutSync(graph, tune, profiling);
|
if (runTune)
|
||||||
|
tune(graph, profiling);
|
||||||
|
else
|
||||||
|
runWithoutSync(graph);
|
||||||
sync();
|
sync();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
|
void CudaRuntimeObj::sync() const { cudaDeviceSynchronize(); }
|
||||||
|
|
||||||
|
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -22,8 +22,9 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||||
Graph gCuda = make_ref<GraphObj>(cuda);
|
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||||
// Set input data on CPU in a CPU Graph
|
// Set input data on CPU in a CPU Graph
|
||||||
|
IT_ASSERT(c % group == 0);
|
||||||
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
|
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
|
||||||
Tensor w0Cpu = gCpu->addTensor({f, c, r, s}, DataType::Float32);
|
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
|
||||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||||
gCpu->dataMalloc();
|
gCpu->dataMalloc();
|
||||||
i0Cpu->setData(IncrementalGenerator());
|
i0Cpu->setData(IncrementalGenerator());
|
||||||
|
@ -43,6 +44,41 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||||
return cuda->getPerfTime(gCuda);
|
return cuda->getPerfTime(gCuda);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
|
||||||
|
int s, int padh, int padw, int strideh,
|
||||||
|
int stridew, int dilationh, int dilationw,
|
||||||
|
int oph, int opw, int group) {
|
||||||
|
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
|
||||||
|
// dilationh, dilationw, group] =
|
||||||
|
// tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
|
||||||
|
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||||
|
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||||
|
// Set input data on CPU in a CPU Graph
|
||||||
|
IT_ASSERT(c % group == 0);
|
||||||
|
Tensor i0Cpu = gCpu->addTensor({n, f, h, w}, DataType::Float32);
|
||||||
|
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
|
||||||
|
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
i0Cpu->setData(IncrementalGenerator());
|
||||||
|
w0Cpu->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
// Copy input tensors from CPU to CUDA
|
||||||
|
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||||
|
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||||
|
// Build CUDA graph
|
||||||
|
auto conv = gCuda->addOp<ConvTransposed2dObj>(
|
||||||
|
i0Cuda, w0Cuda, nullptr, padh, padw, strideh, stridew, dilationh,
|
||||||
|
dilationw, oph, opw, group);
|
||||||
|
// allocate CUDA memory
|
||||||
|
gCuda->dataMalloc();
|
||||||
|
// Execute on CUDA
|
||||||
|
bool tune = true;
|
||||||
|
cuda->run(gCuda, tune);
|
||||||
|
return cuda->getPerfTime(gCuda);
|
||||||
|
}
|
||||||
|
|
||||||
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
|
double getPerfMatmulCublas(int b, int m, int n, int k, const char *name) {
|
||||||
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
|
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
|
||||||
// dilationh, dilationw, group] =
|
// dilationh, dilationw, group] =
|
||||||
|
|
|
@ -13,6 +13,7 @@ void register_operator_timer(py::module &m) {
|
||||||
#ifdef USE_CUDA
|
#ifdef USE_CUDA
|
||||||
using namespace opTimer;
|
using namespace opTimer;
|
||||||
m.def("getPerfConvCudnn", &getPerfConvCudnn);
|
m.def("getPerfConvCudnn", &getPerfConvCudnn);
|
||||||
|
m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
|
||||||
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
|
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -250,8 +250,7 @@ class convBackwardDataCudnn : public Kernel {
|
||||||
outData);
|
outData);
|
||||||
},
|
},
|
||||||
[&]() { context->sync(); });
|
[&]() { context->sync(); });
|
||||||
// printf("mode:%d algo:%d :%.8lf\n", mode, algo,
|
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
|
||||||
// record.time);
|
|
||||||
|
|
||||||
// Update the tune result
|
// Update the tune result
|
||||||
if (ret.time > record.time)
|
if (ret.time > record.time)
|
||||||
|
|
Loading…
Reference in New Issue