Add: warmup and repeat args in timeNonCtcOperators

This commit is contained in:
Liyan Zheng 2023-04-19 16:22:59 +08:00
parent 537b3b4ea4
commit e4c20a9ae2
4 changed files with 19 additions and 13 deletions

View File

@ -64,7 +64,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
* @return double Return the sum of perf time for each operator
*/
double getPerfTime(const Graph &graph, bool printProfiling = false,
bool allowEstimation = false) const;
bool allowEstimation = false,
bool ignoreMemboundOp = false) const;
Blob allocBlob(size_t size);
bool isCpu() const {
return device == Device::CPU || device == Device::INTELCPU;
@ -82,7 +83,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
map<UidBaseType, bool>
getCompileTimeComputableAttribute(const Graph &graph) const;
double timeNonCtcOperators(const Graph &graph) const;
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
int repeat = 1000) const;
protected:
void printProfilingData(double totTime,

View File

@ -78,7 +78,8 @@ RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
}
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
bool allowEstimation) const {
bool allowEstimation,
bool ignoreMemboundOp) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// Statistics
@ -97,10 +98,14 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
double time = -1e9;
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
time = 0;
} else if (op->getOpType() == OpType::Reshape) {
time = 0;
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
time = 0;
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
time = as<MemBoundObj>(op)->getEstimatedTime();
} else if (perfData) { // Tune the kernel if there is no record
time = perfData->time;
} else if (allowEstimation && op->getOpType() == OpType::MemBound) {
time = as<MemBoundObj>(op)->getEstimatedTime();
} else {
// TODO: should tenosrs automatically allocate when access data?
// allocate memory for empty tensors and release it after
@ -189,7 +194,8 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
int repeat) const {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable
@ -209,14 +215,13 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
kernel->compute(op, perfData, this);
else
kernel->compute(op, this);
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
if (op->getOpType() == OpType::Matmul)
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
kernels.emplace_back(op, kernel, perfData);
}
for (auto &[op, kernel, perfData] : kernels) {
dbg(op);
}
cudaProfilerStart(); // HACK: Debug
// cudaProfilerStart();
double ret = timeit(
[&]() {
for (auto &[op, kernel, perfData] : kernels) {
@ -226,8 +231,8 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
kernel->compute(op, this);
}
},
[&]() { sync(); });
cudaProfilerStop(); // HACK: Debug
[&]() { sync(); }, warmup, repeat);
// cudaProfilerStop();
return ret;
}

View File

@ -450,7 +450,7 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
}
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
return runtimeExec->getPerfTime(graph, false, true);
return runtimeExec->getPerfTime(graph, false, true, true);
}
Graph SearchEngine::fuseVertically(const Graph &graph) {

View File

@ -41,7 +41,6 @@ class MemboundTVMPackedFunction : public Kernel {
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
// IT_ASSERT(packedFunc != nullptr);
// prepare inputs and outputs
vector<DLTensorHolder> inputsHolder;