forked from jiuyuan/InfiniTensor
Add: warmup and repeat args in timeNonCtcOperators
This commit is contained in:
parent
537b3b4ea4
commit
e4c20a9ae2
|
@ -64,7 +64,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
* @return double Return the sum of perf time for each operator
|
||||
*/
|
||||
double getPerfTime(const Graph &graph, bool printProfiling = false,
|
||||
bool allowEstimation = false) const;
|
||||
bool allowEstimation = false,
|
||||
bool ignoreMemboundOp = false) const;
|
||||
Blob allocBlob(size_t size);
|
||||
bool isCpu() const {
|
||||
return device == Device::CPU || device == Device::INTELCPU;
|
||||
|
@ -82,7 +83,8 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
|||
map<UidBaseType, bool>
|
||||
getCompileTimeComputableAttribute(const Graph &graph) const;
|
||||
|
||||
double timeNonCtcOperators(const Graph &graph) const;
|
||||
double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
|
||||
int repeat = 1000) const;
|
||||
|
||||
protected:
|
||||
void printProfilingData(double totTime,
|
||||
|
|
|
@ -78,7 +78,8 @@ RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
|
|||
}
|
||||
|
||||
double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
||||
bool allowEstimation) const {
|
||||
bool allowEstimation,
|
||||
bool ignoreMemboundOp) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// Statistics
|
||||
|
@ -97,10 +98,14 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
|
|||
double time = -1e9;
|
||||
if (ctcMap[op->getGuid()]) { // Compile-time computable operators
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::Reshape) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
|
||||
time = 0;
|
||||
} else if (op->getOpType() == OpType::MemBound && allowEstimation) {
|
||||
time = as<MemBoundObj>(op)->getEstimatedTime();
|
||||
} else if (perfData) { // Tune the kernel if there is no record
|
||||
time = perfData->time;
|
||||
} else if (allowEstimation && op->getOpType() == OpType::MemBound) {
|
||||
time = as<MemBoundObj>(op)->getEstimatedTime();
|
||||
} else {
|
||||
// TODO: should tenosrs automatically allocate when access data?
|
||||
// allocate memory for empty tensors and release it after
|
||||
|
@ -189,7 +194,8 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,
|
|||
|
||||
string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }
|
||||
|
||||
double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
|
||||
double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
|
||||
int repeat) const {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
|
@ -209,14 +215,13 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
|
|||
kernel->compute(op, perfData, this);
|
||||
else
|
||||
kernel->compute(op, this);
|
||||
// if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
if (op->getOpType() == OpType::Matmul)
|
||||
if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
|
||||
kernels.emplace_back(op, kernel, perfData);
|
||||
}
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
dbg(op);
|
||||
}
|
||||
cudaProfilerStart(); // HACK: Debug
|
||||
// cudaProfilerStart();
|
||||
double ret = timeit(
|
||||
[&]() {
|
||||
for (auto &[op, kernel, perfData] : kernels) {
|
||||
|
@ -226,8 +231,8 @@ double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
|
|||
kernel->compute(op, this);
|
||||
}
|
||||
},
|
||||
[&]() { sync(); });
|
||||
cudaProfilerStop(); // HACK: Debug
|
||||
[&]() { sync(); }, warmup, repeat);
|
||||
// cudaProfilerStop();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -450,7 +450,7 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
|
|||
}
|
||||
|
||||
double SearchEngine::getEstimatedGraphPerf(Graph graph) {
|
||||
return runtimeExec->getPerfTime(graph, false, true);
|
||||
return runtimeExec->getPerfTime(graph, false, true, true);
|
||||
}
|
||||
|
||||
Graph SearchEngine::fuseVertically(const Graph &graph) {
|
||||
|
|
|
@ -41,7 +41,6 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
// auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
|
||||
tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
|
||||
// IT_ASSERT(packedFunc != nullptr);
|
||||
|
||||
// prepare inputs and outputs
|
||||
vector<DLTensorHolder> inputsHolder;
|
||||
|
|
Loading…
Reference in New Issue