From e72fe791689b071c37e364c2c18fc40fdd5e3e9a Mon Sep 17 00:00:00 2001 From: Liyan Zheng Date: Thu, 13 Apr 2023 19:46:54 +0800 Subject: [PATCH] Add: search engine uses estimated time --- include/core/runtime.h | 5 +- include/core/search_engine.h | 25 +++--- include/operators/membound.h | 1 + python/cpp_plugin/gen_ansor_so.py | 133 ++++++++++++++++-------------- src/core/runtime.cc | 28 ++++--- src/core/search_engine.cc | 20 ++--- src/core/tensor.cc | 4 +- src/nnet/App/test_models.cc | 124 ++++++++++++++++++++++++++++ src/nnet/nmutator.cc | 4 +- 9 files changed, 243 insertions(+), 101 deletions(-) create mode 100644 src/nnet/App/test_models.cc diff --git a/include/core/runtime.h b/include/core/runtime.h index 53920fdb..16b9b60b 100644 --- a/include/core/runtime.h +++ b/include/core/runtime.h @@ -59,10 +59,11 @@ class RuntimeObj : public std::enable_shared_from_this { * execution happens. * * @param graph - * @param profiling Whether to print breakdown of time + * @param printProfiling Whether to print breakdown of time * @return double Return the sum of perf time for each operator */ - double getPerfTime(const Graph &graph, bool profiling = false) const; + double getPerfTime(const Graph &graph, bool printProfiling = false, + bool allowEstimation = false) const; Blob allocBlob(size_t size); bool isCpu() const { return device == Device::CPU || device == Device::INTELCPU; diff --git a/include/core/search_engine.h b/include/core/search_engine.h index 55c90bc5..05c4063f 100644 --- a/include/core/search_engine.h +++ b/include/core/search_engine.h @@ -11,11 +11,15 @@ class SearchEngine { private: Runtime runtimeExec; Ref mutator; + std::function graphTimeComparer; public: - SearchEngine(Runtime _runtime, Ref _mutator) { - runtimeExec = _runtime; - mutator = _mutator; + SearchEngine(Runtime runtime, Ref mutator) + : runtimeExec(runtime), mutator(mutator) { + // Compare graph with estimated time + graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool { + return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b); + }; } ~SearchEngine() {} @@ -24,11 +28,7 @@ class SearchEngine { 3; // cut nodes whose #in + #out >= partitionThreshold size_t GRAPH_SIZE = 16; // num of best graphs. - private: // Composed objects - std::shared_ptr mutationEngine; - public: - std::shared_ptr getMutationEngine() { return mutationEngine; }; struct GroupEdge { int v, next; GroupEdge() = delete; @@ -38,10 +38,7 @@ class SearchEngine { std::shared_ptr graph; double perf = INFINITY; }; - class MetaGraph { // a graph of subgraphs, for searching. - public: - MetaGraph() {} - ~MetaGraph() {} + struct MetaGraph { // a graph of subgraphs, for searching. struct Node { Graph graph; std::vector suc; @@ -51,7 +48,7 @@ class SearchEngine { std::vector nodes; }; - Graph run(const Graph graph); // entrance of search engine. + Graph run(const Graph graph); // entrance to search engine. std::vector search(const Graph &graph); // search for a partition. private: @@ -76,5 +73,9 @@ class SearchEngine { * branch. */ bool isMultiBranchMergable(const Graph graph); + + double getEstimatedGraphPerf(Graph graph) { + return runtimeExec->getPerfTime(graph, false, true); + } }; } // namespace infini diff --git a/include/operators/membound.h b/include/operators/membound.h index 4a444553..d1707913 100644 --- a/include/operators/membound.h +++ b/include/operators/membound.h @@ -30,6 +30,7 @@ class MemBoundObj : public OperatorObj { pair getSimplifiedNnetExpr() const { return {expr, hash}; } + double getEstimatedTime() const { return exec_time; } private: vector getWorkloadVector() const override; diff --git a/python/cpp_plugin/gen_ansor_so.py b/python/cpp_plugin/gen_ansor_so.py index d06bc3be..9580b3c7 100644 --- a/python/cpp_plugin/gen_ansor_so.py +++ b/python/cpp_plugin/gen_ansor_so.py @@ -1,4 +1,5 @@ -import re +from contextlib import redirect_stdout +import time import numpy as np import tvm @@ -8,99 +9,111 @@ import json import logging USE_CACHE = True +logging.basicConfig() logger = logging.getLogger('InfiniTensor') -logger.setLevel(logging.DEBUG) +logger.setLevel(logging.INFO) def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype, tvm_code, func_name, nnet_expression: str, nnet_simplified_expression: str, hash_code=None): assert len(input_tensors) == len(input_dtypes) - - logging.debug(f'Work on hash {hash_code}') + + logger.debug(f'Work on hash {hash_code}') dir_name = os.path.join(".cache", "generated_kernels", str(hash_code)) - + if not os.path.exists(dir_name): os.makedirs(dir_name) - + so_fn = os.path.join(dir_name, f"{func_name}.so") config_fn = os.path.join(dir_name, "config_so.json") - - print("Generating Ansor op: ") - print(tvm_code) - - print("Input shape: ") - print(input_tensors) - print("Output shape: ") - print(output_tensor) - + desc_fn = os.path.join(dir_name, "desc.txt") + log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json") + out_fn = os.path.join(dir_name, "out.txt") + + logger.debug(f"Generating Ansor op: {tvm_code}") + logger.debug(f"Input shape: {input_tensors}") + logger.debug(f"Output shape: {output_tensor}") + if USE_CACHE and hash_code is not None: if os.path.exists(dir_name) and \ - os.path.exists(so_fn) and \ - os.path.exists(config_fn): + os.path.exists(so_fn) and \ + os.path.exists(config_fn): print(f"Use cache in {dir_name}") with open(config_fn, "r") as config_fin: config = json.loads(config_fin.read().strip()) conv_time = config["conv_time"] - logger.debug(f'Find tuning log for {hash_code}') + logger.info(f'Find tuning log for {hash_code} in {so_fn}') return so_fn, conv_time - + logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}") + + time_start = time.perf_counter() + # Print descriptions of the task + if USE_CACHE and hash_code is not None: + with redirect_stdout(open(desc_fn, "w")): + print("====NNET tensor expression====") + print(nnet_expression+"\n") + print("====NNET simplified tensor expression====") + print(nnet_simplified_expression+"\n") + print("====TVM compute====") + print(tvm_code+"\n") + print("Input shape: ", input_tensors) + print("Output shape: ", output_tensor) + @auto_scheduler.register_workload(func_name) def compute(): _locals = locals() - exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals) + exec(tvm_code, {'tvm': tvm, 'te': te, + 'tir': tir, 'topi': topi}, _locals) return _locals['ret'] - + target = tvm.target.Target("cuda") task = auto_scheduler.SearchTask(func=func_name, args=(), target=target) - # Inspect the computational graph - print("Computational DAG:") - print(task.compute_dag) + with redirect_stdout(open(out_fn, 'w')): + # Inspect the computational graph + print("Computational DAG:") + print(task.compute_dag) - log_file = f"ansor_{func_name}_log.json" - measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=10, - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, - ) + measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=10, + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_fn)], + verbose=2, + ) - # Run auto-tuning (search) - task.tune(tune_option) - # Apply the best schedule - sch, args = task.apply_best(log_file) + # Run auto-tuning (search) + task.tune(tune_option) + # Apply the best schedule + sch, args = task.apply_best(log_fn) + + # Kill the measurement process + del measure_ctx + + func = tvm.build(sch, args, target, name=func_name) + func.export_library(so_fn) + + ctx = tvm.cuda(0) + input_a = [] + for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)): + a_np = np.random.uniform(size=shape).astype(dtype) + input_a.append(tvm.nd.array(a_np, ctx)) + a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx) + func(a_out, *input_a) + evaluator = func.time_evaluator(func.entry_name, ctx, number=100) + conv_time = evaluator(a_out, *input_a).mean * 1e3 + + time_end = time.perf_counter() - # Kill the measurement process - del measure_ctx - - func = tvm.build(sch, args, target, name=func_name) - func.export_library(so_fn) - - ctx = tvm.cuda(0) - input_a = [] - for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)): - a_np = np.random.uniform(size=shape).astype(dtype) - input_a.append(tvm.nd.array(a_np, ctx)) - a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx) - func(a_out, *input_a) - evaluator = func.time_evaluator(func.entry_name, ctx, number=100) - conv_time = evaluator(a_out, *input_a).mean * 1e3 - - print("====NNET tensor expression====") - print(nnet_expression+"\n") - print("====NNET simplified tensor expression====") - print(nnet_simplified_expression+"\n") - print("====Time====") - print(conv_time) - if USE_CACHE and hash_code is not None: with open(config_fn, "w") as config_fout: config_fout.write(json.dumps({ "conv_time": conv_time, + "tuning_time": time_end - time_start, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), }, ensure_ascii=False, indent=2)) - + return so_fn, conv_time diff --git a/src/core/runtime.cc b/src/core/runtime.cc index 1e1e7c1d..8151a6f0 100644 --- a/src/core/runtime.cc +++ b/src/core/runtime.cc @@ -2,6 +2,7 @@ #include "core/blob.h" #include "core/kernel.h" #include "core/perf_engine.h" +#include "operators/membound.h" #include "utils/data_generator.h" #include #include @@ -56,7 +57,8 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const { printProfilingData(totalTime, opTime, opCnt); } -double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const { +double RuntimeObj::getPerfTime(const Graph &graph, bool profiling, + bool allowEstimation) const { const auto &kernelRegistry = KernelRegistry::getInstance(); auto &perfEngine = PerfEngine::getInstance(); // Statistics @@ -70,11 +72,16 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const { auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfData = perfEngine.getPerfData(perfKey); - PerfRecord record; + double time = -1e9; // Tune the kernel if there is no record - if (!perfData) { + if (perfData) { + time = perfData->time; + } else if (allowEstimation && op->getOpType() == OpType::MemBound) { + time = as(op)->getEstimatedTime(); + } else { // TODO: should tenosrs automatically allocate when access data? - // allocate memory for empty tensors and release it after profiling + // allocate memory for empty tensors and release it after + // profiling TensorVec allocatedTensors; for (auto t : op->getInputs()) if (!t->hasData()) @@ -88,21 +95,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const { } // Profile operators and record the results - record = kernel->tune(op, this); + PerfRecord record = kernel->tune(op, this); + time = record->time; perfEngine.setPerfData(perfKey, record); // Free allocated memory for (auto t : allocatedTensors) t->freeData(); - } else - record = perfData; + } - double t = record->time; - totalTime += t; + totalTime += time; if (profiling) { op->print(); - printf(" op_time %lf\n", t); - opTime[op->getOpType()] += t; + printf(" op_time %lf\n", time); + opTime[op->getOpType()] += time; opCnt[op->getOpType()]++; } } diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc index af643686..b1c621f2 100644 --- a/src/core/search_engine.cc +++ b/src/core/search_engine.cc @@ -32,8 +32,7 @@ Graph SearchEngine::run(const Graph graph) { IT_ASSERT(runtimeExec == graph->getRuntime()); std::cout << "[INFO] original graph: " << std::endl; std::cout << graph->toString(); - std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph) - << std::endl; + std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl; std::vector partitions = partitionGraph(graph); @@ -65,9 +64,7 @@ Graph SearchEngine::run(const Graph graph) { nextGraphs.emplace_back(tmp); } } - std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) { - return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y); - }); + std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer); if (nextGraphs.size() > GRAPH_SIZE) { nextGraphs.resize(GRAPH_SIZE); } @@ -81,7 +78,7 @@ Graph SearchEngine::run(const Graph graph) { for (size_t i = 0; i < bestGraphs.size(); i++) { std::cout << "bestGraph " << i << ":" << std::endl; std::cout << bestGraphs[i]->toString(); - std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i]) + std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i]) << std::endl; } @@ -102,9 +99,8 @@ std::vector SearchEngine::search(const Graph &graph) { } } - sort(results.begin(), results.end(), [&](Graph x, Graph y) { - return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y); - }); // compare with perf time + // compare with perf time + std::sort(results.begin(), results.end(), graphTimeComparer); if (results.size() > GRAPH_SIZE) { results.resize(GRAPH_SIZE); } @@ -360,9 +356,7 @@ std::vector SearchEngine::searchMutation( for (auto g : nextGraphs) { g->dataMalloc(); } - std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) { - return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y); - }); + std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer); if (nextGraphs.size() > GRAPH_SIZE) { nextGraphs.resize(GRAPH_SIZE); } @@ -372,7 +366,7 @@ std::vector SearchEngine::searchMutation( } bool SearchEngine::isMultiBranchMergable(const Graph graph) { - return mutationEngine->isMultiBranchMergable(graph); + return mutator->isMultiBranchMergable(graph); } // Split a graph into multiple independt graphs. Search engine will search for diff --git a/src/core/tensor.cc b/src/core/tensor.cc index e63039d5..609b1720 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -24,8 +24,7 @@ string TensorObj::toString() const { ss << "nullptr data"; string ret = "Tensor " + std::to_string(guid) + ", Fuid " + std::to_string(fuid) + ", shape " + vecToString(shape) + - ", dtype " + dtype.toString() + ", " + runtime->toString() + - ", " + ss.str() + "\n"; + ", dtype " + dtype.toString(); vector targetGuids; for (const auto &op : targets) targetGuids.emplace_back(op.lock()->getGuid()); @@ -34,6 +33,7 @@ string TensorObj::toString() const { else ret += ", source None"; ret += ", targets " + vecToString(targetGuids); + ret += ", " + runtime->toString() + ", " + ss.str(); return ret; } diff --git a/src/nnet/App/test_models.cc b/src/nnet/App/test_models.cc new file mode 100644 index 00000000..570f713c --- /dev/null +++ b/src/nnet/App/test_models.cc @@ -0,0 +1,124 @@ +#include "core/blob.h" +#include "core/dummy_mutator.h" +#include "core/graph.h" +#include "core/runtime.h" +#include "core/search_engine.h" +#include "cuda/cuda_runtime.h" +#include "nnet/nmutator.h" +#include "operators/conv.h" +#include "test.h" +#include + +namespace infini { + +// NHWC format +Graph getInfoGAN(int batch, Runtime runtime) { + Graph g = make_ref(runtime); + vector weights; + vector> cs{ + // Channel, kernelSize, pad, stride + {448, 2, 0, 1}, {256, 4, 1, 2}, {128, 4, 1, 2}, + {64, 4, 1, 2}, {32, 4, 1, 2}, + }; + Tensor input = g->addTensor({batch, 1, 1, 228}); + for (auto [channel, kernelSize, pad, stride] : cs) { + int f = input->getDims()[3]; // n, h, w, f + auto weight = + g->addTensor({f, kernelSize, kernelSize, channel}); // f, r, s, c + input = g->addOp(input, weight, nullptr, pad, + pad, stride, stride, 1, 1) + ->getOutput(); + // TODO: activation + } + return g; +} + +void printGraph(Graph g) { + g->print(); + puts("============ Data ============"); + for (auto t : g->getTensors()) { + dbg(t); + t->printData(); + } +} + +vector runInfoGAN() { + const bool useMutatorDirectly = true; + Runtime cuda = make_ref(); + Runtime cpu = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(cpu); + + Graph g = getInfoGAN(1, cuda); + + auto mutator = + make_ref(NMutator::Mode::RuleBased, + vector{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}); + // // Translate OP to membound without derivation + // mutator->setToNaiveMembound(); + + vector bestGraphs; + SearchEngine searchEngine(cuda, mutator); + bestGraphs.emplace_back(searchEngine.run(g)); + g->topo_sort(); + dbg(g, bestGraphs[0], bestGraphs.size()); + g->print(); + + g->dataMalloc(); + map fuidToInputTensor; + for (auto t : g->getInputs()) { + IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0); + fuidToInputTensor[t->getFuid()] = t; + } + + auto gen = RandomGenerator(-1, 1, 0); + for (auto t : g->getInputs()) { + t->setData(gen); + } + for (auto t : g->getOutputs()) { + t->setData(ZeroGenerator()); + } + cuda->run(g); + dbg("Baseline graph"); + printGraph(g); + dbg(cuda->getPerfTime(g, true)); + + for (size_t i = 0; i < bestGraphs.size(); i++) { + auto bestGraphCpu = bestGraphs[i]; + auto bestGraph = make_ref(cuda, bestGraphCpu->getOperators()); + bestGraph->topo_sort(); + + bestGraph->dataMalloc(); + // Initialize inputs with random data + for (auto t : bestGraph->getInputs()) { + t->copyData(fuidToInputTensor[t->getFuid()]); + } + + // Initialize outputs with zeros + for (auto t : bestGraph->getOutputs()) { + t->setData(ZeroGenerator()); + } + + dbg(bestGraph); + dbg(bestGraph->getOutputs()); + + cuda->run(bestGraph, true); // Tune kernels + cuda->run(bestGraph, false); // Execute transfomraed graph + + auto go0 = gCpu->cloneTensor(g->getOutputs()[0]); + auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]); + // EXPECT_TRUE(go0->equalData(bgo0, 1e-3)); + std::cout << go0->equalData(bgo0, 1e-3) << std::endl; + bgo0->printData(); + go0->printData(); + dbg(cuda->getPerfTime(bestGraph, true)); + + dbg("Best graph"); + printGraph(bestGraph); + return {g->getOutputs()[0], bestGraph->getOutputs()[0]}; + } + return {}; +} + +// TEST(ModelE2E, InfoGAN) { runInfoGAN(); } + +} // namespace infini diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc index 50575082..9cc7ef59 100644 --- a/src/nnet/nmutator.cc +++ b/src/nnet/nmutator.cc @@ -49,7 +49,7 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph, assert(computeOps.size() == 1); const auto &computeOp = computeOps[0]; auto g = infini::make_ref(in_graph->getRuntime()); - auto expr = opToExpression(computeOp); + nnet::Expr expr = opToExpression(computeOp); auto inputsN = nnet::GetTensorsVisitor().get(expr); dbg(inputsN, expr); IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1, @@ -258,6 +258,8 @@ nnet::Expr NMutator::opToExpression(Operator op) { const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS(); const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation(); IT_ASSERT_TODO(convOp->getNumGroups() == 1); + if (r != 4) + return nullptr; IT_ASSERT_TODO(r == 4); IT_ASSERT_TODO(ph == pw); IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));