diff --git a/include/core/tensor.h b/include/core/tensor.h index 61ba2bac..931019b6 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -77,6 +77,7 @@ class TensorObj : public TensorBaseObj { // Thus the internal state of generator cannot be updated. void setData( std::function const &generator) const; + void setData(const Blob &_blob) { data = _blob; } Tensor clone() const { auto obj = make_ref(*this); obj->freeData(); diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc index 3ef686c5..0e85bbd1 100644 --- a/src/core/search_engine.cc +++ b/src/core/search_engine.cc @@ -1,6 +1,7 @@ #include "core/search_engine.h" #include "core/hash.h" #include "core/runtime.h" +#include "nnet/dbg.h" #include #include @@ -74,7 +75,9 @@ Graph SearchEngine::run(const Graph graph) { nextGraphs.emplace_back(tmp); } } + dbg("===Num" + std::to_string(nextGraphs.size())); std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer); + if (nextGraphs.size() > GRAPH_SIZE) { nextGraphs.resize(GRAPH_SIZE); } @@ -122,6 +125,7 @@ std::vector SearchEngine::search(const Graph &graph) { } // compare with perf time + dbg("===Num" + std::to_string(results.size())); std::sort(results.begin(), results.end(), graphTimeComparer); if (results.size() > GRAPH_SIZE) { results.resize(GRAPH_SIZE); @@ -341,6 +345,9 @@ std::vector SearchEngine::searchMutation(const MetaGraph &metaGraph) { std::vector nextGraphs; if (node.type == 1) { // If it has computing OPs auto mutatedGraphs = mutator->run(node.graph); + // // HACK: only try the first one for debug + if (mutatedGraphs.size() > 2) + mutatedGraphs.resize(2); for (auto graph : graphs) { for (auto mutatedGraph : mutatedGraphs) { std::vector ops; @@ -373,6 +380,7 @@ std::vector SearchEngine::searchMutation(const MetaGraph &metaGraph) { for (auto g : nextGraphs) { g->dataMalloc(); } + dbg("===Num" + std::to_string(nextGraphs.size())); std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer); if (nextGraphs.size() > GRAPH_SIZE) { nextGraphs.resize(GRAPH_SIZE); diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc index 0b15e4b6..1743dfdb 100644 --- a/src/kernels/cuda/matmul.cc +++ b/src/kernels/cuda/matmul.cc @@ -49,7 +49,7 @@ class matmulCublas : public Kernel { const float alpha = 1.f, beta = 0.f; // TODO:use compute type cublasStatus_t stat; - if (b > 1) { + if (b >= 1) { // Support batch broadcast with zero stride int dimA = op->getInputs(0)->getDims().size(); int dimB = op->getInputs(1)->getDims().size(); @@ -63,6 +63,11 @@ class matmulCublas : public Kernel { (dimB == 3 && op->getInputs(1)->getDims()[0] == 1)) ? 0 // Broadcast the batch dimension if batch size is 1 : n * k; + // printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f, + // B " + // "%d %lld, A %d %lld, C %d %d, b %d %d\n", + // opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc, + // m * n, b, record->algo); stat = cublasGemmStridedBatchedEx( context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData, CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA, diff --git a/src/kernels/cuda/reshape.cc b/src/kernels/cuda/reshape.cc index 77070c23..ffefa52d 100644 --- a/src/kernels/cuda/reshape.cc +++ b/src/kernels/cuda/reshape.cc @@ -4,10 +4,13 @@ namespace infini { class CopyCuda : public CudaKernelWithoutConfig { void compute(const Operator &op, const RuntimeObj *_context) const override { - auto inData = op->getInputs(0)->getRawDataPtr(); - auto outData = op->getOutputs()[0]->getRawDataPtr(); - cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(), - cudaMemcpyDeviceToDevice); + // auto inData = op->getInputs(0)->getRawDataPtr(); + // auto outData = op->getOutputs()[0]->getRawDataPtr(); + // cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(), + // cudaMemcpyDeviceToDevice); + + // HACK: optimization + op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob()); } }; // reshape/flatten/identity all act as copying from input to output. diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc index b5bf5f89..f88c6eb1 100644 --- a/src/nnet/nmutator.cc +++ b/src/nnet/nmutator.cc @@ -77,6 +77,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph, void NMutator::runSingleOp(Graph in_graph, std::vector &out_graphs) { OpVec computeOps = in_graph->getComputeOps(); IT_ASSERT(computeOps.size() == 1); + // HACK: remove this + if (auto op = as(computeOps[0]); !op) + return; // if (infini::Graph g = transformTConv1x1(computeOps[0])) { // out_graphs.emplace_back(g); @@ -408,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) { nameNToTensorT.at(BN->getName())}; TensorVec outputsPET = {nameNToTensorT.at(outputNameN)}; const auto &[b, m, n, k, transa, transb] = op->getArgs(); + // // HACK: pruning for deubg + if (!((transa == 0) && (transb == 1))) + return nullptr; g->addOpWithOutputs(inputsPET[0], inputsPET[1], outputsPET[0], transa, transb); } @@ -536,11 +542,15 @@ Graph NMutator::transformConvtransposed1x1(Operator _op) { auto newA = g->addTensor( {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype); // FRSC + // auto newW = g->addTensor( + // {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, + // dtype); + // HACK: without transpoe auto newW = g->addTensor( - {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype); + {weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype); g->addOpWithOutputs(g->cloneTensor(A), newA, newA->getDims()); g->addOpWithOutputs(g->cloneTensor(W), newW, newW->getDims()); - Tensor newO = g->addOp(newA, newW, nullptr, 0, 0)->getOutput(); + Tensor newO = g->addOp(newA, newW, nullptr, 0, 1)->getOutput(); g->addOpWithOutputs(newO, g->cloneTensor(op->getOutput()), op->getOutput()->getDims()); return g; diff --git a/test/nnet/test_mutator.cc b/test/nnet/test_mutator.cc index 1e2b0623..9219c526 100644 --- a/test/nnet/test_mutator.cc +++ b/test/nnet/test_mutator.cc @@ -55,7 +55,6 @@ TEST(NMutator, NaiveConvWithInterpreter) { gg->getOutputs()[0]->getRawDataPtr()); } -// FIXME: failed since implicit transpose for DLT TEST(NMutator, InfoGAN_TConv_3_correctness) { const bool useMutatorDirectly = false; Runtime runtime = make_ref(); @@ -123,6 +122,7 @@ TEST(NMutator, InfoGAN_TConv_3_correctness) { auto go0 = gCpu->cloneTensor(g->getOutputs()[0]); auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]); + dbg(runtime->getPerfTime(bestGraph, true)); EXPECT_TRUE(go0->equalData(bgo0, 1e-4)); } }