Add: debug hacks for InfoGAN

2023-04-19 17:23:32 +08:00 · 2023-04-19 17:23:32 +08:00 · 0b23a065ca
parent e86e993ed4
commit 0b23a065ca
6 changed files with 35 additions and 8 deletions
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -77,6 +77,7 @@ class TensorObj : public TensorBaseObj {
    // Thus the internal state of generator cannot be updated.
    void setData(
        std::function<void(void *, size_t, DataType)> const &generator) const;
+    void setData(const Blob &_blob) { data = _blob; }
    Tensor clone() const {
        auto obj = make_ref<TensorObj>(*this);
        obj->freeData();
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -1,6 +1,7 @@
 #include "core/search_engine.h"
 #include "core/hash.h"
 #include "core/runtime.h"
+#include "nnet/dbg.h"

 #include <algorithm>
 #include <iostream>
@ -74,7 +75,9 @@ Graph SearchEngine::run(const Graph graph) {
                nextGraphs.emplace_back(tmp);
            }
        }
+        dbg("===Num" + std::to_string(nextGraphs.size()));
        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
+
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
        }
@ -122,6 +125,7 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
    }

    // compare with perf time
+    dbg("===Num" + std::to_string(results.size()));
    std::sort(results.begin(), results.end(), graphTimeComparer);
    if (results.size() > GRAPH_SIZE) {
        results.resize(GRAPH_SIZE);
@ -341,6 +345,9 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
        std::vector<Graph> nextGraphs;
        if (node.type == 1) { // If it has computing OPs
            auto mutatedGraphs = mutator->run(node.graph);
+            // // HACK: only try the first one for debug
+            if (mutatedGraphs.size() > 2)
+                mutatedGraphs.resize(2);
            for (auto graph : graphs) {
                for (auto mutatedGraph : mutatedGraphs) {
                    std::vector<Operator> ops;
@ -373,6 +380,7 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
        for (auto g : nextGraphs) {
            g->dataMalloc();
        }
+        dbg("===Num" + std::to_string(nextGraphs.size()));
        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@ -49,7 +49,7 @@ class matmulCublas : public Kernel {
        const float alpha = 1.f, beta = 0.f;
        // TODO:use compute type
        cublasStatus_t stat;
-        if (b > 1) {
+        if (b >= 1) {
            // Support batch broadcast with zero stride
            int dimA = op->getInputs(0)->getDims().size();
            int dimB = op->getInputs(1)->getDims().size();
@ -63,6 +63,11 @@ class matmulCublas : public Kernel {
                 (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
                    ? 0 // Broadcast the batch dimension if batch size is 1
                    : n * k;
+            // printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
+            // B "
+            //        "%d %lld, A %d %lld, C %d %d, b %d %d\n",
+            //        opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
+            //        m * n, b, record->algo);
            stat = cublasGemmStridedBatchedEx(
                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
                CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@ -4,10 +4,13 @@ namespace infini {
 class CopyCuda : public CudaKernelWithoutConfig {
    void compute(const Operator &op,
                 const RuntimeObj *_context) const override {
-        auto inData = op->getInputs(0)->getRawDataPtr<void *>();
-        auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
-        cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
-                        cudaMemcpyDeviceToDevice);
+        // auto inData = op->getInputs(0)->getRawDataPtr<void *>();
+        // auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
+        // cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
+        //                 cudaMemcpyDeviceToDevice);
+
+        // HACK: optimization
+        op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
    }
 };
 // reshape/flatten/identity all act as copying from input to output.
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@ -77,6 +77,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
 void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
    OpVec computeOps = in_graph->getComputeOps();
    IT_ASSERT(computeOps.size() == 1);
+    // HACK: remove this
+    if (auto op = as<ConvTransposed2dNHWCObj>(computeOps[0]); !op)
+        return;

    // if (infini::Graph g = transformTConv1x1(computeOps[0])) {
    //     out_graphs.emplace_back(g);
@ -408,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
                                   nameNToTensorT.at(BN->getName())};
            TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
            const auto &[b, m, n, k, transa, transb] = op->getArgs();
+            // // HACK: pruning for deubg
+            if (!((transa == 0) && (transb == 1)))
+                return nullptr;
            g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
                                           outputsPET[0], transa, transb);
        }
@ -536,11 +542,15 @@ Graph NMutator::transformConvtransposed1x1(Operator _op) {
    auto newA = g->addTensor(
        {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
    // FRSC
+    // auto newW = g->addTensor(
+    //     {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
+    //     dtype);
+    // HACK: without transpoe
    auto newW = g->addTensor(
-        {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
+        {weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype);
    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
-    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
    g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
                                    op->getOutput()->getDims());
    return g;
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@ -55,7 +55,6 @@ TEST(NMutator, NaiveConvWithInterpreter) {
                gg->getOutputs()[0]->getRawDataPtr<void *>());
 }

-// FIXME: failed since implicit transpose for DLT
 TEST(NMutator, InfoGAN_TConv_3_correctness) {
    const bool useMutatorDirectly = false;
    Runtime runtime = make_ref<CudaRuntimeObj>();
@ -123,6 +122,7 @@ TEST(NMutator, InfoGAN_TConv_3_correctness) {

        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        dbg(runtime->getPerfTime(bestGraph, true));
        EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
    }
 }