diff --git a/include/core/tensor.h b/include/core/tensor.h
index 61ba2bac..931019b6 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -77,6 +77,7 @@ class TensorObj : public TensorBaseObj {
     // Thus the internal state of generator cannot be updated.
     void setData(
         std::function<void(void *, size_t, DataType)> const &generator) const;
+    void setData(const Blob &_blob) { data = _blob; }
     Tensor clone() const {
         auto obj = make_ref<TensorObj>(*this);
         obj->freeData();
diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc
index 3ef686c5..0e85bbd1 100644
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@@ -1,6 +1,7 @@
 #include "core/search_engine.h"
 #include "core/hash.h"
 #include "core/runtime.h"
+#include "nnet/dbg.h"
 
 #include <algorithm>
 #include <iostream>
@@ -74,7 +75,9 @@ Graph SearchEngine::run(const Graph graph) {
                 nextGraphs.emplace_back(tmp);
             }
         }
+        dbg("===Num" + std::to_string(nextGraphs.size()));
         std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
+
         if (nextGraphs.size() > GRAPH_SIZE) {
             nextGraphs.resize(GRAPH_SIZE);
         }
@@ -122,6 +125,7 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
     }
 
     // compare with perf time
+    dbg("===Num" + std::to_string(results.size()));
     std::sort(results.begin(), results.end(), graphTimeComparer);
     if (results.size() > GRAPH_SIZE) {
         results.resize(GRAPH_SIZE);
@@ -341,6 +345,9 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
         std::vector<Graph> nextGraphs;
         if (node.type == 1) { // If it has computing OPs
             auto mutatedGraphs = mutator->run(node.graph);
+            // // HACK: only try the first one for debug
+            if (mutatedGraphs.size() > 2)
+                mutatedGraphs.resize(2);
             for (auto graph : graphs) {
                 for (auto mutatedGraph : mutatedGraphs) {
                     std::vector<Operator> ops;
@@ -373,6 +380,7 @@ std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
         for (auto g : nextGraphs) {
             g->dataMalloc();
         }
+        dbg("===Num" + std::to_string(nextGraphs.size()));
         std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
         if (nextGraphs.size() > GRAPH_SIZE) {
             nextGraphs.resize(GRAPH_SIZE);
diff --git a/src/kernels/cuda/matmul.cc b/src/kernels/cuda/matmul.cc
index 0b15e4b6..1743dfdb 100644
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@@ -49,7 +49,7 @@ class matmulCublas : public Kernel {
         const float alpha = 1.f, beta = 0.f;
         // TODO:use compute type
         cublasStatus_t stat;
-        if (b > 1) {
+        if (b >= 1) {
             // Support batch broadcast with zero stride
             int dimA = op->getInputs(0)->getDims().size();
             int dimB = op->getInputs(1)->getDims().size();
@@ -63,6 +63,11 @@ class matmulCublas : public Kernel {
                  (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
                     ? 0 // Broadcast the batch dimension if batch size is 1
                     : n * k;
+            // printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
+            // B "
+            //        "%d %lld, A %d %lld, C %d %d, b %d %d\n",
+            //        opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
+            //        m * n, b, record->algo);
             stat = cublasGemmStridedBatchedEx(
                 context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
                 CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
diff --git a/src/kernels/cuda/reshape.cc b/src/kernels/cuda/reshape.cc
index 77070c23..ffefa52d 100644
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@@ -4,10 +4,13 @@ namespace infini {
 class CopyCuda : public CudaKernelWithoutConfig {
     void compute(const Operator &op,
                  const RuntimeObj *_context) const override {
-        auto inData = op->getInputs(0)->getRawDataPtr<void *>();
-        auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
-        cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
-                        cudaMemcpyDeviceToDevice);
+        // auto inData = op->getInputs(0)->getRawDataPtr<void *>();
+        // auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
+        // cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
+        //                 cudaMemcpyDeviceToDevice);
+
+        // HACK: optimization
+        op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
     }
 };
 // reshape/flatten/identity all act as copying from input to output.
diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc
index b5bf5f89..f88c6eb1 100644
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@@ -77,6 +77,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
 void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
     OpVec computeOps = in_graph->getComputeOps();
     IT_ASSERT(computeOps.size() == 1);
+    // HACK: remove this
+    if (auto op = as<ConvTransposed2dNHWCObj>(computeOps[0]); !op)
+        return;
 
     // if (infini::Graph g = transformTConv1x1(computeOps[0])) {
     //     out_graphs.emplace_back(g);
@@ -408,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
                                    nameNToTensorT.at(BN->getName())};
             TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
             const auto &[b, m, n, k, transa, transb] = op->getArgs();
+            // // HACK: pruning for deubg
+            if (!((transa == 0) && (transb == 1)))
+                return nullptr;
             g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
                                            outputsPET[0], transa, transb);
         }
@@ -536,11 +542,15 @@ Graph NMutator::transformConvtransposed1x1(Operator _op) {
     auto newA = g->addTensor(
         {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
     // FRSC
+    // auto newW = g->addTensor(
+    //     {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
+    //     dtype);
+    // HACK: without transpoe
     auto newW = g->addTensor(
-        {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
+        {weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype);
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
-    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
     g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
                                     op->getOutput()->getDims());
     return g;
diff --git a/test/nnet/test_mutator.cc b/test/nnet/test_mutator.cc
index 1e2b0623..9219c526 100644
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@@ -55,7 +55,6 @@ TEST(NMutator, NaiveConvWithInterpreter) {
                 gg->getOutputs()[0]->getRawDataPtr<void *>());
 }
 
-// FIXME: failed since implicit transpose for DLT
 TEST(NMutator, InfoGAN_TConv_3_correctness) {
     const bool useMutatorDirectly = false;
     Runtime runtime = make_ref<CudaRuntimeObj>();
@@ -123,6 +122,7 @@ TEST(NMutator, InfoGAN_TConv_3_correctness) {
 
         auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
         auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        dbg(runtime->getPerfTime(bestGraph, true));
         EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
     }
 }