diff --git a/3rd-party/backward-cpp b/3rd-party/backward-cpp
index 3bb9240c..f30744bc 160000
--- a/3rd-party/backward-cpp
+++ b/3rd-party/backward-cpp
@@ -1 +1 @@
-Subproject commit 3bb9240cb15459768adb3e7d963a20e1523a6294
+Subproject commit f30744bcf726ea3735df7ecf9e9de9ddac540283
diff --git a/3rd-party/googletest b/3rd-party/googletest
index b796f7d4..e2239ee6 160000
--- a/3rd-party/googletest
+++ b/3rd-party/googletest
@@ -1 +1 @@
-Subproject commit b796f7d44681514f58a683a3a71ff17c94edb0c1
+Subproject commit e2239ee6043f73722e7aa812a459f54a28552929
diff --git a/3rd-party/nlohmann_json_cmake_fetchcontent b/3rd-party/nlohmann_json_cmake_fetchcontent
index 13132dd3..6aebf092 160000
--- a/3rd-party/nlohmann_json_cmake_fetchcontent
+++ b/3rd-party/nlohmann_json_cmake_fetchcontent
@@ -1 +1 @@
-Subproject commit 13132dd361c8c5b5753983d5186cf54f689d90f9
+Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756
diff --git a/3rd-party/pybind11 b/3rd-party/pybind11
index 0bd8896a..1e3400b6 160000
--- a/3rd-party/pybind11
+++ b/3rd-party/pybind11
@@ -1 +1 @@
-Subproject commit 0bd8896a4010f2d91b2340570c24fa08606ec406
+Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae
diff --git a/include/cuda/cuda_runtime.h b/include/cuda/cuda_runtime.h
index a4a0114a..b0b181aa 100644
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
-#include "nnet/dbg.h"
 
 namespace infini {
 
@@ -31,7 +30,6 @@ class CudaRuntimeObj : public RuntimeObj {
     void sync() const;
     CudaPtr alloc(size_t size) override {
         void *ptr;
-        // dbg(size);
         checkCudaError(cudaMalloc(&ptr, size));
         allocatedGPUMemorySize += size;
         allocationMap[ptr] = size;
diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc
index c94607b1..d190a689 100644
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@@ -502,9 +502,10 @@ Graph SearchEngine::fuseVertically(const Graph &graph) {
 
         auto bestGraph = make_ref<GraphObj>(runtimeExec, chainOps);
         // Eliminate transpose and reshape operators
-        // if (auto eliminatedGraph = mutator->eliminateVertically(
-        //         make_ref<GraphObj>(runtimeExec, chainOps)))
-        //     bestGraph = eliminatedGraph;
+        // FIXME: current Relu only support 3D and 4D tensors
+        if (auto eliminatedGraph = mutator->eliminateVertically(
+                make_ref<GraphObj>(runtimeExec, chainOps)))
+            bestGraph = eliminatedGraph;
         // Fuse membound operators
         if (auto optGraph = mutator->fuseVertically(bestGraph))
             bestGraph = optGraph;
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index ba81fa21..627bb021 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -2,7 +2,6 @@
 #include "core/blob.h"
 #include "core/operator.h"
 #include "core/runtime.h"
-#include "nnet/dbg.h"
 #include "utils/dataloader.h"
 #include <cstring>
 #include <numeric>
diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc
index c8d08092..da65104a 100644
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@@ -3,6 +3,7 @@
 #include "core/perf_engine.h"
 #include "core/runtime.h"
 #include "cuda_profiler_api.h"
+#include "nnet/dbg.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 #ifdef INFINI_USE_TVM
@@ -165,9 +166,9 @@ double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
     auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
     // Since one TVM packed function may contaion more than one CUDA kernel, the
     // number of captured kernels may exceed the number of operators.
-    // IT_ASSERT(numCudaGraphNodes >= kernels.size(),
-    //           std::to_string(numCudaGraphNodes) +
-    //               " != " + std::to_string(kernels.size()));
+    IT_ASSERT(numCudaGraphNodes >= kernels.size(),
+              std::to_string(numCudaGraphNodes) +
+                  " != " + std::to_string(kernels.size()));
     printf("numCudaGraphNodes = %lu\n", numCudaGraphNodes);
     return timeit(
         [&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
diff --git a/src/kernels/cuda/conv.cc b/src/kernels/cuda/conv.cc
index 7f5f6031..cf214cb3 100644
--- a/src/kernels/cuda/conv.cc
+++ b/src/kernels/cuda/conv.cc
@@ -82,6 +82,9 @@ class convCudnn : public Kernel {
         // get kernels
         cudnnFilterDescriptor_t knDesc;
         checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
+        // FIXME: filter data layout is not changed with input data layout
+        // since FCRS shows better performance for NHWC inputs in some cases.
+        // This should be tunable.
         checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
                                                    CUDNN_TENSOR_NCHW, f,
                                                    channelsPerGrp, r, s));
diff --git a/src/kernels/cuda/conv2dreduce.cc b/src/kernels/cuda/conv2dreduce.cc
index 2afbfc47..aaf1b5f6 100644
--- a/src/kernels/cuda/conv2dreduce.cc
+++ b/src/kernels/cuda/conv2dreduce.cc
@@ -41,4 +41,4 @@ REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduce, DataType::Float32,
 REGISTER_KERNEL(Device::CUDA, OpType::Conv2dReduceTranspose, DataType::Float32,
                 Conv2dReduceCuda, "Conv2dReduceTranspose_CUDA_Float32");
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc
index 00a56860..ee4e590b 100644
--- a/src/kernels/cuda/unary.cc
+++ b/src/kernels/cuda/unary.cc
@@ -32,7 +32,6 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
         } else if (dim.size() == 3) {
             n = 1, c = dim[0], h = dim[1], w = dim[2];
         } else {
-            dbg(vecToString(dim));
             IT_TODO_HALT();
         }
 
diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc
index 8941bd23..8e9c75fa 100644
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@@ -574,24 +574,21 @@ Graph NMutator::transformConvToGEMMReduce(Operator _op) {
     IT_ASSERT(inputDims[2] == w);
     IT_ASSERT(inputDims[3] == c);
     const DataType dtype = A->getDType();
-    // IT_ASSERT(outputDims[0] == n);
-    // IT_ASSERT(outputDims[1] == h);
-    // IT_ASSERT(outputDims[2] == w);
-    // IT_ASSERT(outputDims[3] == f);
     auto g = make_ref<GraphObj>(runtime);
-    dbg(vecToString(inputDims));
-    dbg(vecToString(weightDims));
     auto newA = g->addTensor(
         {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
-    auto newW = g->addTensor(
-        {weightDims[3], weightDims[0] * weightDims[1] * weightDims[2]}, dtype);
 
+    // // If use Matmul with transpose 0,0
     // auto newW = g->addTensor(
-    //     {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]},
-    //     dtype);
+    //     {weightDims[3], weightDims[0] * weightDims[1] * weightDims[2]}, dtype);
+
+    // If use Matmul with transpose 0, 1
+    auto newW = g->addTensor(
+            {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]},
+            dtype);
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
-    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
     auto new1 = g->addTensor({n, h, w, f, r, s}, dtype);
     g->addOpWithOutputs<ReshapeObj>(newO, new1, new1->getDims());
     g->addOpWithOutputs<Conv2dReduce>(
@@ -605,39 +602,27 @@ Graph NMutator::transformConvTranposeToGEMMReduce(Operator _op) {
         return nullptr;
     const auto &A = op->getInputs()[0];
     const auto &W = op->getInputs()[1];
+    // f is the de-facto input channel for ConvTranspose
     const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
     const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
     const Shape inputDims = op->getInputs(0)->getDims();
     const Shape weightDims = op->getInputs(1)->getDims();
     const Shape outputDims = op->getOutput()->getDims();
-    dbg(vecToString(inputDims));
-    dbg(vecToString(weightDims));
-    dbg(vecToString(op->getOutput()->getDims()));
-    // dbg(vecToString(op->getNCHWFRS());
-    IT_ASSERT(weightDims[0] == f);
-    IT_ASSERT(weightDims[1] == r);
-    IT_ASSERT(weightDims[2] == s);
-    IT_ASSERT(weightDims[3] == c);
-    IT_ASSERT(inputDims[0] == n);
-    IT_ASSERT(inputDims[1] == h);
-    IT_ASSERT(inputDims[2] == w);
-    IT_ASSERT(inputDims[3] == f);
     const DataType dtype = A->getDType();
     auto g = make_ref<GraphObj>(runtime);
-    auto newA = g->addTensor(
+    auto newA = g->addTensor( // [N,H,W,F]
         {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
-    auto newW = g->addTensor(
+    auto newW = g->addTensor( // [F, CRS]
         {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
-        dtype); // hack
+        dtype); // HACK: this should be a transpose
 
-    // auto newW = g->addTensor(
-    //     {weightDims[0] * weightDims[1] * weightDims[2], weightDims[3]},
-    //     dtype);
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
     g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
+    // newO [NHW, CRS]
     Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
     auto new1 = g->addTensor({n, h, w, c, r, s}, dtype);
     g->addOpWithOutputs<ReshapeObj>(newO, new1, new1->getDims());
+    // [NHW, CRS] -> [N,H,W,C]
     g->addOpWithOutputs<Conv2dReduceTranspose>(
         new1, nullptr, g->cloneTensor(op->getOutput()), false, 0.f, ph, pw);
     return g;
diff --git a/src/operators/conv2dreduce.cc b/src/operators/conv2dreduce.cc
index 798e6d9c..86d3700d 100644
--- a/src/operators/conv2dreduce.cc
+++ b/src/operators/conv2dreduce.cc
@@ -95,4 +95,4 @@ Conv2dReduceTranspose::inferShape(const TensorVec &inputs) const {
 
     return {{{on, oh, ow, of}}};
 }
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/test/nnet/run_models_nnet.py b/test/nnet/run_models_nnet.py
index 008a7c21..383a4393 100644
--- a/test/nnet/run_models_nnet.py
+++ b/test/nnet/run_models_nnet.py
@@ -155,7 +155,7 @@ if __name__ == "__main__":
         # construct_convTranspose2d(runtime)
         # (load_onnx(runtime, '/mnt/auxHome/models/einnet/fsrcnn.bs1.onnx'), 'fsrcnn.bs1'),
         (ft.getFSRCNNGraph(1, runtime), "fsrcnn.bs1"),
-        (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16")
+        (ft.getFSRCNNGraph(16, runtime), "fsrcnn.bs16"),
         # (construct_conv_nhwc(runtime, 1, 56, 32, 32, 12, 1, 1, 0, 1, 1), 'conv1x1')
     ]