Add: time non-compile-cime-computable operators

Chore: remove deprecated function
Add: NMutator::memboundToJson to export memboundOp
2023-04-18 16:08:05 +08:00 · 2023-04-18 14:56:35 +08:00 · 2023-04-18 09:56:14 +08:00 · 2023-04-17 20:15:40 +08:00 · 2023-04-17 13:24:56 +08:00 · 2023-04-17 13:24:14 +08:00
55 changed files with 3084 additions and 477 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,6 +16,16 @@ cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")

+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+  message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
+  include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+else()
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
+    message(STATUS "Using config.cmake in CMAKE_CURRENT_SOURCE_DIR directory")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
+  endif()
+endif()
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off

@ -63,6 +73,19 @@ include_directories(3rd-party/pybind11/include)
 add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

+# TVM backend
+if(BUILD_TEST_EINNET)
+  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
+    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
+  endif()
+  # TVM and DMLC for invoking TVM packed functions
+  include_directories(${TVM_INCLUDE_DIR})
+  include_directories(${DMLC_INCLUDE_DIR})
+  include_directories(${DLPACK_INCLUDE_DIR})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+  set(CMAKE_CXX_FLAGS "-DINFINI_USE_TVM=1") # Enable TVM codegen kernels 
+endif()
+
 if(BUILD_TEST)
  set(BUILD_GMOCK
      OFF
@ -100,6 +123,11 @@ endif()

 target_link_libraries(InfiniTensor pybind11::embed)

+# TVM backend
+if(BUILD_TEST_EINNET)
+  target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
+endif()
+
 # Python bindings
 file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
 pybind11_add_module(backend MODULE ${FFIS})
@ -219,5 +247,9 @@ if(BUILD_TEST)
  endif()
  if(BUILD_TEST_EINNET)
    build_test(test/nnet/test_*.cc)
+
+    # Build expression reader
+    add_executable(nnet_reader test/nnet/readlog.cc)
+    target_link_libraries(nnet_reader InfiniTensor)
  endif()
 endif()
--- a/cmake/config_lotus_TVM.cmake
+++ b/cmake/config_lotus_TVM.cmake
@ -0,0 +1,13 @@
+set(TVM_HOME "/home/zly/Apps/tvm-v0.10.0")
+set(TVM_INCLUDE_DIR "${TVM_HOME}/include")
+set(TVM_LIB_DIR "${TVM_HOME}/build")
+set(DMLC_INCLUDE_DIR "${TVM_HOME}/3rdparty/dmlc-core/include")
+set(DLPACK_INCLUDE_DIR "${TVM_HOME}/3rdparty/dlpack/include")
+
+set(USE_CUDA ON)
+set(USE_BANG OFF)
+
+set(BUILD_TEST ON)
+set(BUILD_TEST_CORE ON)
+set(BUILD_TEST_PET OFF)
+set(BUILD_TEST_EINNET ON)
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,9 +16,13 @@ class GraphObj : public Object {
    string toString() const override;
    Runtime getRuntime() const { return runtime; }

-    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
+                     TensorType tensorType = TensorType::Other);
    Tensor addTensor(const Tensor &tensor);
    TensorVec addTensor(const TensorVec &tensors);
+    /**
+     * @brief Clone a tensor and add it to the graph.
+     */
    Tensor cloneTensor(const Tensor &tensor) {
        return addTensor(tensor->clone(runtime));
    }
@ -79,6 +83,8 @@ class GraphObj : public Object {
        return ret;
    }

+    bool selfCheck(bool assert = false) const;
+
  private:
    /**
     * @brief Add reverse connections and Op relationship in ctor.
--- a/include/core/mutator.h
+++ b/include/core/mutator.h
@ -30,6 +30,7 @@ class Mutator {
    virtual bool isMultiBranchMergable(const Graph &in_graph) {
        IT_TODO_HALT();
    }
+    virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
 };

 } // namespace infini
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -39,6 +39,7 @@ enum class OpType {
    Tanh,
    Abs,
    Resize,
+    PRelu,
    //
    MemBound = 300,
 };
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,5 +1,6 @@
 #pragma once
 #include "core/common.h"
+#include "core/object.h"
 #include "core/ref.h"
 #include <memory>

@ -59,10 +60,11 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
     * execution happens.
     *
     * @param graph
-     * @param profiling Whether to print breakdown of time
+     * @param printProfiling Whether to print breakdown of time
     * @return double Return the sum of perf time for each operator
     */
-    double getPerfTime(const Graph &graph, bool profiling = false) const;
+    double getPerfTime(const Graph &graph, bool printProfiling = false,
+                       bool allowEstimation = false) const;
    Blob allocBlob(size_t size);
    bool isCpu() const {
        return device == Device::CPU || device == Device::MKL;
@ -77,6 +79,11 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                               size_t bytes) const = 0;
    virtual string toString() const = 0;

+    map<UidBaseType, bool>
+    getCompileTimeComputableAttribute(const Graph &graph) const;
+
+    double timeNonCtcOperators(const Graph &graph) const;
+
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
--- a/include/core/search_engine.h
+++ b/include/core/search_engine.h
@ -4,19 +4,15 @@
 #include "graph.h"
 #include "mutator.h"

-#include <unordered_map>
-
 namespace infini {
 class SearchEngine {
  private:
    Runtime runtimeExec;
    Ref<Mutator> mutator;
+    std::function<bool(const Graph &, const Graph &)> graphTimeComparer;

  public:
-    SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
-        runtimeExec = _runtime;
-        mutator = _mutator;
-    }
+    SearchEngine(Runtime runtime, Ref<Mutator> mutator);
    ~SearchEngine() {}

  private: // Configurations
@ -24,24 +20,17 @@ class SearchEngine {
        3;                  // cut nodes whose #in + #out >= partitionThreshold
    size_t GRAPH_SIZE = 16; // num of best graphs.

-  private: // Composed objects
-    std::shared_ptr<Mutator> mutationEngine;
-
  public:
-    std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
    struct GroupEdge {
        int v, next;
        GroupEdge() = delete;
    };

-    struct Candidate { // a graph with perf
-        std::shared_ptr<Graph> graph;
-        double perf = INFINITY;
-    };
-    class MetaGraph { // a graph of subgraphs, for searching.
-      public:
-        MetaGraph() {}
-        ~MetaGraph() {}
+    // struct Candidate { // a graph with perf
+    //     Graph graph;
+    //     double perf = INFINITY;
+    // };
+    struct MetaGraphObj { // a graph of subgraphs, for searching.
        struct Node {
            Graph graph;
            std::vector<int> suc;
@ -50,31 +39,33 @@ class SearchEngine {
        };
        std::vector<Node> nodes;
    };
+    using MetaGraph = Ref<MetaGraphObj>;

-    Graph run(const Graph graph);                  // entrance of search engine.
+    Graph run(const Graph graph);                  // entrance to search engine.
    std::vector<Graph> search(const Graph &graph); // search for a partition.

  private:
    std::vector<Graph> partitionGraph(const Graph graph);
-    std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
-    std::shared_ptr<MetaGraph>
-    buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
-                           const std::vector<int> &plan);
+    MetaGraph buildMetaGraphWithGraph(const Graph graph);
+    MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
+                                     const std::vector<int> &plan);
    // search horizontal merges
-    std::vector<std::shared_ptr<MetaGraph>>
-    searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
-    void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
-                        std::vector<int> &plan, std::vector<int> &frontier,
+    std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
+    void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
+                        std::vector<int> &frontier,
                        std::vector<std::vector<int>> &plans,
                        std::unordered_set<uint64_t> &planSet);
-    std::vector<Graph>
-    searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
+    std::vector<Graph> searchMutation(const MetaGraph &metaGraph);

-    void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
+    void printMetaGraph(MetaGraph metaGraph);
    /**
     * @brief Check whether a multi-brach graph can be merged into a single
     * branch.
     */
    bool isMultiBranchMergable(const Graph graph);
+    Graph fuseVertically(const Graph &graph);
+
+    double getEstimatedGraphPerf(Graph graph);
 };
+
 } // namespace infini
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -12,13 +12,14 @@ namespace infini {
 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
+enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
 class TensorObj : public TensorBaseObj {
  private:
    Shape shape;
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-
+    TensorType tensorType;
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
    }

  public:
-    TensorObj(Shape shape, DataType dtype, Runtime runtime);
+    TensorObj(Shape shape, DataType dtype, Runtime runtime,
+              TensorType tensorType = TensorType::Other);
    virtual ~TensorObj() {}
    string toString() const override;

@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
+    TensorType getTensorType() const { return tensorType; }

    void load(std::string file_path);
    void save(std::string file_path);
@ -69,6 +72,9 @@ class TensorObj : public TensorBaseObj {

    void copyData(const TensorObj *src);
    void copyData(const Tensor &src) { copyData(src.get()); }
+
+    // FIXME: std::fucntion copies the generator instead of passing it by ref.
+    // Thus the internal state of generator cannot be updated.
    void setData(
        const std::function<void(void *, size_t, DataType)> &generator) const;
    Tensor clone() const {
@ -92,29 +98,31 @@ class TensorObj : public TensorBaseObj {
    }

    void printData() const;
-    bool equalData(const Tensor &rhs) const;
+    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;

    template <typename T> bool equalData(const vector<T> &dataVector) {
        IT_ASSERT(DataType::get<T>() == dtype);
        IT_ASSERT(size() == dataVector.size());
-        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
+        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size(),
+                             1e-6);
    }

    size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;

  private:
-    void printDataFloat() const;
-    void printDataUint32_t() const;
+    void printDataFloat(float *ptr) const;
+    void printDataUint32_t(uint32_t *ptr) const;

    template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size,
+                       double relativeError) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
-                    1e-6) {
+                    relativeError) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -23,9 +23,8 @@
        const char *errName;                                                   \
        if (CUDA_SUCCESS != err) {                                             \
            cuGetErrorString(err, &errName);                                   \
-            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
-                    errName);                                                  \
-            exit(EXIT_FAILURE);                                                \
+            IT_ASSERT(err == CUDA_SUCCESS,                                     \
+                      (string("CU error: ") + string(errName)));               \
        }                                                                      \
    }

--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -11,18 +11,8 @@ class CudaRuntimeObj : public RuntimeObj {
    CudaPtr workspace;
    size_t workspaceSize;

-  public:
-    CUdevice cuDevice;
-    CUcontext newContext;
-
  public:
    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
-        // Prepare for nvrtc. cuCtxCreate should be called befero others.
-        // Otherwise it will result in strange failure, such as cuBLAS failed on
-        // certian inputs.
-        checkCUresult(cuInit(0));
-        checkCUresult(cuDeviceGet(&cuDevice, 0));
-        checkCUresult(cuCtxCreate(&newContext, 0, cuDevice));

        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
@ -32,10 +22,13 @@ class CudaRuntimeObj : public RuntimeObj {
        workspace = alloc(workspaceSize);
    }
    virtual ~CudaRuntimeObj() {
-        dealloc(workspace);
-        checkCudnnError(cudnnDestroy(cudnn));
-        checkCublasError(cublasDestroy(cublas));
-        checkCUresult(cuCtxDestroy(newContext));
+        try {
+            dealloc(workspace);
+            checkCudnnError(cudnnDestroy(cudnn));
+            checkCublasError(cublasDestroy(cublas));
+        } catch (const std::exception &e) {
+            std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
+        }
    }
    string toString() const override;

--- a/include/ffi/ffi_callback.h
+++ b/include/ffi/ffi_callback.h
@ -0,0 +1,9 @@
+#include "core/graph_handler.h"
+#include "core/mutator.h"
+#include "core/search_engine.h"
+
+namespace infini {
+namespace callback {
+void exportONNX(const Graph &graph, const string &path);
+}
+} // namespace infini
--- a/include/nnet/Visitor/HashVisitor.h
+++ b/include/nnet/Visitor/HashVisitor.h
@ -22,6 +22,7 @@ class HashVisitor : public Functor<HashType(void)> {
    HashType visit_(const Subscript &c) override;
    HashType visit_(const Tensor &c) override;
    HashType visit_(const Var &c) override;
+    HashType visit_(const Func &c) override;

  public:
    HashVisitor(int _verobse = 0) : Functor(_verobse) {}
--- a/include/nnet/Visitor/MergeMemboundMutator.h
+++ b/include/nnet/Visitor/MergeMemboundMutator.h
@ -20,7 +20,13 @@ class MergeMemboundMutator : public Mutator {
     */
    MergeMemboundMutator(const VecExpr &kernels)
        : Mutator(), kernels(kernels), curDepth(kernels.size() - 1) {}
-    Expr merge(bool allowEmptyMembound = false);
+
+    /// @brief Merged multiple expressions into one with one or several stages.
+    /// @param allowEmptyMembound
+    /// @param allowFailure If true, return nullptr when merging fails. If
+    /// false, assert will fail.
+    /// @return
+    Expr merge(bool allowEmptyMembound = false, bool allowFailure = false);
 };

 } // namespace nnet
--- a/include/nnet/Visitor/Serializer.h
+++ b/include/nnet/Visitor/Serializer.h
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
    string visit_(const Subscript &c) override;
    string visit_(const Var &c) override;
    string visit_(const Tensor &c) override;
+    string visit_(const Func &c) override;
    string dispatchRoutine(const Routine &c);

    Expr buildExprTree(string key);
--- a/include/nnet/common.h
+++ b/include/nnet/common.h
@ -66,10 +66,11 @@ static inline HashType genhash(string s) {
 }

 #define nnet_unimplemented_halt()                                              \
-    { assert(!"Unimplemented"); }
+    { IT_TODO_HALT(); }

 #define nnet_unimplemented_continue()                                          \
-    { dbg("Unimplemented"); }
+    {}
+// { dbg("Unimplemented"); }

 #define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

--- a/include/nnet/expr.h
+++ b/include/nnet/expr.h
@ -104,10 +104,11 @@ enum class NodeType {
    FuncNodeType
 };

-enum class FuncType { Relu, Tanh };
+enum class FuncType { Relu = 1000, Tanh, PRelu };

-#define DEFINE_GETTYPE(CLASS)                                                  \
-    NodeType getType() const override { return NodeType::CLASS##Type; }
+#define DEFINE_GETTYPE(CLASS, isScalar_v)                                      \
+    NodeType getType() const override { return NodeType::CLASS##Type; }        \
+    bool isScalar() const override { return isScalar_v; }

 class ExprNode {
  public:
@ -119,6 +120,7 @@ class ExprNode {
    friend std::ostream &operator<<(std::ostream &ios, const ExprNode &expr);

    virtual NodeType getType() const = 0;
+    virtual bool isScalar() const = 0;
 };

 class VarNode : public ExprNode {
@ -127,7 +129,7 @@ class VarNode : public ExprNode {
  public:
    VarNode(std::string _name) : name(_name){};
    virtual ~VarNode() {}
-    DEFINE_GETTYPE(VarNode);
+    DEFINE_GETTYPE(VarNode, true);

    const std::string &getName() const { return name; }
    HashType hash() const override { return genhash(name); };
@ -152,7 +154,7 @@ class TensorNode : public ExprNode {
    TensorNode(string _name, vector<int> _shape, vector<int> _paddings = {},
               Routine _source = nullptr);
    virtual ~TensorNode() {}
-    DEFINE_GETTYPE(TensorNode);
+    DEFINE_GETTYPE(TensorNode, false);

    bool operator==(const string &rhs) { return name == rhs; }
    friend bool operator==(const string &lhs, const TensorNode &rhs) {
@ -174,6 +176,7 @@ class TensorNode : public ExprNode {
    const Routine &getSource() const { return source; }
    int getData(const Ref<vector<int>> &data, const vector<int> &idx);
    size_t getOffset(const vector<int> &idx);
+    bool hasPadding();
 };

 enum class OpType { Range, Add, Mul, Div, Mod, Sub };
@ -203,7 +206,8 @@ struct IterationType {
    enum { Loop, Sum };
    constexpr static int NumIterationType = 2;
 };
-class RangeOpNode : public OperatorNode {
+class RangeOpNode : public OperatorNode,
+                    public std::enable_shared_from_this<RangeOpNode> {
  public:
    enum { Summand, END_POS };
    constexpr static int Loop = IterationType::Loop;
@ -220,13 +224,14 @@ class RangeOpNode : public OperatorNode {
                const vector<int> &paddings)
        : OperatorNode(OpType::Range, {_summand}), vars{_loopIters, _sumIters},
          paddings(paddings){};
-    DEFINE_GETTYPE(RangeOpNode);
+    DEFINE_GETTYPE(RangeOpNode, false);

    virtual HashType hash() const override {
        nnet_unimplemented_halt();
        return 0;
    };
    string toReadable() const override;
+    string getFullExpression();
    const Expr &getSummand() const { return subExprs[Summand]; }
    const vector<VarRangePair> &getVarRanges(int _index) const {
        return vars[_index];
@ -289,7 +294,7 @@ class BinaryOpNode : public OperatorNode {
    BinaryOpNode(OpType _opType, Expr _lhs, Expr _rhs)
        : OperatorNode(_opType, {_lhs, _rhs}){};
    virtual ~BinaryOpNode() {}
-    DEFINE_GETTYPE(BinaryOpNode);
+    DEFINE_GETTYPE(BinaryOpNode, true);

    virtual HashType hash() const override {
        return genhash((HashType)opType,
@ -314,7 +319,7 @@ class ConstantNode : public ExprNode {
    ConstantNode(int _val) : val(_val){};
    ConstantNode(const ConstantNode &rhs) : ExprNode(rhs), val(rhs.val){};
    virtual ~ConstantNode() {}
-    DEFINE_GETTYPE(ConstantNode);
+    DEFINE_GETTYPE(ConstantNode, true);

    int getValue() const { return val; }
    virtual HashType hash() const override { return genhash(val, 6214587); };
@ -334,7 +339,7 @@ class SubscriptNode : public ExprNode {
    SubscriptNode(Expr _indexed, vector<Expr> _subExprs) : subExprs(_subExprs) {
        setObject(_indexed);
    };
-    DEFINE_GETTYPE(SubscriptNode);
+    DEFINE_GETTYPE(SubscriptNode, true);

    virtual HashType hash() const override {
        nnet_unimplemented_continue();
@ -358,14 +363,15 @@ class SubscriptNode : public ExprNode {

 class FuncNode : public ExprNode {
  protected:
-    Subscript object;
+    Expr object;
    FuncType funcType;

  public:
-    FuncNode(Expr object, FuncType funcType) : funcType(funcType) {
-        setObject(object);
+    FuncNode(Expr object, FuncType funcType)
+        : object(object), funcType(funcType) {
+        nnet_assert(object->isScalar(), "FuncNode operates on a scalar");
    }
-    DEFINE_GETTYPE(FuncNode);
+    DEFINE_GETTYPE(FuncNode, true);

    virtual HashType hash() const override {
        nnet_unimplemented_continue();
@ -373,7 +379,7 @@ class FuncNode : public ExprNode {
    };
    virtual string toReadable() const override;

-    const Subscript &getObject() const { return object; }
+    const Expr &getObject() const { return object; }
    void setObject(Expr e);

    FuncType getFuncType() const { return funcType; }
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@ -7,12 +7,13 @@ namespace infini {
 class NMutator : public Mutator {
  public:
    enum class Mode { Normal, ToNaiveMembound, RuleBased };
+    using NameNToTensorT = map<string, Tensor>;

  private:
    // Suffix -N: NNet objects.
    // Suffix -T: tpm objects.
    // Map: NNet tensors -> tpm tensor.
-    std::map<std::string, Tensor> inputsNameNToTensorT;
+    NameNToTensorT inputsNameNToTensorT;
    Mode mode;
    const double bandwidth = double(200) * 1024 * 1024 * 1024;
    // If in RuleBased mode, use derivationRules in derivator
@ -20,19 +21,30 @@ class NMutator : public Mutator {

  public:
    NMutator(Mode mode = Mode::Normal);
-    NMutator(const std::vector<int> &derivationRules);
+    NMutator(Mode mode, const std::vector<int> &derivationRules);
    ~NMutator();

    vector<Graph> run(const Graph &in_graph) override;
-    void setToNaiveMembound();
+    Graph fuseVertically(const Graph &in_graph) override;

+    void setToNaiveMembound();
    void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
    long long cntStates = 0;
    long long cntCandidates = 0;

+    static void memboundToJson(const Graph &g, const string path);
+
  private:
    int maxDepth = 8;
    nnet::Expr opToExpression(Operator op);
+    /// @brief
+    /// @param op
+    /// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
+    static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
+    static pair<nnet::Expr, NMutator::NameNToTensorT>
+    generateUnaryExpr(const Operator &op);
+    static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
+
    void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);

    /**
@ -48,8 +60,8 @@ class NMutator : public Mutator {

    // TODO: recover these rules
    // Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
-    // Graph transformTConv1x1(Operator op);
-    // Graph transformTConv3x3(Operator op);
+    Graph transformConvtransposed1x1(Operator _op);
+    // Graph transformConvtransposed(Operator op);
    // Graph transformDialtedConv(Operator op);
    // Graph transformConv1x1(Operator op);
    // Graph transformConv1xk(Operator op);
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@ -19,9 +19,15 @@ class MatmulObj : public OperatorObj {

  public:
    /**
-     * @brief Construct a new Matmul object. This comments show how operators is
-     * defined in InfiniTensor. The constructor can create output tensors for
-     * the operator or not, which depends on `graph`.
+     * @brief Matmul operator with batch broadcast and tensor transpose
+     * supports. Only one tensor with singe batch can be broadcasted due to the
+     * BLAS interface restriction. Tranpose indicates whether the last two
+     * dimensions should be transposed before Matmul and does not affect other
+     * leading dimensions.
+     *
+     * Matmul show how operators are defined in InfiniTensor. The constructor of
+     * an operator can create output tensors for the operator or not, which
+     * depends on `graph`.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param A The input tensor.
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@ -6,10 +6,13 @@ namespace infini {

 class MemBoundObj : public OperatorObj {
  private:
-    std::vector<nnet::Tensor> nnetInputs;
-    nnet::Expr expr;
+    std::vector<nnet::Tensor>
+        nnetInputs; // The order of inputs in nnetInputs should be consistant
+                    // with inputs in infinitensor
+    nnet::Expr expr, simplifiedExpr;
    double exec_time;
    std::string hint;
+    HashType hash, simplifiedHash;
    int n, f, h, w;

  public:
@ -26,11 +29,17 @@ class MemBoundObj : public OperatorObj {
    int numOutputs() const override { return outputs.size(); }
    const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
    const nnet::Expr getNnetExpr() const { return expr; }
+    pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
+        return {expr, hash};
+    }
+    double getEstimatedTime() const { return exec_time; }
+    void saveAsJson(string path) const;

  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
-    HashType getHash() const;
+    static HashType calcHash(nnet::Expr expr);
+    static bool checkOOB(nnet::Expr expr);
 };

 } // namespace infini
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -41,4 +41,5 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
 DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
 DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
 DEFINE_UNARY_OBJ(Abs, OpType::Abs)
+DEFINE_UNARY_OBJ(PRelu, OpType::PRelu)
 }; // namespace infini
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@ -1,5 +1,7 @@
+#pragma once
 #include "core/common.h"
 #include "core/tensor_base.h"
+#include <random>

 namespace infini {

@ -38,6 +40,34 @@ class IncrementalGenerator : public DataGenerator {
    void fill(float *data, size_t size) override { fill<float>(data, size); }
 };

+class RandomGenerator : public DataGenerator {
+  private:
+    double l, r;
+    std::mt19937 e;
+    std::uniform_int_distribution<int> di;
+    std::uniform_real_distribution<float> dr;
+    bool generateInteger;
+
+  public:
+    RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
+                    bool generateInteger = false)
+        : l(l), r(r), e(seed), di(l, r), dr(l, r),
+          generateInteger(generateInteger) {}
+    virtual ~RandomGenerator() {}
+
+  private:
+    void fill(uint32_t *data, size_t size) override {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = di(e);
+        }
+    }
+    void fill(float *data, size_t size) override {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = (generateInteger) ? di(e) : dr(e);
+        }
+    }
+};
+
 template <int val> class ValGenerator : public DataGenerator {
  public:
    virtual ~ValGenerator() {}
--- a/python/cpp_plugin/init.py
+++ b/python/cpp_plugin/init.py
@ -1 +1,2 @@
 from .gen_ansor_op import gen_ansor_op
+from .gen_ansor_so import gen_ansor_so
--- a/python/cpp_plugin/gen_ansor_op.py
+++ b/python/cpp_plugin/gen_ansor_op.py
@ -3,19 +3,50 @@ import re
 import numpy as np
 import tvm
 from tvm import te, tir, auto_scheduler, topi
+import os
+import json
+import logging
+
+USE_CACHE = True
+logger = logging.getLogger('InfiniTensor')
+logger.setLevel(logging.DEBUG)


-def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, func_name, input_names, output_name):
+def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f,
+                 func_name, input_names, output_name, nnet_expression: str,
+                 nnet_simplified_expression: str, hash_code=None):
    assert len(input_tensors) == len(input_dtypes)
    assert len(input_tensors) == len(input_names)

+    logging.debug(f'Work on hash {hash_code}')
+
+    dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
+    func_code_fn = os.path.join(dir_name, "kernel.cu")
+    invoke_code_fn = os.path.join(dir_name, "invoke.cpp")
+    config_fn = os.path.join(dir_name, "config.json")
+
+    if USE_CACHE and hash_code is not None:
+        if os.path.exists(dir_name):
+            print(f"Use cache in {dir_name}")
+            with open(func_code_fn, "r") as func_code_fin:
+                func_code = func_code_fin.read()
+            with open(invoke_code_fn, "r") as invoke_code_fin:
+                invoke_code = invoke_code_fin.read()
+            with open(config_fn, "r") as config_fin:
+                config = json.loads(config_fin.read().strip())
+                conv_time = config["conv_time"]
+                invoke_params = config["invoke_params"]
+
+            logger.debug(f'Find tuning log for {hash_code}')
+            return func_code, invoke_code, conv_time, invoke_params
+
    print("Generating Ansor op: ")
    print(f)

    @auto_scheduler.register_workload(func_name)
    def compute():
        _locals = locals()
-        exec(f, {'tvm': tvm, 'te': te, 'tir': tir}, _locals)
+        exec(f, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
        return _locals['ret']

    target = tvm.target.Target("cuda")
@ -43,6 +74,28 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu
    # Kill the measurement process
    del measure_ctx

+    def test_mutator():
+        # test part
+        tgt_temp = tvm.target.Target(target="llvm", host="llvm")
+        all_tensors = compute()
+        sch = te.create_schedule(all_tensors[0].op)
+        args = all_tensors
+        C0, K0, A0 = args
+        func_temp = tvm.build(sch, args, tgt_temp, name="temp")
+
+        # print result
+        n, c, h, w, f, r, s = 1, 1, 2, 2, 1, 4, 4
+        dev_temp = tvm.device(tgt_temp.kind.name, 0)
+        A_temp = tvm.nd.array(
+            np.arange(n*h*w*f).reshape(n, h, w, f).astype(A0.dtype), dev_temp)
+        K_temp = tvm.nd.array(
+            np.arange(f*r*s*c).reshape(f, r, s, c).astype(K0.dtype), dev_temp)
+        C_temp = tvm.nd.array(
+            np.zeros((1, 4, 4, 1)).astype(C0.dtype), dev_temp)
+        func_temp(C_temp, K_temp, A_temp)
+        print("================= Test Result =====================")
+        print(C_temp)
+
    ir = str(tvm.lower(sch, args, simple_mode=True))
    thread_dim = [1, 1, 1]
    block_dim = [1, 1, 1]
@ -83,11 +136,27 @@ def gen_ansor_op(input_tensors, input_dtypes, output_tensor, output_dtype, f, fu

    print("Func Code")
    # Attach TVM code behind func_code
-    func_code += "\n/* " + f + "*/"
+    func_code += "\n/* NNET tensor expression \n" + nnet_expression + "\n*/\n"
+    func_code += "\n/* NNET simplified tensor expression \n" + \
+        nnet_simplified_expression + "\n*/\n"
+    func_code += "\n/* TVM compute\n" + f + "\n*/\n"
    print(func_code)
    print("Invoke Code")
    print(invoke_code)
    print("Time")
    print(conv_time)

+    if hash_code is not None:
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        with open(func_code_fn, "w") as func_code_fout:
+            func_code_fout.write(func_code)
+        with open(invoke_code_fn, "w") as invoke_code_fout:
+            invoke_code_fout.write(invoke_code)
+        with open(config_fn, "w") as config_fout:
+            config_fout.write(json.dumps({
+                "conv_time": conv_time,
+                "invoke_params": invoke_params
+            }, ensure_ascii=False, indent=2))
+
    return func_code, invoke_code, conv_time, invoke_params  # ms
--- a/python/cpp_plugin/gen_ansor_so.py
+++ b/python/cpp_plugin/gen_ansor_so.py
@ -0,0 +1,131 @@
+import os
+import sys
+import json
+from contextlib import redirect_stdout
+import time
+import logging
+
+import numpy as np
+import tvm
+from tvm import te, tir, auto_scheduler, topi
+
+USE_CACHE = True
+logging.basicConfig()
+logger = logging.getLogger('InfiniTensor')
+logger.setLevel(logging.INFO)
+
+
+def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
+                 tvm_code, func_name, nnet_expression: str,
+                 nnet_simplified_expression: str, hash_code: str = None):
+    assert len(input_tensors) == len(input_dtypes)
+
+    logger.debug(f'Work on hash {hash_code}')
+    dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
+
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+    so_fn = os.path.join(dir_name, f"{func_name}.so")
+    config_fn = os.path.join(dir_name, "config_so.json")
+    desc_fn = os.path.join(dir_name, "desc.txt")
+    log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
+    out_fn = os.path.join(dir_name, "out.txt")
+
+    logger.debug(f"Generating Ansor op: {tvm_code}")
+    logger.debug(f"Input shape: {input_tensors}")
+    logger.debug(f"Output shape: {output_tensor}")
+
+    if USE_CACHE and hash_code is not None:
+        if os.path.exists(dir_name) and \
+                os.path.exists(so_fn) and \
+                os.path.exists(config_fn):
+            print(f"Use cache in {dir_name}")
+            with open(config_fn, "r") as config_fin:
+                config = json.loads(config_fin.read().strip())
+                conv_time = config["conv_time"]
+
+            logger.info(f'Find tuning log for {hash_code} in {so_fn}')
+            return so_fn, conv_time
+    logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
+
+    time_start = time.perf_counter()
+    # Print descriptions of the task
+    if USE_CACHE and hash_code is not None:
+        with redirect_stdout(open(desc_fn, "w")):
+            print("====NNET tensor expression====")
+            print(nnet_expression+"\n")
+            print("====NNET simplified tensor expression====")
+            print(nnet_simplified_expression+"\n")
+            print("====TVM compute====")
+            print(tvm_code+"\n")
+            print("Input shape: ", input_tensors)
+            print("Output shape: ", output_tensor)
+
+    @auto_scheduler.register_workload(func_name)
+    def compute():
+        _locals = locals()
+        exec(tvm_code, {'tvm': tvm, 'te': te,
+             'tir': tir, 'topi': topi}, _locals)
+        return _locals['ret']
+
+    target = tvm.target.Target("cuda")
+
+    task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
+
+    with redirect_stdout(open(out_fn, 'w')):
+        # Inspect the computational graph
+        print("Computational DAG:")
+        print(task.compute_dag)
+
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=10,
+            runner=measure_ctx.runner,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
+            verbose=2,
+        )
+
+        # Run auto-tuning (search)
+        task.tune(tune_option)
+        # Apply the best schedule
+        sch, args = task.apply_best(log_fn)
+
+        # Kill the measurement process
+        del measure_ctx
+
+        func = tvm.build(sch, args, target, name=func_name)
+        func.export_library(so_fn)
+
+        ctx = tvm.cuda(0)
+        input_a = []
+        for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
+            a_np = np.random.uniform(size=shape).astype(dtype)
+            input_a.append(tvm.nd.array(a_np, ctx))
+        a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
+        func(a_out, *input_a)
+        evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+        conv_time = evaluator(a_out, *input_a).mean * 1e3
+
+    time_end = time.perf_counter()
+
+    if USE_CACHE and hash_code is not None:
+        with open(config_fn, "w") as config_fout:
+            config_fout.write(json.dumps({
+                "conv_time": conv_time,
+                "tuning_time": time_end - time_start,
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
+            }, ensure_ascii=False, indent=2))
+
+    return so_fn, conv_time
+
+# Read arguments from pipe, which is redirected to stdin.
+# Write generated library path to pipe.
+
+
+def pipe_gen(fd: int):
+    args = json.load(sys.stdin)  # read from pipe
+    # print(args, f'fd={fd}')
+    ret = gen_ansor_so(**args)
+    with os.fdopen(fd, 'w') as f:
+        print(ret[0], file=f, end='')  # write to pipe
--- a/python/infinitensor/init.py
+++ b/python/infinitensor/init.py
@ -0,0 +1,7 @@
+import backend
+from backend import *
+import sys
+
+sys.path.extend(__path__)
+
+print("import backend: {}".format(backend))
--- a/python/infinitensor/if_onnx.py
+++ b/python/infinitensor/if_onnx.py
@ -0,0 +1,941 @@
+import backend
+import onnx
+from onnx import (
+    ModelProto,
+    TensorProto,
+    NodeProto,
+    AttributeProto,
+    TensorShapeProto,
+    ValueInfoProto,
+)
+from onnx.helper import (
+    make_node,
+    make_tensor_value_info,
+    make_tensor,
+    make_graph,
+    make_model,
+)
+from onnx.checker import (
+    check_graph,
+    check_model,
+    check_node,
+    check_value_info,
+    check_tensor,
+)
+from onnx.shape_inference import infer_shapes
+from onnx.numpy_helper import to_array
+from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
+from functools import reduce
+
+
+class OnnxStub:
+    """
+    The Onnx model imported into infinitensor.
+    It can be generated from an Onnx model object.
+    """
+
+    # inputs: Dict[str, backend.Tensor] = {}
+    # outputs: Dict[str, backend.Tensor] = {}
+    initializer: Dict[int, TensorProto] = {}
+    # handler: backend.GraphHandler
+
+    # def __init__(self, model: ModelProto, runtime):
+    #     model = infer_shapes(model)
+    #     self.handler = backend.GraphHandler(runtime)
+
+    #     tensors: Dict[str, backend.Tensor] = dict()
+    #     data: Dict[str, TensorProto] = dict()
+
+    #     for input in model.graph.input:
+    #         dims = _take_shape_dim(input.type.tensor_type.shape)
+    #         tensors[input.name] = self.handler.tensor(
+    #             dims, input.type.tensor_type.elem_type
+    #         )
+
+    #     for output in model.graph.output:
+    #         dims = _take_shape_dim(output.type.tensor_type.shape)
+    #         tensors[output.name] = self.handler.tensor(
+    #             dims, output.type.tensor_type.elem_type
+    #         )
+
+    #     for initializer in model.graph.initializer:
+    #         dims = [d for d in initializer.dims]
+    #         tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
+    #         data[initializer.name] = initializer
+
+    #     for node in model.graph.node:
+    #         if node.op_type == "Conv":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (d, p, s) = (
+    #                 attributes[name] for name in ["dilations", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors[node.input[0]], None, p, [-2, -1]
+    #                 )
+    #                 p = [0, 0, 0, 0]
+    #             else:
+    #                 adapt = node.input[0]
+
+    #             if len(node.input) > 2:
+    #                 bias = "{}-bias".format(node.output[0])
+    #                 reshape = "{}-reshape".format(node.output[0])
+    #                 tensors[bias] = self.handler.conv(
+    #                     tensors[adapt],
+    #                     tensors[node.input[1]],
+    #                     None,
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                     d[0],
+    #                     d[1],
+    #                 )
+    #                 tensors[reshape] = self.handler.reshape(
+    #                     tensors[node.input[2]],
+    #                     None,
+    #                     [
+    #                         1,
+    #                         reduce(
+    #                             lambda acc, x: acc * x,
+    #                             _search_shape(model, node.input[2]),
+    #                         ),
+    #                         1,
+    #                         1,
+    #                     ],
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.add(
+    #                     tensors[bias],
+    #                     tensors[reshape],
+    #                     tensors.get(node.output[0]),
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.conv(
+    #                     tensors[adapt],
+    #                     tensors[node.input[1]],
+    #                     tensors.get(node.output[0]),
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                     d[0],
+    #                     d[1],
+    #                 )
+    #         elif node.op_type == "ConvTranspose":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0],
+    #                     "strides": [1, 1],
+    #                     "output_padding": [0, 0],
+    #                 },
+    #             )
+    #             (d, p, s, op) = (
+    #                 attributes[name]
+    #                 for name in ["dilations", "pads", "strides", "output_padding"]
+    #             )
+    #             tensors[node.output[0]] = self.handler.convTransposed2d(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 p[0],
+    #                 p[1],
+    #                 s[0],
+    #                 s[1],
+    #                 d[0],
+    #                 d[1],
+    #                 op[0],
+    #                 op[1],
+    #             )
+    #         elif node.op_type == "MatMul":
+    #             tensors[node.output[0]] = self.handler.matmul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 False,
+    #                 False,
+    #                 None,
+    #                 backend.ActType.Linear,
+    #             )
+    #         elif node.op_type == "Gemm":
+    #             attributes = _parse_attribute(
+    #                 node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
+    #             )
+    #             (alpha, beta, transA, transB) = (
+    #                 attributes[name] for name in ["alpha", "beta", "transA", "transB"]
+    #             )
+    #             # FIXME unsupport attributes: `alpha` `beta`
+    #             assert alpha == 1.0
+    #             assert beta == 1.0
+    #             tensors[node.output[0]] = self.handler.matmul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 transA == 1,
+    #                 transB == 1,
+    #                 tensors[node.input[2]] if len(node.input) > 2 else None,
+    #                 backend.ActType.Linear,
+    #             )
+    #         elif node.op_type == "BatchNormalization":
+    #             (input, mean, var, scale, bias) = (
+    #                 tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
+    #             )
+    #             output = tensors.get(node.output[0])
+    #             attributes = _parse_attribute(
+    #                 node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
+    #             )
+    #             (momentum, eps, training) = (
+    #                 attributes[name]
+    #                 for name in ["momentum", "epsilon", "training_mode"]
+    #             )
+    #             tensors[node.output[0]] = self.handler.batchNorm(
+    #                 input, output, mean, var, scale, bias, momentum, eps, training != 0
+    #             )
+    #         elif node.op_type == "MaxPool":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "kernel_shape": None,
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (k, d, p, s) = (
+    #                 attributes[name]
+    #                 for name in ["kernel_shape", "dilations", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors.get(node.input[0]), None, p, [-2, -1]
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.maxPool(
+    #                     tensors[adapt],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     d[0],
+    #                     d[1],
+    #                     0,
+    #                     0,
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.maxPool(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     d[0],
+    #                     d[1],
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #         elif node.op_type == "AveragePool":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "kernel_shape": None,
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (k, p, s) = (
+    #                 attributes[name] for name in ["kernel_shape", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors.get(node.input[0]), None, p, [-2, -1]
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.avgPool(
+    #                     tensors[adapt],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     1,
+    #                     1,
+    #                     0,
+    #                     0,
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.avgPool(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     1,
+    #                     1,
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #         elif node.op_type == "GlobalAveragePool":
+    #             [_, _, h, w] = _search_shape(model, node.input[0])
+    #             tensors[node.output[0]] = self.handler.avgPool(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 h,
+    #                 w,
+    #                 1,
+    #                 1,
+    #                 0,
+    #                 0,
+    #                 1,
+    #                 1,
+    #             )
+    #         elif node.op_type == "Add":
+    #             tensors[node.output[0]] = self.handler.add(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Sub":
+    #             tensors[node.output[0]] = self.handler.sub(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Mul":
+    #             tensors[node.output[0]] = self.handler.mul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Div":
+    #             tensors[node.output[0]] = self.handler.div(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Pow":
+    #             tensors[node.output[0]] = self.handler.pow(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Relu":
+    #             tensors[node.output[0]] = self.handler.relu(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Sigmoid":
+    #             tensors[node.output[0]] = self.handler.sigmoid(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Tanh":
+    #             tensors[node.output[0]] = self.handler.tanh(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Softmax":
+    #             tensors[node.output[0]] = self.handler.softmax(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Abs":
+    #             tensors[node.output[0]] = self.handler.abs(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Shape":
+    #             tensors[node.output[0]] = self.handler.shape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Identity":
+    #             tensors[node.output[0]] = self.handler.identity(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Flatten":
+    #             # FIXME axis must be 1
+    #             axis = next(
+    #                 (attr.i for attr in node.attribute if attr.name == "axis"), None
+    #             )
+    #             assert axis == None or axis == 1
+    #             tensors[node.output[0]] = self.handler.flatten(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "PRelu":
+    #             tensors[node.output[0]] = self.handler.pRelu(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Clip":
+    #             tensors[node.output[0]] = self.handler.clip(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 next(_parse_data(data[node.input[1]]).__iter__(), None)
+    #                 if len(node.input) > 1
+    #                 else None,
+    #                 next(_parse_data(data[node.input[2]]).__iter__(), None)
+    #                 if len(node.input) > 2
+    #                 else None,
+    #             )
+    #         elif node.op_type == "Transpose":
+    #             perm = next(
+    #                 (attr.ints for attr in node.attribute if attr.name == "perm"), None
+    #             )
+    #             tensors[node.output[0]] = self.handler.transpose(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 perm,
+    #             )
+    #         elif node.op_type == "Reshape":
+    #             dims = _search_shape(model, node.input[0])
+    #             size = reduce(lambda acc, x: acc * x, dims)
+    #             input_shape = _parse_data(data[node.input[1]])
+    #             for i, x in enumerate(input_shape):
+    #                 if x == 0:
+    #                     input_shape[i] = dims[i]
+    #             temp = reduce(lambda acc, x: acc * x, input_shape, 1)
+    #             if temp < 0:
+    #                 input_shape[input_shape.index(-1)] = size // -temp
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 input_shape,
+    #             )
+    #         elif node.op_type == "Squeeze":
+    #             input_shape = _search_shape(model, node.input[0])
+    #             axes = set(
+    #                 [int(i) for i in data[node.input[1]].int64_data]
+    #                 if len(node.input) > 1
+    #                 else _parse_attribute(node, {"axes": None})["axes"]
+    #             )
+    #             assert all(input_shape[d] == 1 for d in axes)
+    #             output_shape = []
+    #             for i, x in enumerate(input_shape):
+    #                 if i not in axes:
+    #                     output_shape.append(x)
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 output_shape,
+    #             )
+    #         elif node.op_type == "Unsqueeze":
+    #             input_shape = _search_shape(model, node.input[0])
+    #             axes = (
+    #                 [int(i) for i in data[node.input[1]].int64_data]
+    #                 if len(node.input) > 1
+    #                 else _parse_attribute(node, {"axes": None})["axes"]
+    #             )
+    #             for i in axes:
+    #                 input_shape.insert(i, 1)
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 input_shape,
+    #             )
+    #         elif node.op_type == "Concat":
+    #             tensors[node.output[0]] = self.handler.concat(
+    #                 [tensors[name] for name in node.input],
+    #                 tensors.get(node.output[0]),
+    #                 next((attr.i for attr in node.attribute if attr.name == "axis")),
+    #             )
+    #         elif node.op_type == "Split":
+    #             for name, tensor in zip(
+    #                 node.output,
+    #                 self.handler.split(
+    #                     tensors[node.input[0]],
+    #                     None,
+    #                     next(
+    #                         (attr.i for attr in node.attribute if attr.name == "axis"),
+    #                         0,
+    #                     ),
+    #                     len(node.output),
+    #                 ),
+    #             ):
+    #                 tensors[name] = tensor
+    #         elif node.op_type == "Gather":
+    #             tensors[node.output[0]] = self.handler.gather(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 next((attr.i for attr in node.attribute if attr.name == "axis")),
+    #             )
+    #         elif node.op_type == "ReduceMean":
+    #             tensors[node.output[0]] = self.handler.reduce_mean(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 next(
+    #                     (attr.ints for attr in node.attribute if attr.name == "axes"),
+    #                     None,
+    #                 ),
+    #                 next((attr.i for attr in node.attribute if attr.name == "keepdims"))
+    #                 != 0,
+    #             )
+    #         elif node.op_type == "Slice":
+    #             tensors[node.output[0]] = self.handler.slice(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 _parse_data(data[node.input[1]]),
+    #                 _parse_data(data[node.input[2]]),
+    #                 _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+    #                 _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
+    #             )
+    #         elif node.op_type == "Pad":
+    #             tensors[node.output[0]] = self.handler.pad(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 _parse_data(data[node.input[1]]),
+    #                 _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+    #             )
+    #         elif node.op_type == "Dropout":
+    #             for name, tensor in zip(
+    #                 node.output,
+    #                 self.handler.dropout(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     tensors.get(node.output[1]) if len(node.output) > 1 else None,
+    #                     _parse_data(data[node.input[1]])[0]
+    #                     if len(node.input) > 1
+    #                     else 0.5,
+    #                     _parse_data(data[node.input[2]])[0]
+    #                     if len(node.input) > 2
+    #                     else False,
+    #                 ),
+    #             ):
+    #                 tensors[name] = tensor
+    #         else:
+    #             raise Exception('Unsupported operator "{}"'.format(node.op_type))
+
+    #     self.handler.data_malloc()
+
+    #     for name, obj in tensors.items():
+    #         tensor = data.get(name)
+    #         if tensor == None:
+    #             if any(input.name == name for input in model.graph.input):
+    #                 self.inputs[name] = obj
+    #         else:
+    #             self.initializer[obj.fuid()] = tensor
+    #             if tensor.data_type == TensorProto.INT32:
+    #                 obj.copyin_int32(_parse_data(tensor))
+    #             elif tensor.data_type == TensorProto.INT64:
+    #                 obj.copyin_int64(_parse_data(tensor))
+    #             elif tensor.data_type == TensorProto.FLOAT:
+    #                 obj.copyin_float(_parse_data(tensor))
+    #             else:
+    #                 assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+
+    #     for output in model.graph.output:
+    #         self.outputs[output.name] = tensors[output.name]
+
+    def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
+        class Context:
+            # saves object names, including tensors and operators
+            names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
+            # counts the occurrence times of each operator for naming
+            count_op: Dict[backend.OpType, int] = dict()
+            # counts input and output tensors for naming
+            count_in, count_out = 0, 0
+            # saves nodes (operators)
+            nodes: List[NodeProto] = []
+            # saves global input tensors
+            inputs: List[ValueInfoProto] = []
+            # saves global output tensors
+            outputs: List[ValueInfoProto] = []
+            # saves global input tensors
+            initializers: List[TensorProto] = []
+
+            def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
+                ty = op.op_type()
+                name = "{}_{}".format(ty.name, op.guid())
+                self.names[op] = name
+                self.count_op[ty] = self.count_op.get(ty, 0) + 1
+                return ty, name
+
+            def push_output(self, name: str, tensor: backend.Tensor) -> str:
+                self.names[tensor] = name
+                if not tensor.has_target():
+                    shape = tensor.shape()
+                    dtype = backend.tensor_dtype(tensor)
+                    value_info = make_tensor_value_info(name, dtype, shape)
+                    check_value_info(value_info)
+                    self.outputs.append(value_info)
+                return name
+
+            def push_input(
+                self, tensor: backend.Tensor, init: Optional[TensorProto]
+            ) -> str:
+                name = self.names.get(tensor)
+                # means that this input is a global input
+                if name is None:
+                    self.count_in += 1
+                    name = "input_{}".format(tensor.guid())
+                    self.names[tensor] = name
+                    if init != None:
+                        init.name = name
+                        self.initializers.append(init)
+                    else:
+                        shape = tensor.shape()
+                        dtype = backend.tensor_dtype(tensor)
+                        value_info = make_tensor_value_info(name, dtype, shape)
+                        check_value_info(value_info)
+                        self.inputs.append(value_info)
+                return name
+
+            def push_data_input(
+                self,
+                node_name: str,
+                attr_name: str,
+                elem_type: int,
+                shape: Sequence[int],
+                vals: Any,
+            ) -> str:
+                name = "{}_{}".format(node_name, attr_name)
+                tensor = make_tensor(name, elem_type, shape, vals)
+                check_tensor(tensor)
+                self.initializers.append(tensor)
+                return name
+
+            def push_node(self, node: NodeProto) -> None:
+                # check_node(node)
+                self.nodes.append(node)
+
+            def build(self, name: str) -> ModelProto:
+                graph = make_graph(
+                    self.nodes, name, self.inputs, self.outputs, self.initializers
+                )
+                # check_graph(graph)
+
+                model = make_model(graph)
+                # check_model(model)
+
+                return model
+
+        # 拓扑排序
+        if not g.topo_sort():
+            raise Exception("Sorting fails")
+
+        ops = g.operators()  # 图中所有算子（节点）
+
+        ctx = Context()
+
+        for op in ops:
+            ty, name = ctx.name_op(op)
+            inputs = [
+                ctx.push_input(it, self.initializer.get(it.fuid()))
+                for it in op.inputs()
+            ]
+            outputs = [
+                ctx.push_output("{}_{}_{}".format(
+                    name, i, tensor.guid()), tensor)
+                for (i, tensor) in enumerate(op.outputs())
+            ]
+            if ty == backend.OpType.Conv:
+                ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        pads=[ph, pw, ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                        group=op.inputs()[0].shape()[
+                            1] // op.inputs()[1].shape()[1],
+                    )
+                )
+            elif ty == backend.OpType.ConvTrans:
+                ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
+                    op)
+                ctx.push_node(
+                    make_node(
+                        "ConvTranspose",
+                        inputs,
+                        outputs,
+                        name,
+                        pads=[ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                        output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.ConvTransNHWC:
+                # ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "ConvTranspose",
+                        inputs,
+                        outputs,
+                        name,
+                        domain="nnet",
+                        # pads=[ph, pw],
+                        # strides=[sh, sw],
+                        # dilations=[dh, dw],
+                        # output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.MemBound:
+                # ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "Membound",
+                        inputs,
+                        outputs,
+                        name,
+                        domain="nnet",
+                        # pads=[ph, pw],
+                        # strides=[sh, sw],
+                        # dilations=[dh, dw],
+                        # output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.Matmul:
+                # transA, transB = backend.matmul_attrs_of(op)
+                # HACK: recover this
+                transA, transB = False, False
+                ctx.push_node(
+                    make_node(
+                        "Gemm", inputs, outputs, name, transA=transA, transB=transB
+                    )
+                )
+            elif ty == backend.OpType.BatchNorm:
+                inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
+                momentum, eps, training = backend.batch_norm_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "BatchNormalization",
+                        inputs,
+                        outputs,
+                        name,
+                        epsilon=eps,
+                        momentum=momentum,
+                        training_mode=training,
+                    )
+                )
+            elif ty == backend.OpType.MaxPool:
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw, ph, pw],
+                        dilations=[dh, dw],
+                        strides=[sh, sw],
+                    )
+                )
+            elif ty == backend.OpType.AvgPool:
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "AveragePool",
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw, ph, pw],
+                        strides=[sh, sw],
+                    )
+                )
+            elif ty in [
+                backend.OpType.Add,
+                backend.OpType.Sub,
+                backend.OpType.Mul,
+                backend.OpType.Div,
+                backend.OpType.Pow,
+                backend.OpType.Relu,
+                backend.OpType.Sigmoid,
+                backend.OpType.Tanh,
+                backend.OpType.Softmax,
+                backend.OpType.Abs,
+                backend.OpType.Identity,
+                backend.OpType.PRelu,
+            ]:
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Flatten:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Transpose:
+                perm = backend.transpose_permute_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, perm=perm))
+            elif ty == backend.OpType.Reshape:
+                shape = backend.reshape_shape_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "shape",
+                        TensorProto.INT64,
+                        [len(shape)],
+                        shape,
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Concat:
+                axis = backend.concat_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, axis=axis))
+            elif ty == backend.OpType.Split:
+                axis = backend.split_axis_of(op)
+                num_outputs = len(outputs)
+                split = op.inputs()[0].shape()[axis] // num_outputs
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "split",
+                        TensorProto.INT64,
+                        [len(outputs)],
+                        [split for _ in range(0, num_outputs)],
+                    )
+                )
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        axis=axis,
+                    )
+                )
+            elif ty == backend.OpType.Gather:
+                axis = backend.gather_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, axis=axis))
+            elif ty == backend.OpType.ReduceMean:
+                axes, keepdims = backend.reduce_mean_attrs_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "axes", TensorProto.INT64, [len(axes)], axes
+                    )
+                )
+                ctx.push_node(
+                    make_node(ty.name, inputs, outputs,
+                              name, keepdims=keepdims)
+                )
+            elif ty == backend.OpType.Slice:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Pad:
+                pads = backend.pad_pads_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "pads", TensorProto.INT64, [len(pads)], pads
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            # elif ty == backend.OpType.Clip:
+            #     min, max = backend.clip_attrs_of(op)
+            #     if min != None:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
+            #         )
+            #     else:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
+            #         )
+            #     if max != None:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
+            #         )
+            #     else:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
+            #         )
+            #     ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            else:
+                raise Exception("Unsupported OpType", ty)
+
+        model = ctx.build(name)
+        onnx.save(model, path)
+        return model
+
+    # def init(self) -> None:
+    #     self.handler.data_malloc()
+
+    # def optimize(self) -> None:
+    #     self.handler.optimize()
+
+    # def run(self) -> None:
+    #     self.handler.run()
+
+
+# def from_onnx(model: ModelProto, runtime):
+#     stub = OnnxStub(model, runtime)
+#     return stub.inputs, stub.outputs, stub.handler
+
+
+# def _search_shape(model: ModelProto, name: str) -> List[int]:
+#     ans = (
+#         next(
+#             (
+#                 [
+#                     (d.dim_value if d.dim_value > 0 else 1)
+#                     for d in tensor.type.tensor_type.shape.dim
+#                 ]
+#                 for tensor in model.graph.value_info
+#                 if tensor.name == name
+#             ),
+#             None,
+#         )
+#         or next(
+#             (
+#                 [
+#                     (d.dim_value if d.dim_value > 0 else 1)
+#                     for d in tensor.type.tensor_type.shape.dim
+#                 ]
+#                 for tensor in model.graph.input
+#                 if tensor.name == name
+#             ),
+#             None,
+#         )
+#         or next(
+#             [int(d) for d in tensor.dims]
+#             for tensor in model.graph.initializer
+#             if tensor.name == name
+#         )
+#     )
+#     return ans
+
+
+# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
+#     for attr in node.attribute:
+#         if attr.name in attrs:
+#             if attr.type == AttributeProto.INT:
+#                 attrs[attr.name] = attr.i
+#             elif attr.type == AttributeProto.INTS:
+#                 attrs[attr.name] = attr.ints
+#             elif attr.type == AttributeProto.FLOAT:
+#                 attrs[attr.name] = attr.f
+#             elif attr.type == AttributeProto.STRING:
+#                 attrs[attr.name] = attr.s
+#             elif attr.type == AttributeProto.TENSOR:
+#                 attrs[attr.name] = attr.t
+#             else:
+#                 assert False, "Unsupported Attribute Type: {}".format(attr.type)
+#     return attrs
+
+
+# def _parse_data(tensor: TensorProto) -> List[Any]:
+#     return to_array(tensor).flatten().tolist()
+
+
+# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
+#     return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
+
+def export_onnx(g: backend.Graph, path: str) -> None:
+    stub = OnnxStub()
+    stub.to_onnx(g, path)
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -11,13 +11,11 @@ GraphObj::GraphObj(Runtime runtime, OpVec ops_in)
    for (const auto &op : ops_in) {
        for (const auto &t : op->getInputs())
            if (tensorPool.find(t->getFuid()) == tensorPool.end())
-                tensorPool[t->getFuid()] = t->clone();
+                tensorPool[t->getFuid()] = cloneTensor(t);
        for (const auto &t : op->getOutputs())
            if (tensorPool.find(t->getFuid()) == tensorPool.end())
-                tensorPool[t->getFuid()] = t->clone();
+                tensorPool[t->getFuid()] = cloneTensor(t);
    }
-    for (const auto &[_, t] : tensorPool)
-        addTensor(t);
    // Clone operators and add connections
    for (const auto &op : ops_in) {
        TensorVec inputs, outputs;
@ -122,13 +120,18 @@ void GraphObj::dataMalloc() {
    }
 }

-Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
-    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
+    return tensors.emplace_back(
+        make_ref<TensorObj>(dim, dtype, runtime, tensorType));
 }

 Tensor GraphObj::addTensor(const Tensor &tensor) {
-    IT_ASSERT(tensor->getRuntime() == runtime, "Tensor runtime mismatch");
-    return tensors.emplace_back(tensor);
+    IT_ASSERT(tensor->getRuntime() == runtime,
+              std::string("Tensor runtime mismatch: cannot add a tenosr in ") +
+                  tensor->getRuntime()->toString() + " to " +
+                  runtime->toString());
+    tensors.emplace_back(tensor);
+    return tensor;
 }

 TensorVec GraphObj::addTensor(const TensorVec &tensors) {
@ -145,4 +148,19 @@ OpVec GraphObj::getComputeOps() const {
    return opList;
 };

+bool GraphObj::selfCheck(bool assert) const {
+    std::set<UidBaseType> s;
+    // check whether two tensors with the same FUID exist
+    for (auto tensor : tensors) {
+        int cnt = s.count(tensor->getFuid());
+
+        if (assert)
+            IT_ASSERT(cnt == 0, std::to_string(tensor->getFuid()));
+        else if (cnt > 0)
+            return false;
+        s.insert(tensor->getFuid());
+    }
+    return true;
+}
+
 } // namespace infini
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@ -1,6 +1,7 @@
 #include "core/operator.h"
 #include "core/graph.h"
 #include "core/hash.h"
+#include "nnet/dbg.h"

 namespace infini {

@ -24,8 +25,8 @@ bool OperatorObj::isConcatOp() const { return type == OpType::Concat; }

 bool OperatorObj::isComputeOp() const {
    return type == OpType::Conv || type == OpType::Matmul ||
-           type == OpType::ConvTrans || type == OpType::G2BMM ||
-           type == OpType::GBMM;
+           type == OpType::ConvTrans || type == OpType::ConvTransNHWC ||
+           type == OpType::G2BMM || type == OpType::GBMM;
 }

 bool OperatorObj::isTransposeOp() const { return type == OpType::Transpose; }
@ -34,7 +35,8 @@ bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }

 bool OperatorObj::isMemBoundOp() const {
    return type == OpType::MemBound || type == OpType::Activation ||
-           type == OpType::Transpose;
+           type == OpType::Transpose || type == OpType::Relu ||
+           type == OpType::Tanh;
 }

 OpPerfKey OperatorObj::getOpPerfKey() const {
@ -57,22 +59,29 @@ HashType OperatorObj::hash() const {

 bool OperatorObj::checkValid(GraphObj *graph) {
    auto optShapes = inferShape();
+    IT_ASSERT(optShapes);
    if (!optShapes) // shape inference failed
        return false;

    const vector<Shape> &shapes = *optShapes;
+    IT_ASSERT(shapes.size() == outputs.size());
    if (shapes.size() != outputs.size())
        return false;
    if (graph) { // if graph != nullptr, outputs should be created
        auto dataTypes = inferDataType();
        for (size_t i = 0; i < outputs.size(); i++) {
-            IT_ASSERT(!outputs[i]);
-            outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
+            IT_ASSERT(!outputs[i], "Find empty output while operator creation");
+            outputs[i] =
+                graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
        }
    } else { // if outputs have been created, check their shapes
        for (size_t i = 0; i < shapes.size(); ++i) {
-            if (shapes[i] != outputs[i]->getDims())
+            IT_ASSERT(shapes[i] == outputs[i]->getDims());
+            if (shapes[i] != outputs[i]->getDims()) {
+                dbg(shapes[i], outputs[i]->getDims());
                return false;
+            }
+            IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
        }
    }
    return true;
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -2,6 +2,8 @@
 #include "core/blob.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
+#include "cuda_profiler_api.h"
+#include "operators/membound.h"
 #include "utils/data_generator.h"
 #include <chrono>
 #include <cstring>
@ -60,13 +62,35 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
        printProfilingData(totalTime, opTime, opCnt);
 }

-double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
+map<UidBaseType, bool>
+RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
+    map<UidBaseType, bool> ctcMap; // compile-time computable
+    // Skip static computation
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+    for (auto &op : graph->getOperators()) {
+        bool compileTimeComputable = true;
+        for (auto input : op->getInputs()) {
+            // FIXME: propogate the tensor type. Current only the first operator
+            // after weights are compile-time computable.
+            if (input->getTensorType() != TensorType::Initialized)
+                compileTimeComputable = false;
+        }
+        ctcMap[op->getGuid()] = compileTimeComputable;
+    }
+    return ctcMap;
+}
+
+double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
+                               bool allowEstimation) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    // Statistics
    double totalTime = 0;
    std::map<OpType, double> opTime;
    std::map<OpType, int> opCnt;
+    // compile-time computable
+    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);

    for (auto &op : graph->getOperators()) {
        auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@ -74,11 +98,17 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);

-        PerfRecord record;
-        // Tune the kernel if there is no record
-        if (!perfData) {
+        double time = -1e9;
+        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
+            time = 0;
+        } else if (perfData) { // Tune the kernel if there is no record
+            time = perfData->time;
+        } else if (allowEstimation && op->getOpType() == OpType::MemBound) {
+            time = as<MemBoundObj>(op)->getEstimatedTime();
+        } else {
            // TODO: should tenosrs automatically allocate when access data?
-            // allocate memory for empty tensors and release it after profiling
+            // allocate memory for empty tensors and release it after
+            // profiling
            TensorVec allocatedTensors;
            for (auto t : op->getInputs())
                if (!t->hasData())
@ -92,21 +122,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
            }

            // Profile operators and record the results
-            record = kernel->tune(op, this);
+            PerfRecord record = kernel->tune(op, this);
+            time = record->time;
            perfEngine.setPerfData(perfKey, record);

            // Free allocated memory
            for (auto t : allocatedTensors)
                t->freeData();
-        } else
-            record = perfData;
+        }

-        double t = record->time;
-        totalTime += t;
+        totalTime += time;
        if (profiling) {
            op->print();
-            printf(" op_time %lf\n", t);
-            opTime[op->getOpType()] += t;
+            printf("  op_time %lf\n", time);
+            opTime[op->getOpType()] += time;
            opCnt[op->getOpType()]++;
        }
    }
@ -164,4 +193,46 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,

 string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }

+double RuntimeObj::timeNonCtcOperators(const Graph &graph) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto &perfEngine = PerfEngine::getInstance();
+    // compile-time computable
+    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
+    vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Float32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        auto perfData = perfEngine.getPerfData(perfKey);
+        if (perfData)
+            kernel->compute(op, perfData, this);
+        else
+            kernel->compute(op, this);
+        // if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
+        if (op->getOpType() == OpType::Matmul)
+            kernels.emplace_back(op, kernel, perfData);
+    }
+    for (auto &[op, kernel, perfData] : kernels) {
+        dbg(op);
+    }
+    cudaProfilerStart(); // HACK: Debug
+    double ret = timeit(
+        [&]() {
+            for (auto &[op, kernel, perfData] : kernels) {
+                if (perfData)
+                    kernel->compute(op, perfData, this);
+                else
+                    kernel->compute(op, this);
+            }
+        },
+        [&]() { sync(); });
+    cudaProfilerStop(); // HACK: Debug
+    return ret;
+}
+
 } // namespace infini
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -8,7 +8,17 @@

 namespace infini {

-void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
+using MetaGraph = SearchEngine::MetaGraph;
+
+SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
+    : runtimeExec(runtime), mutator(mutator) {
+    // Compare graph with estimated time
+    graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
+        return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
+    };
+}
+
+void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
    for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
        auto &node = metaGraph->nodes[i];
        std::cout << "id: " << i << std::endl;
@ -32,8 +42,7 @@ Graph SearchEngine::run(const Graph graph) {
    IT_ASSERT(runtimeExec == graph->getRuntime());
    std::cout << "[INFO] original graph: " << std::endl;
    std::cout << graph->toString();
-    std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
-              << std::endl;
+    std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;

    std::vector<Graph> partitions = partitionGraph(graph);

@ -65,9 +74,7 @@ Graph SearchEngine::run(const Graph graph) {
                nextGraphs.emplace_back(tmp);
            }
        }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
        }
@ -81,10 +88,22 @@ Graph SearchEngine::run(const Graph graph) {
    for (size_t i = 0; i < bestGraphs.size(); i++) {
        std::cout << "bestGraph " << i << ":" << std::endl;
        std::cout << bestGraphs[i]->toString();
-        std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
+        std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
                  << std::endl;
    }

+    // Fuse vertically and sort according to performance
+    for (size_t i = 0; i < bestGraphs.size(); ++i) {
+        // Debug
+        bestGraphs[i] = fuseVertically(bestGraphs[i]);
+    }
+    std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
+
+    std::cout << "[INFO] best fused graph: " << std::endl;
+    std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
+              << std::endl;
+    std::cout << bestGraphs[0] << std::endl;
+
    return bestGraphs[0];
 }

@ -102,9 +121,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
        }
    }

-    sort(results.begin(), results.end(), [&](Graph x, Graph y) {
-        return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-    }); // compare with perf time
+    // compare with perf time
+    std::sort(results.begin(), results.end(), graphTimeComparer);
    if (results.size() > GRAPH_SIZE) {
        results.resize(GRAPH_SIZE);
    }
@ -112,9 +130,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
 }

 // Build metagraph with a graph, each operator is a node.
-std::shared_ptr<SearchEngine::MetaGraph>
-SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
-    auto metaGraph = std::make_shared<MetaGraph>();
+MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
+    auto metaGraph = make_ref<MetaGraphObj>();

    int numOps = graph->getOperators().size();
    std::vector<int> cnt(numOps, 0);
@ -123,7 +140,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
    std::vector<int> q(0);
    for (size_t i = 0; i < graph->getOperators().size(); i++) {
        auto &op = graph->getOperators()[i];
-        MetaGraph::Node node;
+        MetaGraphObj::Node node;
        std::vector<Operator> ops;
        ops.emplace_back(op);
        node.graph = make_ref<GraphObj>(runtimeExec, ops);
@ -157,9 +174,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {

 // Build a metagraph with graph and a plan, a plan is which ops should be a
 // node.
-std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
-    const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
-    const std::vector<int> &plan) {
+MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
+                                               const std::vector<int> &plan) {
    int numGroups = 0;
    for (auto i : plan) {
        if (i > numGroups) {
@ -172,12 +188,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
        groups[plan[i]].emplace_back(i);
    }

-    auto resultMetaGraph = make_ref<MetaGraph>();
+    auto resultMetaGraph = make_ref<MetaGraphObj>();
    for (auto &group : groups) {
        std::vector<Operator> ops;
        std::unordered_set<int> preSet, sucSet;
        for (auto id : group) {
-            MetaGraph::Node node;
+            MetaGraphObj::Node node;
            for (auto op : metaGraph->nodes[id].graph->getOperators()) {
                ops.emplace_back(op);
            }
@ -204,8 +220,7 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
 }

 // Search how to merge multiple ops.
-std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
-SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
+vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
    IT_ASSERT(metaGraph != nullptr);
    std::vector<int> plan(metaGraph->nodes.size());
    for (size_t i = 0; i < plan.size(); i++) {
@ -222,7 +237,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
    std::unordered_set<HashType> planSet;
    searchMergeDfs(metaGraph, plan, frontier, plans, planSet);

-    std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
+    vector<MetaGraph> metaGraphs;
    for (auto &curPlan : plans) {
        metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
    }
@ -230,8 +245,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
 }

 // DFS impl for search merge.
-void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
-                                  std::vector<int> &plan,
+void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
                                  std::vector<int> &frontier,
                                  std::vector<std::vector<int>> &plans,
                                  std::unordered_set<uint64_t> &planSet) {
@ -320,8 +334,7 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
 }

 // Search mutation for each compute op.
-std::vector<Graph> SearchEngine::searchMutation(
-    const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
+std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
    std::vector<Graph> graphs = {nullptr};
    // Append a node to all existing candidates
    for (auto &node : metaGraph->nodes) {
@ -360,9 +373,7 @@ std::vector<Graph> SearchEngine::searchMutation(
        for (auto g : nextGraphs) {
            g->dataMalloc();
        }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
        }
@ -372,7 +383,7 @@ std::vector<Graph> SearchEngine::searchMutation(
 }

 bool SearchEngine::isMultiBranchMergable(const Graph graph) {
-    return mutationEngine->isMultiBranchMergable(graph);
+    return mutator->isMultiBranchMergable(graph);
 }

 // Split a graph into multiple independt graphs. Search engine will search for
@ -438,4 +449,60 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
    return partitions;
 }

+double SearchEngine::getEstimatedGraphPerf(Graph graph) {
+    return runtimeExec->getPerfTime(graph, false, true);
+}
+
+Graph SearchEngine::fuseVertically(const Graph &graph) {
+    std::unordered_map<UidBaseType, int> visitTime;
+    std::vector<Operator> ops;
+
+    graph->topo_sort();
+    int cnt = 0;
+    for (auto op : graph->getOperators()) {
+        // Skip visited OP
+        if (visitTime.find(op->getGuid()) != visitTime.end()) {
+            continue;
+        }
+        // Skip compute OP and multi-input/output OP
+        if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
+                                    op->getSuccessors().size() != 1)) {
+            visitTime.emplace(op->getGuid(), ++cnt);
+            ops.emplace_back(op);
+            continue;
+        }
+        vector<Operator> chainOps;
+        visitTime.emplace(op->getGuid(), ++cnt);
+
+        vector<Operator> tmp;
+        auto cur = op;
+        while (cur->getPredecessors().size() == 1 &&
+               cur->getPredecessors()[0]->isMemBoundOp()) {
+            cur = cur->getPredecessors()[0];
+            tmp.emplace_back(cur);
+            visitTime.emplace(cur->getGuid(), cnt);
+        }
+        for (int i = tmp.size() - 1; i >= 0; i--) {
+            chainOps.emplace_back(tmp[i]);
+        }
+        chainOps.emplace_back(op);
+        cur = op;
+        while (cur->getSuccessors().size() == 1 &&
+               cur->getSuccessors()[0]->isMemBoundOp()) {
+            cur = cur->getSuccessors()[0];
+            chainOps.emplace_back(cur);
+            visitTime.emplace(cur->getGuid(), cnt);
+        }
+        make_ref<GraphObj>(runtimeExec, chainOps)->print();
+
+        Graph optGraph =
+            mutator->fuseVertically(make_ref<GraphObj>(runtimeExec, chainOps));
+        for (auto op : optGraph->getOperators()) {
+            ops.emplace_back(op);
+        }
+    }
+
+    return make_ref<GraphObj>(runtimeExec, ops);
+}
+
 } // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -8,17 +8,26 @@

 namespace infini {

-TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
+                     TensorType tensorType)
    : TensorBaseObj(shape.size(), dtype, runtime), shape(std::move(shape_)),
      _size(shape.empty()
                ? 0
                : std::accumulate(shape.begin(), shape.end(), 1,
-                                  [](auto acc, auto x) { return acc * x; })) {}
+                                  [](auto acc, auto x) { return acc * x; })),
+      tensorType(tensorType) {}

 string TensorObj::toString() const {
+    // Convert data pointer to string
+    std::stringstream ss;
+    if (data != nullptr)
+        ss << data->getPtr<void *>();
+    else
+        ss << "nullptr data";
    string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                 std::to_string(fuid) + ", shape " + vecToString(shape) +
-                 ", dtype " + dtype.toString();
+                 ", dtype " + dtype.toString() + ", tensorType " +
+                 std::to_string(enum_to_underlying(tensorType));
    vector<UidBaseType> targetGuids;
    for (const auto &op : targets)
        targetGuids.emplace_back(op.lock()->getGuid());
@ -27,6 +36,7 @@ string TensorObj::toString() const {
    else
        ret += ", source None";
    ret += ", targets " + vecToString(targetGuids);
+    ret += ", " + runtime->toString() + ", " + ss.str();
    return ret;
 }

@ -57,25 +67,36 @@ vector<size_t> TensorObj::getStride() const {

 void TensorObj::printData() const {
    IT_ASSERT(data != nullptr);
-    if (!runtime->isCpu())
-        IT_TODO_HALT();
+    void *ptr = nullptr;
+    Blob buffer;
+    if (!runtime->isCpu()) {
+        buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
+        runtime->copyBlobToCPU(buffer->getPtr<void *>(),
+                               getRawDataPtr<void *>(), getBytes());
+        ptr = buffer->getPtr<void *>();
+    } else
+        ptr = data->getPtr<float *>();
    if (dtype == DataType::Float32)
-        printDataFloat();
+        printDataFloat(static_cast<float *>(ptr));
    else if (dtype == DataType::UInt32)
-        printDataUint32_t();
+        printDataUint32_t(static_cast<uint32_t *>(ptr));
    else
        IT_TODO_HALT();
 }

-void TensorObj::printDataFloat() const {
+void TensorObj::printDataFloat(float *ptr) const {
    std::cout << "Tensor: " << guid << std::endl;
    auto numDims = shape.size();
    auto dimSzVec = std::vector<int>(numDims, 1);
-    auto ptr = data->getPtr<float *>();
    dimSzVec[numDims - 1] = shape[numDims - 1];
    for (int i = numDims - 1; i != 0; --i)
        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
+        if (iEnd > 1000 && i > 20 && i < iEnd - 20) {
+            printf("... , ");
+            i = iEnd - 20;
+            continue;
+        }
        for (size_t j = 0; j < numDims; ++j) {
            if (i % dimSzVec[j] == 0) {
                std::cout << "[";
@ -94,12 +115,11 @@ void TensorObj::printDataFloat() const {
    }
 }

-void TensorObj::printDataUint32_t() const {
+void TensorObj::printDataUint32_t(uint32_t *ptr) const {
    IT_ASSERT(data != nullptr);
    std::cout << "Tensor: " << guid << std::endl;
    auto numDims = shape.size();
    auto dimSzVec = std::vector<int>(numDims, 1);
-    auto ptr = data->getPtr<VType *>();
    dimSzVec[numDims - 1] = shape[numDims - 1];
    for (int i = numDims - 1; i != 0; --i)
        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
@ -122,7 +142,7 @@ void TensorObj::printDataUint32_t() const {
    }
 }

-bool TensorObj::equalData(const Tensor &rhs) const {
+bool TensorObj::equalData(const Tensor &rhs, double relativeError) const {
    IT_ASSERT(data != nullptr);
    IT_ASSERT(rhs->data != nullptr);
    IT_ASSERT(getDType() == rhs->getDType());
@ -132,10 +152,11 @@ bool TensorObj::equalData(const Tensor &rhs) const {
        return false;
    if (getDType() == DataType::UInt32)
        return equalDataImpl(getRawDataPtr<uint32_t *>(),
-                             rhs->getRawDataPtr<uint32_t *>(), size());
+                             rhs->getRawDataPtr<uint32_t *>(), size(), 0);
    else if (getDType() == DataType::Float32)
        return equalDataImpl(getRawDataPtr<float *>(),
-                             rhs->getRawDataPtr<float *>(), size());
+                             rhs->getRawDataPtr<float *>(), size(),
+                             relativeError);
    else
        IT_TODO_HALT();
 }
--- a/src/ffi/ffi_callback.cc
+++ b/src/ffi/ffi_callback.cc
@ -0,0 +1,22 @@
+#include "core/graph.h"
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace infini {
+
+namespace callback {
+
+using namespace py::literals;
+
+static std::function<void(const Graph &, string)> exportONNXImpl;
+void exportONNX(const Graph &graph, const string &path) {
+    IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
+    static auto exportONNXImpl =
+        py::module_::import("infinitensor.if_onnx").attr("export_onnx");
+    exportONNXImpl(graph, path);
+}
+
+} // namespace callback
+
+} // namespace infini
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -1,4 +1,7 @@
 #include "core/graph_handler.h"
+#include "core/mutator.h"
+#include "core/search_engine.h"
+#include "nnet/nmutator.h"
 #include "operators/batch_norm.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
@ -53,6 +56,7 @@ void export_values(py::module &m) {
        .VALUE(OpType, Conv)
        .VALUE(OpType, Matmul)
        .VALUE(OpType, ConvTrans)
+        .VALUE(OpType, ConvTransNHWC)
        .VALUE(OpType, G2BMM)
        .VALUE(OpType, GBMM)
        .VALUE(OpType, Pad)
@ -82,6 +86,7 @@ void export_values(py::module &m) {
        .VALUE(OpType, Abs)
        .VALUE(OpType, Resize)
        .VALUE(OpType, MemBound)
+        .VALUE(OpType, PRelu)
        .export_values();

 #undef VALUE
@ -176,6 +181,9 @@ void export_functions(py::module &m) {
 void init_graph_builder(py::module &m) {
    using Handler = GraphHandlerObj;

+    py::class_<Object, Ref<Object>>(m, "_Object")
+        .def("__str__", &Object::toString)
+        .def("guid", &Object::getGuid);
    py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
    py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
               RuntimeObj>(m, "CpuRuntime");
@ -183,7 +191,7 @@ void init_graph_builder(py::module &m) {
    py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
        m, "CudaRuntime");
 #endif
-    py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
+    py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
        .def("fuid", &TensorObj::getFuid, policy::automatic)
        .def("shape", &TensorObj::getDims, policy::move)
        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
@ -194,8 +202,9 @@ void init_graph_builder(py::module &m) {
        .def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
        .def("has_target", &TensorObj::hasTarget, policy::automatic)
        .def("src", &TensorObj::getSource, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic);
-    py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
+        .def("print_data", &TensorObj::printData)
+        .def("data_malloc", &TensorObj::dataMalloc);
+    py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
        .def("op_type", &OperatorObj::getOpType, policy::automatic)
        .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
             policy::reference)
@ -232,6 +241,36 @@ void init_graph_builder(py::module &m) {
        .def("operators", &Handler::operators, policy::move)
        .def("data_malloc", &Handler::data_malloc, policy::automatic)
        .def("run", &Handler::run, policy::automatic);
+    py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
+    py::enum_<NMutator::Mode>(m, "NMutatorMode")
+        .value("RuleBased", NMutator::Mode::RuleBased);
+    py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
+        .def(py::init<NMutator::Mode>())
+        .def(py::init<NMutator::Mode, vector<int>>())
+        .def("run", &NMutator::run)
+        .def_static("memboundToJson", &NMutator::memboundToJson);
+    py::class_<SearchEngine>(m, "SearchEngine")
+        .def(py::init<Runtime, Ref<Mutator>>())
+        .def("run", &SearchEngine::run);
+    py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
+        .def("tensors", &GraphObj::getTensors)
+        .def("operators", &GraphObj::getOperators)
+        .def("inputs", &GraphObj::getInputs)
+        .def("outputs", &GraphObj::getOutputs)
+        .def("print", &GraphObj::print)
+        .def("topo_sort", &GraphObj::topo_sort);
+}
+
+Graph getInfoGAN(int batch, Runtime runtime, int nLayers);
+vector<Tensor> runInfoGAN(int nLayers);
+Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
+Graph optimizeGraph(Graph g, Runtime runtime, bool tuning);
+void export_test_model(py::module &m) {
+    m.def("runInfoGAN", &runInfoGAN);
+    m.def("getInfoGAN", &getInfoGAN);
+    m.def("getConvtransposedNHWC", &getConvtransposedNHWC);
+    m.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
+          "tuning"_a = false);
 }

 } // namespace infini
@ -241,4 +280,5 @@ PYBIND11_MODULE(backend, m) {
    infini::export_values(m);
    infini::export_functions(m);
    infini::init_graph_builder(m);
+    infini::export_test_model(m);
 }
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@ -50,10 +50,23 @@ class matmulCublas : public Kernel {
        // TODO:use compute type
        cublasStatus_t stat;
        if (b > 1) {
+            // Support batch broadcast with zero stride
+            int dimA = op->getInputs(0)->getDims().size();
+            int dimB = op->getInputs(1)->getDims().size();
+            long long strideA =
+                (dimA == 2 ||
+                 (dimA == 3 && op->getInputs(0)->getDims()[0] == 1))
+                    ? 0 // Broadcast the batch dimension if batch size is 1
+                    : m * k;
+            long long strideB =
+                (dimB == 2 ||
+                 (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
+                    ? 0 // Broadcast the batch dimension if batch size is 1
+                    : n * k;
            stat = cublasGemmStridedBatchedEx(
                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
-                CUDA_R_32F, ldb, k * n, inAData, CUDA_R_32F, lda, m * k, &beta,
-                outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
+                CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
+                &beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
                (cublasGemmAlgo_t)record->algo);
        } else {
            stat = cublasGemmEx(
@ -61,6 +74,8 @@ class matmulCublas : public Kernel {
                CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
                CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
        }
+        // if (stat != CUBLAS_STATUS_SUCCESS)
+        //     cout << cublasGetErrorString(stat);
        return (stat == CUBLAS_STATUS_SUCCESS);
    }

@ -79,6 +94,8 @@ class matmulCublas : public Kernel {
                    const RuntimeObj *_context) const override {
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        auto op = as<MatmulObj>(_op);
+        IT_ASSERT(context);
+        IT_ASSERT(op);
        auto ret = make_ref<MatmulCublasPerfRecordObj>();
        ret->time = std::numeric_limits<double>::max();
        for (int i = 0; i < N_ALGO; i++) {
@ -91,9 +108,8 @@ class matmulCublas : public Kernel {
            if (rcd->time < ret->time)
                ret = rcd;
        }
-        IT_ASSERT(ret->time < std::numeric_limits<double>::max(), "No valid "
-                                                                  "algorithm "
-                                                                  "found");
+        IT_ASSERT(ret->time < std::numeric_limits<double>::max(),
+                  "No valid algorithm found for " + op->toString());
        return ret;
    }
 };
--- a/src/kernels/cuda/membound_tvm_extract_source.cc
+++ b/src/kernels/cuda/membound_tvm_extract_source.cc
@ -1,7 +1,11 @@
+#ifdef INFINI_USE_TVM
 #include "core/kernel.h"
 #include "cuda/cuda_runtime.h"
 #include "ffi/ffi_embed.h"
 #include "nnet/Visitor/AsTVMVisitor.h"
+#include "nnet/Visitor/CheckOOBVisitor.h"
+#include "nnet/Visitor/HashVisitor.h"
+#include "nnet/Visitor/MergeMemboundMutator.h"
 #include "nvrtc.h"
 #include "operators/membound.h"
 #include "operators/pooling.h"
@ -17,11 +21,12 @@ class TVMRecordObj : public PerfRecordObj {
    std::string log, ptx;
    std::vector<int> invokeParams;
    std::string kernelName;
+    HashType simplifiedExprHash;
 };

 using TVMRecord = Ref<TVMRecordObj>;

-class MemboundTVM : public Kernel {
+class MemboundTVMExtractSource : public Kernel {
  public:
    void compute(const Operator &_op, const PerfRecord &record,
                 const RuntimeObj *_context) const override {
@ -65,6 +70,11 @@ class MemboundTVM : public Kernel {
        return "var_" + std::to_string(t->getGuid());
    }

+    bool checkOOB(nnet::Expr expr) const {
+        return nnet::CheckOOBVisitor().checkRangeOp(
+            nnet::as<nnet::RangeOpNode>(expr));
+    }
+
    // Premise: op is idempotent since it is called multiple times.
    PerfRecord tune(const Operator &_op,
                    const RuntimeObj *_context) const override {
@ -73,10 +83,18 @@ class MemboundTVM : public Kernel {
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);

        // invoke Ansor to tune a membound kernel
-        std::string func = "mem_bound_" + std::to_string(op->getGuid());
-        std::string kernelName = func + "_kernel0";
        nnet::AsTVMVisitor visitor;
-        visitor.dispatch(op->getNnetExpr());
+        IT_ASSERT(!checkOOB(op->getNnetExpr()));
+        // fuse stages in nnet expr to reduce kernels generated by TVM
+        auto expr = op->getNnetExpr();
+        if (auto mergedExpr =
+                nnet::MergeMemboundMutator({expr}).merge(false, true))
+            expr = mergedExpr;
+
+        nnet::HashVisitor hashVisitor;
+        HashType hashCode = hashVisitor.getHash(expr);
+
+        visitor.dispatch(expr);
        auto &&stmts = visitor.getStmts();
        auto &&inShapes = visitor.getInputShapes();
        auto &&outShape = visitor.getOutputShape();
@ -85,10 +103,14 @@ class MemboundTVM : public Kernel {
        for (auto &&in : op->getInputs()) {
            inputs.emplace_back(getVarName(in));
        }
-        std::string output = getVarName(op->getOutput());
+        const std::string output = getVarName(op->getOutput());
+
+        const std::string func = "membound_" + std::to_string(hashCode);
+        const std::string kernelName = func + "_kernel0";
        auto res = getAnsorCode(
            inShapes, std::vector<std::string>(inShapes.size(), "float32"),
-            outShape, "float32", stmts, func, inputs, output);
+            outShape, "float32", stmts, func, inputs, output, op->toString(),
+            expr->toReadable(), hashCode);

        // compile the kernel
        auto funcCode = res.first;
@ -119,6 +141,7 @@ class MemboundTVM : public Kernel {
        nvrtcGetPTX(prog, ret->ptx.data());
        ret->invokeParams = invokeParams;
        ret->kernelName = kernelName;
+        ret->simplifiedExprHash = hashCode;

        // prepare for evaluation
        CUmodule module;
@ -151,20 +174,43 @@ class MemboundTVM : public Kernel {
        return std::dynamic_pointer_cast<PerfRecordObj>(ret);
    }

+    /// @brief
+    /// @param inDims
+    /// @param inDTypes
+    /// @param outDims
+    /// @param outDType
+    /// @param lambda
+    /// @param funcName Generated function name
+    /// @param inputNames Input array names in the generated invocation code.
+    /// @param outputName Output array names in the generated invocation code.
+    /// @param nnetExpressionString Save expr in string for logging.
+    /// @param nnetSimplifiedExprString Save simplified expr in string for
+    /// logging.
+    /// @param hashCode (optional) Hash code of the input expression for kernel
+    /// cache.
+    /// @return
    std::pair<std::string, std::vector<int>>
    getAnsorCode(const std::vector<std::vector<int>> &inDims,
                 const std::vector<std::string> &inDTypes,
                 const std::vector<int> &outDims, const std::string &outDType,
                 const std::string &lambda, const std::string &funcName,
                 const std::vector<std::string> &inputNames,
-                 const std::string &outputName) const {
+                 const std::string &outputName,
+                 const std::string &nnetExprString,
+                 const std::string &nnetSimplifiedExprString,
+                 const HashType hashCode) const {
        std::string funcCode;
        std::vector<int> invokeParams;
        try {
            start_interpreter();
-            auto func = py::module::import("cpp_plugin").attr("gen_ansor_op");
-            py::tuple code = func(inDims, inDTypes, outDims, outDType, lambda,
-                                  funcName, inputNames, outputName);
+            // Use static to avoid re-importing the module. Re-importing results
+            // in cuBLAS failure, whose root cause is not identified yet.
+            static auto func =
+                py::module::import("cpp_plugin").attr("gen_ansor_op");
+            py::tuple code =
+                func(inDims, inDTypes, outDims, outDType, lambda, funcName,
+                     inputNames, outputName, nnetExprString,
+                     nnetSimplifiedExprString, std::to_string(hashCode));
            funcCode = py::str(code[0]);
            auto temp = py::list(code[3]);
            for (int i = 0; i < 6; ++i) {
@ -183,6 +229,9 @@ class MemboundTVM : public Kernel {
    }
 };

-REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, MemboundTVM,
-                "Memobund_TVM_Ansor");
+// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
+// MemboundTVMExtractSource,
+//                 "Memobund_TVM_Ansor_extract_source");
 }; // namespace infini
+
+#endif
--- a/src/kernels/cuda/membound_tvm_packed_function.cc
+++ b/src/kernels/cuda/membound_tvm_packed_function.cc
@ -0,0 +1,273 @@
+#ifdef INFINI_USE_TVM
+#include "core/kernel.h"
+#include "cuda/cuda_runtime.h"
+#include "dlpack/dlpack.h"
+#include "ffi/ffi_embed.h"
+#include "nnet/Visitor/AsTVMVisitor.h"
+#include "operators/membound.h"
+#include "operators/pooling.h"
+#include "tvm/runtime/module.h"
+#include "tvm/runtime/packed_func.h"
+#include <nlohmann/json.hpp>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+using json = nlohmann::json;
+
+namespace py = pybind11;
+
+namespace infini {
+
+using DLTensorHolder = pair<DLTensor, Ref<vector<int64_t>>>;
+
+class TVMRecordObj : public PerfRecordObj {
+  public:
+    std::string kernelName;
+    HashType simplifiedExprHash;
+    std::string dllPath;
+    std::string funcName;
+    std::vector<int> inputIdx;
+    tvm::runtime::PackedFunc packedFunc;
+};
+
+using TVMRecord = Ref<TVMRecordObj>;
+
+class MemboundTVMPackedFunction : public Kernel {
+  public:
+    void compute(const Operator &_op, const PerfRecord &record,
+                 const RuntimeObj *_context) const override {
+        auto op = as<MemBoundObj>(_op);
+        // auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+        auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
+        tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;
+        // IT_ASSERT(packedFunc != nullptr);
+
+        // prepare inputs and outputs
+        vector<DLTensorHolder> inputsHolder;
+        for (auto idx : tvmRecord->inputIdx) {
+            inputsHolder.emplace_back(
+                convertTensorToDLTensor(op->getInputs()[idx]));
+        }
+        DLTensorHolder outputHolder = convertTensorToDLTensor(op->getOutput());
+
+        // make tvm arg and rv
+        pair<vector<TVMValue>, vector<int>> preArgs =
+            convertInOutToTVMArgs(inputsHolder, outputHolder);
+        tvm::runtime::TVMRetValue rv;
+        tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
+                                   preArgs.first.size());
+
+        packedFunc.CallPacked(args, &rv);
+    }
+
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        IT_ASSERT(false, "A TVM record is required for membound kernel.");
+    }
+
+    // Premise: op is idempotent since it is called multiple times.
+    PerfRecord tune(const Operator &_op,
+                    const RuntimeObj *_context) const override {
+        TVMRecord ret = std::make_shared<TVMRecordObj>();
+        auto op = as<MemBoundObj>(_op);
+        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
+
+        // invoke Ansor to tune a membound kernel
+        auto [expr, hash] = op->getSimplifiedNnetExpr();
+        nnet::AsTVMVisitor visitor;
+        visitor.dispatch(expr);
+        auto &&stmts = visitor.getStmts();
+        auto &&inShapes = visitor.getInputShapes();
+        auto &&outShape = visitor.getOutputShape();
+
+        const std::string func = "membound_" + std::to_string(hash);
+        const std::string kernelName = func + "_kernel0";
+        // Set the dllPath directly when debugging
+        auto dllPath = getAnsorDLL(
+            inShapes, std::vector<std::string>(inShapes.size(), "float32"),
+            outShape, "float32", stmts, func, op->toString(),
+            expr->toReadable(), hash);
+
+        // remap input
+        vector<int> inputIdx;
+        int numInputs = op->getInputs().size();
+        for (int i = 0; i < numInputs; ++i) {
+            string inputName = visitor.getInputs()[i];
+            int j = 0;
+            for (; j < numInputs; ++j) {
+                if (inputName == op->getNnetInputs()[j]->getName())
+                    break;
+            }
+            IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
+            inputIdx.emplace_back(j);
+        }
+
+        tvm::runtime::PackedFunc packedFunc = getPackedFunction(dllPath, func);
+        IT_ASSERT(packedFunc != nullptr);
+
+        // prepare inputs and outputs
+        vector<DLTensorHolder> inputsHolder;
+        for (auto idx : inputIdx) {
+            inputsHolder.emplace_back(
+                convertTensorToDLTensor(op->getInputs()[idx]));
+        }
+        DLTensorHolder outputHolder = convertTensorToDLTensor(op->getOutput());
+
+        // make tvm arg and rv
+        pair<vector<TVMValue>, vector<int>> preArgs =
+            convertInOutToTVMArgs(inputsHolder, outputHolder);
+        tvm::runtime::TVMRetValue rv;
+        tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
+                                   preArgs.first.size());
+
+        ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
+                           [&]() { context->sync(); });
+        ret->kernelName = kernelName;
+        ret->dllPath = dllPath;
+        ret->funcName = func;
+        ret->inputIdx = inputIdx;
+        ret->packedFunc = packedFunc;
+
+        return std::dynamic_pointer_cast<PerfRecordObj>(ret);
+    }
+
+    std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
+                                 const std::vector<std::string> &inDTypes,
+                                 const std::vector<int> &outDims,
+                                 const std::string &outDType,
+                                 const std::string &lambda,
+                                 const std::string &funcName,
+                                 const std::string &nnetExprString,
+                                 const std::string &nnetSimplifiedExprString,
+                                 const HashType hashCode) const {
+        json j;
+        // Consistant with python API interface
+        j["input_tensors"] = inDims;
+        j["input_dtypes"] = inDTypes;
+        j["output_tensor"] = outDims;
+        j["output_dtype"] = outDType;
+        j["tvm_code"] = lambda;
+        j["func_name"] = funcName;
+        j["nnet_expression"] = nnetExprString;
+        j["nnet_simplified_expression"] = nnetSimplifiedExprString;
+        j["hash_code"] = std::to_string(hashCode);
+        return j.dump();
+    }
+
+    std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
+                            const std::vector<std::string> &inDTypes,
+                            const std::vector<int> &outDims,
+                            const std::string &outDType,
+                            const std::string &lambda,
+                            const std::string &funcName,
+                            const std::string &nnetExprString,
+                            const std::string &nnetSimplifiedExprString,
+                            const HashType hashCode) const {
+        int fdP2C[2], fdC2P[2];
+        for (auto fd : {fdP2C, fdC2P}) {
+            int status = pipe(fd);
+            IT_ASSERT(status == 0, "pipe failed");
+        }
+        pid_t pid = fork();
+        IT_ASSERT(pid >= 0, "fork failed");
+        if (pid == 0) { // Child process
+            close(fdP2C[1]);
+            close(fdC2P[0]);
+
+            dup2(fdP2C[0], STDIN_FILENO);
+            close(fdP2C[0]);
+
+            string cmd =
+                "from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
+                std::to_string(fdC2P[1]) + ")";
+            const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
+            execvp("python3", const_cast<char *const *>(argv));
+        } else { // Parent process
+            close(fdP2C[0]);
+            close(fdC2P[1]);
+
+            // Write to pipe
+            string serializedArgs = serializeTVMArgs(
+                inDims, inDTypes, outDims, outDType, lambda, funcName,
+                nnetExprString, nnetSimplifiedExprString, hashCode);
+            int status = -1;
+            status =
+                write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
+            IT_ASSERT((size_t)status == serializedArgs.size(),
+                      "Failed to write to pipe");
+            close(fdP2C[1]);
+
+            // Wait for TVM
+            waitpid(pid, &status, 0);
+            IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
+            const int es = WEXITSTATUS(status);
+            IT_ASSERT(es == 0,
+                      "TVM process exit with code " + std::to_string(es));
+
+            // Read from pipe
+            FILE *stream;
+            stream = fdopen(fdC2P[0], "r");
+            char buf_read[257] = {0};
+            status = std::fscanf(stream, "%256c", buf_read);
+            IT_ASSERT(status == 1, "Failed to read from pipe");
+            IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
+            fclose(stream);
+            close(fdC2P[0]);
+            return buf_read;
+        }
+        IT_ASSERT(false, "Should not reach here");
+        return "";
+    }
+
+    tvm::runtime::PackedFunc getPackedFunction(string path,
+                                               string functionName) const {
+        tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(path);
+        return mod.GetFunction(functionName);
+    }
+
+    DLTensorHolder convertTensorToDLTensor(const Tensor &tensor) const {
+        IT_ASSERT(tensor->getRuntime()->isCuda());
+        // The lifecycle of shapeInt64 is managed by the caller.
+        auto shapeInt64 = make_ref<vector<int64_t>>();
+        for (auto v : tensor->getDims())
+            shapeInt64->push_back(v);
+        DLTensor ret{
+            .data = tensor->getRawDataPtr<void *>(),
+            .device = DLDevice{.device_type = kDLCUDA, .device_id = 0},
+            .ndim = (int32_t)shapeInt64->size(),
+            .dtype =
+                DLDataType{.code = (uint8_t)kDLFloat, .bits = 32, .lanes = 1},
+            .shape = static_cast<int64_t *>(shapeInt64->data()),
+            .strides = nullptr,
+            .byte_offset = 0,
+        };
+        return {ret, shapeInt64};
+    }
+
+    pair<vector<TVMValue>, vector<int>>
+    convertInOutToTVMArgs(const vector<DLTensorHolder> &inputs,
+                          const DLTensorHolder &output) const {
+        vector<TVMValue> values;
+        vector<int> type_codes;
+
+        // The order of inputs and outputs is consistant with definition of TVM
+        // computation in Python, which is determined by AsTVMVisitor.
+        values.emplace_back(TVMValue{.v_handle = (void *)&output.first});
+        type_codes.emplace_back(kTVMDLTensorHandle);
+
+        for (auto &in : inputs) {
+            values.emplace_back(TVMValue{.v_handle = (void *)&in.first});
+            type_codes.emplace_back(kTVMDLTensorHandle);
+        }
+
+        return {values, type_codes};
+    }
+};
+
+REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
+                MemboundTVMPackedFunction,
+                "Memobund_TVM_Ansor_packed_funciton");
+}; // namespace infini
+
+#endif
--- a/src/nnet/App/test_models.cc
+++ b/src/nnet/App/test_models.cc
@ -0,0 +1,237 @@
+#include "core/blob.h"
+#include "core/dummy_mutator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "core/search_engine.h"
+#include "cuda/cuda_runtime.h"
+#include "ffi/ffi_callback.h"
+#include "nnet/nmutator.h"
+#include "operators/conv.h"
+#include "operators/unary.h"
+#include "test.h"
+#include <pybind11/stl.h>
+
+namespace infini {
+
+// NHWC format
+Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
+    IT_ASSERT(1 <= nLayers && nLayers <= 5);
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int, bool>> cs{
+        // Channel, kernelSize, pad, stride, isTanh
+        {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
+        {64, 4, 1, 2, false},  {3, 4, 1, 2, true},
+    };
+
+    Tensor input =
+        g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
+    for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
+        auto [channel, kernelSize, pad, stride, tanh] = cs[i];
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        if (tanh) {
+            input = g->addOp<TanhObj>(input, nullptr)->getOutput();
+        } else {
+            input = g->addOp<ReluObj>(input, nullptr)->getOutput();
+        }
+    }
+    return g;
+}
+
+Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
+    IT_ASSERT(0 <= layerId && layerId < 5);
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int, bool>> cs{
+        // Channel, kernelSize, pad, stride, isTanh
+        {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
+        {64, 4, 1, 2, false},  {3, 4, 1, 2, true},
+    };
+
+    Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
+    for (int i = layerId; i < layerId + 1; ++i) {
+        auto [channel, kernelSize, pad, stride, tanh] = cs[i];
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        if (tanh) {
+            input = g->addOp<TanhObj>(input, nullptr)->getOutput();
+        } else {
+            input = g->addOp<ReluObj>(input, nullptr)->getOutput();
+        }
+    }
+    return g;
+}
+
+void printGraph(Graph g) {
+    g->print();
+    puts("============ Data ============");
+    for (auto t : g->getTensors()) {
+        dbg(t);
+        t->printData();
+    }
+}
+
+Graph optimizeGraph(Graph g, Runtime runtime, bool tuning) {
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(runtime, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-0.1, 0.1, 0);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    runtime->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    dbg(runtime->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph =
+            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        if (tuning) {
+            runtime->run(bestGraph, true);  // Tune kernels
+            runtime->run(bestGraph, false); // Execute transfomraed graph
+
+            auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+            auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+            // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+            dbg(go0->equalData(bgo0, 1e-3));
+            dbg(runtime->getPerfTime(bestGraph, true));
+            dbg(runtime->timeNonCtcOperators(bestGraph));
+        }
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        return bestGraph;
+    }
+    return nullptr;
+}
+
+vector<Tensor> runInfoGAN(int nLayers) {
+    auto cuda = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    Graph g = getInfoGAN(1, cuda, nLayers);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    // // Translate OP to membound without derivation
+    // mutator->setToNaiveMembound();
+
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(cuda, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-0.1, 0.1, 0);
+    // auto gen = RandomGenerator(-5, 5, 0, true);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    cuda->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    dbg(cuda->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        cuda->run(bestGraph, true);  // Tune kernels
+        cuda->run(bestGraph, false); // Execute transfomraed graph
+
+        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+        std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
+        bgo0->printData();
+        go0->printData();
+        dbg(cuda->getPerfTime(bestGraph, true));
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
+        return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
+    }
+    return {};
+}
+
+// TEST(ModelE2E, InfoGAN) { runInfoGAN(); }
+
+} // namespace infini
--- a/src/nnet/Visitor/AsTVMVisitor.cc
+++ b/src/nnet/Visitor/AsTVMVisitor.cc
@ -27,13 +27,16 @@ std::string AsTVMVisitor::visit_(const BinaryOp &c) {
    }
 }
 std::string AsTVMVisitor::visit_(const Func &c) {
+    string nested = dispatch(c->getObject());
    switch (c->getFuncType()) {
    case FuncType::Relu:
        // TODO: Deduce the dtype
-        return "te.max(" + dispatch(c->getObject()) +
-               ", tvm.tir.const(0, 'float32'))";
+        return "te.max(" + nested + ", tvm.tir.const(0, 'float32'))";
    case FuncType::Tanh:
-        return "te.tanh(" + dispatch(c->getObject()) + ")";
+        return "te.tanh(" + nested + ")";
+    case FuncType::PRelu:
+        return "tir.if_then_else(0.0 < " + nested + ", " + nested +
+               ", (0.25 * " + nested + "))";
    default:
        assert(false);
    }
@ -114,6 +117,11 @@ std::string AsTVMVisitor::visit_(const Subscript &c) {
            str += " - " +
                   std::to_string(rangeOp->getLoopVarRanges()[i].second.first -
                                  rangeOp->getPaddings(i));
+        } else if (c->getObject()->getType() == NodeType::TensorNodeType) {
+            auto tensor = as<TensorNode>(c->getObject());
+            if (auto pad_i = tensor->getPadding(i); pad_i > 0) {
+                str += " + " + std::to_string(pad_i);
+            }
        }
    }
    str += "]";
@ -138,6 +146,24 @@ std::string AsTVMVisitor::visit_(const Tensor &c) {
    }
    stmt += "), name='" + c->getName() + "')";
    stmts += stmt + "\n";
+
+    if (c->hasPadding()) {
+        std::string name_after_pad = "pad_" + c->getName();
+        pythonVars.emplace_back(name_after_pad);
+        // inputs.emplace_back(name_after_pad);
+        std::string pad_tuple = "(";
+        for (auto pad : c->getPaddings()) {
+            pad_tuple += std::to_string(pad) + ", ";
+        }
+        pad_tuple += ")";
+
+        std::string pad_stmt = name_after_pad + " = " + "topi.nn.pad(" +
+                               c->getName() + ", " + pad_tuple + ", " +
+                               pad_tuple + ", 0.0, \"" + name_after_pad + "\")";
+        stmts += pad_stmt + "\n";
+        return name_after_pad;
+    }
+
    return c->getName();
 }
 std::string AsTVMVisitor::getStmts() const {
--- a/src/nnet/Visitor/FullPrinterVisitor.cc
+++ b/src/nnet/Visitor/FullPrinterVisitor.cc
@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
    oss << "==> ROOT\n" << root->toReadable() << "\n";
    for (size_t i = 0; i < q.size(); ++i) {
        const auto &[name, routine, tensor] = q[i];
-        oss << "==> " << name << " : ";
+        oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
+            << " : ";
        if (routine) {
            oss << routine->toReadable() << "\n";
            if (routine->getExpr()) {
--- a/src/nnet/Visitor/MergeMemboundMutator.cc
+++ b/src/nnet/Visitor/MergeMemboundMutator.cc
@ -5,7 +5,7 @@

 namespace nnet {

-Expr MergeMemboundMutator::merge(bool allowEmptyMembound) {
+Expr MergeMemboundMutator::merge(bool allowEmptyMembound, bool allowFailure) {
    // FIXME: fix empty expression in membound
    assert(kernels.size() >= 1);
    if (checkEmpty()) {
@ -27,19 +27,30 @@ Expr MergeMemboundMutator::merge(bool allowEmptyMembound) {
            assert(CheckOOBVisitor().checkRangeOp(curRangeOp) == false);
            auto summand = curRangeOp->getSummand();
            if (auto subscriptOp = as<SubscriptNode>(summand)) {
+                // Try merging the current and next stages
                if (auto mergedExpr = rule4StageMerging(*curExpr, true)) {
                    // dbg(*curExpr, mergedExpr);
                    *curExpr = mergedExpr;
                    merged = true;
                    break;
                }
+                // If merging fails, try the next stage
                curExpr = subscriptOp->getObjectPtr();
                nnet_assert(*curExpr != nullptr, __LINE__);
            } else if (auto funcOp = as<FuncNode>(summand)) {
-                // Relu({...}[i,j])
-                curExpr = funcOp->getObject()->getObjectPtr();
-            } else
-                nnet_unimplemented_halt();
+                // If the object of FuncNode is a subscript, like
+                // Relu({...}[i,j]), we can further merge it. Otherwise, like
+                // Relu(A[i]+B[j]), we cannot.
+                if (auto sub = as<SubscriptNode>(funcOp->getObject()))
+                    curExpr = sub->getObjectPtr();
+                else
+                    break;
+            } else {
+                if (allowFailure)
+                    return nullptr;
+                else
+                    nnet_unimplemented_halt();
+            }
        }
    } while (merged);
    return expr;
--- a/src/nnet/Visitor/Serializer.cc
+++ b/src/nnet/Visitor/Serializer.cc
@ -73,6 +73,14 @@ string Serializer::visit_(const Tensor &c) {
    return key;
 }

+string Serializer::visit_(const Func &c) {
+    const string key = std::to_string(id++);
+    j[key]["type"] = c->getType();
+    j[key]["funcType"] = c->getFuncType();
+    j[key]["object"] = dispatch(c->getObject());
+    return key;
+}
+
 bool Serializer::serialize(const Expr &expr, const string &filePath,
                           const string &msg) {
    // Metadata
@ -180,6 +188,10 @@ Expr Serializer::buildExprTree(string key) {
        return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
                                    j[key]["paddings"], source);
    }
+    case NodeType::FuncNodeType: {
+        auto object = buildExprTree(j[key]["object"]);
+        return make_ref<FuncNode>(object, j[key]["funcType"]);
+    }
    default: {
        nnet_unimplemented_halt();
        break;
--- a/src/nnet/Visitor/hashVisitor.cc
+++ b/src/nnet/Visitor/hashVisitor.cc
@ -153,4 +153,11 @@ HashType HashVisitor::visit_(const Var &c) {
    return varHash[c];
 }

+HashType HashVisitor::visit_(const Func &c) {
+    HashType objHash = dispatch(c->getObject());
+    return hash(binPrefix,
+                hash((((HashType)c->getFuncType()) + 10086), objHash));
+    return 0;
+}
+
 } // namespace nnet
--- a/src/nnet/expr.cc
+++ b/src/nnet/expr.cc
@ -1,4 +1,5 @@
 #include "nnet/expr.h"
+#include "nnet/Visitor/FullPrinterVisitor.h"
 #include "nnet/Visitor/GetTensorsVisitor.h"

 namespace nnet {
@ -90,6 +91,14 @@ size_t TensorNode::getOffset(const vector<int> &idx) {
    return offset;
 }

+bool TensorNode::hasPadding() {
+    for (auto pad : paddings) {
+        if (pad > 0)
+            return true;
+    }
+    return false;
+}
+
 string RangeOpNode::toReadable() const {
    string ret;
    for (int i = 0; i < IterationType::NumIterationType; ++i) {
@ -264,10 +273,15 @@ string FuncNode::toReadable() const {
        ret += "Relu";
    else if (funcType == FuncType::Tanh)
        ret += "Tanh";
+    else if (funcType == FuncType::PRelu)
+        ret += "PRelu";
    else
        nnet_unimplemented_halt();
-    ret += "(  ...  " + serializeVec(object->getIndex()) + ")\n    {" +
-           object->getObject()->toReadable() + "}";
+    if (auto sub = as<SubscriptNode>(object))
+        ret += "(  ...  " + serializeVec(sub->getIndex()) + ")\n    {" +
+               sub->getObject()->toReadable() + "}";
+    else
+        ret += "(" + object->toReadable() + ")";
    return ret;
 }

@ -380,6 +394,7 @@ int64_t TensorNode::getSize() const {
        size *= len;
    return size;
 }
+
 int RangeOpNode::getPaddings(int dim) const {
    return dim < (int)paddings.size() ? paddings[dim] : 0;
 }
@ -445,8 +460,13 @@ vector<Range> RangeOpNode::getOutputRanges() const {
 }

 void FuncNode::setObject(Expr e) {
-    object = as<SubscriptNode>(e);
-    nnet_assert(object, "Illegal subscripted object");
+    nnet_assert(e->isScalar(), "FuncNode operates on scalars");
+    object = e;
+}
+
+string RangeOpNode::getFullExpression() {
+    FullPrinterVisitor printer;
+    return printer.print(this->shared_from_this());
 }

 } // namespace nnet
--- a/src/nnet/iterator_table.cc
+++ b/src/nnet/iterator_table.cc
@ -574,7 +574,8 @@ Expr ConvTransPattern::getExpr(Tensor A, Tensor K, int N, int C, int H, int W,

    auto subA = makeSubscript(A, {n, x1 + r - 1, y1 + s - 1, f});
    auto subK =
-        makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
+        // makeSubscript(K, {(R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, f, c});
+        makeSubscript(K, {f, (R - 2) - 2 * r + x2, (S - 2) - 2 * s + y2, c});
    // x1=(h+1)//2, x2=(h+1)%2, y1=(w+1)//2

    auto range1 = makeRangeOperator(
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@ -1,21 +1,27 @@
 #include "nnet/nmutator.h"
 #include "core/graph.h"
+#include "ffi/ffi_callback.h"
 #include "nnet/Visitor/FullPrinterVisitor.h"
 #include "nnet/Visitor/GetTensorsVisitor.h"
 #include "nnet/Visitor/MatchReshapeVisitor.h"
+#include "nnet/Visitor/MergeMemboundMutator.h"
 #include "nnet/derivator.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 #include "operators/membound.h"
+#include "operators/reshape.h"
+#include "operators/unary.h"

 namespace infini {

 NMutator::NMutator(Mode mode) : Mutator(10), mode{mode} {
-    IT_ASSERT(mode != Mode::RuleBased, "Use RuleBased in the other ctor.");
+    IT_ASSERT(mode != Mode::RuleBased, "Specify rules for the RuleBased mode.");
 }

-NMutator::NMutator(const std::vector<int> &derivationRules)
-    : Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {}
+NMutator::NMutator(Mode mode, const std::vector<int> &derivationRules)
+    : Mutator(10), mode{Mode::RuleBased}, derivationRules{derivationRules} {
+    IT_ASSERT(mode == Mode::RuleBased);
+}

 NMutator::~NMutator() {}

@ -46,9 +52,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
    assert(computeOps.size() == 1);
    const auto &computeOp = computeOps[0];
    auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
-    auto expr = opToExpression(computeOp);
+    nnet::Expr expr = opToExpression(computeOp);
    auto inputsN = nnet::GetTensorsVisitor().get(expr);
-    dbg(inputsN, expr);
+    // dbg(inputsN, expr);
    IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
              "Which one is the second input tensor?");
    vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
@ -69,18 +75,17 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
 }

 void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
-    IT_TODO_HALT();
-    // OpVec computeOps = in_graph->getComputeOps();
+    OpVec computeOps = in_graph->getComputeOps();
+    IT_ASSERT(computeOps.size() == 1);
+
    // if (infini::Graph g = transformTConv1x1(computeOps[0])) {
    //     out_graphs.emplace_back(g);
    //     return;
    // }
-    // // Commented for debug, not implemented yet
-    // // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
-    // //     Graph graph = new Graph(g->getOperators());
-    // //     out_graphs.emplace_back(graph);
-    // //     return;
-    // // }
+    if (Graph g = transformConvtransposed1x1(computeOps[0])) {
+        out_graphs.emplace_back(g);
+        return;
+    }
    // if (infini::Graph g = transformDialtedConv(computeOps[0])) {
    //     out_graphs.emplace_back(g);
    //     return;
@ -96,38 +101,35 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
    // //     return;
    // // }

-    // auto expr = opToExpression(computeOps[0]);
-    // if (!expr)
-    //     return;
+    auto expr = opToExpression(computeOps[0]);
+    if (!expr)
+        return;

-    // nnet::Derivator derivator(maxDepth);
-    // nnet::Formula conv_9x9(expr, 0);
-    // // const std::vector<int> rules{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90}; //
-    // Tconv
-    // // const std::vector<int> rules{1, 7, 7, 2, 8, 6, 6}; // G2BMM
-    // if (mode == Mode::Normal) {
-    //     derivator.search(conv_9x9, 0);
-    // } else if (mode == Mode::RuleBased) {
-    //     dbg(derivationRules);
-    //     derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
-    // } else
-    //     nnet_assert(0, "Unknown mode");
-    // const auto &candidates = derivator.getCandidates();
+    nnet::Derivator derivator(maxDepth);
+    nnet::Formula conv_9x9(expr, 0);
+    if (mode == Mode::Normal) {
+        derivator.search(conv_9x9, 0);
+    } else if (mode == Mode::RuleBased) {
+        // dbg(derivationRules);
+        derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
+    } else
+        IT_TODO_HALT_MSG("Unknown NMutator search mode.");
+    const auto &candidates = derivator.getCandidates();
    // dbg(candidates.size());
-    // // derivator.print();
-    // for (const auto &candidate : candidates) {
-    //     // dbg(nnet::FullPrinterVisitor().print(candidate.root));
-    //     if (auto g = expressionToGraph(candidate.root, in_graph)) {
-    //         out_graphs.emplace_back(g);
-    //     }
-    //     // break; // HACK:Debug only for the first subgraph
+    // derivator.print();
+    for (const auto &candidate : candidates) {
+        // dbg(nnet::FullPrinterVisitor().print(candidate.root));
+        if (auto g = expressionToGraph(candidate.root, in_graph)) {
+            out_graphs.emplace_back(g);
+        }
+        // break; // HACK:Debug only for the first subgraph
+    }
+    // dbg(out_graphs);
+    // for (auto graph : out_graphs) {
+    //     graph->print();
    // }
-    // // dbg(out_graphs);
-    // // for (auto graph : out_graphs) {
-    // //     graph->print();
-    // // }
-    // cntStates += derivator.getNumIntermediateStates();
-    // cntCandidates += derivator.getNumCandidates();
+    cntStates += derivator.getNumIntermediateStates();
+    cntCandidates += derivator.getNumCandidates();
 }

 void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
@ -228,31 +230,38 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
 //     }
 // }

-nnet::Expr NMutator::opToExpression(Operator op) {
-    // IT_TODO_HALT();
-    if (auto convOp = as<ConvObj>(op)) {
+nnet::Expr NMutator::opToExpression(Operator opT) {
+    auto [expr, mapNameNToTensorT] = extractOp(opT);
+    for (auto &[name, tensorT] : mapNameNToTensorT) {
+        IT_ASSERT(inputsNameNToTensorT.count(name) == 0);
+        inputsNameNToTensorT[name] = tensorT;
+    }
+    return expr;
+}
+
+pair<nnet::Expr, NMutator::NameNToTensorT> NMutator::extractOp(Operator opT) {
+    if (auto convOp = as<ConvObj>(opT)) {
        const auto &inputs = convOp->getInputs();
        const auto &AT = inputs[0];
        const auto &KT = inputs[1];
        const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
        const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
        if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
-            return nullptr;
+            return {};
        assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
-        inputsNameNToTensorT["A"] = AT;
-        inputsNameNToTensorT["K"] = KT;
        const auto A = nnet::makeTensor("A", AT->getDims(),
                                        std::vector<int>{0, 0, ph, pw});
        const auto K = nnet::makeTensor("K", KT->getDims());
-        return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
-    } else if (auto convOp = as<ConvTransposed2dObj>(op)) {
+        return {nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s),
+                {{"A", AT}, {"K", KT}}};
+    } else if (auto convOp = as<ConvTransposed2dNHWCObj>(opT)) {
        const auto &AT = convOp->getInputs()[0];
        const auto &KT = convOp->getInputs()[1];
-        inputsNameNToTensorT["A"] = AT;
-        inputsNameNToTensorT["K"] = KT;
        const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
        const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
        IT_ASSERT_TODO(convOp->getNumGroups() == 1);
+        if (r != 4)
+            return {};
        IT_ASSERT_TODO(r == 4);
        IT_ASSERT_TODO(ph == pw);
        IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
@ -264,8 +273,9 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        const auto A = nnet::makeTensor(
            "A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
        const auto K = nnet::makeTensor("K", KT->getDims());
-        return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
-        // } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
+        return {nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s),
+                {{"A", AT}, {"K", KT}}};
+        // } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(opT)) {
        //     const auto &AT = g2bmmOp->getInputs()[0];
        //     const auto &BT = g2bmmOp->getInputs()[1];
        //     const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
@ -275,7 +285,7 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        //     inputsNameNToTensorT[inputsN.first->getName()] = AT;
        //     inputsNameNToTensorT[inputsN.second->getName()] = BT;
        //     return expr;
-        // } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
+        // } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(opT)) {
        //     const auto &AT = gbmmlOp->getInputs()[0];
        //     const auto &BT = gbmmlOp->getInputs()[1];
        //     const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
@ -285,118 +295,146 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        //     inputsNameNToTensorT[inputsN.second->getName()] = BT;
        //     dbg(b, m, w, k, dilation, expr);
        //     return expr;
-    } else if (auto matmulOp = as<MatmulObj>(op)) {
+    } else if (auto matmulOp = as<MatmulObj>(opT)) {
        const auto &AT = matmulOp->getInputs()[0];
        const auto &BT = matmulOp->getInputs()[1];
        const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
        const auto &[expr, inputsN] =
            nnet::MatmulPattern::getExpr(transA, transB, b, m, n, k);
-        inputsNameNToTensorT[inputsN.first->getName()] = AT;
-        inputsNameNToTensorT[inputsN.second->getName()] = BT;
-        // dbg(b, m, n, k, expr);
-        return expr;
+        return {
+            expr,
+            {{inputsN.first->getName(), AT}, {inputsN.second->getName(), BT}}};
+    } else if (auto op = as<MemBoundObj>(opT)) {
+        NameNToTensorT m;
+        for (int i = 0; i < op->numInputs(); ++i)
+            m[op->getNnetInputs()[i]->getName()] = opT->getInputs()[i];
+        return {op->getNnetExpr(), m};
+    } else if (opT->getOpType() == OpType::Relu ||
+               opT->getOpType() == OpType::Tanh) {
+        return generateUnaryExpr(opT);
    }
-    // // else if (auto transposeOp = dynamic_cast<TransposeOp *>(op)) {
+    // // else if (auto transposeOp = dynamic_cast<TransposeOp *>(opT)) {
    // //     return transposeOpToExpression(transposeOp);
    // // }
-    nnet_unimplemented_continue();
-    return nullptr;
+    IT_TODO_HALT_MSG("Cannot convert " + opT->toString() +
+                     " to an NNet expression");
+    return {};
 }

 infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
-    IT_TODO_HALT();
-    // auto g = make_ref<GraphObj>();
-    // nnet::FullPrinterVisitor fullVisitor;
-    // const auto &tensorQueueN = fullVisitor.traverse(expr);
-    // // Build tensors: Skip the first one, which is output
-    // auto nameNToTensorT = inputsNameNToTensorT;
-    // for (size_t i = 1; i < tensorQueueN.size(); ++i) {
-    //     const auto &[nameN, routineN, tensorN] = tensorQueueN[i];
-    //     // dbg(nameN, routineN, tensorN);
-    //     if (!routineN) {
-    //         // This is an inputs
-    //         assert(nameNToTensorT.count(nameN));
-    //     } else {
-    //         assert(!nameNToTensorT.count(nameN));
-    //         nameNToTensorT[nameN] = g->addTensor(tensorN->getShape());
-    //     }
-    // }
-    // const auto &outputsPET = in_graph->getOutputs();
-    // if (outputsPET.size() != 1) {
-    //     nnet_unimplemented_continue();
-    //     return nullptr;
-    // }
-    // nameNToTensorT[std::get<0>(tensorQueueN.at(0))] = outputsPET[0];
-    // // Build computation graph in PET:
-    // for (int i = tensorQueueN.size() - 1; i >= 0; --i) {
-    //     const auto &[outputNameN, routineN, tensorN] = tensorQueueN[i];
-    //     if (!routineN)
-    //         continue;
-    //     // dbg(outputNameN, routineN, tensorN, routineN->getType());
-    //     if (auto op = nnet::as<nnet::ConvNode>(routineN)) {
-    //         // g->conv(i8, w9, 2, 2);
-    //         std::vector<nnet::Tensor> inputsN = op->getInputs();
-    //         auto A = nameNToTensorT.at(inputsN[0]->getName());
-    //         auto K = nameNToTensorT.at(inputsN[1]->getName());
-    //         auto output = nameNToTensorT.at(outputNameN);
-    //         const auto &[ph, pw, sh, sw, dh, dw] = op->getArgs();
-    //         g->conv(A, K, output, ph, pw, sh, sw, dh, dw);
-    //     } else if (auto op = nnet::as<nnet::ElementWiseNode>(routineN)) {
-    //         assert(op->getInputs().size() == 1);
-    //         nnet::MatchReshapeVisitor matchReshapeVisitor;
-    //         if (matchReshapeVisitor(op->getExpr())) {
-    //             auto input =
-    //                 nameNToTensorT.at(op->getInputs().at(0)->getName());
-    //             auto output = nameNToTensorT.at(outputNameN);
-    //             g->reshape(input, output);
-    //         } else {
-    //             TensorVec inputsPET;
-    //             TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
-    //             for (const auto &inputN : op->getInputs())
-    //                 inputsPET.emplace_back(
-    //                     nameNToTensorT.at(inputN->getName()));
-    //             // Re-estimate time here.
-    //             ssize_t cnt = 0;
-    //             for (const auto tensor : inputsPET)
-    //                 cnt += tensor->size();
-    //             for (const auto tensor : outputsPET)
-    //                 cnt += tensor->size();
-    //             g->membound(inputsPET, outputsPET, op->getInputs(),
-    //                         op->getExpr(), memboundTime(cnt));
-    //         }
-    //     } else if (auto op = nnet::as<nnet::MatmulNode>(routineN)) {
-    //         assert(op->getInputs().size() == 2);
-    //         nnet::Tensor AN = op->getInputs()[0];
-    //         nnet::Tensor BN = op->getInputs()[1];
-    //         TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
-    //                                nameNToTensorT.at(BN->getName())};
-    //         TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
-    //         const auto &[b, m, n, k, transa, transb] = op->getArgs();
-    //         g->matmul(inputsPET[0], inputsPET[1], outputsPET[0], transa,
-    //                   transb);
-    //     } else if (auto op = nnet::as<nnet::G2bmmNode>(routineN)) {
-    //         assert(op->getInputs().size() == 2);
-    //         nnet::Tensor AN = op->getInputs()[0];
-    //         nnet::Tensor BN = op->getInputs()[1];
-    //         TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
-    //                                nameNToTensorT.at(BN->getName())};
-    //         TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
-    //         const auto &[b, m, w, k, dilation] = op->getArgs();
-    //         g->g2bmm(inputsPET[0], inputsPET[1], outputsPET[0], w, dilation);
-    //     } else if (auto op = nnet::as<nnet::GbmmNode>(routineN)) {
-    //         assert(op->getInputs().size() == 2);
-    //         nnet::Tensor AN = op->getInputs()[0];
-    //         nnet::Tensor BN = op->getInputs()[1];
-    //         TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
-    //                                nameNToTensorT.at(BN->getName())};
-    //         TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
-    //         const auto &[b, m, w, n, dilation] = op->getArgs();
-    //         g->gbmml(inputsPET[0], inputsPET[1], outputsPET[0], dilation);
-    //     }
-    // }
-    // g->updateConnection();
-    // Graph graph = new Graph(g->getOperators());
-    // return graph;
+    auto g = make_ref<GraphObj>(runtime);
+    nnet::FullPrinterVisitor fullVisitor;
+    // Get tensors in the reversed topological order
+    const auto &tensorQueueN = fullVisitor.traverse(expr);
+    // dbg(fullVisitor.print(expr));
+
+    // Build a map: name in nnet -> tensors in infini
+    // Add input tensors to the map
+    std::map<std::string, Tensor> nameNToTensorT;
+    for (const auto &[k, v] : inputsNameNToTensorT)
+        nameNToTensorT[k] = g->cloneTensor(v);
+
+    // Add output tensors to the map
+    const auto &outputsT = in_graph->getOutputs();
+    if (outputsT.size() != 1) {
+        nnet_unimplemented_continue();
+        return nullptr;
+    }
+    nameNToTensorT[std::get<0>(tensorQueueN.at(0))] =
+        g->cloneTensor(outputsT[0]);
+    // Skip the first tensor, which is output and should be created by clone
+    for (size_t i = 1; i < tensorQueueN.size(); ++i) {
+        const auto &[nameN, routineN, tensorN] = tensorQueueN[i];
+        // dbg(nameN, routineN, tensorN);
+        if (!routineN) {
+            // this tensor is an input as it is not contrusted by a routine
+            IT_ASSERT(nameNToTensorT.count(nameN),
+                      "Missing an input tensor in graph or a rountine for this "
+                      "tensor.");
+        } else { // this tensor is an intermediate result
+            IT_ASSERT(!nameNToTensorT.count(nameN),
+                      "An NNET tensor appears twice or it is an input tensor "
+                      "with routine specified.");
+            nameNToTensorT[nameN] = g->addTensor(tensorN->getShape());
+        }
+    }
+
+    // Build computation graph in InfiniTensor
+    for (int i = tensorQueueN.size() - 1; i >= 0; --i) {
+        const auto &[outputNameN, routineN, tensorN] = tensorQueueN[i];
+        if (!routineN)
+            continue;
+        // dbg(outputNameN, routineN, tensorN, routineN->getType());
+        if (auto op = nnet::as<nnet::ConvNode>(routineN)) {
+            std::vector<nnet::Tensor> inputsN = op->getInputs();
+            auto A = nameNToTensorT.at(inputsN[0]->getName());
+            auto K = nameNToTensorT.at(inputsN[1]->getName());
+            auto output = nameNToTensorT.at(outputNameN);
+            const auto &[ph, pw, sh, sw, dh, dw] = op->getArgs();
+            g->addOpWithOutputs<ConvObj>(A, K, output, ph, pw, sh, sw, dh, dw);
+        } else if (auto op = nnet::as<nnet::ElementWiseNode>(routineN)) {
+            assert(op->getInputs().size() == 1);
+            nnet::MatchReshapeVisitor matchReshapeVisitor;
+            // If this routine only change the shape, translate it to a Reshape
+            if (matchReshapeVisitor(op->getExpr())) {
+                auto input =
+                    nameNToTensorT.at(op->getInputs().at(0)->getName());
+                auto output = nameNToTensorT.at(outputNameN);
+                g->addOpWithOutputs<ReshapeObj>(input, output,
+                                                output->getDims());
+            } else {
+                TensorVec inputsPET;
+                TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
+                for (const auto &inputN : op->getInputs())
+                    inputsPET.emplace_back(
+                        nameNToTensorT.at(inputN->getName()));
+                // Re-estimate time here.
+                ssize_t cnt = 0;
+                for (const auto &tensor : inputsPET)
+                    cnt += tensor->size();
+                for (const auto &tensor : outputsPET)
+                    cnt += tensor->size();
+                // dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
+                //     memboundTime(cnt));
+                g->addOpWithOutputs<MemBoundObj>(inputsPET, outputsPET,
+                                                 op->getInputs(), op->getExpr(),
+                                                 memboundTime(cnt));
+            }
+        } else if (auto op = nnet::as<nnet::MatmulNode>(routineN)) {
+            assert(op->getInputs().size() == 2);
+            nnet::Tensor AN = op->getInputs()[0];
+            nnet::Tensor BN = op->getInputs()[1];
+            TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
+                                   nameNToTensorT.at(BN->getName())};
+            TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
+            const auto &[b, m, n, k, transa, transb] = op->getArgs();
+            g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
+                                           outputsPET[0], transa, transb);
+        }
+        // TODO
+        // else if (auto op = nnet::as<nnet::G2bmmNode>(routineN)) {
+        //     assert(op->getInputs().size() == 2);
+        //     nnet::Tensor AN = op->getInputs()[0];
+        //     nnet::Tensor BN = op->getInputs()[1];
+        //     TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
+        //                            nameNToTensorT.at(BN->getName())};
+        //     TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
+        //     const auto &[b, m, w, k, dilation] = op->getArgs();
+        //     g->g2bmm(inputsPET[0], inputsPET[1], outputsPET[0], w, dilation);
+        // } else if (auto op = nnet::as<nnet::GbmmNode>(routineN)) {
+        //     assert(op->getInputs().size() == 2);
+        //     nnet::Tensor AN = op->getInputs()[0];
+        //     nnet::Tensor BN = op->getInputs()[1];
+        //     TensorVec inputsPET = {nameNToTensorT.at(AN->getName()),
+        //                            nameNToTensorT.at(BN->getName())};
+        //     TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
+        //     const auto &[b, m, w, n, dilation] = op->getArgs();
+        //     g->gbmml(inputsPET[0], inputsPET[1], outputsPET[0], dilation);
+        // }
+        else
+            IT_TODO_HALT();
+    }
+    return g;
 }

 double NMutator::memboundTime(ssize_t cnt) {
@ -475,43 +513,82 @@ double NMutator::memboundTime(const Shape &dims) {
 //     return nullptr;
 // }

-// Graph NMutator::transformTConv3x3(Operator op) {
-//     if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
-//         dbg(tconvOp->getInputs()[1]->getDims());
-//         if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
-//             tconvOp->getInputs()[1]->getDims()[0] == 3 &&
-//             tconvOp->getInputs()[1]->getDims()[1] == 3) {
-//             auto g = new infini::Graph();
-//             auto inputDims = tconvOp->getInputs(0)->getDims();
-//             auto weightDims = tconvOp->getInputs(1)->getDims();
-//             auto outputDims = tconvOp->getOutput()->getDims();
-//             // NHWF
-//             auto newA = g->tensor(
-//                 {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
-//             // RSFC
-//             auto newW = g->tensor(
-//                 {weightDims[0] * weightDims[1] * weightDims[3],
-//                 weightDims[2]});
-//             auto newO =
-//                 g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+Graph NMutator::transformConvtransposed1x1(Operator _op) {
+    auto op = as<ConvTransposed2dNHWCObj>(_op);
+    if (!op)
+        return nullptr;
+    const auto &A = op->getInputs()[0];
+    const auto &W = op->getInputs()[1];
+    const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+    const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+    const Shape inputDims = op->getInputs(0)->getDims();
+    const Shape weightDims = op->getInputs(1)->getDims();
+    const Shape outputDims = op->getOutput()->getDims();
+    const DataType dtype = A->getDType();
+    IT_ASSERT_TODO(op->getNumGroups() == 1);
+    if (h != 1 || w != 1)
+        return {};
+    IT_ASSERT_TODO(ph == pw);
+    IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
+    IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+    auto g = make_ref<GraphObj>(runtime);
+    // NHWF
+    auto newA = g->addTensor(
+        {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
+    // FRSC
+    auto newW = g->addTensor(
+        {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]}, dtype);
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 0)->getOutput();
+    g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
+                                    op->getOutput()->getDims());
+    return g;
+}
+
+// Graph NMutator::transformConvtransposed(Operator _op) {
+//     auto op = as<ConvTransposed2dNHWCObj>(_op);
+//     if (!op)
+//         return nullptr;
+//     const auto &AT = op->getInputs()[0];
+//     const auto &KT = op->getInputs()[1];
+//     const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+//     const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+//     IT_ASSERT_TODO(op->getNumGroups() == 1);
+//     if (r != 4)
+//         return {};
+//     IT_ASSERT_TODO(ph == pw);
+//     IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
+//     IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+
+//     auto g = make_ref<Graph>();
+//     // TODO: implement transformation rules
+//     // How to efficiently write an expression...
+//     auto inputDims = op->getInputs(0)->getDims();
+//     auto weightDims = op->getInputs(1)->getDims();
+//     auto outputDims = op->getOutput()->getDims();
+//     // NHWF
+//     auto newA =
+//         g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+//         inputDims[3]});
+//     // RSFC
+//     auto newW = g->tensor(
+//         {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
+//     auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
 //                            weightDims[0] * weightDims[1] * weightDims[3]});
-//             g->reshape(tconvOp->getInputs(0), newA);
-//             g->reshape(tconvOp->getInputs(1), newW);
-//             g->matmul(newA, newW, newO, 0, 1);
-//             // g->reshape(newO, tconvOp->getOutput());
-//             tconvOp->print();
-//             dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
-//             assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
-//             g->membound(
-//                 {newO}, {tconvOp->getOutput()}, {}, nullptr,
+//     g->reshape(op->getInputs(0), newA);
+//     g->reshape(op->getInputs(1), newW);
+//     g->matmul(newA, newW, newO, 0, 1);
+//     // g->reshape(newO, tconvOp->getOutput());
+//     tconvOp->print();
+//     dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
+//     assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
+//     g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
 //                 memboundTime(newO->size() + tconvOp->getOutput()->size()),
 //                 "TConv3x3 reduce");
-//             g->updateConnection();
-//             Graph graph = new Graph(g->getOperators());
-//             return graph;
-//         }
-//     }
-//     return nullptr;
+//     g->updateConnection();
+//     Graph graph = new Graph(g->getOperators());
+//     return graph;
 // }

 // Graph NMutator::transformTConv1x1(Operator op) {
@ -607,4 +684,95 @@ double NMutator::memboundTime(const Shape &dims) {
 //     return graph;
 // }

+Graph NMutator::fuseVertically(const Graph &inputGraph) {
+    Graph optGraph = make_ref<GraphObj>(runtime);
+
+    auto chainOps = inputGraph->getOperators();
+    IT_ASSERT(!chainOps.empty());
+    for (auto &op : chainOps) {
+        IT_ASSERT(op->isMemBoundOp());
+        IT_ASSERT_TODO(op->getInputs().size() == 1);
+        IT_ASSERT(op->getOutputs().size() == 1);
+    }
+    if (chainOps.size() == 1) {
+        return make_ref<GraphObj>(runtime, chainOps);
+    }
+    std::vector<nnet::Expr> exprs;
+    for (const auto &op : chainOps) {
+        auto [expr, _] = extractOp(op);
+        exprs.emplace_back(expr);
+        // dbg(op, infini::as<nnet::RangeOpNode>(expr)->getFullExpression());
+    }
+    // double maxTime = getMaxPerf(std::make_shared<SubGraph>(chainOps));
+    // Fuse a MemboundOp chain
+    auto expr = nnet::MergeMemboundMutator(exprs).merge(true);
+    auto inputNMap = nnet::GetTensorsVisitor().get(exprs.front());
+    IT_ASSERT(inputNMap.size() == 1);
+    vector<nnet::Tensor> inputsN;
+    for (const auto &[name, t] : inputNMap) {
+        inputsN.emplace_back(t);
+    }
+    optGraph->addOpWithOutputs<MemBoundObj>(chainOps.front()->getInputs(),
+                                            chainOps.back()->getOutputs(),
+                                            inputsN, expr, 0);
+    // TODO: set time
+    return optGraph;
+}
+
+pair<nnet::Expr, NMutator::NameNToTensorT>
+NMutator::generateUnaryExpr(const Operator &op) {
+    using namespace nnet;
+    const map<OpType, nnet::FuncType> opTToFuncN = {
+        {OpType::PRelu, nnet::FuncType::PRelu},
+        {OpType::Relu, nnet::FuncType::Relu},
+        {OpType::Tanh, nnet::FuncType::Tanh}};
+    Shape shape = op->getInputs()[0]->getDims();
+    nnet::FuncType type = opTToFuncN.at(op->getOpType());
+    auto T = make_ref<TensorNode>("T", shape);
+    VecExpr indices;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        indices.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
+    }
+    auto sub = makeSubscript(T, indices);
+    auto func = nnet::make_ref<FuncNode>(sub, type);
+    vector<VarRangePair> varRanges;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        varRanges.emplace_back(nnet::as<VarNode>(indices[i]),
+                               Range{0, shape[i]});
+    }
+    return {makeRangeOperator(varRanges, {}, func),
+            NameNToTensorT{{"T", op->getInputs()[0]}}};
+}
+
+void NMutator::memboundToJson(const Graph &g, const string path) {
+    for (auto &_op : g->getOperators()) {
+        if (auto op = as<MemBoundObj>(_op)) {
+            op->saveAsJson(path + "/" + "membound_" +
+                           std::to_string(op->getGuid()) + ".json");
+        }
+    }
+}
+
+pair<nnet::Expr, vector<nnet::Tensor>> NMutator::generateRevert(Tensor in) {
+    using namespace nnet;
+    using infini::make_ref;
+    const Shape &orignalShape = in->getDims();
+    auto tensor = makeTensor("T", in->getDims());
+    VecExpr iters;
+    for (size_t i = 0; i < orignalShape.size(); ++i) {
+        iters.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
+    }
+
+    Shape newShape = orignalShape;
+    std::reverse(newShape.begin(), newShape.end());
+    auto sub = makeSubscript(tensor, iters);
+    vector<VarRangePair> loopIters;
+    for (int i = orignalShape.size() - 1; i >= 0; --i) {
+        loopIters.emplace_back(infini::as<VarNode>(iters[i]),
+                               Range{0, orignalShape[i]});
+    }
+    auto range = makeRangeOperator(loopIters, {}, sub);
+    return {range, {tensor}};
+}
+
 } // namespace infini
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@ -5,22 +5,24 @@ namespace infini {
 MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
                     bool transB, [[maybe_unused]] Tensor bias, ActType act)
    : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
-      act(act), b(1) {
+      act(act) {
    auto shape_a = A->getDims();
    auto shape_b = B->getDims();
-    IT_ASSERT(shape_a.size() == shape_b.size());
-    switch (shape_a.size()) {
-    case 0:
-    case 1:
-        IT_ASSERT(false);
-    case 2:
-        break;
-    default:
+    int dimA = shape_a.size(), dimB = shape_b.size();
+    IT_ASSERT(dimA >= 2 && dimB >= 2);
+
+    b = 1;
+    if (dimA <= 3 && dimB <= 3) {
+        int b1 = dimA == 2 ? 1 : A->getDims()[0];
+        int b2 = dimB == 2 ? 1 : B->getDims()[0];
+
+        b = std::max(b1, b2);
+    } else {
+        IT_ASSERT_TODO(dimA == dimB);
        for (size_t i = 0; i < shape_a.size() - 2; ++i) {
-            IT_ASSERT(shape_a[i] == shape_b[i]);
+            IT_ASSERT_TODO(shape_a[i] == shape_b[i]);
            b *= shape_a[i];
        }
-        break;
    }
    m = *(transA ? shape_a.rbegin() : shape_a.rbegin() + 1);
    n = *(transB ? shape_b.rbegin() + 1 : shape_b.rbegin());
@ -38,11 +40,44 @@ string MatmulObj::toString() const {
 }

 optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) const {
-    auto shape_a = inputs[0]->getDims();
-    auto it = shape_a.rbegin();
-    *it++ = n;
-    *it++ = m;
-    return {{std::move(shape_a)}};
+    auto A = inputs[0], B = inputs[1];
+    int dimA = A->getDims().size(), dimB = B->getDims().size();
+
+    if (dimA > 3 || dimB > 3) {
+        // no broadcast
+        auto shape_a = inputs[0]->getDims();
+        auto it = shape_a.rbegin();
+        *it++ = n;
+        *it++ = m;
+        return {{std::move(shape_a)}};
+    }
+
+    int b1 = dimA == 2 ? 1 : A->getDims()[0];
+    int b2 = dimB == 2 ? 1 : B->getDims()[0];
+
+    int b = std::max(b1, b2);
+    int m = transA ? A->getDims()[dimA - 1] : A->getDims()[dimA - 2];
+    int n = transB ? B->getDims()[dimB - 2] : B->getDims()[dimB - 1];
+    int kA = transA ? A->getDims()[dimA - 2] : A->getDims()[dimA - 1];
+    int kB = transB ? B->getDims()[dimB - 1] : B->getDims()[dimB - 2];
+
+    if ((dimA != 2 && dimA != 3) || (dimB != 2 && dimB != 3)) {
+        printf("Bad input dim: dimA = %d, dimB = %d\n", dimA, dimB);
+        return {};
+    }
+    if (b1 != 1 && b2 != 1 && b1 != b2) {
+        printf("Bad batch size b1 = %d, b2 = %d\n", b1, b2);
+        return {};
+    }
+    if (kA != kB) {
+        printf("Bad K: kA = %d, kB = %d\n", kA, kB);
+        return {};
+    }
+    if (dimA == 2 && dimB == 2) {
+        return {{{m, n}}};
+    } else {
+        return {{{b, m, n}}};
+    }
 }

 vector<int> MatmulObj::getWorkloadVector() const {
--- a/src/operators/membound.cc
+++ b/src/operators/membound.cc
@ -1,5 +1,8 @@
 #include "operators/membound.h"
+#include "nnet/Visitor/CheckOOBVisitor.h"
 #include "nnet/Visitor/HashVisitor.h"
+#include "nnet/Visitor/MergeMemboundMutator.h"
+#include "nnet/Visitor/Serializer.h"

 namespace infini {

@ -10,6 +13,19 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
    : OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
      expr(expr), exec_time(exec_time), hint(hint) {
    IT_ASSERT(checkValid(graph));
+    IT_ASSERT(!checkOOB(expr));
+    hash = calcHash(expr);
+
+    // fuse stages in nnet expr to reduce kernels generated by TVM
+    if (auto mergedExpr =
+            nnet::MergeMemboundMutator({expr}).merge(false, true)) {
+        simplifiedExpr = mergedExpr;
+        IT_ASSERT(!checkOOB(simplifiedExpr));
+        simplifiedHash = calcHash(simplifiedExpr);
+    } else {
+        simplifiedExpr = expr;
+        simplifiedHash = hash;
+    }
 }

 string MemBoundObj::toString() const {
@ -30,9 +46,16 @@ string MemBoundObj::toString() const {
    os << "exec_time=" << exec_time << ", ";
    os << "NNet Inputs=[";
    for (const auto &tensor : nnetInputs)
-        os << tensor->toReadable() << ",";
-    os << "])";
-    os << "\n" << (expr ? expr->toReadable() : "Empty expression") << "\n";
+        os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
+    os << "]";
+    os << ", ExprHash=" << hash;
+    os << ", SimplifiedExprHash=" << simplifiedHash;
+    os << ")\n";
+    os << ">>> Original expr\n"
+       << (expr ? expr->toReadable() : "Empty expression") << "\n";
+    os << ">>> Simplified expr\n"
+       << (simplifiedExpr ? simplifiedExpr->toReadable() : "Empty expression")
+       << "\n";
    return os.str();
 }

@ -47,13 +70,23 @@ optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
 }

 vector<int> MemBoundObj::getWorkloadVector() const {
-    return {enum_to_underlying(type), (int)getHash()};
+    return {enum_to_underlying(type), (int)simplifiedHash};
 }

 vector<int> MemBoundObj::getOpAttrVector() const { return getWorkloadVector(); }

-HashType MemBoundObj::getHash() const {
+HashType MemBoundObj::calcHash(nnet::Expr expr) {
    return nnet::HashVisitor().dispatch(expr);
 }

+bool MemBoundObj::checkOOB(nnet::Expr expr) {
+    return nnet::CheckOOBVisitor().checkRangeOp(
+        nnet::as<nnet::RangeOpNode>(expr));
+}
+
+void MemBoundObj::saveAsJson(string path) const {
+    bool status = nnet::Serializer().serialize(expr, path);
+    IT_ASSERT(status);
+}
+
 } // namespace infini
--- a/test/core/test_hash.cc
+++ b/test/core/test_hash.cc
@ -8,7 +8,7 @@ namespace infini {
 TEST(Hash, OperatorHash) {
    OpPerfKey key1(0, OpType::Unknown), key2(0, OpType::Unknown);
    { // build with addOpWithOutputs
-        Graph g = make_ref<GraphObj>(nullptr);
+        Graph g = make_ref<GraphObj>(NativeCpuRuntimeObj::getInstance());
        Tensor i0 = g->addTensor({1, 2, 3}, DataType::UInt32);
        Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
        Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
@ -18,7 +18,7 @@ TEST(Hash, OperatorHash) {
        EXPECT_GT(key1.attrs.size(), (size_t)5);
    }
    { // build with addOp
-        Graph g = make_ref<GraphObj>(nullptr);
+        Graph g = make_ref<GraphObj>(NativeCpuRuntimeObj::getInstance());
        Tensor i0 = g->addTensor({2, 2, 3}, DataType::UInt32);
        Tensor w0 = g->addTensor({2, 3, 4}, DataType::UInt32);
        auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
--- a/test/kernels/cuda/test_cuda_matmul.cc
+++ b/test/kernels/cuda/test_cuda_matmul.cc
@ -1,4 +1,3 @@
-
 #include "core/graph.h"
 #include "core/kernel.h"
 #include "core/runtime.h"
@ -51,26 +50,38 @@ TEST(cuBLAS_Matmul, run) {
                   Shape{2, 3, 4}, Shape{2, 3, 2},
                   ExpectOutput{40, 52, 46, 61, 52, 70, 58, 79, 400, 448, 424,
                                475, 448, 502, 472, 529});
+    testMatmulCuda(
+        IncrementalGenerator(), IncrementalGenerator(), false, false,
+        Shape{2, 3, 5}, Shape{5, 2},
+        ExpectOutput{60, 70, 160, 195, 260, 320, 360, 445, 460, 570, 560, 695});
+    testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), true, false,
+                   Shape{2, 5, 3}, Shape{5, 2},
+                   ExpectOutput{180, 210, 200, 235, 220, 260, 480, 585, 500,
+                                610, 520, 635});
+    testMatmulCuda(IncrementalGenerator(), IncrementalGenerator(), false, false,
+                   Shape{3, 5}, Shape{5, 2},
+                   ExpectOutput{60, 70, 160, 195, 260, 320});
 }

 TEST(cuBLAS_Matmul, tune) {
-    auto cpuRuntime = NativeCpuRuntimeObj::getInstance();
-    Graph gCpu = make_ref<GraphObj>(cpuRuntime);
-    auto ACpu = gCpu->addTensor(Shape{1, 3, 5}, DataType::Float32);
-    auto BCpu = gCpu->addTensor(Shape{1, 5, 2}, DataType::Float32);
-    gCpu->dataMalloc();
-    ACpu->setData(IncrementalGenerator());
-    BCpu->setData(IncrementalGenerator());
-
+    // Matmul([A^T,B,act=0],A=597,B=595,C=598,bmnk=[1,4,4096,448])
+    const int B = 1, M = 4, N = 4096, K = 448;
+    const bool transA = true, transB = false;
    auto cudaRuntime = make_ref<CudaRuntimeObj>();
-    auto gCuda = make_ref<GraphObj>(cudaRuntime);
-    auto ACuda = gCuda->cloneTensor(ACpu);
-    auto BCuda = gCuda->cloneTensor(BCpu);
-    auto matmul = gCuda->addOp<MatmulObj>(ACuda, BCuda, nullptr);
-
+    Graph g = make_ref<GraphObj>(cudaRuntime);
+    auto a = g->addTensor(transA ? Shape{B, K, M} : Shape{B, M, K});
+    auto b = g->addTensor(transB ? Shape{B, N, K} : Shape{B, K, N});
    // allocate CUDA memory
-    gCuda->dataMalloc();
-    cudaRuntime->run(gCuda, true);
+    g->dataMalloc();
+    a->setData(IncrementalGenerator());
+    b->setData(IncrementalGenerator());
+
+    auto matmul = g->addOp<MatmulObj>(a, b, nullptr, transA, transB);
+    matmul->print();
+    double time = cudaRuntime->getPerfTime(g);
+    EXPECT_GT(time, 1e-3);
+    EXPECT_LT(time, 1);
+    cudaRuntime->run(g, true);
 }

 }; // namespace infini
--- a/test/nnet/run_models_nnet.py
+++ b/test/nnet/run_models_nnet.py
@ -0,0 +1,92 @@
+import onnx
+import torch
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+import infinitensor as ft
+
+
+def to_pytorch_tensor(tensor) -> torch.Tensor:
+    data = tensor.copyout_float()
+    tt = torch.tensor(data)
+    return tt.reshape(tensor.shape())
+
+
+def run_InfoGAN_return_tesnor(n_layers: int):
+    if_tensors = ft.runInfoGAN(n_layers)
+    tensors = [to_pytorch_tensor(t) for t in if_tensors]
+    return tensors
+
+
+def read_and_check():
+    for n_layers in range(1, 6):
+        ans = torch.load(f'torch_{n_layers}layers_0.pt')
+        x = torch.load(f'torch_{n_layers}layers_1.pt')
+        print(f'=== {n_layers} layers ===')
+        print(x.abs().max())
+
+
+def run_e2e_InfoGAN():
+    data = []
+    for n_layers in range(5, 6):
+        tensors = run_InfoGAN_return_tesnor(n_layers)
+        for i, t in enumerate(tensors):
+            torch.save(t, f'torch_{n_layers}layers_{i}.pt')
+        print(f'============ {n_layers} layers = = =')
+        ans, x = tensors
+        print(f'Allclose {torch.allclose(ans, x)}')
+
+        # Print error numbers
+        tot = np.product(ans.shape)
+        data.append([])
+        for i in range(0, 10):
+            tol = 10**(-i)
+            clo = torch.isclose(ans, x, atol=tol, rtol=tol).sum().item()
+            print(f'0.1^{i} close: {clo}/{tot} = {clo/tot}')
+            data[-1].append(clo/tot)
+
+        rel_err = torch.abs((ans-x)/ans)
+        print(rel_err, rel_err.max())
+        print(f'ans = {ans}')
+        print(f'x = {x}')
+
+        # # Plot CDF
+        # fig, axes = plt.subplots(9,1)
+        # print(axes)
+        # for i, ax in enumerate(axes):
+        #     print(i)
+        #     ax:plt.Axes
+        #     ax.hist(torch.flatten(rel_err), density=True, cumulative=True, label='CDF',
+        #         histtype='step', alpha=0.8, color='k')
+        #     ax.set_xlim(0, 10**(-i))
+        #     # ax.set_title('')
+        # plt.show()
+        # plt.savefig('a.pdf')
+    df = pd.DataFrame(data)
+    print(df.to_string())
+    df.set_axis([f'0.1^{i}' for i in range(0, 10)], axis=1, inplace=True)
+    print(df.to_string())
+    df.to_csv('a.csv')
+
+
+def runSingleConvT():
+    runtime = ft.cuda_runtime()
+    g = ft.getConvtransposedNHWC(runtime, [1, 2, 2, 448], 1)
+    opt_g = ft.optimizeGraph(g, runtime)
+    ft.if_onnx.export_onnx(opt_g, 'convtransposed.onnx')
+
+
+def run_InfoGAN_without_tuning(tuning: bool):
+    runtime = ft.cuda_runtime()
+    g = ft.getInfoGAN(1, runtime, 5)
+    # g = ft.getInfoGAN(1, runtime, 1)
+    opt_g = ft.optimizeGraph(g, runtime, tuning)
+    ft.if_onnx.export_onnx(opt_g, 'infogan_transformed.onnx')
+    ft.NMutator.memboundToJson(opt_g, ".")
+
+
+if __name__ == "__main__":
+    # run_e2e_InfoGAN()
+    run_InfoGAN_without_tuning(True)
+    # runSingleConvT()
+    # read_and_check()
--- a/test/nnet/test_memboundOp.cc
+++ b/test/nnet/test_memboundOp.cc
@ -7,7 +7,8 @@
 #include "nnet/routine.h"
 #include "nnet/test.h"
 #include "operators/matmul.h"
-#include <chrono>
+#include "operators/membound.h"
+#include "test.h"
 using namespace infini;
 using namespace std;

@ -18,8 +19,8 @@ TEST(nnet, MemboundOpInterpretation) {
    Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
    Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
    g->dataMalloc();
-    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
    NMutator nmutator(NMutator::Mode::ToNaiveMembound);
    auto mutations = nmutator.run(g);
@ -36,7 +37,7 @@ TEST(nnet, MemboundOpInterpretation) {
    EXPECT_EQ(membound->getOpType(), OpType::MemBound);
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
    ans->dataMalloc();
-    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyin(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
    EXPECT_TRUE(membound->getOutput()->equalData(ans));
 }

@ -49,8 +50,8 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
    Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
    Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
    g->dataMalloc();
-    i0->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
    NMutator nmutator(NMutator::Mode::ToNaiveMembound);
    auto mutations = nmutator.run(g);
@ -67,7 +68,7 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
    EXPECT_EQ(membound->getOpType(), OpType::MemBound);
    auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::Float32, cpu);
    ans->dataMalloc();
-    ans->copyData(vector<float>{38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyin(vector<float>{38, 44, 50, 56, 83, 98, 113, 128});

    auto oCpu = gCpu->cloneTensor(membound->getOutput());
    oCpu->printData();
@ -77,3 +78,41 @@ TEST(nnet, MemboundOp_Ansor_Codegen) {
    // double time = timeit([&]() { runtime->run(gNew, false); }); // tune
    // kernels std::cout << "Time (ms):" << time << std::endl;
 }
+
+pair<std::vector<nnet::Tensor>, nnet::Expr> getPReluExpr(int size) {
+    using namespace nnet;
+    using nnet::make_ref;
+    DEFINE_VAR(i);
+    auto A = make_ref<TensorNode>("A", vector{size});
+    auto B = make_ref<TensorNode>("B", vector{size});
+    Expr e = make_ref<FuncNode>(makeSubscript(A, {i}) - makeSubscript(B, {i}),
+                                FuncType::PRelu);
+    Expr ret = makeRangeOperator({{i, {0, size}}}, {}, e);
+    return {{A, B}, ret};
+}
+
+TEST(nnet, PRelu_Ansor_Codegen) {
+    auto cuda = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(cuda);
+    Tensor i0 = g->addTensor(vector{12});
+    Tensor w0 = g->addTensor(vector{12});
+    Tensor o0 = g->addTensor(vector{12});
+    auto [nnetInputs, expr] = getPReluExpr(12);
+    g->addOpWithOutputs<MemBoundObj>(vector{i0, w0}, vector{o0}, nnetInputs,
+                                     expr, -1);
+    g->dataMalloc();
+    i0->setData(IncrementalGenerator());
+    w0->setData(ValGenerator<5>());
+    cuda->run(g, true); // tune kernels
+
+    // check answer
+    auto ans = make_ref<TensorObj>(Shape{12}, DataType::Float32, cpu);
+    ans->dataMalloc();
+    ans->copyin(
+        vector<float>{-1.25, -1., -0.75, -0.5, -0.25, 0, 1, 2, 3, 4, 5, 6});
+
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    auto oCpu = gCpu->cloneTensor(o0);
+    EXPECT_TRUE(oCpu->equalData(ans));
+}
--- a/test/nnet/test_mergeStage.cc
+++ b/test/nnet/test_mergeStage.cc
@ -4,19 +4,16 @@
 #include "nnet/Visitor/HashVisitor.h"
 #include "nnet/Visitor/MergeMemboundMutator.h"
 #include "nnet/expr.h"
+#include "nnet/test.h"
 #include "gtest/gtest.h"
 using namespace nnet;
 using namespace std;
-#define DEFINE_VAR(name) auto name = make_ref<VarNode>(#name);

 TEST(FuseMembound, Relu) {
    const int n_heads = 8, seq_len = 10000, feat_len = 512;
    // dilation_heads = 2;
    const int Batch = n_heads, M = seq_len, K = feat_len, W = 32;
-    DEFINE_VAR(b);
-    DEFINE_VAR(m);
-    DEFINE_VAR(w);
-    DEFINE_VAR(k);
+    DEFINE_VAR(b, m, w, k);
    auto A = make_ref<TensorNode>("A", vector<int>({Batch, M, K}),
                                  vector<int>{0, 0, 0});

@ -35,10 +32,7 @@ TEST(FuseMembound, MemMemFusion) {
    const int n_heads = 8, seq_len = 100, feat_len = 100;
    // dilation_heads = 2;
    const int Batch = n_heads, M = seq_len, K = feat_len;
-    DEFINE_VAR(b);
-    DEFINE_VAR(m);
-    DEFINE_VAR(w);
-    DEFINE_VAR(k);
+    DEFINE_VAR(b, m, w, k);
    auto A = make_ref<TensorNode>("A", vector<int>({Batch, M, K}),
                                  vector<int>{0, 0, 0});
    auto B = make_ref<TensorNode>("B", vector<int>({Batch, K, M}),
@ -55,3 +49,25 @@ TEST(FuseMembound, MemMemFusion) {
                                    {{k, {0, K}}}, makeSubscript(A, {b, m, k}));
    EXPECT_EQ(HashVisitor().getHash(merged), HashVisitor().getHash(ans));
 }
+
+TEST(FuseMembound, mergeNestedStagesInRangeOp) {
+    // Case in ConvTranspose to Matmul
+    // L<f:0:448><i39:0:4096>Sum  ...  [i39,f]
+    //   {L<i39:0:4096><f:0:448>Sum  ...  [f,(i39 / 1024),((i39 / 256) % 4),(i39
+    //   % 256)] {K}}
+    DEFINE_VAR(f, i);
+    const int I = 4096, F = 448;
+    auto K = make_ref<TensorNode>("K", vector<int>({448, 4, 4, 256}));
+
+    auto subA = makeSubscript(K, {f, i / 1024, (i / 256) % 4, i % 256});
+    auto range = makeRangeOperator({{i, {0, I}}, {f, {0, F}}}, {}, subA);
+    auto outerRange = makeRangeOperator({{f, {0, F}}, {i, {0, I}}}, {},
+                                        makeSubscript(range, {i, f}));
+    auto merged = MergeMemboundMutator({outerRange}).merge();
+
+    // Compare the result with answer
+    RangeOp ans = makeRangeOperator(
+        {{f, {0, F}}, {i, {0, I}}}, {},
+        makeSubscript(K, {f, i / 1024, (i / 256) % 4, i % 256}));
+    EXPECT_EQ(HashVisitor().getHash(merged), HashVisitor().getHash(ans));
+}
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@ -5,12 +5,13 @@
 #include "core/search_engine.h"
 #include "cuda/cuda_runtime.h"
 #include "nnet/nmutator.h"
+#include "nnet/test.h"
 #include "operators/conv.h"
 #include "test.h"

 namespace infini {

-TEST(Mutator, NaiveConvWithInterpreter) {
+TEST(NMutator, NaiveConvWithInterpreter) {
    // verifyNaiveMembound True: subgraph after transformation
    // verifyNaiveMembound False: subgraph of one single membound (eOP)
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
@ -55,46 +56,75 @@ TEST(Mutator, NaiveConvWithInterpreter) {
 }

 // FIXME: failed since implicit transpose for DLT
-TEST(Mutator, InfoGAN_TConv_3_correctness) {
-    // verifyNaiveMembound True: subgraph after transformation
-    // verifyNaiveMembound False: subgraph of one single membound (eOP)
-    // const bool verifyNaiveMembound = false;
+TEST(NMutator, InfoGAN_TConv_3_correctness) {
+    const bool useMutatorDirectly = false;
    Runtime runtime = make_ref<CudaRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
    Graph gCpu = make_ref<GraphObj>(cpu);

-    // {n, h, w, f} * {f, r, s, c}
-    auto i0 = g->addTensor({1, 2, 2, 448});
-    auto w0 = g->addTensor({448, 4, 4, 256});
+    const int n = 1, c = 256, h = 2, w = 2, f = 448, r = 4, s = 4;
+    // // Minimum config for test
+    // const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
+    // const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
+
+    auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
+    auto w0 =
+        g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
    g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);

-    auto mutator = make_ref<NMutator>();
-    mutator->setToNaiveMembound();
-    SearchEngine searchEngine(runtime, mutator);
-    auto bestGraph = searchEngine.run(g);
-    bestGraph->print();
-    printf("--- SearchEngine Finished ---\n");
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    // // Translate OP to membound without derivation
+    // mutator->setToNaiveMembound();

+    vector<Graph> bestGraphs;
+    if (useMutatorDirectly) { // Use mutator results
+        bestGraphs = mutator->run(g);
+    } else { // Use search engine results
+        SearchEngine searchEngine(runtime, mutator);
+        bestGraphs.emplace_back(searchEngine.run(g));
+    }
    g->dataMalloc();
-    bestGraph->dataMalloc();
-    for (auto t : g->getTensors()) {
-        if (t->getFuid() <= 2)
-            t->setData(IncrementalGenerator());
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        EXPECT_EQ(fuidToInputTensor.count(t->getFuid()), 0);
+        fuidToInputTensor[t->getFuid()] = t;
    }
-    for (auto t : bestGraph->getTensors()) {
-        if (t->getFuid() <= 2)
-            t->setData(IncrementalGenerator());
+
+    std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph =
+            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
+
+        auto gen = RandomGenerator(0.1, 0.1, i);
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : g->getInputs()) {
+            t->setData(gen);
+        }
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : g->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        runtime->run(bestGraph, true); // Tune kernels
+        runtime->run(g);
+        runtime->run(bestGraph, false); // Execute transfomraed graph
+
+        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
    }
-    runtime->run(g);
-    runtime->run(bestGraph);
-
-    auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
-    auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
-
-    EXPECT_TRUE(go0->equalData(bgo0));
-    EXPECT_TRUE(g->getOutputs()[0]->getRawDataPtr<void *>() !=
-                bestGraph->getOutputs()[0]->getRawDataPtr<void *>());
 }

 // TEST(Mutator, Conv9x9) {
--- a/test/nnet/test_serializer.cc
+++ b/test/nnet/test_serializer.cc
@ -76,12 +76,14 @@ TEST(Serializer, CompareTwoExprs) {
    auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
    auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
    auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
+    auto funcA = make_ref<FuncNode>(subA, FuncType::Relu);
    auto subB = makeSubscript(B, {b, ((i3 + (2500 * i4)) + w), k});
    auto range = makeRangeOperator(
        {{i3, {0, 2500}}, {i4, {0, 4}}, {b, {0, 8}}, {w, {0, 65}}},
-        {{k, {0, 512}}}, subA * subB);
+        {{k, {0, 512}}}, funcA * subB);
    Serializer().serialize(range, "./test_serializer.json");
    auto expr = Serializer().deserialize("./test_serializer.json");
+    dbg(expr);

    EXPECT_EQ(range->toReadable(), expr->toReadable());
 }
@ -90,11 +92,9 @@ TEST(Serializer, Serialization_NestedTensor) {
    FullPrinterVisitor printer;
    auto range = buildNestedExpr();
    auto ans = printer.print(range);
-    dbg(ans);
    auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
    EXPECT_TRUE(isSuccessful);
    auto exprDeserialized = Serializer().deserialize("./test_serializer.json");
    auto output = printer.print(exprDeserialized);
-    dbg(output);
    EXPECT_EQ(output, ans);
 }
Author	SHA1	Message	Date
Liyan Zheng	28bf2f0e83	Add: time non-compile-cime-computable operators	2023-04-18 16:08:05 +08:00
Liyan Zheng	133d91bc76	Chore: remove deprecated function	2023-04-18 14:56:35 +08:00
Liyan Zheng	37f3e9cf22	Add: NMutator::memboundToJson to export memboundOp	2023-04-18 09:56:14 +08:00
Liyan Zheng	885a978016	Add: nnet::Serializer supports FuncNode	2023-04-17 20:15:40 +08:00
Liyan Zheng	a65dbff5f9	Chore: disable nnet_unimplemented_continue output	2023-04-17 13:24:56 +08:00
Liyan Zheng	da3f5605e5	Add: exclude compile-time computable operator time	2023-04-17 13:24:14 +08:00
Liyan Zheng	45f7841d83	Add: C++ callback to export ONNX	2023-04-17 11:48:26 +08:00
Liyan Zheng	7a1d271c79	Add: RangeOpNode::getFullExpression()	2023-04-17 11:41:13 +08:00
Liyan Zheng	b2c53458d9	Add: fuse membound operators	2023-04-17 11:38:48 +08:00
Liyan Zheng	e76f2d0f10	Chore: simplify type names	2023-04-15 13:58:18 +08:00
Liyan Zheng	f881db7ceb	Add: export to ONNX with custom operators	2023-04-15 00:47:11 +08:00
Liyan Zheng	b72662bb9b	Add: infogan python interface	2023-04-14 16:36:55 +08:00
Liyan Zheng	582de83629	Fix: avoid reload library	2023-04-14 15:10:47 +08:00
Liyan Zheng	b6b37ccf33	Add: invoke TVM through pipe	2023-04-13 21:43:56 +08:00
Liyan Zheng	9a009be337	Add: search engine uses estimated time	2023-04-13 19:46:54 +08:00
Liyan Zheng	f0785225d7	Chore: format and comments	2023-04-12 11:48:11 +08:00
Liyan Zheng	f939ebf8bb	Add: comments for Matmul	2023-04-12 11:24:31 +08:00
Liyan Zheng	31b03ef91a	Add: detailed output of Matmul	2023-04-12 11:24:31 +08:00
Liyan Zheng	bd76a5c8d6	Add: CMake support of TVM	2023-04-12 11:24:31 +08:00
Liyan Zheng	83d5842f56	Fix: specify runtime	2023-04-12 11:24:31 +08:00
Liyan Zheng	005c113948	Add: support TVM packed function	2023-04-12 11:24:31 +08:00
Liyan Zheng	0d70643867	Add: Random generator	2023-04-12 11:24:31 +08:00
Liyan Zheng	f29f54687c	Add: PRelu Expr and AsTVMVisitor	2023-04-12 11:24:31 +08:00
huangshuhong	67e9fb9632	Fix: test_memboundOp	2023-04-12 11:24:29 +08:00
huangshuhong	af80830d56	Fix: membound_tvm	2023-04-12 11:24:25 +08:00
huangshuhong	f19bd4d1d6	Fix: remove cuCtxCreate which makes TVM fails	2023-04-12 11:24:25 +08:00
huangshuhong	2e501de8db	Add: PackedFunc (broken)	2023-04-12 11:24:13 +08:00
Liyan Zheng	31b1edc600	Add: CMake config	2023-04-12 11:24:13 +08:00
Liyan Zheng	69d894e003	Add: TVM headers and CMake include paths	2023-04-12 11:24:13 +08:00
Liyan Zheng	e8b4e3f03f	Add: expr simplication for TVM	2023-04-12 11:24:13 +08:00
Liyan Zheng	0d8685b1e0	Fix: graph OpList ctor	2023-04-12 11:24:13 +08:00
Liyan Zheng	04858012bf	Add: Exception for checkCuError	2023-04-12 11:24:13 +08:00
Liyan Zheng	6aaf963f50	Fix: cuBLAS failure when codegen is enabled	2023-04-12 11:24:13 +08:00
Liyan Zheng	6441abb88f	Fix: GraphObj ctor should use cloneTensor	2023-04-12 11:24:13 +08:00
huangshuhong	220b76f355	Add: matmul broadcast	2023-04-12 11:24:09 +08:00
Liyan Zheng	aac4bd4650	Fix: [Intermediate] NMutator::expressionToGraph To be fix: matmul with implicit broadcast	2023-04-12 11:24:09 +08:00
Liyan Zheng	7478b698dc	Add: expression reader as a cmake target	2023-04-12 11:24:09 +08:00
huangshuhong	8e6fc4136e	Add: cache and padding (bugs!!)	2023-04-12 11:24:01 +08:00
Liyan Zheng	bd12b67ea3	Add: mutator InfoGAN minimum test	2023-04-11 20:44:08 +08:00