remove expect

fix tvm stream
Add: different ONNX names for inputs and weights
2023-04-21 00:17:04 +08:00 · 2023-04-21 00:09:47 +08:00 · 2023-04-20 21:51:47 +08:00 · 2023-04-20 21:51:47 +08:00 · 2023-04-20 21:51:47 +08:00 · 2023-04-20 17:16:01 +08:00
51 changed files with 2931 additions and 466 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -129,7 +129,7 @@ if(BUILD_TEST_EINNET)
 endif()

 # Python bindings
-file(GLOB_RECURSE FFIS src/ffi/ffi_infinitensor.cc)
+file(GLOB_RECURSE FFIS src/ffi/ffi_callback.cc src/ffi/ffi_infinitensor.cc)
 pybind11_add_module(backend MODULE ${FFIS})
 target_link_libraries(backend PRIVATE InfiniTensor)

@ -168,6 +168,7 @@ endif()

 if(USE_CUDA)
  add_compile_definitions(USE_CUDA=1)
+  add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM=1) # Support CUDA graph stream caputre
  # Since enable_language only executes once, rerun cmake is required if CMAKE_CUDA_HOST_COMPILER is wrong
  set(CMAKE_CUDA_HOST_COMPILER
      ${CMAKE_CXX_COMPILER}
--- a/README_CN.md
+++ b/README_CN.md
@ -81,7 +81,7 @@ import onnx
 from pyinfinitensor.onnx import OnnxStub
 from pyinfinitensor import backend

-stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime())
+stub = OnnxStub.from_model(onnx.load("model_file"), backend.cpu_runtime())
 ```

 [`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数，将 onnx 文件读取为保存在内存中的 onnx 模型。
@ -201,7 +201,7 @@ def infer(model: ModelProto, input) -> dict:


 model0 = onnx.load(sys.argv[1])
-model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new")
+model1 = OnnxStub.from_model(model0, backend.cpu_runtime()).to_onnx("new")

 input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim]
 input = numpy.random.random(input_shape).astype(numpy.float32)
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -16,7 +16,8 @@ class GraphObj : public Object {
    string toString() const override;
    Runtime getRuntime() const { return runtime; }

-    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Float32,
+                     TensorType tensorType = TensorType::Other);
    Tensor addTensor(const Tensor &tensor);
    TensorVec addTensor(const TensorVec &tensors);
    /**
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -35,14 +35,22 @@ class GraphHandlerObj {
    Graph g;

  public:
-    GraphHandlerObj(Runtime runtime)
+    explicit GraphHandlerObj(Runtime runtime)
        : g(make_ref<GraphObj>(std::move(runtime))) {}

-    Tensor tensor(Shape dims, int dtype);
+    explicit GraphHandlerObj(Graph g) : g(std::move(g)) {}
+
+    //------ tensors
+
+    vector<Tensor> inputs() { return g->getInputs(); }
+
+    vector<Tensor> outputs() { return g->getOutputs(); }
+
+    Tensor tensor(Shape dims, int dtype, TensorType ttype);

    //------ operators

-    inline OpVec operators() { return g->getOperators(); }
+    OpVec operators() { return g->getOperators(); }

    Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
                int sh, int sw, int dh, int dw);
@ -90,18 +98,22 @@ class GraphHandlerObj {
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
+    /// @brief Import memBound operator from a json
+    TensorVec memBound(const TensorVec &inputs, const Tensor &outputs,
+                       const string &jsonString);

    //------ modifiers

-    inline bool topo_sort() { return g->topo_sort(); }
+    bool topo_sort() { return g->topo_sort(); }

-    inline void optimize() { g->optimize(); }
+    void optimize() { g->optimize(); }

    //------ runtime

-    inline void data_malloc() { g->dataMalloc(); }
+    void data_malloc() { g->dataMalloc(); }

-    inline void run() { g->getRuntime()->run(g); }
+    void run() { g->getRuntime()->run(g); }
+    Graph getGraph() const { return g; }
 };

 } // namespace infini
--- a/include/core/mutator.h
+++ b/include/core/mutator.h
@ -30,6 +30,7 @@ class Mutator {
    virtual bool isMultiBranchMergable(const Graph &in_graph) {
        IT_TODO_HALT();
    }
+    virtual Graph fuseVertically(const Graph &inputGraph) { IT_TODO_HALT(); }
 };

 } // namespace infini
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -121,6 +121,7 @@ class OpRegistry {
            FOP(ConvBackwardData);
            FOP(Matmul);
            FOP(ConvTrans);
+            FOP(ConvTransNHWC);
            FOP(G2BMM);
            FOP(GBMM);
            FOP(Pad);
@ -209,7 +210,8 @@ class OpRegistry {
            //
            FOP(MemBound);
        default:
-            IT_ASSERT(false);
+            IT_ASSERT(false, "Unknown OpType " +
+                                 std::to_string(enum_to_underlying(opType)));
            break;
        }
 #undef FOP
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,5 +1,6 @@
 #pragma once
 #include "core/common.h"
+#include "core/object.h"
 #include "core/ref.h"
 #include <memory>

@ -59,10 +60,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
     * execution happens.
     *
     * @param graph
-     * @param profiling Whether to print breakdown of time
+     * @param printProfiling Whether to print breakdown of time
     * @return double Return the sum of perf time for each operator
     */
-    double getPerfTime(const Graph &graph, bool profiling = false) const;
+    double getPerfTime(const Graph &graph, bool printProfiling = false,
+                       bool allowEstimation = false,
+                       bool ignoreMemboundOp = false) const;
    Blob allocBlob(size_t size);
    bool isCpu() const {
        return device == Device::CPU || device == Device::INTELCPU;
@ -77,6 +80,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                               size_t bytes) const = 0;
    virtual string toString() const = 0;

+    map<UidBaseType, bool>
+    getCompileTimeComputableAttribute(const Graph &graph) const;
+
+    double timeNonCtcOperators(const Graph &graph, int warmup = 1000,
+                               int repeat = 1000) const;
+
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
--- a/include/core/search_engine.h
+++ b/include/core/search_engine.h
@ -4,19 +4,15 @@
 #include "graph.h"
 #include "mutator.h"

-#include <unordered_map>
-
 namespace infini {
 class SearchEngine {
  private:
    Runtime runtimeExec;
    Ref<Mutator> mutator;
+    std::function<bool(const Graph &, const Graph &)> graphTimeComparer;

  public:
-    SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
-        runtimeExec = _runtime;
-        mutator = _mutator;
-    }
+    SearchEngine(Runtime runtime, Ref<Mutator> mutator);
    ~SearchEngine() {}

  private: // Configurations
@ -24,24 +20,17 @@ class SearchEngine {
        3;                  // cut nodes whose #in + #out >= partitionThreshold
    size_t GRAPH_SIZE = 16; // num of best graphs.

-  private: // Composed objects
-    std::shared_ptr<Mutator> mutationEngine;
-
  public:
-    std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
    struct GroupEdge {
        int v, next;
        GroupEdge() = delete;
    };

-    struct Candidate { // a graph with perf
-        std::shared_ptr<Graph> graph;
-        double perf = INFINITY;
-    };
-    class MetaGraph { // a graph of subgraphs, for searching.
-      public:
-        MetaGraph() {}
-        ~MetaGraph() {}
+    // struct Candidate { // a graph with perf
+    //     Graph graph;
+    //     double perf = INFINITY;
+    // };
+    struct MetaGraphObj { // a graph of subgraphs, for searching.
        struct Node {
            Graph graph;
            std::vector<int> suc;
@ -50,31 +39,33 @@ class SearchEngine {
        };
        std::vector<Node> nodes;
    };
+    using MetaGraph = Ref<MetaGraphObj>;

-    Graph run(const Graph graph);                  // entrance of search engine.
+    Graph run(const Graph graph);                  // entrance to search engine.
    std::vector<Graph> search(const Graph &graph); // search for a partition.

  private:
    std::vector<Graph> partitionGraph(const Graph graph);
-    std::shared_ptr<MetaGraph> buildMetaGraphWithGraph(const Graph graph);
-    std::shared_ptr<MetaGraph>
-    buildMetaGraphWithPlan(const std::shared_ptr<MetaGraph> metaGraph,
-                           const std::vector<int> &plan);
+    MetaGraph buildMetaGraphWithGraph(const Graph graph);
+    MetaGraph buildMetaGraphWithPlan(const MetaGraph metaGraph,
+                                     const std::vector<int> &plan);
    // search horizontal merges
-    std::vector<std::shared_ptr<MetaGraph>>
-    searchMerge(std::shared_ptr<MetaGraph> &metaGraph);
-    void searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
-                        std::vector<int> &plan, std::vector<int> &frontier,
+    std::vector<MetaGraph> searchMerge(MetaGraph &metaGraph);
+    void searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
+                        std::vector<int> &frontier,
                        std::vector<std::vector<int>> &plans,
                        std::unordered_set<uint64_t> &planSet);
-    std::vector<Graph>
-    searchMutation(const std::shared_ptr<MetaGraph> &metaGraph);
+    std::vector<Graph> searchMutation(const MetaGraph &metaGraph);

-    void printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph);
+    void printMetaGraph(MetaGraph metaGraph);
    /**
     * @brief Check whether a multi-brach graph can be merged into a single
     * branch.
     */
    bool isMultiBranchMergable(const Graph graph);
+    Graph fuseVertically(const Graph &graph);
+
+    double getEstimatedGraphPerf(Graph graph);
 };
+
 } // namespace infini
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -12,13 +12,14 @@ namespace infini {
 // TODO: how to deal with this
 using ShapeElem = int;
 using Shape = vector<ShapeElem>;
+enum class TensorType { Error = 0, Input = 1, Initialized = 2, Other = 3 };
 class TensorObj : public TensorBaseObj {
  private:
    Shape shape;
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-
+    TensorType tensorType;
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
@ -27,7 +28,8 @@ class TensorObj : public TensorBaseObj {
    }

  public:
-    TensorObj(Shape shape, DataType dtype, Runtime runtime);
+    TensorObj(Shape shape, DataType dtype, Runtime runtime,
+              TensorType tensorType = TensorType::Other);
    virtual ~TensorObj() {}
    string toString() const override;

@ -39,6 +41,7 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
+    TensorType getTensorType() const { return tensorType; }

    void load(std::string file_path);
    void save(std::string file_path);
@ -74,6 +77,7 @@ class TensorObj : public TensorBaseObj {
    // Thus the internal state of generator cannot be updated.
    void setData(
        std::function<void(void *, size_t, DataType)> const &generator) const;
+    void setData(const Blob &_blob) { data = _blob; }
    Tensor clone() const {
        auto obj = make_ref<TensorObj>(*this);
        obj->freeData();
@ -106,13 +110,13 @@ class TensorObj : public TensorBaseObj {
    size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const;

  private:
-    template <class T> string dataToString() const {
+    template <class T> string dataToString(void *rawPtr) const {
        std::stringstream builder;
        builder << "Tensor: " << guid << std::endl;

        auto numDims = shape.size();
        auto dimSzVec = vector<int>(numDims, 1);
-        auto ptr = data->getPtr<T *>();
+        T *ptr = (T *)rawPtr;
        dimSzVec[numDims - 1] = shape[numDims - 1];

        for (int i = numDims - 1; i != 0; --i)
--- a/include/cuda/cuda_conv2dreduce.h
+++ b/include/cuda/cuda_conv2dreduce.h
@ -0,0 +1,14 @@
+#pragma once
+
+namespace infini {
+
+void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
+                         int n, int h, int w, int f, int r, int s, int oh,
+                         int ow, int ph, int pw, int sh, int sw, int dh,
+                         int dw);
+
+void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
+                                  int act, int n, int h, int w, int f, int r,
+                                  int s, int oh, int ow, int ph, int pw, int sh,
+                                  int sw, int dh, int dw);
+} // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,35 +1,22 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
+#include "nnet/dbg.h"

 namespace infini {

 class CudaRuntimeObj : public RuntimeObj {
  private:
+    cudaStream_t stream;
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    CudaPtr workspace;
    size_t workspaceSize;
+    bool cudaGraphStatus; // Whether CUDA graph stream capture is enabled

  public:
-    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
-
-        checkCudnnError(cudnnCreate(&cudnn));
-        checkCublasError(cublasCreate(&cublas));
-        // 10GB for Longformer
-        // size_t longformerNum = 3lu * (1 << 30);
-        workspaceSize = 7ll << 30; // 7 GB
-        workspace = alloc(workspaceSize);
-    }
-    virtual ~CudaRuntimeObj() {
-        try {
-            dealloc(workspace);
-            checkCudnnError(cudnnDestroy(cudnn));
-            checkCublasError(cublasDestroy(cublas));
-        } catch (const std::exception &e) {
-            std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
-        }
-    }
+    CudaRuntimeObj();
+    virtual ~CudaRuntimeObj();
    string toString() const override;

    void run(const Graph &graph, bool tune = false,
@ -69,7 +56,15 @@ class CudaRuntimeObj : public RuntimeObj {

    void runWithoutSync(const Graph &graph) const;

+    bool isInCudaGraph() const { return cudaGraphStatus; }
+    cudaStream_t getStream() const { return stream; }
+
+    double timeWithCudaGraph(Graph graph);
+
  private:
    void tune(const Graph &graph, bool profiling) const;
+
+    void beginCudaGraphStreamCapture();
+    tuple<cudaGraphExec_t, size_t> endCudaGraphStreamCapture();
 };
 } // namespace infini
--- a/include/ffi/ffi_callback.h
+++ b/include/ffi/ffi_callback.h
@ -0,0 +1,9 @@
+#include "core/graph_handler.h"
+#include "core/mutator.h"
+#include "core/search_engine.h"
+
+namespace infini {
+namespace callback {
+void exportONNX(const Graph &graph, const string &path);
+}
+} // namespace infini
--- a/include/nnet/Visitor/Serializer.h
+++ b/include/nnet/Visitor/Serializer.h
@ -20,6 +20,7 @@ class Serializer : public Functor<string()> {
    string visit_(const Subscript &c) override;
    string visit_(const Var &c) override;
    string visit_(const Tensor &c) override;
+    string visit_(const Func &c) override;
    string dispatchRoutine(const Routine &c);

    Expr buildExprTree(string key);
@ -29,16 +30,44 @@ class Serializer : public Functor<string()> {
    Serializer(int _verobse = 0);
    virtual ~Serializer();

+    /**
+     * @brief Serialize the given expression to string
+     *
+     * @param expr The expression to be serialized
+     * @param msg Message of derivation
+     * @param inputs membound operator attributes
+     * @param exec_time membound operator attributes
+     * @param hint membound operator attributes
+     * @return bool Whether the serialization succeed
+     */
+    std::optional<std::string> toString(Expr const &expr,
+                                        const string &msg = "",
+                                        vector<Tensor> inputs = {},
+                                        double exec_time = -1e9,
+                                        string hint = "");
+
    /**
     * @brief Serialize the given expression to json file
     *
     * @param expr The expression to be serialized
     * @param filePath The path of json file to be output
     * @param msg Message of derivation
+     * @param inputs membound operator attributes
+     * @param exec_time membound operator attributes
+     * @param hint membound operator attributes
     * @return bool Whether the serialization succeed
     */
-    bool serialize(const Expr &expr, const string &filePath,
-                   const string &msg = "");
+    bool toFile(const Expr &expr, const string &filePath,
+                const string &msg = "", vector<Tensor> inputs = {},
+                double exec_time = -1e9, string hint = "");
+
+    /**
+     * @brief Deserialize the given json file to expression
+     *
+     * @param text The text of the expr to be deserialized
+     * @return Expression deserialized from the given json file
+     */
+    Expr fromString(const string &text);

    /**
     * @brief Deserialize the given json file to expression
@ -46,7 +75,15 @@ class Serializer : public Functor<string()> {
     * @param filePath The path to file to be deserialized
     * @return Expression deserialized from the given json file
     */
-    Expr deserialize(const string &filePath);
+    Expr fromFile(const string &filePath);
+
+    tuple<Expr, vector<Tensor>, double, string>
+    deserializeAsMemobundOp(const string &filePath);
+
+    // FIXME: the order of elements in tuple is not consistent with memboundObj
+    // constructor
+    tuple<Expr, vector<Tensor>, double, string>
+    membundOpFromString(const string &data);
 };

-} // namespace nnet
+} // namespace nnet
--- a/include/nnet/common.h
+++ b/include/nnet/common.h
@ -69,7 +69,8 @@ static inline HashType genhash(string s) {
    { IT_TODO_HALT(); }

 #define nnet_unimplemented_continue()                                          \
-    { dbg("Unimplemented"); }
+    {}
+// { dbg("Unimplemented"); }

 #define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

--- a/include/nnet/expr.h
+++ b/include/nnet/expr.h
@ -104,7 +104,7 @@ enum class NodeType {
    FuncNodeType
 };

-enum class FuncType { Relu, Tanh, PRelu };
+enum class FuncType { Relu = 1000, Tanh, PRelu };

 #define DEFINE_GETTYPE(CLASS, isScalar_v)                                      \
    NodeType getType() const override { return NodeType::CLASS##Type; }        \
@ -206,7 +206,8 @@ struct IterationType {
    enum { Loop, Sum };
    constexpr static int NumIterationType = 2;
 };
-class RangeOpNode : public OperatorNode {
+class RangeOpNode : public OperatorNode,
+                    public std::enable_shared_from_this<RangeOpNode> {
  public:
    enum { Summand, END_POS };
    constexpr static int Loop = IterationType::Loop;
@ -230,6 +231,7 @@ class RangeOpNode : public OperatorNode {
        return 0;
    };
    string toReadable() const override;
+    string getFullExpression();
    const Expr &getSummand() const { return subExprs[Summand]; }
    const vector<VarRangePair> &getVarRanges(int _index) const {
        return vars[_index];
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@ -7,12 +7,13 @@ namespace infini {
 class NMutator : public Mutator {
  public:
    enum class Mode { Normal, ToNaiveMembound, RuleBased };
+    using NameNToTensorT = map<string, Tensor>;

  private:
    // Suffix -N: NNet objects.
    // Suffix -T: tpm objects.
    // Map: NNet tensors -> tpm tensor.
-    std::map<std::string, Tensor> inputsNameNToTensorT;
+    NameNToTensorT inputsNameNToTensorT;
    Mode mode;
    const double bandwidth = double(200) * 1024 * 1024 * 1024;
    // If in RuleBased mode, use derivationRules in derivator
@ -24,8 +25,9 @@ class NMutator : public Mutator {
    ~NMutator();

    vector<Graph> run(const Graph &in_graph) override;
-    void setToNaiveMembound();
+    Graph fuseVertically(const Graph &in_graph) override;

+    void setToNaiveMembound();
    void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
    long long cntStates = 0;
    long long cntCandidates = 0;
@ -33,6 +35,14 @@ class NMutator : public Mutator {
  private:
    int maxDepth = 8;
    nnet::Expr opToExpression(Operator op);
+    /// @brief
+    /// @param op
+    /// @return pair<Expr, map from NNet tensor names to InfiniTensor tensors>
+    static pair<nnet::Expr, NameNToTensorT> extractOp(Operator op);
+    static pair<nnet::Expr, NMutator::NameNToTensorT>
+    generateUnaryExpr(const Operator &op);
+    static pair<nnet::Expr, vector<nnet::Tensor>> generateRevert(Tensor in);
+
    void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);

    /**
@ -48,8 +58,8 @@ class NMutator : public Mutator {

    // TODO: recover these rules
    // Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
-    // Graph transformTConv1x1(Operator op);
-    // Graph transformTConv3x3(Operator op);
+    Graph transformConvtransposed1x1(Operator _op);
+    // Graph transformConvtransposed(Operator op);
    // Graph transformDialtedConv(Operator op);
    // Graph transformConv1x1(Operator op);
    // Graph transformConv1xk(Operator op);
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@ -6,12 +6,17 @@ namespace infini {

 class MemBoundObj : public OperatorObj {
  private:
-    std::vector<nnet::Tensor> nnetInputs;
-    nnet::Expr expr, simplifiedExpr;
+    nnet::Expr expr;
+    std::vector<nnet::Tensor>
+        nnetInputs; // The order of inputs in nnetInputs should be consistant
+                    // with inputs in infinitensor
    double exec_time;
    std::string hint;
-    HashType hash, simplifiedHash;
-    int n, f, h, w;
+
+    // Generated attributes
+    HashType hash;
+    nnet::Expr simplifiedExpr;
+    HashType simplifiedHash;

  public:
    MemBoundObj(GraphObj *graph, const TensorVec &input,
@ -27,9 +32,12 @@ class MemBoundObj : public OperatorObj {
    int numOutputs() const override { return outputs.size(); }
    const vector<nnet::Tensor> &getNnetInputs() const { return nnetInputs; }
    const nnet::Expr getNnetExpr() const { return expr; }
+    HashType getHash() const { return hash; }
    pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
        return {expr, hash};
    }
+    double getEstimatedTime() const { return exec_time; }
+    string toJson() const;

  private:
    vector<int> getWorkloadVector() const override;
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@ -46,10 +46,13 @@ class RandomGenerator : public DataGenerator {
    std::mt19937 e;
    std::uniform_int_distribution<int> di;
    std::uniform_real_distribution<float> dr;
+    bool generateInteger;

  public:
-    RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0)
-        : l(l), r(r), e(seed), di(l, r), dr(l, r) {}
+    RandomGenerator(double l = 0, double r = 1, unsigned int seed = 0,
+                    bool generateInteger = false)
+        : l(l), r(r), e(seed), di(l, r), dr(l, r),
+          generateInteger(generateInteger) {}
    virtual ~RandomGenerator() {}

  private:
@ -60,7 +63,7 @@ class RandomGenerator : public DataGenerator {
    }
    void fill(float *data, size_t size) override {
        for (size_t i = 0; i < size; i++) {
-            data[i] = dr(e);
+            data[i] = (generateInteger) ? di(e) : dr(e);
        }
    }
 };
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -37,29 +37,48 @@ class OnnxStub:
    outputs: Dict[str, backend.Tensor] = {}
    initializer: Dict[int, TensorProto] = {}
    handler: backend.GraphHandler
+    disable_check: bool

-    def __init__(self, model: ModelProto, runtime):
-        model = infer_shapes(model)
-        self.handler = backend.GraphHandler(runtime)
+    @classmethod
+    def from_onnx(cls, model: ModelProto, runtime, enable_onnx_shape_infernce=True):
+        if enable_onnx_shape_infernce:
+            model = infer_shapes(model)
+        ans = OnnxStub()
+        ans.handler = backend.GraphHandler(runtime)

        tensors: Dict[str, backend.Tensor] = dict()
        data: Dict[str, TensorProto] = dict()

+        cnt_infini_inputs = 0
        for input in model.graph.input:
            dims = _take_shape_dim(input.type.tensor_type.shape)
-            tensors[input.name] = self.handler.tensor(
-                dims, input.type.tensor_type.elem_type
+            if input.name.startswith('input'):
+                tensor_type = backend.TensorType.Input
+                cnt_infini_inputs += 1
+            else:
+                tensor_type = backend.TensorType.Initialized
+            tensors[input.name] = ans.handler.tensor(
+                dims,
+                input.type.tensor_type.elem_type,
+                tensor_type,
            )
+        assert cnt_infini_inputs == 1, f'{cnt_infini_inputs} tensor names start with "input" found.'

        for output in model.graph.output:
            dims = _take_shape_dim(output.type.tensor_type.shape)
-            tensors[output.name] = self.handler.tensor(
-                dims, output.type.tensor_type.elem_type
+            tensors[output.name] = ans.handler.tensor(
+                dims,
+                output.type.tensor_type.elem_type,
+                backend.TensorType.Other,
            )

        for initializer in model.graph.initializer:
            dims = [d for d in initializer.dims]
-            tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
+            tensors[initializer.name] = ans.handler.tensor(
+                dims,
+                initializer.data_type,
+                backend.TensorType.Initialized,
+            )
            data[initializer.name] = initializer

        for node in model.graph.node:
@ -77,7 +96,7 @@ class OnnxStub:
                )
                if p[0] != p[2] or p[1] != p[3]:
                    adapt = "{}-adapt".format(node.output[0])
-                    tensors[adapt] = self.handler.pad(
+                    tensors[adapt] = ans.handler.pad(
                        tensors[node.input[0]], None, p, [-2, -1]
                    )
                    p = [0, 0, 0, 0]
@ -87,7 +106,7 @@ class OnnxStub:
                if len(node.input) > 2:
                    bias = "{}-bias".format(node.output[0])
                    reshape = "{}-reshape".format(node.output[0])
-                    tensors[bias] = self.handler.conv(
+                    tensors[bias] = ans.handler.conv(
                        tensors[adapt],
                        tensors[node.input[1]],
                        None,
@ -98,7 +117,7 @@ class OnnxStub:
                        d[0],
                        d[1],
                    )
-                    tensors[reshape] = self.handler.reshape(
+                    tensors[reshape] = ans.handler.reshape(
                        tensors[node.input[2]],
                        None,
                        [
@ -111,13 +130,13 @@ class OnnxStub:
                            1,
                        ],
                    )
-                    tensors[node.output[0]] = self.handler.add(
+                    tensors[node.output[0]] = ans.handler.add(
                        tensors[bias],
                        tensors[reshape],
                        tensors.get(node.output[0]),
                    )
                else:
-                    tensors[node.output[0]] = self.handler.conv(
+                    tensors[node.output[0]] = ans.handler.conv(
                        tensors[adapt],
                        tensors[node.input[1]],
                        tensors.get(node.output[0]),
@ -142,7 +161,7 @@ class OnnxStub:
                    attributes[name]
                    for name in ["dilations", "pads", "strides", "output_padding"]
                )
-                tensors[node.output[0]] = self.handler.convTransposed2d(
+                tensors[node.output[0]] = ans.handler.convTransposed2d(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
@ -156,7 +175,7 @@ class OnnxStub:
                    op[1],
                )
            elif node.op_type == "MatMul":
-                tensors[node.output[0]] = self.handler.matmul(
+                tensors[node.output[0]] = ans.handler.matmul(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
@ -175,7 +194,7 @@ class OnnxStub:
                # FIXME unsupport attributes: `alpha` `beta`
                assert alpha == 1.0
                assert beta == 1.0
-                tensors[node.output[0]] = self.handler.matmul(
+                tensors[node.output[0]] = ans.handler.matmul(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
@ -196,7 +215,7 @@ class OnnxStub:
                    attributes[name]
                    for name in ["momentum", "epsilon", "training_mode"]
                )
-                tensors[node.output[0]] = self.handler.batchNorm(
+                tensors[node.output[0]] = ans.handler.batchNorm(
                    input, output, mean, var, scale, bias, momentum, eps, training != 0
                )
            elif node.op_type == "MaxPool":
@ -215,10 +234,10 @@ class OnnxStub:
                )
                if p[0] != p[2] or p[1] != p[3]:
                    adapt = "{}-adapt".format(node.output[0])
-                    tensors[adapt] = self.handler.pad(
+                    tensors[adapt] = ans.handler.pad(
                        tensors.get(node.input[0]), None, p, [-2, -1]
                    )
-                    tensors[node.output[0]] = self.handler.maxPool(
+                    tensors[node.output[0]] = ans.handler.maxPool(
                        tensors[adapt],
                        tensors.get(node.output[0]),
                        k[0],
@ -231,7 +250,7 @@ class OnnxStub:
                        s[1],
                    )
                else:
-                    tensors[node.output[0]] = self.handler.maxPool(
+                    tensors[node.output[0]] = ans.handler.maxPool(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                        k[0],
@ -257,10 +276,10 @@ class OnnxStub:
                )
                if p[0] != p[2] or p[1] != p[3]:
                    adapt = "{}-adapt".format(node.output[0])
-                    tensors[adapt] = self.handler.pad(
+                    tensors[adapt] = ans.handler.pad(
                        tensors.get(node.input[0]), None, p, [-2, -1]
                    )
-                    tensors[node.output[0]] = self.handler.avgPool(
+                    tensors[node.output[0]] = ans.handler.avgPool(
                        tensors[adapt],
                        tensors.get(node.output[0]),
                        k[0],
@ -273,7 +292,7 @@ class OnnxStub:
                        s[1],
                    )
                else:
-                    tensors[node.output[0]] = self.handler.avgPool(
+                    tensors[node.output[0]] = ans.handler.avgPool(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                        k[0],
@ -287,7 +306,7 @@ class OnnxStub:
                    )
            elif node.op_type == "GlobalAveragePool":
                [_, _, h, w] = _search_shape(model, node.input[0])
-                tensors[node.output[0]] = self.handler.avgPool(
+                tensors[node.output[0]] = ans.handler.avgPool(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    h,
@ -300,52 +319,52 @@ class OnnxStub:
                    1,
                )
            elif node.op_type == "Add":
-                tensors[node.output[0]] = self.handler.add(
+                tensors[node.output[0]] = ans.handler.add(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Sub":
-                tensors[node.output[0]] = self.handler.sub(
+                tensors[node.output[0]] = ans.handler.sub(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Mul":
-                tensors[node.output[0]] = self.handler.mul(
+                tensors[node.output[0]] = ans.handler.mul(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Div":
-                tensors[node.output[0]] = self.handler.div(
+                tensors[node.output[0]] = ans.handler.div(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Pow":
-                tensors[node.output[0]] = self.handler.pow(
+                tensors[node.output[0]] = ans.handler.pow(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Relu":
-                tensors[node.output[0]] = self.handler.relu(
+                tensors[node.output[0]] = ans.handler.relu(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Sigmoid":
-                tensors[node.output[0]] = self.handler.sigmoid(
+                tensors[node.output[0]] = ans.handler.sigmoid(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Tanh":
-                tensors[node.output[0]] = self.handler.tanh(
+                tensors[node.output[0]] = ans.handler.tanh(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Softmax":
-                tensors[node.output[0]] = self.handler.softmax(
+                tensors[node.output[0]] = ans.handler.softmax(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    next(
@ -353,34 +372,34 @@ class OnnxStub:
                    ),
                )
            elif node.op_type == "Abs":
-                tensors[node.output[0]] = self.handler.abs(
+                tensors[node.output[0]] = ans.handler.abs(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Shape":
-                tensors[node.output[0]] = self.handler.shape(
+                tensors[node.output[0]] = ans.handler.shape(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Identity":
-                tensors[node.output[0]] = self.handler.identity(
+                tensors[node.output[0]] = ans.handler.identity(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Flatten":
-                tensors[node.output[0]] = self.handler.flatten(
+                tensors[node.output[0]] = ans.handler.flatten(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    next((attr.i for attr in node.attribute if attr.name == "axis")),
                )
            elif node.op_type == "PRelu":
-                tensors[node.output[0]] = self.handler.pRelu(
+                tensors[node.output[0]] = ans.handler.pRelu(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "Clip":
-                tensors[node.output[0]] = self.handler.clip(
+                tensors[node.output[0]] = ans.handler.clip(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    next(_parse_data(data[node.input[1]]).__iter__(), None)
@ -394,7 +413,7 @@ class OnnxStub:
                perm = next(
                    (attr.ints for attr in node.attribute if attr.name == "perm"), None
                )
-                tensors[node.output[0]] = self.handler.transpose(
+                tensors[node.output[0]] = ans.handler.transpose(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    perm,
@ -409,7 +428,7 @@ class OnnxStub:
                temp = reduce(lambda acc, x: acc * x, input_shape, 1)
                if temp < 0:
                    input_shape[input_shape.index(-1)] = size // -temp
-                tensors[node.output[0]] = self.handler.reshape(
+                tensors[node.output[0]] = ans.handler.reshape(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    input_shape,
@ -426,7 +445,7 @@ class OnnxStub:
                for i, x in enumerate(input_shape):
                    if i not in axes:
                        output_shape.append(x)
-                tensors[node.output[0]] = self.handler.reshape(
+                tensors[node.output[0]] = ans.handler.reshape(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    output_shape,
@ -440,13 +459,13 @@ class OnnxStub:
                )
                for i in axes:
                    input_shape.insert(i, 1)
-                tensors[node.output[0]] = self.handler.reshape(
+                tensors[node.output[0]] = ans.handler.reshape(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    input_shape,
                )
            elif node.op_type == "Concat":
-                tensors[node.output[0]] = self.handler.concat(
+                tensors[node.output[0]] = ans.handler.concat(
                    [tensors[name] for name in node.input],
                    tensors.get(node.output[0]),
                    next((attr.i for attr in node.attribute if attr.name == "axis")),
@ -454,7 +473,7 @@ class OnnxStub:
            elif node.op_type == "Split":
                for name, tensor in zip(
                    node.output,
-                    self.handler.split(
+                    ans.handler.split(
                        tensors[node.input[0]],
                        None,
                        next(
@ -466,14 +485,14 @@ class OnnxStub:
                ):
                    tensors[name] = tensor
            elif node.op_type == "Gather":
-                tensors[node.output[0]] = self.handler.gather(
+                tensors[node.output[0]] = ans.handler.gather(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                    next((attr.i for attr in node.attribute if attr.name == "axis")),
                )
            elif node.op_type == "ReduceMean":
-                tensors[node.output[0]] = self.handler.reduce_mean(
+                tensors[node.output[0]] = ans.handler.reduce_mean(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    tensors[node.input[1]] if len(node.input) > 1 else None,
@ -481,7 +500,7 @@ class OnnxStub:
                    != 0,
                )
            elif node.op_type == "Slice":
-                tensors[node.output[0]] = self.handler.slice(
+                tensors[node.output[0]] = ans.handler.slice(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    _parse_data(data[node.input[1]]),
@ -490,7 +509,7 @@ class OnnxStub:
                    _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
                )
            elif node.op_type == "Pad":
-                tensors[node.output[0]] = self.handler.pad(
+                tensors[node.output[0]] = ans.handler.pad(
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                    _parse_data(data[node.input[1]]),
@ -499,7 +518,7 @@ class OnnxStub:
            elif node.op_type == "Dropout":
                for name, tensor in zip(
                    node.output,
-                    self.handler.dropout(
+                    ans.handler.dropout(
                        tensors[node.input[0]],
                        tensors.get(node.output[0]),
                        tensors.get(node.output[1]) if len(node.output) > 1 else None,
@ -512,18 +531,33 @@ class OnnxStub:
                    ),
                ):
                    tensors[name] = tensor
+            elif node.op_type == "MemBound":
+                attributes = _parse_attribute(node, {"expr": None})
+                expr: str = attributes["expr"]
+                assert expr is not None
+                assert (
+                    len(node.output) == 1
+                ), """MemBound with multiple
+                    outputs requires rewrite the logic of tensor creation"""
+                outputs = ans.handler.memBound(
+                    [tensors[name] for name in node.input],
+                    tensors.get(node.output[0]),
+                    expr,
+                )
+                for name, tensor in zip(node.output, outputs):
+                    tensors[name] = tensor
            else:
                raise Exception('Unsupported operator "{}"'.format(node.op_type))

-        self.handler.data_malloc()
+        ans.handler.data_malloc()

        for name, obj in tensors.items():
            tensor = data.get(name)
            if tensor == None:
                if any(input.name == name for input in model.graph.input):
-                    self.inputs[name] = obj
+                    ans.inputs[name] = obj
            else:
-                self.initializer[obj.fuid()] = tensor
+                ans.initializer[obj.fuid()] = tensor
                if tensor.data_type == TensorProto.INT32:
                    obj.copyin_int32(_parse_data(tensor))
                elif tensor.data_type == TensorProto.INT64:
@ -533,8 +567,19 @@ class OnnxStub:
                else:
                    assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)

-        for output in model.graph.output:
-            self.outputs[output.name] = tensors[output.name]
+        return ans
+
+    @classmethod
+    def from_graph(cls, g: backend.Graph):
+        ans = OnnxStub()
+        handler = backend.GraphHandler(g)
+        for i, tensor in enumerate(handler.inputs()):
+            ans.inputs["input{}".format(i)] = tensor
+        for i, tensor in enumerate(handler.outputs()):
+            ans.inputs["output{}".format(i)] = tensor
+        ans.handler = handler
+        ans.disable_check = True
+        return ans

    def to_onnx(self, name: str) -> ModelProto:
        class Context:
@ -552,6 +597,13 @@ class OnnxStub:
            outputs: List[ValueInfoProto] = []
            # saves global input tensors
            initializers: List[TensorProto] = []
+            # saves global output tensors
+            value_info: List[ValueInfoProto] = []
+
+            enable_check = False
+
+            def __init__(self, enable_check):
+                self.enable_check = enable_check

            def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
                ty = op.op_type()
@ -562,12 +614,15 @@ class OnnxStub:

            def push_output(self, name: str, tensor: backend.Tensor) -> str:
                self.names[tensor] = name
-                if not tensor.has_target():
-                    shape = tensor.shape()
-                    dtype = backend.tensor_dtype(tensor)
-                    value_info = make_tensor_value_info(name, dtype, shape)
-                    check_value_info(value_info)
+
+                shape = tensor.shape()
+                dtype = backend.tensor_dtype(tensor)
+                value_info = make_tensor_value_info(name, dtype, shape)
+                check_value_info(value_info)
+                if not tensor.has_target():  # if this output is a global output
                    self.outputs.append(value_info)
+                else:  # if this output is a local output
+                    self.value_info.append(value_info)
                return name

            def push_input(
@ -577,7 +632,10 @@ class OnnxStub:
                # means that this input is a global input
                if name is None:
                    self.count_in += 1
-                    name = "input{}".format(self.count_in)
+                    if tensor.getTensorType() == backend.TensorType.Input:
+                        name = "input{}".format(self.count_in)
+                    else:
+                        name = "weight{}".format(self.count_in)
                    self.names[tensor] = name
                    if init != None:
                        init.name = name
@ -605,17 +663,25 @@ class OnnxStub:
                return name

            def push_node(self, node: NodeProto) -> None:
-                check_node(node)
+                if self.enable_check:
+                    check_node(node)
                self.nodes.append(node)

            def build(self, name: str) -> ModelProto:
                graph = make_graph(
-                    self.nodes, name, self.inputs, self.outputs, self.initializers
+                    self.nodes,
+                    name,
+                    self.inputs,
+                    self.outputs,
+                    self.initializers,
+                    value_info=self.value_info,
                )
-                check_graph(graph)
+                if self.enable_check:
+                    check_graph(graph)

                model = make_model(graph)
-                check_model(model)
+                if self.enable_check:
+                    check_model(model)

                return model

@ -625,7 +691,7 @@ class OnnxStub:

        ops = self.handler.operators()  # 图中所有算子（节点）

-        ctx = Context()
+        ctx = Context(not self.disable_check)

        for op in ops:
            ty, name = ctx.name_op(op)
@ -812,6 +878,18 @@ class OnnxStub:
                        ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
                    )
                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.MemBound:
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        domain="nnet",
+                        expr=backend.membound_expr_of(op),
+                        hash=str(backend.membound_hash_of(op)),
+                    )
+                )
            else:
                raise Exception("Unsupported OpType", ty)

@ -828,7 +906,7 @@ class OnnxStub:


 def from_onnx(model: ModelProto, runtime):
-    stub = OnnxStub(model, runtime)
+    stub = OnnxStub.from_onnx(model, runtime)
    return stub.inputs, stub.outputs, stub.handler


--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -40,9 +40,9 @@ class TestStringMethods(unittest.TestCase):
                        file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
                    )
                )
-                model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx(
-                    "new"
-                )
+                model = OnnxStub.from_onnx(
+                    onnx.load(model_file), backend.cpu_runtime()
+                ).to_onnx("new")
                model = infer_shapes(model)

    def test_tensor(self):
@ -304,16 +304,16 @@ class TestStringMethods(unittest.TestCase):

    def test_frontend(self):
        handler = backend.GraphHandler(backend.cpu_runtime())
-        a = handler.tensor([1, 2, 3], 12)
-        b = handler.tensor([1, 2, 3], 12)
-        c = handler.tensor([1, 2, 3], 12)
-        d = handler.tensor([1, 2, 3], 12)
-        e = handler.tensor([1, 2, 3], 12)
+        a = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
+        b = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
+        c = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
+        d = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)
+        e = handler.tensor([1, 2, 3], 12, backend.TensorType.Input)

        x = handler.add(
            handler.add(handler.add(handler.add(a, b, None), c, None), d, None), e, None
        )
-        y = handler.tensor([3, 2, 1], 12)
+        y = handler.tensor([3, 2, 1], 12, backend.TensorType.Other)
        handler.reshape(x, y, [3, 2, 1])


--- a/python/cpp_plugin/gen_ansor_so.py
+++ b/python/cpp_plugin/gen_ansor_so.py
@ -1,106 +1,131 @@
-import re
+import os
+import sys
+import json
+from contextlib import redirect_stdout
+import time
+import logging

 import numpy as np
 import tvm
 from tvm import te, tir, auto_scheduler, topi
-import os
-import json
-import logging

 USE_CACHE = True
+logging.basicConfig()
 logger = logging.getLogger('InfiniTensor')
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)


 def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
                 tvm_code, func_name, nnet_expression: str,
-                 nnet_simplified_expression: str, hash_code=None):
+                 nnet_simplified_expression: str, hash_code: str = None):
    assert len(input_tensors) == len(input_dtypes)
-    
-    logging.debug(f'Work on hash {hash_code}')
+
+    logger.debug(f'Work on hash {hash_code}')
    dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
-    
+
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
-    
+
    so_fn = os.path.join(dir_name, f"{func_name}.so")
    config_fn = os.path.join(dir_name, "config_so.json")
-    
-    print("Generating Ansor op: ")
-    print(tvm_code)
-    
-    print("Input shape: ")
-    print(input_tensors)
-    print("Output shape: ")
-    print(output_tensor)
-    
+    desc_fn = os.path.join(dir_name, "desc.txt")
+    log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
+    out_fn = os.path.join(dir_name, "out.txt")
+
+    logger.debug(f"Generating Ansor op: {tvm_code}")
+    logger.debug(f"Input shape: {input_tensors}")
+    logger.debug(f"Output shape: {output_tensor}")
+
    if USE_CACHE and hash_code is not None:
        if os.path.exists(dir_name) and \
-            os.path.exists(so_fn) and \
-            os.path.exists(config_fn):
+                os.path.exists(so_fn) and \
+                os.path.exists(config_fn):
            print(f"Use cache in {dir_name}")
            with open(config_fn, "r") as config_fin:
                config = json.loads(config_fin.read().strip())
                conv_time = config["conv_time"]

-            logger.debug(f'Find tuning log for {hash_code}')
+            logger.info(f'Find tuning log for {hash_code} in {so_fn}')
            return so_fn, conv_time
-    
+    logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
+
+    time_start = time.perf_counter()
+    # Print descriptions of the task
+    if USE_CACHE and hash_code is not None:
+        with redirect_stdout(open(desc_fn, "w")):
+            print("====NNET tensor expression====")
+            print(nnet_expression+"\n")
+            print("====NNET simplified tensor expression====")
+            print(nnet_simplified_expression+"\n")
+            print("====TVM compute====")
+            print(tvm_code+"\n")
+            print("Input shape: ", input_tensors)
+            print("Output shape: ", output_tensor)
+
    @auto_scheduler.register_workload(func_name)
    def compute():
        _locals = locals()
-        exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
+        exec(tvm_code, {'tvm': tvm, 'te': te,
+             'tir': tir, 'topi': topi}, _locals)
        return _locals['ret']
-    
+
    target = tvm.target.Target("cuda")

    task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)

-    # Inspect the computational graph
-    print("Computational DAG:")
-    print(task.compute_dag)
+    with redirect_stdout(open(out_fn, 'w')):
+        # Inspect the computational graph
+        print("Computational DAG:")
+        print(task.compute_dag)

-    log_file = f"ansor_{func_name}_log.json"
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=10,
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        verbose=2,
-    )
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=10,
+            runner=measure_ctx.runner,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
+            verbose=2,
+        )

-    # Run auto-tuning (search)
-    task.tune(tune_option)
-    # Apply the best schedule
-    sch, args = task.apply_best(log_file)
+        # Run auto-tuning (search)
+        task.tune(tune_option)
+        # Apply the best schedule
+        sch, args = task.apply_best(log_fn)
+
+        # Kill the measurement process
+        del measure_ctx
+
+        func = tvm.build(sch, args, target, name=func_name)
+        func.export_library(so_fn)
+
+        ctx = tvm.cuda(0)
+        input_a = []
+        for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
+            a_np = np.random.uniform(size=shape).astype(dtype)
+            input_a.append(tvm.nd.array(a_np, ctx))
+        a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
+        func(a_out, *input_a)
+        evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+        conv_time = evaluator(a_out, *input_a).mean * 1e3
+
+    time_end = time.perf_counter()

-    # Kill the measurement process
-    del measure_ctx
-    
-    func = tvm.build(sch, args, target, name=func_name)
-    func.export_library(so_fn)
-    
-    ctx = tvm.cuda(0)
-    input_a = []
-    for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
-        a_np = np.random.uniform(size=shape).astype(dtype)
-        input_a.append(tvm.nd.array(a_np, ctx))
-    a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
-    func(a_out, *input_a)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
-    conv_time = evaluator(a_out, *input_a).mean * 1e3
-    
-    print("====NNET tensor expression====")
-    print(nnet_expression+"\n")
-    print("====NNET simplified tensor expression====")
-    print(nnet_simplified_expression+"\n")
-    print("====Time====")
-    print(conv_time)
-    
    if USE_CACHE and hash_code is not None:
        with open(config_fn, "w") as config_fout:
            config_fout.write(json.dumps({
                "conv_time": conv_time,
+                "tuning_time": time_end - time_start,
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
            }, ensure_ascii=False, indent=2))
-    
+
    return so_fn, conv_time
+
+# Read arguments from pipe, which is redirected to stdin.
+# Write generated library path to pipe.
+
+
+def pipe_gen(fd: int):
+    args = json.load(sys.stdin)  # read from pipe
+    # print(args, f'fd={fd}')
+    ret = gen_ansor_so(**args)
+    with os.fdopen(fd, 'w') as f:
+        print(ret[0], file=f, end='')  # write to pipe
--- a/python/infinitensor/init.py
+++ b/python/infinitensor/init.py
@ -0,0 +1,7 @@
+import backend
+from backend import *
+import sys
+
+sys.path.extend(__path__)
+
+print("import backend: {}".format(backend))
--- a/python/infinitensor/if_onnx.py
+++ b/python/infinitensor/if_onnx.py
@ -0,0 +1,941 @@
+import backend
+import onnx
+from onnx import (
+    ModelProto,
+    TensorProto,
+    NodeProto,
+    AttributeProto,
+    TensorShapeProto,
+    ValueInfoProto,
+)
+from onnx.helper import (
+    make_node,
+    make_tensor_value_info,
+    make_tensor,
+    make_graph,
+    make_model,
+)
+from onnx.checker import (
+    check_graph,
+    check_model,
+    check_node,
+    check_value_info,
+    check_tensor,
+)
+from onnx.shape_inference import infer_shapes
+from onnx.numpy_helper import to_array
+from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
+from functools import reduce
+
+
+class OnnxStub:
+    """
+    The Onnx model imported into infinitensor.
+    It can be generated from an Onnx model object.
+    """
+
+    # inputs: Dict[str, backend.Tensor] = {}
+    # outputs: Dict[str, backend.Tensor] = {}
+    initializer: Dict[int, TensorProto] = {}
+    # handler: backend.GraphHandler
+
+    # def __init__(self, model: ModelProto, runtime):
+    #     model = infer_shapes(model)
+    #     self.handler = backend.GraphHandler(runtime)
+
+    #     tensors: Dict[str, backend.Tensor] = dict()
+    #     data: Dict[str, TensorProto] = dict()
+
+    #     for input in model.graph.input:
+    #         dims = _take_shape_dim(input.type.tensor_type.shape)
+    #         tensors[input.name] = self.handler.tensor(
+    #             dims, input.type.tensor_type.elem_type
+    #         )
+
+    #     for output in model.graph.output:
+    #         dims = _take_shape_dim(output.type.tensor_type.shape)
+    #         tensors[output.name] = self.handler.tensor(
+    #             dims, output.type.tensor_type.elem_type
+    #         )
+
+    #     for initializer in model.graph.initializer:
+    #         dims = [d for d in initializer.dims]
+    #         tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
+    #         data[initializer.name] = initializer
+
+    #     for node in model.graph.node:
+    #         if node.op_type == "Conv":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (d, p, s) = (
+    #                 attributes[name] for name in ["dilations", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors[node.input[0]], None, p, [-2, -1]
+    #                 )
+    #                 p = [0, 0, 0, 0]
+    #             else:
+    #                 adapt = node.input[0]
+
+    #             if len(node.input) > 2:
+    #                 bias = "{}-bias".format(node.output[0])
+    #                 reshape = "{}-reshape".format(node.output[0])
+    #                 tensors[bias] = self.handler.conv(
+    #                     tensors[adapt],
+    #                     tensors[node.input[1]],
+    #                     None,
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                     d[0],
+    #                     d[1],
+    #                 )
+    #                 tensors[reshape] = self.handler.reshape(
+    #                     tensors[node.input[2]],
+    #                     None,
+    #                     [
+    #                         1,
+    #                         reduce(
+    #                             lambda acc, x: acc * x,
+    #                             _search_shape(model, node.input[2]),
+    #                         ),
+    #                         1,
+    #                         1,
+    #                     ],
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.add(
+    #                     tensors[bias],
+    #                     tensors[reshape],
+    #                     tensors.get(node.output[0]),
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.conv(
+    #                     tensors[adapt],
+    #                     tensors[node.input[1]],
+    #                     tensors.get(node.output[0]),
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                     d[0],
+    #                     d[1],
+    #                 )
+    #         elif node.op_type == "ConvTranspose":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0],
+    #                     "strides": [1, 1],
+    #                     "output_padding": [0, 0],
+    #                 },
+    #             )
+    #             (d, p, s, op) = (
+    #                 attributes[name]
+    #                 for name in ["dilations", "pads", "strides", "output_padding"]
+    #             )
+    #             tensors[node.output[0]] = self.handler.convTransposed2d(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 p[0],
+    #                 p[1],
+    #                 s[0],
+    #                 s[1],
+    #                 d[0],
+    #                 d[1],
+    #                 op[0],
+    #                 op[1],
+    #             )
+    #         elif node.op_type == "MatMul":
+    #             tensors[node.output[0]] = self.handler.matmul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 False,
+    #                 False,
+    #                 None,
+    #                 backend.ActType.Linear,
+    #             )
+    #         elif node.op_type == "Gemm":
+    #             attributes = _parse_attribute(
+    #                 node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
+    #             )
+    #             (alpha, beta, transA, transB) = (
+    #                 attributes[name] for name in ["alpha", "beta", "transA", "transB"]
+    #             )
+    #             # FIXME unsupport attributes: `alpha` `beta`
+    #             assert alpha == 1.0
+    #             assert beta == 1.0
+    #             tensors[node.output[0]] = self.handler.matmul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 transA == 1,
+    #                 transB == 1,
+    #                 tensors[node.input[2]] if len(node.input) > 2 else None,
+    #                 backend.ActType.Linear,
+    #             )
+    #         elif node.op_type == "BatchNormalization":
+    #             (input, mean, var, scale, bias) = (
+    #                 tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
+    #             )
+    #             output = tensors.get(node.output[0])
+    #             attributes = _parse_attribute(
+    #                 node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
+    #             )
+    #             (momentum, eps, training) = (
+    #                 attributes[name]
+    #                 for name in ["momentum", "epsilon", "training_mode"]
+    #             )
+    #             tensors[node.output[0]] = self.handler.batchNorm(
+    #                 input, output, mean, var, scale, bias, momentum, eps, training != 0
+    #             )
+    #         elif node.op_type == "MaxPool":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "kernel_shape": None,
+    #                     "dilations": [1, 1],
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (k, d, p, s) = (
+    #                 attributes[name]
+    #                 for name in ["kernel_shape", "dilations", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors.get(node.input[0]), None, p, [-2, -1]
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.maxPool(
+    #                     tensors[adapt],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     d[0],
+    #                     d[1],
+    #                     0,
+    #                     0,
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.maxPool(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     d[0],
+    #                     d[1],
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #         elif node.op_type == "AveragePool":
+    #             attributes = _parse_attribute(
+    #                 node,
+    #                 {
+    #                     "kernel_shape": None,
+    #                     "pads": [0, 0, 0, 0],
+    #                     "strides": [1, 1],
+    #                 },
+    #             )
+    #             (k, p, s) = (
+    #                 attributes[name] for name in ["kernel_shape", "pads", "strides"]
+    #             )
+    #             if p[0] != p[2] or p[1] != p[3]:
+    #                 adapt = "{}-adapt".format(node.output[0])
+    #                 tensors[adapt] = self.handler.pad(
+    #                     tensors.get(node.input[0]), None, p, [-2, -1]
+    #                 )
+    #                 tensors[node.output[0]] = self.handler.avgPool(
+    #                     tensors[adapt],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     1,
+    #                     1,
+    #                     0,
+    #                     0,
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #             else:
+    #                 tensors[node.output[0]] = self.handler.avgPool(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     k[0],
+    #                     k[1],
+    #                     1,
+    #                     1,
+    #                     p[0],
+    #                     p[1],
+    #                     s[0],
+    #                     s[1],
+    #                 )
+    #         elif node.op_type == "GlobalAveragePool":
+    #             [_, _, h, w] = _search_shape(model, node.input[0])
+    #             tensors[node.output[0]] = self.handler.avgPool(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 h,
+    #                 w,
+    #                 1,
+    #                 1,
+    #                 0,
+    #                 0,
+    #                 1,
+    #                 1,
+    #             )
+    #         elif node.op_type == "Add":
+    #             tensors[node.output[0]] = self.handler.add(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Sub":
+    #             tensors[node.output[0]] = self.handler.sub(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Mul":
+    #             tensors[node.output[0]] = self.handler.mul(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Div":
+    #             tensors[node.output[0]] = self.handler.div(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Pow":
+    #             tensors[node.output[0]] = self.handler.pow(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Relu":
+    #             tensors[node.output[0]] = self.handler.relu(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Sigmoid":
+    #             tensors[node.output[0]] = self.handler.sigmoid(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Tanh":
+    #             tensors[node.output[0]] = self.handler.tanh(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Softmax":
+    #             tensors[node.output[0]] = self.handler.softmax(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Abs":
+    #             tensors[node.output[0]] = self.handler.abs(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Shape":
+    #             tensors[node.output[0]] = self.handler.shape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Identity":
+    #             tensors[node.output[0]] = self.handler.identity(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Flatten":
+    #             # FIXME axis must be 1
+    #             axis = next(
+    #                 (attr.i for attr in node.attribute if attr.name == "axis"), None
+    #             )
+    #             assert axis == None or axis == 1
+    #             tensors[node.output[0]] = self.handler.flatten(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "PRelu":
+    #             tensors[node.output[0]] = self.handler.pRelu(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #             )
+    #         elif node.op_type == "Clip":
+    #             tensors[node.output[0]] = self.handler.clip(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 next(_parse_data(data[node.input[1]]).__iter__(), None)
+    #                 if len(node.input) > 1
+    #                 else None,
+    #                 next(_parse_data(data[node.input[2]]).__iter__(), None)
+    #                 if len(node.input) > 2
+    #                 else None,
+    #             )
+    #         elif node.op_type == "Transpose":
+    #             perm = next(
+    #                 (attr.ints for attr in node.attribute if attr.name == "perm"), None
+    #             )
+    #             tensors[node.output[0]] = self.handler.transpose(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 perm,
+    #             )
+    #         elif node.op_type == "Reshape":
+    #             dims = _search_shape(model, node.input[0])
+    #             size = reduce(lambda acc, x: acc * x, dims)
+    #             input_shape = _parse_data(data[node.input[1]])
+    #             for i, x in enumerate(input_shape):
+    #                 if x == 0:
+    #                     input_shape[i] = dims[i]
+    #             temp = reduce(lambda acc, x: acc * x, input_shape, 1)
+    #             if temp < 0:
+    #                 input_shape[input_shape.index(-1)] = size // -temp
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 input_shape,
+    #             )
+    #         elif node.op_type == "Squeeze":
+    #             input_shape = _search_shape(model, node.input[0])
+    #             axes = set(
+    #                 [int(i) for i in data[node.input[1]].int64_data]
+    #                 if len(node.input) > 1
+    #                 else _parse_attribute(node, {"axes": None})["axes"]
+    #             )
+    #             assert all(input_shape[d] == 1 for d in axes)
+    #             output_shape = []
+    #             for i, x in enumerate(input_shape):
+    #                 if i not in axes:
+    #                     output_shape.append(x)
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 output_shape,
+    #             )
+    #         elif node.op_type == "Unsqueeze":
+    #             input_shape = _search_shape(model, node.input[0])
+    #             axes = (
+    #                 [int(i) for i in data[node.input[1]].int64_data]
+    #                 if len(node.input) > 1
+    #                 else _parse_attribute(node, {"axes": None})["axes"]
+    #             )
+    #             for i in axes:
+    #                 input_shape.insert(i, 1)
+    #             tensors[node.output[0]] = self.handler.reshape(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 input_shape,
+    #             )
+    #         elif node.op_type == "Concat":
+    #             tensors[node.output[0]] = self.handler.concat(
+    #                 [tensors[name] for name in node.input],
+    #                 tensors.get(node.output[0]),
+    #                 next((attr.i for attr in node.attribute if attr.name == "axis")),
+    #             )
+    #         elif node.op_type == "Split":
+    #             for name, tensor in zip(
+    #                 node.output,
+    #                 self.handler.split(
+    #                     tensors[node.input[0]],
+    #                     None,
+    #                     next(
+    #                         (attr.i for attr in node.attribute if attr.name == "axis"),
+    #                         0,
+    #                     ),
+    #                     len(node.output),
+    #                 ),
+    #             ):
+    #                 tensors[name] = tensor
+    #         elif node.op_type == "Gather":
+    #             tensors[node.output[0]] = self.handler.gather(
+    #                 tensors[node.input[0]],
+    #                 tensors[node.input[1]],
+    #                 tensors.get(node.output[0]),
+    #                 next((attr.i for attr in node.attribute if attr.name == "axis")),
+    #             )
+    #         elif node.op_type == "ReduceMean":
+    #             tensors[node.output[0]] = self.handler.reduce_mean(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 next(
+    #                     (attr.ints for attr in node.attribute if attr.name == "axes"),
+    #                     None,
+    #                 ),
+    #                 next((attr.i for attr in node.attribute if attr.name == "keepdims"))
+    #                 != 0,
+    #             )
+    #         elif node.op_type == "Slice":
+    #             tensors[node.output[0]] = self.handler.slice(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 _parse_data(data[node.input[1]]),
+    #                 _parse_data(data[node.input[2]]),
+    #                 _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+    #                 _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
+    #             )
+    #         elif node.op_type == "Pad":
+    #             tensors[node.output[0]] = self.handler.pad(
+    #                 tensors[node.input[0]],
+    #                 tensors.get(node.output[0]),
+    #                 _parse_data(data[node.input[1]]),
+    #                 _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+    #             )
+    #         elif node.op_type == "Dropout":
+    #             for name, tensor in zip(
+    #                 node.output,
+    #                 self.handler.dropout(
+    #                     tensors[node.input[0]],
+    #                     tensors.get(node.output[0]),
+    #                     tensors.get(node.output[1]) if len(node.output) > 1 else None,
+    #                     _parse_data(data[node.input[1]])[0]
+    #                     if len(node.input) > 1
+    #                     else 0.5,
+    #                     _parse_data(data[node.input[2]])[0]
+    #                     if len(node.input) > 2
+    #                     else False,
+    #                 ),
+    #             ):
+    #                 tensors[name] = tensor
+    #         else:
+    #             raise Exception('Unsupported operator "{}"'.format(node.op_type))
+
+    #     self.handler.data_malloc()
+
+    #     for name, obj in tensors.items():
+    #         tensor = data.get(name)
+    #         if tensor == None:
+    #             if any(input.name == name for input in model.graph.input):
+    #                 self.inputs[name] = obj
+    #         else:
+    #             self.initializer[obj.fuid()] = tensor
+    #             if tensor.data_type == TensorProto.INT32:
+    #                 obj.copyin_int32(_parse_data(tensor))
+    #             elif tensor.data_type == TensorProto.INT64:
+    #                 obj.copyin_int64(_parse_data(tensor))
+    #             elif tensor.data_type == TensorProto.FLOAT:
+    #                 obj.copyin_float(_parse_data(tensor))
+    #             else:
+    #                 assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+
+    #     for output in model.graph.output:
+    #         self.outputs[output.name] = tensors[output.name]
+
+    def to_onnx(self, g: backend.Graph, path: str, name: str = 'my_onnx') -> ModelProto:
+        class Context:
+            # saves object names, including tensors and operators
+            names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
+            # counts the occurrence times of each operator for naming
+            count_op: Dict[backend.OpType, int] = dict()
+            # counts input and output tensors for naming
+            count_in, count_out = 0, 0
+            # saves nodes (operators)
+            nodes: List[NodeProto] = []
+            # saves global input tensors
+            inputs: List[ValueInfoProto] = []
+            # saves global output tensors
+            outputs: List[ValueInfoProto] = []
+            # saves global input tensors
+            initializers: List[TensorProto] = []
+
+            def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
+                ty = op.op_type()
+                name = "{}_{}".format(ty.name, op.guid())
+                self.names[op] = name
+                self.count_op[ty] = self.count_op.get(ty, 0) + 1
+                return ty, name
+
+            def push_output(self, name: str, tensor: backend.Tensor) -> str:
+                self.names[tensor] = name
+                if not tensor.has_target():
+                    shape = tensor.shape()
+                    dtype = backend.tensor_dtype(tensor)
+                    value_info = make_tensor_value_info(name, dtype, shape)
+                    check_value_info(value_info)
+                    self.outputs.append(value_info)
+                return name
+
+            def push_input(
+                self, tensor: backend.Tensor, init: Optional[TensorProto]
+            ) -> str:
+                name = self.names.get(tensor)
+                # means that this input is a global input
+                if name is None:
+                    self.count_in += 1
+                    name = "input_{}".format(tensor.guid())
+                    self.names[tensor] = name
+                    if init != None:
+                        init.name = name
+                        self.initializers.append(init)
+                    else:
+                        shape = tensor.shape()
+                        dtype = backend.tensor_dtype(tensor)
+                        value_info = make_tensor_value_info(name, dtype, shape)
+                        check_value_info(value_info)
+                        self.inputs.append(value_info)
+                return name
+
+            def push_data_input(
+                self,
+                node_name: str,
+                attr_name: str,
+                elem_type: int,
+                shape: Sequence[int],
+                vals: Any,
+            ) -> str:
+                name = "{}_{}".format(node_name, attr_name)
+                tensor = make_tensor(name, elem_type, shape, vals)
+                check_tensor(tensor)
+                self.initializers.append(tensor)
+                return name
+
+            def push_node(self, node: NodeProto) -> None:
+                # check_node(node)
+                self.nodes.append(node)
+
+            def build(self, name: str) -> ModelProto:
+                graph = make_graph(
+                    self.nodes, name, self.inputs, self.outputs, self.initializers
+                )
+                # check_graph(graph)
+
+                model = make_model(graph)
+                # check_model(model)
+
+                return model
+
+        # 拓扑排序
+        if not g.topo_sort():
+            raise Exception("Sorting fails")
+
+        ops = g.operators()  # 图中所有算子（节点）
+
+        ctx = Context()
+
+        for op in ops:
+            ty, name = ctx.name_op(op)
+            inputs = [
+                ctx.push_input(it, self.initializer.get(it.fuid()))
+                for it in op.inputs()
+            ]
+            outputs = [
+                ctx.push_output("{}_{}_{}".format(
+                    name, i, tensor.guid()), tensor)
+                for (i, tensor) in enumerate(op.outputs())
+            ]
+            if ty == backend.OpType.Conv:
+                ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        pads=[ph, pw, ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                        group=op.inputs()[0].shape()[
+                            1] // op.inputs()[1].shape()[1],
+                    )
+                )
+            elif ty == backend.OpType.ConvTrans:
+                ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(
+                    op)
+                ctx.push_node(
+                    make_node(
+                        "ConvTranspose",
+                        inputs,
+                        outputs,
+                        name,
+                        pads=[ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                        output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.ConvTransNHWC:
+                # ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "ConvTranspose",
+                        inputs,
+                        outputs,
+                        name,
+                        domain="nnet",
+                        # pads=[ph, pw],
+                        # strides=[sh, sw],
+                        # dilations=[dh, dw],
+                        # output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.MemBound:
+                # ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "Membound",
+                        inputs,
+                        outputs,
+                        name,
+                        domain="nnet",
+                        # pads=[ph, pw],
+                        # strides=[sh, sw],
+                        # dilations=[dh, dw],
+                        # output_padding=[oph, opw],
+                    )
+                )
+            elif ty == backend.OpType.Matmul:
+                # transA, transB = backend.matmul_attrs_of(op)
+                # HACK: recover this
+                transA, transB = False, False
+                ctx.push_node(
+                    make_node(
+                        "Gemm", inputs, outputs, name, transA=transA, transB=transB
+                    )
+                )
+            elif ty == backend.OpType.BatchNorm:
+                inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
+                momentum, eps, training = backend.batch_norm_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "BatchNormalization",
+                        inputs,
+                        outputs,
+                        name,
+                        epsilon=eps,
+                        momentum=momentum,
+                        training_mode=training,
+                    )
+                )
+            elif ty == backend.OpType.MaxPool:
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw, ph, pw],
+                        dilations=[dh, dw],
+                        strides=[sh, sw],
+                    )
+                )
+            elif ty == backend.OpType.AvgPool:
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "AveragePool",
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw, ph, pw],
+                        strides=[sh, sw],
+                    )
+                )
+            elif ty in [
+                backend.OpType.Add,
+                backend.OpType.Sub,
+                backend.OpType.Mul,
+                backend.OpType.Div,
+                backend.OpType.Pow,
+                backend.OpType.Relu,
+                backend.OpType.Sigmoid,
+                backend.OpType.Tanh,
+                backend.OpType.Softmax,
+                backend.OpType.Abs,
+                backend.OpType.Identity,
+                backend.OpType.PRelu,
+            ]:
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Flatten:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Transpose:
+                perm = backend.transpose_permute_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, perm=perm))
+            elif ty == backend.OpType.Reshape:
+                shape = backend.reshape_shape_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "shape",
+                        TensorProto.INT64,
+                        [len(shape)],
+                        shape,
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Concat:
+                axis = backend.concat_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, axis=axis))
+            elif ty == backend.OpType.Split:
+                axis = backend.split_axis_of(op)
+                num_outputs = len(outputs)
+                split = op.inputs()[0].shape()[axis] // num_outputs
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "split",
+                        TensorProto.INT64,
+                        [len(outputs)],
+                        [split for _ in range(0, num_outputs)],
+                    )
+                )
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        axis=axis,
+                    )
+                )
+            elif ty == backend.OpType.Gather:
+                axis = backend.gather_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs,
+                              outputs, name, axis=axis))
+            elif ty == backend.OpType.ReduceMean:
+                axes, keepdims = backend.reduce_mean_attrs_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "axes", TensorProto.INT64, [len(axes)], axes
+                    )
+                )
+                ctx.push_node(
+                    make_node(ty.name, inputs, outputs,
+                              name, keepdims=keepdims)
+                )
+            elif ty == backend.OpType.Slice:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Pad:
+                pads = backend.pad_pads_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "pads", TensorProto.INT64, [len(pads)], pads
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            # elif ty == backend.OpType.Clip:
+            #     min, max = backend.clip_attrs_of(op)
+            #     if min != None:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min])
+            #         )
+            #     else:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [])
+            #         )
+            #     if max != None:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max])
+            #         )
+            #     else:
+            #         inputs.append(
+            #             ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [])
+            #         )
+            #     ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            else:
+                raise Exception("Unsupported OpType", ty)
+
+        model = ctx.build(name)
+        onnx.save(model, path)
+        return model
+
+    # def init(self) -> None:
+    #     self.handler.data_malloc()
+
+    # def optimize(self) -> None:
+    #     self.handler.optimize()
+
+    # def run(self) -> None:
+    #     self.handler.run()
+
+
+# def from_onnx(model: ModelProto, runtime):
+#     stub = OnnxStub(model, runtime)
+#     return stub.inputs, stub.outputs, stub.handler
+
+
+# def _search_shape(model: ModelProto, name: str) -> List[int]:
+#     ans = (
+#         next(
+#             (
+#                 [
+#                     (d.dim_value if d.dim_value > 0 else 1)
+#                     for d in tensor.type.tensor_type.shape.dim
+#                 ]
+#                 for tensor in model.graph.value_info
+#                 if tensor.name == name
+#             ),
+#             None,
+#         )
+#         or next(
+#             (
+#                 [
+#                     (d.dim_value if d.dim_value > 0 else 1)
+#                     for d in tensor.type.tensor_type.shape.dim
+#                 ]
+#                 for tensor in model.graph.input
+#                 if tensor.name == name
+#             ),
+#             None,
+#         )
+#         or next(
+#             [int(d) for d in tensor.dims]
+#             for tensor in model.graph.initializer
+#             if tensor.name == name
+#         )
+#     )
+#     return ans
+
+
+# def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
+#     for attr in node.attribute:
+#         if attr.name in attrs:
+#             if attr.type == AttributeProto.INT:
+#                 attrs[attr.name] = attr.i
+#             elif attr.type == AttributeProto.INTS:
+#                 attrs[attr.name] = attr.ints
+#             elif attr.type == AttributeProto.FLOAT:
+#                 attrs[attr.name] = attr.f
+#             elif attr.type == AttributeProto.STRING:
+#                 attrs[attr.name] = attr.s
+#             elif attr.type == AttributeProto.TENSOR:
+#                 attrs[attr.name] = attr.t
+#             else:
+#                 assert False, "Unsupported Attribute Type: {}".format(attr.type)
+#     return attrs
+
+
+# def _parse_data(tensor: TensorProto) -> List[Any]:
+#     return to_array(tensor).flatten().tolist()
+
+
+# def _take_shape_dim(shape: TensorShapeProto) -> List[int]:
+#     return [(d.dim_value if d.dim_value > 0 else 1) for d in shape.dim]
+
+def export_onnx(g: backend.Graph, path: str) -> None:
+    stub = OnnxStub()
+    stub.to_onnx(g, path)
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -129,8 +129,9 @@ void GraphObj::dataMalloc() {
    }
 }

-Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
-    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
+    return tensors.emplace_back(
+        make_ref<TensorObj>(dim, dtype, runtime, tensorType));
 }

 Tensor GraphObj::addTensor(const Tensor &tensor) {
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -1,10 +1,12 @@
 #include "core/graph_handler.h"
+#include "nnet/Visitor/Serializer.h"
 #include "operators/batch_norm.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
 #include "operators/element_wise.h"
 #include "operators/gather.h"
 #include "operators/matmul.h"
+#include "operators/membound.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
 #include "operators/reduce_mean.h"
@ -19,8 +21,8 @@ namespace infini {

 static DataType dtype_repr_convert(int);

-Tensor GraphHandlerObj::tensor(Shape dims, int dtype) {
-    return g->addTensor(std::move(dims), dtype_repr_convert(dtype));
+Tensor GraphHandlerObj::tensor(Shape dims, int dtype, TensorType ttype) {
+    return g->addTensor(std::move(dims), dtype_repr_convert(dtype), ttype);
 }

 Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph,
@ -291,6 +293,22 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
    }
 }

+TensorVec GraphHandlerObj::memBound(const TensorVec &inputs,
+                                    const Tensor &output,
+                                    const string &jsonString) {
+    const auto &[expr, nnetInputs, execTime, hint] =
+        nnet::Serializer().membundOpFromString(jsonString);
+    if (output) {
+        g->addOpWithOutputs<MemBoundObj>(std::move(inputs), TensorVec{output},
+                                         nnetInputs, expr, execTime, hint);
+        return {output};
+    } else
+        return g
+            ->addOp<MemBoundObj>(std::move(inputs), TensorVec{nullptr},
+                                 nnetInputs, expr, execTime, hint)
+            ->getOutputs();
+}
+
 static DataType dtype_repr_convert(int dtype) {
    switch ((OnnxDType)dtype) {
    case OnnxDType::FLOAT:
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@ -1,6 +1,7 @@
 #include "core/operator.h"
 #include "core/graph.h"
 #include "core/hash.h"
+#include "nnet/dbg.h"

 namespace infini {

@ -34,7 +35,8 @@ bool OperatorObj::isReshapeOp() const { return type == OpType::Reshape; }

 bool OperatorObj::isMemBoundOp() const {
    return type == OpType::MemBound || type == OpType::Activation ||
-           type == OpType::Transpose;
+           type == OpType::Transpose || type == OpType::Relu ||
+           type == OpType::Tanh;
 }

 void OperatorObj::removePredecessors(const Operator &op) {
@ -83,22 +85,29 @@ HashType OperatorObj::hash() const {

 bool OperatorObj::checkValid(GraphObj *graph) {
    auto optShapes = inferShape();
+    IT_ASSERT(optShapes);
    if (!optShapes) // shape inference failed
        return false;

    const vector<Shape> &shapes = *optShapes;
+    IT_ASSERT(shapes.size() == outputs.size());
    if (shapes.size() != outputs.size())
        return false;
    if (graph) { // if graph != nullptr, outputs should be created
        auto dataTypes = inferDataType();
        for (size_t i = 0; i < outputs.size(); i++) {
            IT_ASSERT(!outputs[i], "Find empty output while operator creation");
-            outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
+            outputs[i] =
+                graph->addTensor(shapes[i], dataTypes[i], TensorType::Other);
        }
    } else { // if outputs have been created, check their shapes
        for (size_t i = 0; i < shapes.size(); ++i) {
-            if (shapes[i] != outputs[i]->getDims())
+            IT_ASSERT(shapes[i] == outputs[i]->getDims());
+            if (shapes[i] != outputs[i]->getDims()) {
+                dbg(shapes[i], outputs[i]->getDims());
                return false;
+            }
+            IT_ASSERT(outputs[i]->getTensorType() == TensorType::Other);
        }
    }
    return true;
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@ -2,9 +2,15 @@
 #include "core/blob.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
+#include "operators/membound.h"
 #include "utils/data_generator.h"
 #include <chrono>
 #include <cstring>
+
+#ifdef USE_CUDA
+#include "cuda_profiler_api.h"
+#endif
+
 namespace infini {
 void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
    if (!tune && profiling)
@ -56,13 +62,36 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
        printProfilingData(totalTime, opTime, opCnt);
 }

-double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
+map<UidBaseType, bool>
+RuntimeObj::getCompileTimeComputableAttribute(const Graph &graph) const {
+    map<UidBaseType, bool> ctcMap; // compile-time computable
+    // Skip static computation
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+    for (auto &op : graph->getOperators()) {
+        bool compileTimeComputable = true;
+        for (auto input : op->getInputs()) {
+            // FIXME: propogate the tensor type. Current only the first operator
+            // after weights are compile-time computable.
+            if (input->getTensorType() != TensorType::Initialized)
+                compileTimeComputable = false;
+        }
+        ctcMap[op->getGuid()] = compileTimeComputable;
+    }
+    return ctcMap;
+}
+
+double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
+                               bool allowEstimation,
+                               bool ignoreMemboundOp) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
    // Statistics
    double totalTime = 0;
    std::map<OpType, double> opTime;
    std::map<OpType, int> opCnt;
+    // compile-time computable
+    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);

    for (auto &op : graph->getOperators()) {
        auto kernelAttrs = KernelAttrs{device, op->getOpType(), op->getDType()};
@ -70,11 +99,21 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
        auto perfData = perfEngine.getPerfData(perfKey);

-        PerfRecord record;
-        // Tune the kernel if there is no record
-        if (!perfData) {
+        double time = -1e9;
+        if (ctcMap[op->getGuid()]) { // Compile-time computable operators
+            time = 0;
+        } else if (op->getOpType() == OpType::Reshape) {
+            time = 0;
+        } else if (op->getOpType() == OpType::MemBound && ignoreMemboundOp) {
+            time = 0;
+        } else if (op->getOpType() == OpType::MemBound && allowEstimation) {
+            time = as<MemBoundObj>(op)->getEstimatedTime();
+        } else if (perfData) { // Tune the kernel if there is no record
+            time = perfData->time;
+        } else {
            // TODO: should tenosrs automatically allocate when access data?
-            // allocate memory for empty tensors and release it after profiling
+            // allocate memory for empty tensors and release it after
+            // profiling
            TensorVec allocatedTensors;
            for (auto t : op->getInputs())
                if (!t->hasData())
@ -88,21 +127,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
            }

            // Profile operators and record the results
-            record = kernel->tune(op, this);
+            PerfRecord record = kernel->tune(op, this);
+            time = record->time;
            perfEngine.setPerfData(perfKey, record);

            // Free allocated memory
            for (auto t : allocatedTensors)
                t->freeData();
-        } else
-            record = perfData;
+        }

-        double t = record->time;
-        totalTime += t;
+        totalTime += time;
        if (profiling) {
            op->print();
-            printf(" op_time %lf\n", t);
-            opTime[op->getOpType()] += t;
+            printf("  op_time %lf\n", time);
+            opTime[op->getOpType()] += time;
            opCnt[op->getOpType()]++;
        }
    }
@ -160,4 +198,44 @@ void CpuRuntimeObj::copyBlobInsideRuntime(void *dst, const void *src,

 string NativeCpuRuntimeObj::toString() const { return "CPU Runtime"; }

+double RuntimeObj::timeNonCtcOperators(const Graph &graph, int warmup,
+                                       int repeat) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto &perfEngine = PerfEngine::getInstance();
+    // compile-time computable
+    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
+    vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Float32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        auto perfData = perfEngine.getPerfData(perfKey);
+        if (perfData)
+            kernel->compute(op, perfData, this);
+        else
+            kernel->compute(op, this);
+        if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
+            kernels.emplace_back(op, kernel, perfData);
+    }
+    for (auto &[op, kernel, perfData] : kernels) {
+        dbg(op);
+    }
+    double ret = timeit(
+        [&]() {
+            for (auto &[op, kernel, perfData] : kernels) {
+                if (perfData)
+                    kernel->compute(op, perfData, this);
+                else
+                    kernel->compute(op, this);
+            }
+        },
+        [&]() { sync(); }, warmup, repeat);
+    return ret;
+}
+
 } // namespace infini
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -1,6 +1,7 @@
 #include "core/search_engine.h"
 #include "core/hash.h"
 #include "core/runtime.h"
+#include "nnet/dbg.h"

 #include <algorithm>
 #include <iostream>
@ -8,7 +9,17 @@

 namespace infini {

-void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
+using MetaGraph = SearchEngine::MetaGraph;
+
+SearchEngine::SearchEngine(Runtime runtime, Ref<Mutator> mutator)
+    : runtimeExec(runtime), mutator(mutator) {
+    // Compare graph with estimated time
+    graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
+        return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
+    };
+}
+
+void SearchEngine::printMetaGraph(MetaGraph metaGraph) {
    for (size_t i = 0; i < metaGraph->nodes.size(); i++) {
        auto &node = metaGraph->nodes[i];
        std::cout << "id: " << i << std::endl;
@ -32,8 +43,7 @@ Graph SearchEngine::run(const Graph graph) {
    IT_ASSERT(runtimeExec == graph->getRuntime());
    std::cout << "[INFO] original graph: " << std::endl;
    std::cout << graph->toString();
-    std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
-              << std::endl;
+    std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;

    std::vector<Graph> partitions = partitionGraph(graph);

@ -45,7 +55,6 @@ Graph SearchEngine::run(const Graph graph) {
        std::vector<Graph> candidates = search(subGraph);
        std::cout << "[INFO] size: " << candidates.size() << std::endl;
        IT_ASSERT(candidates.size() > 0);
-        std::cout << subGraph->toString() << std::endl;
        std::vector<Graph> nextGraphs;
        for (auto lastGraph : bestGraphs) {
            for (auto thisGraph : candidates) {
@ -65,9 +74,9 @@ Graph SearchEngine::run(const Graph graph) {
                nextGraphs.emplace_back(tmp);
            }
        }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        dbg("===Num" + std::to_string(nextGraphs.size()));
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
+
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
        }
@ -81,10 +90,22 @@ Graph SearchEngine::run(const Graph graph) {
    for (size_t i = 0; i < bestGraphs.size(); i++) {
        std::cout << "bestGraph " << i << ":" << std::endl;
        std::cout << bestGraphs[i]->toString();
-        std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
+        std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
                  << std::endl;
    }

+    // Fuse vertically and sort according to performance
+    for (size_t i = 0; i < bestGraphs.size(); ++i) {
+        // Debug
+        bestGraphs[i] = fuseVertically(bestGraphs[i]);
+    }
+    std::sort(bestGraphs.begin(), bestGraphs.end(), graphTimeComparer);
+
+    std::cout << "[INFO] best fused graph: " << std::endl;
+    std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[0])
+              << std::endl;
+    std::cout << bestGraphs[0] << std::endl;
+
    return bestGraphs[0];
 }

@ -102,9 +123,9 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
        }
    }

-    sort(results.begin(), results.end(), [&](Graph x, Graph y) {
-        return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-    }); // compare with perf time
+    // compare with perf time
+    dbg("===Num" + std::to_string(results.size()));
+    std::sort(results.begin(), results.end(), graphTimeComparer);
    if (results.size() > GRAPH_SIZE) {
        results.resize(GRAPH_SIZE);
    }
@ -112,9 +133,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
 }

 // Build metagraph with a graph, each operator is a node.
-std::shared_ptr<SearchEngine::MetaGraph>
-SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
-    auto metaGraph = std::make_shared<MetaGraph>();
+MetaGraph SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
+    auto metaGraph = make_ref<MetaGraphObj>();

    int numOps = graph->getOperators().size();
    std::vector<int> cnt(numOps, 0);
@ -123,7 +143,7 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {
    std::vector<int> q(0);
    for (size_t i = 0; i < graph->getOperators().size(); i++) {
        auto &op = graph->getOperators()[i];
-        MetaGraph::Node node;
+        MetaGraphObj::Node node;
        std::vector<Operator> ops;
        ops.emplace_back(op);
        node.graph = make_ref<GraphObj>(runtimeExec, ops);
@ -157,9 +177,8 @@ SearchEngine::buildMetaGraphWithGraph(const Graph graph) {

 // Build a metagraph with graph and a plan, a plan is which ops should be a
 // node.
-std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
-    const std::shared_ptr<SearchEngine::MetaGraph> metaGraph,
-    const std::vector<int> &plan) {
+MetaGraph SearchEngine::buildMetaGraphWithPlan(const MetaGraph metaGraph,
+                                               const std::vector<int> &plan) {
    int numGroups = 0;
    for (auto i : plan) {
        if (i > numGroups) {
@ -172,12 +191,12 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
        groups[plan[i]].emplace_back(i);
    }

-    auto resultMetaGraph = make_ref<MetaGraph>();
+    auto resultMetaGraph = make_ref<MetaGraphObj>();
    for (auto &group : groups) {
        std::vector<Operator> ops;
        std::unordered_set<int> preSet, sucSet;
        for (auto id : group) {
-            MetaGraph::Node node;
+            MetaGraphObj::Node node;
            for (auto op : metaGraph->nodes[id].graph->getOperators()) {
                ops.emplace_back(op);
            }
@ -204,8 +223,7 @@ std::shared_ptr<SearchEngine::MetaGraph> SearchEngine::buildMetaGraphWithPlan(
 }

 // Search how to merge multiple ops.
-std::vector<std::shared_ptr<SearchEngine::MetaGraph>>
-SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
+vector<MetaGraph> SearchEngine::searchMerge(MetaGraph &metaGraph) {
    IT_ASSERT(metaGraph != nullptr);
    std::vector<int> plan(metaGraph->nodes.size());
    for (size_t i = 0; i < plan.size(); i++) {
@ -222,7 +240,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
    std::unordered_set<HashType> planSet;
    searchMergeDfs(metaGraph, plan, frontier, plans, planSet);

-    std::vector<std::shared_ptr<SearchEngine::MetaGraph>> metaGraphs;
+    vector<MetaGraph> metaGraphs;
    for (auto &curPlan : plans) {
        metaGraphs.emplace_back(buildMetaGraphWithPlan(metaGraph, curPlan));
    }
@ -230,8 +248,7 @@ SearchEngine::searchMerge(std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
 }

 // DFS impl for search merge.
-void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
-                                  std::vector<int> &plan,
+void SearchEngine::searchMergeDfs(MetaGraph &metaGraph, std::vector<int> &plan,
                                  std::vector<int> &frontier,
                                  std::vector<std::vector<int>> &plans,
                                  std::unordered_set<uint64_t> &planSet) {
@ -320,14 +337,16 @@ void SearchEngine::searchMergeDfs(std::shared_ptr<MetaGraph> &metaGraph,
 }

 // Search mutation for each compute op.
-std::vector<Graph> SearchEngine::searchMutation(
-    const std::shared_ptr<SearchEngine::MetaGraph> &metaGraph) {
+std::vector<Graph> SearchEngine::searchMutation(const MetaGraph &metaGraph) {
    std::vector<Graph> graphs = {nullptr};
    // Append a node to all existing candidates
    for (auto &node : metaGraph->nodes) {
        std::vector<Graph> nextGraphs;
        if (node.type == 1) { // If it has computing OPs
            auto mutatedGraphs = mutator->run(node.graph);
+            // // HACK: only try the first one for debug
+            if (mutatedGraphs.size() > 2)
+                mutatedGraphs.resize(2);
            for (auto graph : graphs) {
                for (auto mutatedGraph : mutatedGraphs) {
                    std::vector<Operator> ops;
@ -360,9 +379,8 @@ std::vector<Graph> SearchEngine::searchMutation(
        for (auto g : nextGraphs) {
            g->dataMalloc();
        }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        dbg("===Num" + std::to_string(nextGraphs.size()));
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
        if (nextGraphs.size() > GRAPH_SIZE) {
            nextGraphs.resize(GRAPH_SIZE);
        }
@ -372,7 +390,7 @@ std::vector<Graph> SearchEngine::searchMutation(
 }

 bool SearchEngine::isMultiBranchMergable(const Graph graph) {
-    return mutationEngine->isMultiBranchMergable(graph);
+    return mutator->isMultiBranchMergable(graph);
 }

 // Split a graph into multiple independt graphs. Search engine will search for
@ -438,4 +456,60 @@ std::vector<Graph> SearchEngine::partitionGraph(const Graph graph) {
    return partitions;
 }

+double SearchEngine::getEstimatedGraphPerf(Graph graph) {
+    return runtimeExec->getPerfTime(graph, false, true, true);
+}
+
+Graph SearchEngine::fuseVertically(const Graph &graph) {
+    std::unordered_map<UidBaseType, int> visitTime;
+    std::vector<Operator> ops;
+
+    graph->topo_sort();
+    int cnt = 0;
+    for (auto op : graph->getOperators()) {
+        // Skip visited OP
+        if (visitTime.find(op->getGuid()) != visitTime.end()) {
+            continue;
+        }
+        // Skip compute OP and multi-input/output OP
+        if (!op->isMemBoundOp() || (op->getPredecessors().size() != 1 &&
+                                    op->getSuccessors().size() != 1)) {
+            visitTime.emplace(op->getGuid(), ++cnt);
+            ops.emplace_back(op);
+            continue;
+        }
+        vector<Operator> chainOps;
+        visitTime.emplace(op->getGuid(), ++cnt);
+
+        vector<Operator> tmp;
+        auto cur = op;
+        while (cur->getPredecessors().size() == 1 &&
+               cur->getPredecessors()[0]->isMemBoundOp()) {
+            cur = cur->getPredecessors()[0];
+            tmp.emplace_back(cur);
+            visitTime.emplace(cur->getGuid(), cnt);
+        }
+        for (int i = tmp.size() - 1; i >= 0; i--) {
+            chainOps.emplace_back(tmp[i]);
+        }
+        chainOps.emplace_back(op);
+        cur = op;
+        while (cur->getSuccessors().size() == 1 &&
+               cur->getSuccessors()[0]->isMemBoundOp()) {
+            cur = cur->getSuccessors()[0];
+            chainOps.emplace_back(cur);
+            visitTime.emplace(cur->getGuid(), cnt);
+        }
+        make_ref<GraphObj>(runtimeExec, chainOps)->print();
+
+        Graph optGraph =
+            mutator->fuseVertically(make_ref<GraphObj>(runtimeExec, chainOps));
+        for (auto op : optGraph->getOperators()) {
+            ops.emplace_back(op);
+        }
+    }
+
+    return make_ref<GraphObj>(runtimeExec, ops);
+}
+
 } // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -8,12 +8,14 @@

 namespace infini {

-TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
+                     TensorType tensorType)
    : TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
      _size(shape.empty()
                ? 0
                : std::accumulate(shape.begin(), shape.end(), 1,
-                                  [](auto acc, auto x) { return acc * x; })) {}
+                                  [](auto acc, auto x) { return acc * x; })),
+      tensorType(tensorType) {}

 string TensorObj::toString() const {
    // Convert data pointer to string
@ -24,8 +26,8 @@ string TensorObj::toString() const {
        ss << "nullptr data";
    string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                 std::to_string(fuid) + ", shape " + vecToString(shape) +
-                 ", dtype " + dtype.toString() + ", " + runtime->toString() +
-                 ", " + ss.str() + "\n";
+                 ", dtype " + dtype.toString() + ", tensorType " +
+                 std::to_string(enum_to_underlying(tensorType));
    vector<UidBaseType> targetGuids;
    for (const auto &op : targets)
        targetGuids.emplace_back(op.lock()->getGuid());
@ -34,6 +36,7 @@ string TensorObj::toString() const {
    else
        ret += ", source None";
    ret += ", targets " + vecToString(targetGuids);
+    ret += ", " + runtime->toString() + ", " + ss.str();
    return ret;
 }

@ -64,12 +67,19 @@ vector<size_t> TensorObj::getStride() const {

 void TensorObj::printData() const {
    IT_ASSERT(data != nullptr);
-    if (!runtime->isCpu())
-        IT_TODO_HALT();
+    void *ptr = nullptr;
+    Blob buffer;
+    if (!runtime->isCpu()) { // copy data to main memory
+        buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes());
+        runtime->copyBlobToCPU(buffer->getPtr<void *>(),
+                               getRawDataPtr<void *>(), getBytes());
+        ptr = buffer->getPtr<void *>();
+    } else
+        ptr = data->getPtr<float *>();

 #define TRY_PRINT(N)                                                           \
    if (dtype == DataType(N))                                                  \
-        std::cout << dataToString<DT<N>::t>() << std::endl;
+        std::cout << dataToString<DT<N>::t>(ptr) << std::endl;

    TRY_PRINT(0)          // fmt: new line
    else TRY_PRINT(1)     //
--- a/src/cuda/cuda_runtime.cc
+++ b/src/cuda/cuda_runtime.cc
@ -2,10 +2,56 @@
 #include "core/kernel.h"
 #include "core/perf_engine.h"
 #include "core/runtime.h"
+#include "cuda_profiler_api.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
+#ifdef INFINI_USE_TVM
+#include "tvm/runtime/device_api.h"
+#endif
 namespace infini {

+CudaRuntimeObj::CudaRuntimeObj()
+    : RuntimeObj(Device::CUDA), stream(cudaStreamPerThread),
+      cudaGraphStatus(false) {
+    checkCudnnError(cudnnCreate(&cudnn));
+    checkCublasError(cublasCreate(&cublas));
+    checkCudnnError(cudnnSetStream(cudnn, stream));
+    checkCublasError(cublasSetStream(cublas, stream));
+    // 10GB for Longformer
+    // size_t longformerNum = 3lu * (1 << 30);
+    workspaceSize = 7ll << 30; // 7 GB
+    workspace = alloc(workspaceSize);
+}
+
+CudaRuntimeObj::~CudaRuntimeObj() {
+    try {
+        dealloc(workspace);
+        checkCudnnError(cudnnDestroy(cudnn));
+        checkCublasError(cublasDestroy(cublas));
+    } catch (const std::exception &e) {
+        std::cerr << "Error in ~CudaRuntimeObj: " << e.what() << std::endl;
+    }
+}
+
+void CudaRuntimeObj::beginCudaGraphStreamCapture() {
+    enum cudaStreamCaptureStatus pCaptureStatus;
+    checkCudaError(cudaStreamIsCapturing(stream, &pCaptureStatus));
+    IT_ASSERT(pCaptureStatus == cudaStreamCaptureStatusNone);
+    cudaGraphStatus = true;
+    checkCudaError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+}
+
+tuple<cudaGraphExec_t, size_t> CudaRuntimeObj::endCudaGraphStreamCapture() {
+    cudaGraph_t cudaGraph;
+    cudaGraphExec_t instance;
+    checkCudaError(cudaStreamEndCapture(stream, &cudaGraph));
+    cudaGraphStatus = false;
+    size_t numCudaGraphNodes;
+    checkCudaError(cudaGraphGetNodes(cudaGraph, nullptr, &numCudaGraphNodes));
+    checkCudaError(cudaGraphInstantiate(&instance, cudaGraph, NULL, NULL, 0));
+    return {instance, numCudaGraphNodes};
+}
+
 void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
    const auto &kernelRegistry = KernelRegistry::getInstance();
    auto &perfEngine = PerfEngine::getInstance();
@ -75,4 +121,57 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }

 string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }

+double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto &perfEngine = PerfEngine::getInstance();
+    // compile-time computable
+    map<UidBaseType, bool> ctcMap = getCompileTimeComputableAttribute(graph);
+    vector<tuple<Operator, Kernel *, PerfRecord>> kernels;
+    bool status = graph->topo_sort();
+    IT_ASSERT(status, "Topological sort failed");
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Float32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        auto perfData = perfEngine.getPerfData(perfKey);
+        if (perfData)
+            kernel->compute(op, perfData, this);
+        else
+            kernel->compute(op, this);
+        if (!ctcMap.at(op->getGuid()) && op->getOpType() != OpType::Reshape)
+            kernels.emplace_back(op, kernel, perfData);
+    }
+    for (auto &[op, kernel, perfData] : kernels) {
+        dbg(op);
+    }
+
+    // Init tvm stream
+    #ifdef INFINI_USE_TVM
+    DLDevice tvm_device_id = {kDLCUDA, 0};
+    auto tvm_device = tvm::runtime::DeviceAPI::Get(tvm_device_id);
+    tvm_device->SetStream(tvm_device_id, getStream());
+    #endif
+
+    beginCudaGraphStreamCapture();
+    for (auto &[op, kernel, perfData] : kernels) {
+        if (perfData)
+            kernel->compute(op, perfData, this);
+        else
+            kernel->compute(op, this);
+    }
+    auto [cudaGraphInstance, numCudaGraphNodes] = endCudaGraphStreamCapture();
+    IT_ASSERT(numCudaGraphNodes == kernels.size(),
+              std::to_string(numCudaGraphNodes) +
+                  " != " + std::to_string(kernels.size()));
+    return timeit(
+        [&, cudaGraphInstance = cudaGraphInstance, stream = getStream()]() {
+            checkCudaError(cudaGraphLaunch(cudaGraphInstance, stream));
+        },
+        [&, stream = getStream()]() { cudaStreamSynchronize(stream); }, 1000,
+        1000);
+}
+
 } // namespace infini
--- a/src/ffi/ffi_callback.cc
+++ b/src/ffi/ffi_callback.cc
@ -0,0 +1,22 @@
+#include "core/graph.h"
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace infini {
+
+namespace callback {
+
+using namespace py::literals;
+
+static std::function<void(const Graph &, string)> exportONNXImpl;
+void exportONNX(const Graph &graph, const string &path) {
+    IT_ASSERT(Py_IsInitialized(), "Python interpreter is not running.");
+    static auto exportONNXImpl =
+        py::module_::import("infinitensor.if_onnx").attr("export_onnx");
+    exportONNXImpl(graph, path);
+}
+
+} // namespace callback
+
+} // namespace infini
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -1,9 +1,13 @@
 #include "core/graph_handler.h"
+#include "core/mutator.h"
+#include "core/search_engine.h"
+#include "nnet/nmutator.h"
 #include "operators/batch_norm.h"
 #include "operators/concat.h"
 #include "operators/conv.h"
 #include "operators/gather.h"
 #include "operators/matmul.h"
+#include "operators/membound.h"
 #include "operators/pad.h"
 #include "operators/pooling.h"
 #include "operators/reduce_mean.h"
@ -63,6 +67,7 @@ void export_values(py::module &m) {
        .VALUE(OpType, Conv)
        .VALUE(OpType, Matmul)
        .VALUE(OpType, ConvTrans)
+        .VALUE(OpType, ConvTransNHWC)
        .VALUE(OpType, G2BMM)
        .VALUE(OpType, GBMM)
        .VALUE(OpType, Pad)
@ -94,8 +99,13 @@ void export_values(py::module &m) {
        .VALUE(OpType, Abs)
        .VALUE(OpType, Resize)
        .VALUE(OpType, Dropout)
+        .VALUE(OpType, MemBound)
        .export_values();

+    py::enum_<TensorType>(m, "TensorType")
+        .VALUE(TensorType, Input)
+        .VALUE(TensorType, Initialized)
+        .VALUE(TensorType, Other);
 #undef VALUE
 }

@ -224,6 +234,10 @@ static vector<int> transpose_permute_of(Operator op) {
    return dynamic_cast<const TransposeObj *>(op.get())->getPermute();
 }

+static string membound_expr_of(Operator op) {
+    return as<MemBoundObj>(op)->toJson();
+}
+
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
    m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
@ -252,25 +266,37 @@ void export_functions(py::module &m) {
        .FUNCTION(transpose_permute_of)
        .FUNCTION(concat_axis_of)
        .FUNCTION(split_axis_of)
-        .FUNCTION(gather_axis_of);
+        .FUNCTION(gather_axis_of)
+        .FUNCTION(membound_expr_of)
+        .def("membound_hash_of",
+             [](Operator op) { return as<MemBoundObj>(op)->getHash(); });
 #undef FUNCTION
 }

 void init_graph_builder(py::module &m) {
    using Handler = GraphHandlerObj;

-    py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
+    py::class_<Object, Ref<Object>>(m, "_Object")
+        .def("__str__", &Object::toString)
+        .def("guid", &Object::getGuid);
+    py::class_<RuntimeObj, Ref<RuntimeObj>>(m, "Runtime")
+        .def("run", &RuntimeObj::run, "graph"_a, "tune"_a = false,
+             "profiling"_a = false)
+        .def("getPerfTime", &RuntimeObj::getPerfTime, "graph"_a, "profiling"_a,
+             "allowEstimation"_a, "ignoreMemboundOp"_a)
+        .def("timeNonCtcOperators", &RuntimeObj::timeNonCtcOperators);
    py::class_<NativeCpuRuntimeObj, std::shared_ptr<NativeCpuRuntimeObj>,
               RuntimeObj>(m, "CpuRuntime");
 #ifdef USE_CUDA
-    py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
-        m, "CudaRuntime");
+    py::class_<CudaRuntimeObj, Ref<CudaRuntimeObj>, RuntimeObj>(m,
+                                                                "CudaRuntime")
+        .def("timeWithCudaGraph", &CudaRuntimeObj::timeWithCudaGraph);
 #endif
 #ifdef USE_BANG
    py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
        m, "BangRuntime");
 #endif
-    py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
+    py::class_<TensorObj, std::shared_ptr<TensorObj>, Object>(m, "Tensor")
        .def("fuid", &TensorObj::getFuid, policy::automatic)
        .def("shape", &TensorObj::getDims, policy::move)
        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
@ -281,8 +307,10 @@ void init_graph_builder(py::module &m) {
        .def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
        .def("has_target", &TensorObj::hasTarget, policy::automatic)
        .def("src", &TensorObj::getSource, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic);
-    py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
+        .def("print_data", &TensorObj::printData)
+        .def("data_malloc", &TensorObj::dataMalloc)
+        .def("getTensorType", &TensorObj::getTensorType);
+    py::class_<OperatorObj, std::shared_ptr<OperatorObj>, Object>(m, "Operator")
        .def("op_type", &OperatorObj::getOpType, policy::automatic)
        .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
             policy::reference)
@ -291,6 +319,9 @@ void init_graph_builder(py::module &m) {
             policy::reference);
    py::class_<Handler>(m, "GraphHandler")
        .def(py::init<Runtime>())
+        .def(py::init<Graph>())
+        .def("inputs", &Handler::inputs, policy::move)
+        .def("outputs", &Handler::outputs, policy::move)
        .def("tensor", &Handler::tensor, policy::move)
        .def("conv", &Handler::conv, policy::move)
        .def("convTransposed2d", &Handler::convTransposed2d, policy::move)
@ -321,13 +352,47 @@ void init_graph_builder(py::module &m) {
        .def("reduce_mean", &Handler::reduceMean, policy::move)
        .def("slice", &Handler::slice, policy::move)
        .def("pad", &Handler::pad, policy::move)
+        .def("memBound", &Handler::memBound, policy::move)
        .def("topo_sort", &Handler::topo_sort, policy::automatic)
        .def("optimize", &Handler::optimize, policy::automatic)
        .def("operators", &Handler::operators, policy::move)
        .def("data_malloc", &Handler::data_malloc, policy::automatic)
-        .def("run", &Handler::run, policy::automatic);
+        .def("run", &Handler::run, policy::automatic)
+        .def("getGraph", &Handler::getGraph);
+    py::class_<Mutator, Ref<Mutator>>(m, "Mutator").def("run", &Mutator::run);
+    py::enum_<NMutator::Mode>(m, "NMutatorMode")
+        .value("RuleBased", NMutator::Mode::RuleBased);
+    py::class_<NMutator, Ref<NMutator>, Mutator>(m, "NMutator")
+        .def(py::init<NMutator::Mode>())
+        .def(py::init<NMutator::Mode, vector<int>>())
+        .def("run", &NMutator::run);
+    py::class_<SearchEngine>(m, "SearchEngine")
+        .def(py::init<Runtime, Ref<Mutator>>())
+        .def("run", &SearchEngine::run);
+    py::class_<GraphObj, Ref<GraphObj>, Object>(m, "Graph")
+        .def("tensors", &GraphObj::getTensors)
+        .def("operators", &GraphObj::getOperators)
+        .def("inputs", &GraphObj::getInputs)
+        .def("outputs", &GraphObj::getOutputs)
+        .def("print", &GraphObj::print)
+        .def("topo_sort", &GraphObj::topo_sort);
 }

+#ifdef USE_CUDA
+Graph getInfoGAN(int batch, Runtime runtime, int nLayers);
+vector<Tensor> runInfoGAN(int nLayers);
+Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId);
+Graph optimizeGraph(Graph g, Runtime runtime, bool tuning);
+
+void export_test_model(py::module &m) {
+    m.def("runInfoGAN", &runInfoGAN);
+    m.def("getInfoGAN", &getInfoGAN);
+    m.def("getConvtransposedNHWC", &getConvtransposedNHWC);
+    m.def("optimizeGraph", &optimizeGraph, "graph"_a, "runtime"_a,
+          "tuning"_a = false);
+}
+#endif
+
 } // namespace infini

 PYBIND11_MODULE(backend, m) {
@ -335,4 +400,5 @@ PYBIND11_MODULE(backend, m) {
    infini::export_values(m);
    infini::export_functions(m);
    infini::init_graph_builder(m);
+    infini::export_test_model(m);
 }
--- a/src/kernels/cuda/conv2dreduce.cu
+++ b/src/kernels/cuda/conv2dreduce.cu
@ -0,0 +1,171 @@
+#include "cuda/cuda_common.h"
+
+using dtype = float;
+
+__global__ void conv2dreduce_kernel_(float *__restrict__ input,
+                                     float *__restrict__ bias,
+                                     float *__restrict__ output,
+                                     const bool PReLU, const int n, const int f,
+                                     const int h, const int w, const int oh,
+                                     const int ow, const int r, const int s,
+                                     const int ph, const int pw, const int dh,
+                                     const int dw, const int sh, const int sw) {
+    // output shape: (n, oh, ow, f)
+    // input shape: (n, h, w, f, r, s)
+    int nid = blockIdx.x, fid = blockIdx.y;
+    int hid = threadIdx.x, wid = threadIdx.y;
+    const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
+              nchunck = n * hchunk;
+    float *nfinput = input + nid * nchunck + fid * fchunck;
+    if (nid < n && fid < f && hid < oh && wid < ow) {
+        float imm = 0.0;
+        int ihst = hid * sh - ph;
+        int iwst = wid * sw - pw;
+        for (int ri = 0; ri < r; ++ri) {
+            for (int si = 0; si < s; ++si) {
+                int ihid = ihst + ri * dh;
+                int iwid = iwst + si * dw;
+                if (ihid >= 0 && ihid < h && iwid >= 0 && iwid < w) {
+                    imm += *(nfinput + ihid * hchunk + iwid * wchunk + ri * s +
+                             si);
+                }
+            }
+        }
+        if (bias) {
+            imm += bias[fid];
+        }
+        if (PReLU) {
+            imm = imm > 0.0 ? imm : 0.0;
+        }
+        output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
+    }
+}
+
+__global__ void convTranspose2dreduce_kernel_(
+    float *__restrict__ input, float *__restrict__ bias,
+    float *__restrict__ output, const bool PReLU, const int n, const int f,
+    const int h, const int w, const int oh, const int ow, const int r,
+    const int s, const int ph, const int pw, const int dh, const int dw,
+    const int sh, const int sw) {
+    // assert dh = dw = 1
+    int nid = blockIdx.x, fid = blockIdx.y;
+    int hid = threadIdx.x, wid = threadIdx.y;
+    const int fchunck = r * s, wchunk = f * fchunck, hchunk = w * wchunk,
+              nchunck = n * hchunk;
+    float *nfinput = input + nid * nchunck + fid * fchunck;
+    // view as conv, the true ph and pw
+    int tph = r - ph - 1, tpw = s - pw - 1;
+    int th = (h - 1) * sh + 1, tw = (w - 1) * sw + 1;
+    if (nid < n && fid < f && hid < oh && wid < ow) {
+        float imm = 0.0;
+        int ihst = hid - tph;
+        int iwst = wid - tpw;
+        for (int ri = 0; ri < r; ++ri) {
+            for (int si = 0; si < s; ++si) {
+                int ihid = ihst + r - ri - 1;
+                int iwid = iwst + s - si - 1;
+                if (ihid >= 0 && ihid < th && iwid >= 0 && iwid < tw &&
+                    (ihid % sh == 0) && (iwid % sw == 0)) {
+                    imm += *(nfinput + (ihid / sh) * hchunk +
+                             (iwid / sw) * wchunk + ri * s + si);
+                }
+            }
+        }
+        if (bias) {
+            imm += bias[fid];
+        }
+        if (PReLU) {
+            imm = imm > 0.0 ? imm : 0.0;
+        }
+        output[nid * (oh * ow * f) + hid * (ow * f) + wid * f + fid] = imm;
+    }
+}
+
+// nhwrsc -> nhwc
+__global__ void reduce_4x4(dtype *in, dtype *out, int act, const int N,
+                           const int F, const int H, const int W, const int IH,
+                           const int IW) {
+// #define in_index(n, h, w, r, s, f)                                             \
+//     ((((((n)*IH + h) * IW + w) * R + r) * S + s) * F + f)
+#define in_index(n, h, w, f, r, s)                                             \
+    ((((((n)*IH + h) * IW + w) * F + f) * R + r) * S + s)
+#define out_index(n, h, w, f) (((((n)*H) + (h)) * W + (w)) * F + (f))
+    const int R = 4, S = 4;
+    const int n_tasks = N * F * H * W;
+    int start = threadIdx.x + blockDim.x * blockIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = start; i < n_tasks; i += stride) {
+        int t = i, n, f, h, w;
+        f = t % F;
+        t /= F;
+        w = t % W;
+        t /= W;
+        h = t % H;
+        t /= H;
+        n = t;
+
+        // unroll this 2-iter loop
+        float sum = 0;
+        int x, y;
+        for (int r = (h + 1) & 1; r < R; r += 2) {
+            x = (h + 1 - r) / 2;
+            if (x >= 0 && x < IH) {
+                for (int s = (w + 1) & 1; s < S; s += 2) {
+                    y = (w + 1 - s) / 2;
+                    if (y >= 0 && y < IW) {
+                        sum += in[in_index(n, x, y, f, r, s)];
+                        // if (i==0)
+                        //     printf("TTT nhwf= %d,%d,%d,%d x=%d y=%d, v=%f,
+                        //     index=%d, rsf %d %d %d\n", n, h, w,
+                        //            f, x, y, in[in_index(n, x, y, r, s, f)],
+                        //            in_index(n, x, y, r, s, f), r,s,f);
+                    }
+                }
+            }
+        }
+        if (act == 0) {
+            out[out_index(n, h, w, f)] = sum;
+        } else if (act == 1) { // Relu
+            out[out_index(n, h, w, f)] = sum > 0 ? sum : 0;
+        } else if (act == 2) {
+            out[out_index(n, h, w, f)] = tanhf(sum);
+        }
+    }
+#undef in_index
+#undef out_index
+}
+
+namespace infini {
+
+void conv2dreduce_kernel(float *input, float *bias, float *output, bool PReLU,
+                         int n, int h, int w, int f, int r, int s, int oh,
+                         int ow, int ph, int pw, int sh, int sw, int dh,
+                         int dw) {
+    dim3 grid(n, f);
+    dim3 block(oh, ow);
+    // cudaStream_t stream(cudaStreamPerThread);
+    conv2dreduce_kernel_<<<grid, block, 0>>>(input, bias, output, PReLU, n, f,
+                                             h, w, oh, ow, r, s, ph, pw, dh, dw,
+                                             sh, sw);
+}
+
+void convTranspose2dreduce_kernel(float *input, float *bias, float *output,
+                                  int act, int n, int h, int w, int f, int r,
+                                  int s, int oh, int ow, int ph, int pw, int sh,
+                                  int sw, int dh, int dw) {
+    dim3 grid(n, f);
+    dim3 block(oh, ow);
+    // cudaStream_t stream(cudaStreamPerThread);
+    // puts("convTranspose2dreduce_kernel is executed");
+    if (r == 4 && s == 4 && sh == 2 && sw == 2) {
+        const int M = r * s * f, N = n * h * w;
+        reduce_4x4<<<(M * N + 127) / 128, 128>>>(input, output, act, n, f, oh,
+                                                 ow, h, w);
+    } else {
+        puts("why use this conv2dreduce");
+        convTranspose2dreduce_kernel_<<<grid, block, 0>>>(
+            input, bias, output, (bool)act, n, f, h, w, oh, ow, r, s, ph, pw,
+            dh, dw, sh, sw);
+    }
+}
+} // namespace infini
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@ -49,7 +49,7 @@ class matmulCublas : public Kernel {
        const float alpha = 1.f, beta = 0.f;
        // TODO:use compute type
        cublasStatus_t stat;
-        if (b > 1) {
+        if (b >= 1) {
            // Support batch broadcast with zero stride
            int dimA = op->getInputs(0)->getDims().size();
            int dimB = op->getInputs(1)->getDims().size();
@ -63,6 +63,11 @@ class matmulCublas : public Kernel {
                 (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
                    ? 0 // Broadcast the batch dimension if batch size is 1
                    : n * k;
+            // printf("cublasGemmStridedBatchedEx %d%d, mnk %d %d %d, alpha %f,
+            // B "
+            //        "%d %lld, A %d %lld, C %d %d, b %d %d\n",
+            //        opB, opA, n, m, k, alpha, ldb, strideB, lda, strideA, ldc,
+            //        m * n, b, record->algo);
            stat = cublasGemmStridedBatchedEx(
                context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
                CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
--- a/src/kernels/cuda/membound_tvm_packed_function.cc
+++ b/src/kernels/cuda/membound_tvm_packed_function.cc
@ -1,5 +1,6 @@
 #ifdef INFINI_USE_TVM
 #include "core/kernel.h"
+#include "cuda/cuda_conv2dreduce.h"
 #include "cuda/cuda_runtime.h"
 #include "dlpack/dlpack.h"
 #include "ffi/ffi_embed.h"
@ -8,6 +9,13 @@
 #include "operators/pooling.h"
 #include "tvm/runtime/module.h"
 #include "tvm/runtime/packed_func.h"
+#include <nlohmann/json.hpp>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+using json = nlohmann::json;

 namespace py = pybind11;

@ -22,6 +30,8 @@ class TVMRecordObj : public PerfRecordObj {
    std::string dllPath;
    std::string funcName;
    std::vector<int> inputIdx;
+    tvm::runtime::PackedFunc packedFunc;
+    bool useExistingKernel = false;
 };

 using TVMRecord = Ref<TVMRecordObj>;
@ -33,9 +43,15 @@ class MemboundTVMPackedFunction : public Kernel {
        auto op = as<MemBoundObj>(_op);
        // auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
        auto tvmRecord = std::dynamic_pointer_cast<TVMRecordObj>(record);
-        tvm::runtime::PackedFunc packedFunc =
-            getPackedFunction(tvmRecord->dllPath, tvmRecord->funcName);
-        IT_ASSERT(packedFunc != nullptr);
+
+        // Use user-defined kernels
+        if (tvmRecord->useExistingKernel) {
+            bool success = useExistingKernels(op);
+            IT_ASSERT(success);
+            return;
+        }
+
+        tvm::runtime::PackedFunc packedFunc = tvmRecord->packedFunc;

        // prepare inputs and outputs
        vector<DLTensorHolder> inputsHolder;
@ -63,10 +79,18 @@ class MemboundTVMPackedFunction : public Kernel {
    // Premise: op is idempotent since it is called multiple times.
    PerfRecord tune(const Operator &_op,
                    const RuntimeObj *_context) const override {
-        TVMRecord ret = std::make_shared<TVMRecordObj>();
        auto op = as<MemBoundObj>(_op);
        auto context = dynamic_cast<const CudaRuntimeObj *>(_context);

+        // If hash matches, use user-defined kernels
+        if (useExistingKernels(op)) {
+            TVMRecord ret = std::make_shared<TVMRecordObj>();
+            ret->time = timeit([&]() { useExistingKernels(op); },
+                               [&]() { context->sync(); });
+            ret->useExistingKernel = true;
+            return ret;
+        }
+
        // invoke Ansor to tune a membound kernel
        auto [expr, hash] = op->getSimplifiedNnetExpr();
        nnet::AsTVMVisitor visitor;
@ -93,6 +117,7 @@ class MemboundTVMPackedFunction : public Kernel {
                if (inputName == op->getNnetInputs()[j]->getName())
                    break;
            }
+            IT_ASSERT(j < numInputs, "Cannot find input name: " + inputName);
            inputIdx.emplace_back(j);
        }

@ -114,29 +139,41 @@ class MemboundTVMPackedFunction : public Kernel {
        tvm::runtime::TVMArgs args(preArgs.first.data(), preArgs.second.data(),
                                   preArgs.first.size());

+        TVMRecord ret = std::make_shared<TVMRecordObj>();
        ret->time = timeit([&]() { packedFunc.CallPacked(args, &rv); },
                           [&]() { context->sync(); });
        ret->kernelName = kernelName;
        ret->dllPath = dllPath;
        ret->funcName = func;
        ret->inputIdx = inputIdx;
+        ret->packedFunc = packedFunc;

-        return std::dynamic_pointer_cast<PerfRecordObj>(ret);
+        return ret;
+    }
+
+    std::string serializeTVMArgs(const std::vector<std::vector<int>> &inDims,
+                                 const std::vector<std::string> &inDTypes,
+                                 const std::vector<int> &outDims,
+                                 const std::string &outDType,
+                                 const std::string &lambda,
+                                 const std::string &funcName,
+                                 const std::string &nnetExprString,
+                                 const std::string &nnetSimplifiedExprString,
+                                 const HashType hashCode) const {
+        json j;
+        // Consistant with python API interface
+        j["input_tensors"] = inDims;
+        j["input_dtypes"] = inDTypes;
+        j["output_tensor"] = outDims;
+        j["output_dtype"] = outDType;
+        j["tvm_code"] = lambda;
+        j["func_name"] = funcName;
+        j["nnet_expression"] = nnetExprString;
+        j["nnet_simplified_expression"] = nnetSimplifiedExprString;
+        j["hash_code"] = std::to_string(hashCode);
+        return j.dump();
    }

-    /// @brief
-    /// @param inDims
-    /// @param inDTypes
-    /// @param outDims
-    /// @param outDType
-    /// @param lambda
-    /// @param funcName Generated function name
-    /// @param nnetExpressionString Save expr in string for logging.
-    /// @param nnetSimplifiedExprString Save simplified expr in string for
-    /// logging.
-    /// @param hashCode (optional) Hash code of the input expression for kernel
-    /// cache.
-    /// @return
    std::string getAnsorDLL(const std::vector<std::vector<int>> &inDims,
                            const std::vector<std::string> &inDTypes,
                            const std::vector<int> &outDims,
@ -146,29 +183,60 @@ class MemboundTVMPackedFunction : public Kernel {
                            const std::string &nnetExprString,
                            const std::string &nnetSimplifiedExprString,
                            const HashType hashCode) const {
-        std::string dllPath;
-        try {
-            start_interpreter();
-            // Use static to avoid re-importing the module. Re-importing results
-            // in cuBLAS failure, whose root cause is not identified yet.
-            static auto func =
-                py::module::import("cpp_plugin").attr("gen_ansor_so");
-            py::tuple code =
-                func(inDims, inDTypes, outDims, outDType, lambda, funcName,
-                     nnetExprString, nnetSimplifiedExprString,
-                     std::to_string(hashCode));
-            dllPath = py::str(code[0]);
-        } catch (py::error_already_set &e) {
-            if (e.matches(PyExc_ImportError)) {
-                std::cerr << "Import Error. Don't forget to set environment "
-                             "variable PYTHONPATH to contain "
-                             "<repo-root>/python"
-                          << std::endl;
-            }
-            throw;
+        int fdP2C[2], fdC2P[2];
+        for (auto fd : {fdP2C, fdC2P}) {
+            int status = pipe(fd);
+            IT_ASSERT(status == 0, "pipe failed");
        }
+        pid_t pid = fork();
+        IT_ASSERT(pid >= 0, "fork failed");
+        if (pid == 0) { // Child process
+            close(fdP2C[1]);
+            close(fdC2P[0]);

-        return dllPath;
+            dup2(fdP2C[0], STDIN_FILENO);
+            close(fdP2C[0]);
+
+            string cmd =
+                "from cpp_plugin.gen_ansor_so import pipe_gen; pipe_gen(+" +
+                std::to_string(fdC2P[1]) + ")";
+            const char *const argv[] = {"python3", "-c", cmd.data(), NULL};
+            execvp("python3", const_cast<char *const *>(argv));
+        } else { // Parent process
+            close(fdP2C[0]);
+            close(fdC2P[1]);
+
+            // Write to pipe
+            string serializedArgs = serializeTVMArgs(
+                inDims, inDTypes, outDims, outDType, lambda, funcName,
+                nnetExprString, nnetSimplifiedExprString, hashCode);
+            int status = -1;
+            status =
+                write(fdP2C[1], serializedArgs.data(), serializedArgs.size());
+            IT_ASSERT((size_t)status == serializedArgs.size(),
+                      "Failed to write to pipe");
+            close(fdP2C[1]);
+
+            // Wait for TVM
+            waitpid(pid, &status, 0);
+            IT_ASSERT(WIFEXITED(status), "TVM process was terminated");
+            const int es = WEXITSTATUS(status);
+            IT_ASSERT(es == 0,
+                      "TVM process exit with code " + std::to_string(es));
+
+            // Read from pipe
+            FILE *stream;
+            stream = fdopen(fdC2P[0], "r");
+            char buf_read[257] = {0};
+            status = std::fscanf(stream, "%256c", buf_read);
+            IT_ASSERT(status == 1, "Failed to read from pipe");
+            IT_ASSERT(buf_read[256] == 0, "Pipe buffer overflow");
+            fclose(stream);
+            close(fdC2P[0]);
+            return buf_read;
+        }
+        IT_ASSERT(false, "Should not reach here");
+        return "";
    }

    tvm::runtime::PackedFunc getPackedFunction(string path,
@ -214,6 +282,34 @@ class MemboundTVMPackedFunction : public Kernel {

        return {values, type_codes};
    }
+
+    bool useExistingKernels(Ref<MemBoundObj> op) const {
+        const map<HashType, tuple<int, int, int, int, int, int, int, int, int,
+                                  int, int, int, int, int, int>>
+            hashMap = {
+                // clang-format off
+{18446744073661354550ULL, {1, 1, 2, 2, 256, 4, 4, 4, 4, 1, 1, 2, 2, 1, 1}},
+{124145340ULL,            {1, 1, 4, 4, 128, 4, 4, 8, 8, 1, 1, 2, 2, 1, 1}},
+{18446744073695718019ULL, {1, 1, 8, 8, 64, 4, 4, 16, 16, 1, 1, 2, 2, 1, 1}},
+{515085072ULL,            {2, 1, 16, 16, 3, 4, 4, 32, 32, 1, 1, 2, 2, 1, 1}}
+        }; // clang-format on
+        float *input = op->getInputs(0)->getRawDataPtr<float *>();
+        float *bias = nullptr;
+        float *output = op->getOutput()->getRawDataPtr<float *>();
+        if (auto it = hashMap.find(op->getHash()); it != hashMap.end()) {
+            auto &[PReLU, n, h, w, f, r, s, oh, ow, ph, pw, sh, sw, dh, dw] =
+                it->second;
+            IT_ASSERT(op->getInputs(0)->size() ==
+                      size_t(n) * h * w * f * r * s);
+            IT_ASSERT(op->getOutput()->size() == size_t(n) * oh * ow * f);
+            convTranspose2dreduce_kernel(input, bias, output, PReLU, n, h, w, f,
+                                         r, s, oh, ow, ph, pw, sh, sw, dh, dw);
+            return true;
+        }
+        // conv2dreduce_kernel(input, bias, output, PReLU, n, h, w, f, r, s,
+        //                     oh, ow, ph, pw, sh, sw, dh, dw);
+        return false;
+    }
 };

 REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
--- a/src/kernels/cuda/reshape.cc
+++ b/src/kernels/cuda/reshape.cc
@ -4,10 +4,13 @@ namespace infini {
 class CopyCuda : public CudaKernelWithoutConfig {
    void compute(const Operator &op,
                 const RuntimeObj *_context) const override {
-        auto inData = op->getInputs(0)->getRawDataPtr<void *>();
-        auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
-        cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
-                        cudaMemcpyDeviceToDevice);
+        // auto inData = op->getInputs(0)->getRawDataPtr<void *>();
+        // auto outData = op->getOutputs()[0]->getRawDataPtr<void *>();
+        // cudaMemcpyAsync(outData, inData, op->getInputs(0)->getBytes(),
+        //                 cudaMemcpyDeviceToDevice);
+
+        // HACK: optimization
+        op->getOutputs()[0]->setData(op->getInputs(0)->getDataBlob());
    }
 };
 // reshape/flatten/identity all act as copying from input to output.
--- a/src/nnet/App/test_models.cc
+++ b/src/nnet/App/test_models.cc
@ -0,0 +1,239 @@
+#ifdef USE_CUDA
+#include "core/blob.h"
+#include "core/dummy_mutator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "core/search_engine.h"
+#include "cuda/cuda_runtime.h"
+#include "ffi/ffi_callback.h"
+#include "nnet/nmutator.h"
+#include "operators/conv.h"
+#include "operators/unary.h"
+#include "test.h"
+#include <pybind11/stl.h>
+
+namespace infini {
+
+// NHWC format
+Graph getInfoGAN(int batch, Runtime runtime, int nLayers) {
+    IT_ASSERT(1 <= nLayers && nLayers <= 5);
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int, bool>> cs{
+        // Channel, kernelSize, pad, stride, isTanh
+        {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
+        {64, 4, 1, 2, false},  {3, 4, 1, 2, true},
+    };
+
+    Tensor input =
+        g->addTensor({batch, 1, 1, 228}, DataType::Float32, TensorType::Input);
+    for (int i = 0; i < (int)cs.size() && i < nLayers; ++i) {
+        auto [channel, kernelSize, pad, stride, tanh] = cs[i];
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        if (tanh) {
+            input = g->addOp<TanhObj>(input, nullptr)->getOutput();
+        } else {
+            input = g->addOp<ReluObj>(input, nullptr)->getOutput();
+        }
+    }
+    return g;
+}
+
+Graph getConvtransposedNHWC(Runtime runtime, Shape shape, int layerId) {
+    IT_ASSERT(0 <= layerId && layerId < 5);
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int, bool>> cs{
+        // Channel, kernelSize, pad, stride, isTanh
+        {448, 2, 0, 1, false}, {256, 4, 1, 2, false}, {128, 4, 1, 2, false},
+        {64, 4, 1, 2, false},  {3, 4, 1, 2, true},
+    };
+
+    Tensor input = g->addTensor(shape, DataType::Float32, TensorType::Input);
+    for (int i = layerId; i < layerId + 1; ++i) {
+        auto [channel, kernelSize, pad, stride, tanh] = cs[i];
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight = g->addTensor({f, kernelSize, kernelSize, channel},
+                                   DataType::Float32,
+                                   TensorType::Initialized); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        if (tanh) {
+            input = g->addOp<TanhObj>(input, nullptr)->getOutput();
+        } else {
+            input = g->addOp<ReluObj>(input, nullptr)->getOutput();
+        }
+    }
+    return g;
+}
+
+void printGraph(Graph g) {
+    g->print();
+    puts("============ Data ============");
+    for (auto t : g->getTensors()) {
+        dbg(t);
+        t->printData();
+    }
+}
+
+Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning) {
+    auto runtime = as<CudaRuntimeObj>(_runtime);
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(runtime, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-0.1, 0.1, 0);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    runtime->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    // dbg(runtme->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph =
+            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        if (tuning) {
+            runtime->run(bestGraph, true);  // Tune kernels
+            runtime->run(bestGraph, false); // Execute transfomraed graph
+
+            auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+            auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+            // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+            dbg(go0->equalData(bgo0, 1e-3));
+            dbg(runtime->getPerfTime(bestGraph, true));
+            dbg(runtime->timeNonCtcOperators(bestGraph));
+            // dbg(runtime->timeWithCudaGraph(bestGraph));
+        }
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        return bestGraph;
+    }
+    return nullptr;
+}
+
+vector<Tensor> runInfoGAN(int nLayers) {
+    auto cuda = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    Graph g = getInfoGAN(1, cuda, nLayers);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    // // Translate OP to membound without derivation
+    // mutator->setToNaiveMembound();
+
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(cuda, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-0.1, 0.1, 0);
+    // auto gen = RandomGenerator(-5, 5, 0, true);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    cuda->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    dbg(cuda->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        cuda->run(bestGraph, true);  // Tune kernels
+        cuda->run(bestGraph, false); // Execute transfomraed graph
+
+        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+        std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
+        bgo0->printData();
+        go0->printData();
+        dbg(cuda->getPerfTime(bestGraph, true));
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        callback::exportONNX(bestGraph, "best_graph.onnx"); // Debug
+        return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
+    }
+    return {};
+}
+
+} // namespace infini
+#endif
--- a/src/nnet/Visitor/FullPrinterVisitor.cc
+++ b/src/nnet/Visitor/FullPrinterVisitor.cc
@ -13,7 +13,8 @@ string FullPrinterVisitor::print(const Expr &root) {
    oss << "==> ROOT\n" << root->toReadable() << "\n";
    for (size_t i = 0; i < q.size(); ++i) {
        const auto &[name, routine, tensor] = q[i];
-        oss << "==> " << name << " : ";
+        oss << "==> " << name << " " << infini::vecToString(tensor->getShape())
+            << " : ";
        if (routine) {
            oss << routine->toReadable() << "\n";
            if (routine->getExpr()) {
--- a/src/nnet/Visitor/Serializer.cc
+++ b/src/nnet/Visitor/Serializer.cc
@ -73,17 +73,52 @@ string Serializer::visit_(const Tensor &c) {
    return key;
 }

-bool Serializer::serialize(const Expr &expr, const string &filePath,
-                           const string &msg) {
+string Serializer::visit_(const Func &c) {
+    const string key = std::to_string(id++);
+    j[key]["type"] = c->getType();
+    j[key]["funcType"] = c->getFuncType();
+    j[key]["object"] = dispatch(c->getObject());
+    return key;
+}
+
+std::optional<std::string> Serializer::toString(const Expr &expr,
+                                                const string &msg,
+                                                vector<Tensor> inputs,
+                                                double exec_time, string hint) {
    // Metadata
    j["Version"] = VERSION;
    j["Msg"] = msg;
+    j["exec_time"] = exec_time;
+    j["hint"] = hint;
    // Expressions and routines
    id = 0;
    dispatch(expr);
-    std::ofstream fout(filePath);
-    fout << std::setw(4) << j << std::endl;
-    return true;
+
+    // Input tensors
+    vector<string> inputsIndices;
+    for (const auto &tensor : inputs) {
+        inputsIndices.emplace_back(std::to_string(id));
+        dispatch(tensor);
+    }
+    j["nnetInputs"] = inputsIndices;
+
+    // To string
+    std::stringstream ss;
+    ss << std::setw(4) << j << std::endl;
+    return {ss.str()};
+}
+
+bool Serializer::toFile(const Expr &expr, const string &filePath,
+                        const string &msg, vector<Tensor> inputs,
+                        double exec_time, string hint) {
+    if (auto s = toString(expr, msg, inputs, exec_time, hint)) {
+        // Write to file
+        std::ofstream fout(filePath);
+        fout << *s;
+        return true;
+    } else {
+        return false;
+    }
 }

 string Serializer::dispatchRoutine(const Routine &c) {
@ -129,7 +164,15 @@ string Serializer::dispatchRoutine(const Routine &c) {
    return key;
 }

-Expr Serializer::deserialize(const string &filePath) {
+Expr Serializer::fromString(const string &text) {
+    std::stringstream str;
+    str << text;
+    str >> j;
+    assert(j["Version"] == VERSION);
+    return buildExprTree("0");
+}
+
+Expr Serializer::fromFile(const string &filePath) {
    std::ifstream fin(filePath);
    fin >> j;
    assert(j["Version"] == VERSION);
@ -180,6 +223,10 @@ Expr Serializer::buildExprTree(string key) {
        return make_ref<TensorNode>(j[key]["name"], j[key]["shape"],
                                    j[key]["paddings"], source);
    }
+    case NodeType::FuncNodeType: {
+        auto object = buildExprTree(j[key]["object"]);
+        return make_ref<FuncNode>(object, j[key]["funcType"]);
+    }
    default: {
        nnet_unimplemented_halt();
        break;
@ -242,4 +289,25 @@ Routine Serializer::buildRoutine(string key) {
    return nullptr;
 }

-} // namespace nnet
+tuple<Expr, vector<Tensor>, double, string>
+Serializer::deserializeAsMemobundOp(const string &filePath) {
+    std::ifstream fin(filePath);
+    fin >> j;
+    assert(j["Version"] == VERSION);
+    vector<Tensor> inputs;
+    for (const auto &input : j["nnetInputs"])
+        inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
+    return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
+}
+
+tuple<Expr, vector<Tensor>, double, string>
+Serializer::membundOpFromString(const string &data) {
+    j = json::parse(data);
+    assert(j["Version"] == VERSION);
+    vector<Tensor> inputs;
+    for (const auto &input : j["nnetInputs"])
+        inputs.emplace_back(as<TensorNode>(buildExprTree(input)));
+    return {buildExprTree("0"), inputs, j["exec_time"], j["hint"]};
+}
+
+} // namespace nnet
--- a/src/nnet/derivator.cc
+++ b/src/nnet/derivator.cc
@ -499,8 +499,8 @@ void Derivator::printIntermediateStates() {
        std::cout << FullPrinterVisitor().print(intermediateStates[i]) << endl;
        if (logMode == LogMode::DumpFristCandiate) {
            Serializer serializer;
-            serializer.serialize(intermediateStates[i],
-                                 logFnPrefix + to_string(i) + ".expr", msg);
+            serializer.toFile(intermediateStates[i],
+                              logFnPrefix + to_string(i) + ".expr", msg);
        }
    }
    for (size_t i = 0; i < intermediateStates.size(); ++i) {
--- a/src/nnet/expr.cc
+++ b/src/nnet/expr.cc
@ -1,4 +1,5 @@
 #include "nnet/expr.h"
+#include "nnet/Visitor/FullPrinterVisitor.h"
 #include "nnet/Visitor/GetTensorsVisitor.h"

 namespace nnet {
@ -463,4 +464,9 @@ void FuncNode::setObject(Expr e) {
    object = e;
 }

+string RangeOpNode::getFullExpression() {
+    FullPrinterVisitor printer;
+    return printer.print(this->shared_from_this());
+}
+
 } // namespace nnet
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@ -1,13 +1,16 @@
 #include "nnet/nmutator.h"
 #include "core/graph.h"
+#include "ffi/ffi_callback.h"
 #include "nnet/Visitor/FullPrinterVisitor.h"
 #include "nnet/Visitor/GetTensorsVisitor.h"
 #include "nnet/Visitor/MatchReshapeVisitor.h"
+#include "nnet/Visitor/MergeMemboundMutator.h"
 #include "nnet/derivator.h"
 #include "operators/conv.h"
 #include "operators/matmul.h"
 #include "operators/membound.h"
 #include "operators/reshape.h"
+#include "operators/unary.h"

 namespace infini {

@ -49,9 +52,9 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
    assert(computeOps.size() == 1);
    const auto &computeOp = computeOps[0];
    auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
-    auto expr = opToExpression(computeOp);
+    nnet::Expr expr = opToExpression(computeOp);
    auto inputsN = nnet::GetTensorsVisitor().get(expr);
-    dbg(inputsN, expr);
+    // dbg(inputsN, expr);
    IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
              "Which one is the second input tensor?");
    vector<nnet::Tensor> inputsVectorN = {inputsN.at("A")};
@ -74,17 +77,18 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
 void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
    OpVec computeOps = in_graph->getComputeOps();
    IT_ASSERT(computeOps.size() == 1);
+    // HACK: remove this
+    if (auto op = as<ConvTransposed2dNHWCObj>(computeOps[0]); !op)
+        return;

-    /* if (infini::Graph g = transformTConv1x1(computeOps[0])) {
+    // if (infini::Graph g = transformTConv1x1(computeOps[0])) {
    //     out_graphs.emplace_back(g);
    //     return;
    // }
-    // // Commented for debug, not implemented yet
-    // // if (infini::Graph g = transformTConv3x3(computeOps[0])) {
-    // //     Graph graph = new Graph(g->getOperators());
-    // //     out_graphs.emplace_back(graph);
-    // //     return;
-    // // }
+    if (Graph g = transformConvtransposed1x1(computeOps[0])) {
+        out_graphs.emplace_back(g);
+        return;
+    }
    // if (infini::Graph g = transformDialtedConv(computeOps[0])) {
    //     out_graphs.emplace_back(g);
    //     return;
@ -99,7 +103,6 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {
    // //     out_graphs.emplace_back(graph);
    // //     return;
    // // }
-    */

    auto expr = opToExpression(computeOps[0]);
    if (!expr)
@ -107,18 +110,15 @@ void NMutator::runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs) {

    nnet::Derivator derivator(maxDepth);
    nnet::Formula conv_9x9(expr, 0);
-    // const std::vector<int> rules{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90};
-    // ConvTraspose
-    // const std::vector<int> rules{1, 7, 7, 2, 8, 6, 6}; // G2BMM
    if (mode == Mode::Normal) {
        derivator.search(conv_9x9, 0);
    } else if (mode == Mode::RuleBased) {
-        dbg(derivationRules);
+        // dbg(derivationRules);
        derivator.ruleBasedDFS(conv_9x9, 0, derivationRules);
    } else
        IT_TODO_HALT_MSG("Unknown NMutator search mode.");
    const auto &candidates = derivator.getCandidates();
-    dbg(candidates.size());
+    // dbg(candidates.size());
    // derivator.print();
    for (const auto &candidate : candidates) {
        // dbg(nnet::FullPrinterVisitor().print(candidate.root));
@ -233,31 +233,38 @@ void NMutator::runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs) {
 //     }
 // }

-nnet::Expr NMutator::opToExpression(Operator op) {
-    // IT_TODO_HALT();
-    if (auto convOp = as<ConvObj>(op)) {
+nnet::Expr NMutator::opToExpression(Operator opT) {
+    auto [expr, mapNameNToTensorT] = extractOp(opT);
+    for (auto &[name, tensorT] : mapNameNToTensorT) {
+        IT_ASSERT(inputsNameNToTensorT.count(name) == 0);
+        inputsNameNToTensorT[name] = tensorT;
+    }
+    return expr;
+}
+
+pair<nnet::Expr, NMutator::NameNToTensorT> NMutator::extractOp(Operator opT) {
+    if (auto convOp = as<ConvObj>(opT)) {
        const auto &inputs = convOp->getInputs();
        const auto &AT = inputs[0];
        const auto &KT = inputs[1];
        const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
        const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
        if (!(sh == 1 && sw == 1 && dh == 1 && dw == 1))
-            return nullptr;
+            return {};
        assert(sh == 1 && sw == 1 && dh == 1 && dw == 1);
-        inputsNameNToTensorT["A"] = AT;
-        inputsNameNToTensorT["K"] = KT;
        const auto A = nnet::makeTensor("A", AT->getDims(),
                                        std::vector<int>{0, 0, ph, pw});
        const auto K = nnet::makeTensor("K", KT->getDims());
-        return nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s);
-    } else if (auto convOp = as<ConvTransposed2dNHWCObj>(op)) {
+        return {nnet::ConvPattern::getExpr(A, K, n, c, h, w, f, r, s),
+                {{"A", AT}, {"K", KT}}};
+    } else if (auto convOp = as<ConvTransposed2dNHWCObj>(opT)) {
        const auto &AT = convOp->getInputs()[0];
        const auto &KT = convOp->getInputs()[1];
-        inputsNameNToTensorT["A"] = AT;
-        inputsNameNToTensorT["K"] = KT;
        const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
        const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
        IT_ASSERT_TODO(convOp->getNumGroups() == 1);
+        if (r != 4)
+            return {};
        IT_ASSERT_TODO(r == 4);
        IT_ASSERT_TODO(ph == pw);
        IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
@ -269,8 +276,9 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        const auto A = nnet::makeTensor(
            "A", AT->getDims(), std::vector<int>{0, padding, padding, 0});
        const auto K = nnet::makeTensor("K", KT->getDims());
-        return nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s);
-        // } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(op)) {
+        return {nnet::ConvTransPattern::getExpr(A, K, n, c, h, w, f, r, s),
+                {{"A", AT}, {"K", KT}}};
+        // } else if (auto g2bmmOp = dynamic_cast<G2BMMOp *>(opT)) {
        //     const auto &AT = g2bmmOp->getInputs()[0];
        //     const auto &BT = g2bmmOp->getInputs()[1];
        //     const auto [b, m, k, width, dilation] = g2bmmOp->getArgs();
@ -280,7 +288,7 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        //     inputsNameNToTensorT[inputsN.first->getName()] = AT;
        //     inputsNameNToTensorT[inputsN.second->getName()] = BT;
        //     return expr;
-        // } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(op)) {
+        // } else if (auto gbmmlOp = dynamic_cast<GBMMLOp *>(opT)) {
        //     const auto &AT = gbmmlOp->getInputs()[0];
        //     const auto &BT = gbmmlOp->getInputs()[1];
        //     const auto [b, m, w, k, dilation] = gbmmlOp->getArgs();
@ -290,22 +298,30 @@ nnet::Expr NMutator::opToExpression(Operator op) {
        //     inputsNameNToTensorT[inputsN.second->getName()] = BT;
        //     dbg(b, m, w, k, dilation, expr);
        //     return expr;
-    } else if (auto matmulOp = as<MatmulObj>(op)) {
+    } else if (auto matmulOp = as<MatmulObj>(opT)) {
        const auto &AT = matmulOp->getInputs()[0];
        const auto &BT = matmulOp->getInputs()[1];
        const auto [b, m, n, k, transA, transB] = matmulOp->getBMNKTransAB();
        const auto &[expr, inputsN] =
            nnet::MatmulPattern::getExpr(transA, transB, b, m, n, k);
-        inputsNameNToTensorT[inputsN.first->getName()] = AT;
-        inputsNameNToTensorT[inputsN.second->getName()] = BT;
-        // dbg(b, m, n, k, expr);
-        return expr;
+        return {
+            expr,
+            {{inputsN.first->getName(), AT}, {inputsN.second->getName(), BT}}};
+    } else if (auto op = as<MemBoundObj>(opT)) {
+        NameNToTensorT m;
+        for (int i = 0; i < op->numInputs(); ++i)
+            m[op->getNnetInputs()[i]->getName()] = opT->getInputs()[i];
+        return {op->getNnetExpr(), m};
+    } else if (opT->getOpType() == OpType::Relu ||
+               opT->getOpType() == OpType::Tanh) {
+        return generateUnaryExpr(opT);
    }
-    // // else if (auto transposeOp = dynamic_cast<TransposeOp *>(op)) {
+    // // else if (auto transposeOp = dynamic_cast<TransposeOp *>(opT)) {
    // //     return transposeOpToExpression(transposeOp);
    // // }
-    nnet_unimplemented_continue();
-    return nullptr;
+    IT_TODO_HALT_MSG("Cannot convert " + opT->toString() +
+                     " to an NNet expression");
+    return {};
 }

 infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
@ -313,7 +329,7 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
    nnet::FullPrinterVisitor fullVisitor;
    // Get tensors in the reversed topological order
    const auto &tensorQueueN = fullVisitor.traverse(expr);
-    dbg(fullVisitor.print(expr));
+    // dbg(fullVisitor.print(expr));

    // Build a map: name in nnet -> tensors in infini
    // Add input tensors to the map
@ -381,8 +397,8 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
                    cnt += tensor->size();
                for (const auto &tensor : outputsPET)
                    cnt += tensor->size();
-                dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
-                    memboundTime(cnt));
+                // dbg(inputsPET, outputsPET, op->getInputs(), op->getExpr(),
+                //     memboundTime(cnt));
                g->addOpWithOutputs<MemBoundObj>(inputsPET, outputsPET,
                                                 op->getInputs(), op->getExpr(),
                                                 memboundTime(cnt));
@ -395,6 +411,9 @@ infini::Graph NMutator::expressionToGraph(nnet::Expr expr, Graph in_graph) {
                                   nameNToTensorT.at(BN->getName())};
            TensorVec outputsPET = {nameNToTensorT.at(outputNameN)};
            const auto &[b, m, n, k, transa, transb] = op->getArgs();
+            // // HACK: pruning for deubg
+            if (!((transa == 0) && (transb == 1)))
+                return nullptr;
            g->addOpWithOutputs<MatmulObj>(inputsPET[0], inputsPET[1],
                                           outputsPET[0], transa, transb);
        }
@ -500,43 +519,86 @@ double NMutator::memboundTime(const Shape &dims) {
 //     return nullptr;
 // }

-// Graph NMutator::transformTConv3x3(Operator op) {
-//     if (auto tconvOp = dynamic_cast<ConvTransOp *>(op)) {
-//         dbg(tconvOp->getInputs()[1]->getDims());
-//         if (tconvOp->getPh() == 1 && tconvOp->getSh() == 2 &&
-//             tconvOp->getInputs()[1]->getDims()[0] == 3 &&
-//             tconvOp->getInputs()[1]->getDims()[1] == 3) {
-//             auto g = new infini::Graph();
-//             auto inputDims = tconvOp->getInputs(0)->getDims();
-//             auto weightDims = tconvOp->getInputs(1)->getDims();
-//             auto outputDims = tconvOp->getOutput()->getDims();
-//             // NHWF
-//             auto newA = g->tensor(
-//                 {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]});
-//             // RSFC
-//             auto newW = g->tensor(
-//                 {weightDims[0] * weightDims[1] * weightDims[3],
-//                 weightDims[2]});
-//             auto newO =
-//                 g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+Graph NMutator::transformConvtransposed1x1(Operator _op) {
+    auto op = as<ConvTransposed2dNHWCObj>(_op);
+    if (!op)
+        return nullptr;
+    const auto &A = op->getInputs()[0];
+    const auto &W = op->getInputs()[1];
+    const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+    const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+    const Shape inputDims = op->getInputs(0)->getDims();
+    const Shape weightDims = op->getInputs(1)->getDims();
+    const Shape outputDims = op->getOutput()->getDims();
+    const DataType dtype = A->getDType();
+    IT_ASSERT_TODO(op->getNumGroups() == 1);
+    if (h != 1 || w != 1)
+        return {};
+    IT_ASSERT_TODO(ph == pw);
+    IT_ASSERT_TODO(tie(sh, sw) == tuple(1, 1));
+    IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+    auto g = make_ref<GraphObj>(runtime);
+    // NHWF
+    auto newA = g->addTensor(
+        {inputDims[0] * inputDims[1] * inputDims[2], inputDims[3]}, dtype);
+    // FRSC
+    // auto newW = g->addTensor(
+    //     {weightDims[0], weightDims[1] * weightDims[2] * weightDims[3]},
+    //     dtype);
+    // HACK: without transpoe
+    auto newW = g->addTensor(
+        {weightDims[1] * weightDims[2] * weightDims[3], weightDims[0]}, dtype);
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(A), newA, newA->getDims());
+    g->addOpWithOutputs<ReshapeObj>(g->cloneTensor(W), newW, newW->getDims());
+    Tensor newO = g->addOp<MatmulObj>(newA, newW, nullptr, 0, 1)->getOutput();
+    g->addOpWithOutputs<ReshapeObj>(newO, g->cloneTensor(op->getOutput()),
+                                    op->getOutput()->getDims());
+    return g;
+}
+
+// Graph NMutator::transformConvtransposed(Operator _op) {
+//     auto op = as<ConvTransposed2dNHWCObj>(_op);
+//     if (!op)
+//         return nullptr;
+//     const auto &AT = op->getInputs()[0];
+//     const auto &KT = op->getInputs()[1];
+//     const auto &[n, c, h, w, f, r, s] = op->getNCHWFRS();
+//     const auto &[ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
+//     IT_ASSERT_TODO(op->getNumGroups() == 1);
+//     if (r != 4)
+//         return {};
+//     IT_ASSERT_TODO(ph == pw);
+//     IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));
+//     IT_ASSERT_TODO(tie(dh, dw) == tuple(1, 1));
+
+//     auto g = make_ref<Graph>();
+//     // TODO: implement transformation rules
+//     // How to efficiently write an expression...
+//     auto inputDims = op->getInputs(0)->getDims();
+//     auto weightDims = op->getInputs(1)->getDims();
+//     auto outputDims = op->getOutput()->getDims();
+//     // NHWF
+//     auto newA =
+//         g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
+//         inputDims[3]});
+//     // RSFC
+//     auto newW = g->tensor(
+//         {weightDims[0] * weightDims[1] * weightDims[3], weightDims[2]});
+//     auto newO = g->tensor({inputDims[0] * inputDims[1] * inputDims[2],
 //                            weightDims[0] * weightDims[1] * weightDims[3]});
-//             g->reshape(tconvOp->getInputs(0), newA);
-//             g->reshape(tconvOp->getInputs(1), newW);
-//             g->matmul(newA, newW, newO, 0, 1);
-//             // g->reshape(newO, tconvOp->getOutput());
-//             tconvOp->print();
-//             dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
-//             assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
-//             g->membound(
-//                 {newO}, {tconvOp->getOutput()}, {}, nullptr,
+//     g->reshape(op->getInputs(0), newA);
+//     g->reshape(op->getInputs(1), newW);
+//     g->matmul(newA, newW, newO, 0, 1);
+//     // g->reshape(newO, tconvOp->getOutput());
+//     tconvOp->print();
+//     dbg(newO->size() * 4, tconvOp->getOutput()->size() * 9);
+//     assert(newO->size() * 4 == tconvOp->getOutput()->size() * 9);
+//     g->membound({newO}, {tconvOp->getOutput()}, {}, nullptr,
 //                 memboundTime(newO->size() + tconvOp->getOutput()->size()),
 //                 "TConv3x3 reduce");
-//             g->updateConnection();
-//             Graph graph = new Graph(g->getOperators());
-//             return graph;
-//         }
-//     }
-//     return nullptr;
+//     g->updateConnection();
+//     Graph graph = new Graph(g->getOperators());
+//     return graph;
 // }

 // Graph NMutator::transformTConv1x1(Operator op) {
@ -632,4 +694,86 @@ double NMutator::memboundTime(const Shape &dims) {
 //     return graph;
 // }

+Graph NMutator::fuseVertically(const Graph &inputGraph) {
+    Graph optGraph = make_ref<GraphObj>(runtime);
+
+    auto chainOps = inputGraph->getOperators();
+    IT_ASSERT(!chainOps.empty());
+    for (auto &op : chainOps) {
+        IT_ASSERT(op->isMemBoundOp());
+        IT_ASSERT_TODO(op->getInputs().size() == 1);
+        IT_ASSERT(op->getOutputs().size() == 1);
+    }
+    if (chainOps.size() == 1) {
+        return make_ref<GraphObj>(runtime, chainOps);
+    }
+    std::vector<nnet::Expr> exprs;
+    for (const auto &op : chainOps) {
+        auto [expr, _] = extractOp(op);
+        exprs.emplace_back(expr);
+        // dbg(op, infini::as<nnet::RangeOpNode>(expr)->getFullExpression());
+    }
+    // double maxTime = getMaxPerf(std::make_shared<SubGraph>(chainOps));
+    // Fuse a MemboundOp chain
+    auto expr = nnet::MergeMemboundMutator(exprs).merge(true);
+    auto inputNMap = nnet::GetTensorsVisitor().get(exprs.front());
+    IT_ASSERT(inputNMap.size() == 1);
+    vector<nnet::Tensor> inputsN;
+    for (const auto &[name, t] : inputNMap) {
+        inputsN.emplace_back(t);
+    }
+    optGraph->addOpWithOutputs<MemBoundObj>(chainOps.front()->getInputs(),
+                                            chainOps.back()->getOutputs(),
+                                            inputsN, expr, 0);
+    // TODO: set time
+    return optGraph;
+}
+
+pair<nnet::Expr, NMutator::NameNToTensorT>
+NMutator::generateUnaryExpr(const Operator &op) {
+    using namespace nnet;
+    const map<OpType, nnet::FuncType> opTToFuncN = {
+        {OpType::PRelu, nnet::FuncType::PRelu},
+        {OpType::Relu, nnet::FuncType::Relu},
+        {OpType::Tanh, nnet::FuncType::Tanh}};
+    Shape shape = op->getInputs()[0]->getDims();
+    nnet::FuncType type = opTToFuncN.at(op->getOpType());
+    auto T = make_ref<TensorNode>("T", shape);
+    VecExpr indices;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        indices.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
+    }
+    auto sub = makeSubscript(T, indices);
+    auto func = nnet::make_ref<FuncNode>(sub, type);
+    vector<VarRangePair> varRanges;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        varRanges.emplace_back(nnet::as<VarNode>(indices[i]),
+                               Range{0, shape[i]});
+    }
+    return {makeRangeOperator(varRanges, {}, func),
+            NameNToTensorT{{"T", op->getInputs()[0]}}};
+}
+
+pair<nnet::Expr, vector<nnet::Tensor>> NMutator::generateRevert(Tensor in) {
+    using namespace nnet;
+    using infini::make_ref;
+    const Shape &orignalShape = in->getDims();
+    auto tensor = makeTensor("T", in->getDims());
+    VecExpr iters;
+    for (size_t i = 0; i < orignalShape.size(); ++i) {
+        iters.emplace_back(make_ref<VarNode>("i" + std::to_string(i)));
+    }
+
+    Shape newShape = orignalShape;
+    std::reverse(newShape.begin(), newShape.end());
+    auto sub = makeSubscript(tensor, iters);
+    vector<VarRangePair> loopIters;
+    for (int i = orignalShape.size() - 1; i >= 0; --i) {
+        loopIters.emplace_back(infini::as<VarNode>(iters[i]),
+                               Range{0, orignalShape[i]});
+    }
+    auto range = makeRangeOperator(loopIters, {}, sub);
+    return {range, {tensor}};
+}
+
 } // namespace infini
--- a/src/nnet/test.cc
+++ b/src/nnet/test.cc
@ -6,7 +6,7 @@
 namespace nnet {

 int matchExprResult(Derivator &derivator, string fn) {
-    auto ans = Serializer().deserialize(fn);
+    auto ans = Serializer().fromFile(fn);
    auto hashAns = HashVisitor()(ans);
    int match = 0;
    for (const auto &candidate : derivator.getCandidates()) {
@ -19,14 +19,14 @@ int matchExprResult(Derivator &derivator, string fn) {
 bool checkExprLogSame(string fnPrefix, int start, int end) {
    Serializer serializer;
    string fn0 = fnPrefix + to_string(start) + ".expr";
-    Expr expr0 = serializer.deserialize(fn0);
+    Expr expr0 = serializer.fromFile(fn0);
    RangeOp range0 = as<RangeOpNode>(expr0);
    Interpreter interpreter(range0);
    auto ans0 = interpreter.interpretUniformSample(range0);
    dbg(expr0, ans0);
    for (int i = start + 1; i < end; ++i) {
        string fn1 = fnPrefix + to_string(i) + ".expr";
-        Expr expr1 = serializer.deserialize(fn1);
+        Expr expr1 = serializer.fromFile(fn1);
        RangeOp range1 = as<RangeOpNode>(expr1);
        dbg(fn1, expr1);
        auto ans1 = interpreter.interpretUniformSample(range1);
@ -67,4 +67,4 @@ bool checkExprsEquvivalence(VecExpr exprs) {
    return true;
 }

-} // namespace nnet
+} // namespace nnet
--- a/src/operators/membound.cc
+++ b/src/operators/membound.cc
@ -2,6 +2,7 @@
 #include "nnet/Visitor/CheckOOBVisitor.h"
 #include "nnet/Visitor/HashVisitor.h"
 #include "nnet/Visitor/MergeMemboundMutator.h"
+#include "nnet/Visitor/Serializer.h"

 namespace infini {

@ -9,8 +10,8 @@ MemBoundObj::MemBoundObj(GraphObj *graph, const TensorVec &input,
                         const TensorVec &output,
                         const std::vector<nnet::Tensor> &nnetInputs,
                         nnet::Expr expr, double exec_time, std::string hint)
-    : OperatorObj(OpType::MemBound, input, output), nnetInputs(nnetInputs),
-      expr(expr), exec_time(exec_time), hint(hint) {
+    : OperatorObj(OpType::MemBound, input, output), expr(expr),
+      nnetInputs(nnetInputs), exec_time(exec_time), hint(hint) {
    IT_ASSERT(checkValid(graph));
    IT_ASSERT(!checkOOB(expr));
    hash = calcHash(expr);
@ -45,7 +46,7 @@ string MemBoundObj::toString() const {
    os << "exec_time=" << exec_time << ", ";
    os << "NNet Inputs=[";
    for (const auto &tensor : nnetInputs)
-        os << tensor->toReadable() << ",";
+        os << tensor->toReadable() << vecToString(tensor->getShape()) << ",";
    os << "]";
    os << ", ExprHash=" << hash;
    os << ", SimplifiedExprHash=" << simplifiedHash;
@ -60,11 +61,18 @@ string MemBoundObj::toString() const {

 optional<vector<Shape>> MemBoundObj::inferShape(const TensorVec &inputs) const {
    // inputs have to match nnetInputs excatly
-    if (inputs.size() != nnetInputs.size())
+    if (inputs.size() != nnetInputs.size()) {
+        std::cout << "Num mismatch" << inputs.size() << " "
+                  << nnetInputs.size();
        return {};
+    }
    for (size_t i = 0; i < inputs.size(); ++i)
-        if (inputs[i]->getDims() != nnetInputs[i]->getShape())
+        if (inputs[i]->getDims() != nnetInputs[i]->getShape()) {
+            std::cout << "Shape mismatch " << inputs[i]
+                      << vecToString(inputs[i]->getDims()) << " "
+                      << vecToString(nnetInputs[i]->getShape());
            return {};
+        }
    return {{nnet::as<nnet::RangeOpNode>(expr)->getOutputShape()}};
 }

@ -83,4 +91,9 @@ bool MemBoundObj::checkOOB(nnet::Expr expr) {
        nnet::as<nnet::RangeOpNode>(expr));
 }

+string MemBoundObj::toJson() const {
+    return *nnet::Serializer().toString(expr, "MemBoundObj::toJson", nnetInputs,
+                                        exec_time, hint);
+}
+
 } // namespace infini
--- a/test/core/test_graph_handler.cc
+++ b/test/core/test_graph_handler.cc
@ -7,9 +7,10 @@ namespace infini {
 TEST(Handler, matmul) {
    auto runtime = NativeCpuRuntimeObj::getInstance();
    auto handler = make_ref<GraphHandlerObj>(runtime);
-    auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32);
-    auto w = handler->tensor({1, 3, 4}, OnnxDType::UINT32);
-    auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32);
+    auto i = handler->tensor({1, 2, 3}, OnnxDType::UINT32, TensorType::Input);
+    auto w =
+        handler->tensor({1, 3, 4}, OnnxDType::UINT32, TensorType::Initialized);
+    auto o = handler->tensor({1, 2, 4}, OnnxDType::UINT32, TensorType::Other);
    handler->matmul(i, w, o, false, false, nullptr, ActType::None);
 }

--- a/test/kernels/cuda/test_cuda_runtime.cc
+++ b/test/kernels/cuda/test_cuda_runtime.cc
@ -0,0 +1,58 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
+#include "operators/conv.h"
+#include "nnet/nmutator.h"
+#include "operators/matmul.h"
+#include "test.h"
+
+namespace infini {
+
+TEST(TestCudaRuntime, CudaGraph) {
+    auto runtime = make_ref<CudaRuntimeObj>();
+    Graph g = make_ref<GraphObj>(runtime);
+    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    const int n = 2, c = 256, h = 2, w = 2, f = 448, r = 3, s = 2;
+    auto i0 = g->addTensor({n, c, h, w}, DataType::Float32, TensorType::Input);
+    auto w0 =
+        g->addTensor({f, c, r, s}, DataType::Float32, TensorType::Initialized);
+    g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
+    g->dataMalloc();
+    runtime->run(g, true);
+    runtime->run(g, false);
+    runtime->getPerfTime(g);
+
+    auto time = runtime->timeWithCudaGraph(g);
+    EXPECT_GE(time, 0.01);
+}
+
+TEST(TestCudaRuntime, CudaGraphMembound) {
+    auto runtime = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+    Graph g = make_ref<GraphObj>(runtime);
+
+    Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
+    Tensor w0 = g->addTensor({1, 3, 4}, DataType::Float32);
+    Tensor o0 = g->addTensor({1, 2, 4}, DataType::Float32);
+    g->dataMalloc();
+    i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
+    NMutator nmutator(NMutator::Mode::ToNaiveMembound);
+    auto mutations = nmutator.run(g);
+    ASSERT_EQ(mutations.size(), 2u);
+    Graph gNew = mutations[1];
+    gNew->print();
+    gNew->dataMalloc();
+
+    runtime->run(gNew, true); // tune kernels
+    runtime->run(gNew, false);
+    runtime->getPerfTime(gNew);
+
+    runtime->timeWithCudaGraph(gNew);
+}
+} // namespace infini
--- a/test/nnet/readlog.cc
+++ b/test/nnet/readlog.cc
@ -10,8 +10,8 @@ int main(int argc, char *argv[]) {
        printf("Usage: %s <log>\n", argv[0]);
        return 1;
    }
-    auto expr = Serializer().deserialize(argv[1]);
+    auto expr = Serializer().fromFile(argv[1]);
    cout << FullPrinterVisitor().print(expr);
    cout << endl << "Hash = " << HashVisitor().getHash(expr) << endl;
    return 0;
-}
+}
--- a/test/nnet/run_models_nnet.py
+++ b/test/nnet/run_models_nnet.py
@ -0,0 +1,113 @@
+import onnx
+import torch
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+import pyinfinitensor as pit
+from pyinfinitensor import backend as ft
+from pyinfinitensor.onnx import OnnxStub
+
+
+def to_pytorch_tensor(tensor) -> torch.Tensor:
+    data = tensor.copyout_float()
+    tt = torch.tensor(data)
+    return tt.reshape(tensor.shape())
+
+
+def run_InfoGAN_return_tesnor(n_layers: int):
+    if_tensors = ft.runInfoGAN(n_layers)
+    tensors = [to_pytorch_tensor(t) for t in if_tensors]
+    return tensors
+
+
+def read_and_check():
+    for n_layers in range(1, 6):
+        ans = torch.load(f'torch_{n_layers}layers_0.pt')
+        x = torch.load(f'torch_{n_layers}layers_1.pt')
+        print(f'=== {n_layers} layers ===')
+        print(x.abs().max())
+
+
+def run_e2e_InfoGAN():
+    data = []
+    for n_layers in range(5, 6):
+        tensors = run_InfoGAN_return_tesnor(n_layers)
+        for i, t in enumerate(tensors):
+            torch.save(t, f'torch_{n_layers}layers_{i}.pt')
+        print(f'============ {n_layers} layers = = =')
+        ans, x = tensors
+        print(f'Allclose {torch.allclose(ans, x)}')
+
+        # Print error numbers
+        tot = np.product(ans.shape)
+        data.append([])
+        for i in range(0, 10):
+            tol = 10**(-i)
+            clo = torch.isclose(ans, x, atol=tol, rtol=tol).sum().item()
+            print(f'0.1^{i} close: {clo}/{tot} = {clo/tot}')
+            data[-1].append(clo/tot)
+
+        rel_err = torch.abs((ans-x)/ans)
+        print(rel_err, rel_err.max())
+        print(f'ans = {ans}')
+        print(f'x = {x}')
+
+        # # Plot CDF
+        # fig, axes = plt.subplots(9,1)
+        # print(axes)
+        # for i, ax in enumerate(axes):
+        #     print(i)
+        #     ax:plt.Axes
+        #     ax.hist(torch.flatten(rel_err), density=True, cumulative=True, label='CDF',
+        #         histtype='step', alpha=0.8, color='k')
+        #     ax.set_xlim(0, 10**(-i))
+        #     # ax.set_title('')
+        # plt.show()
+        # plt.savefig('a.pdf')
+    df = pd.DataFrame(data)
+    print(df.to_string())
+    df.set_axis([f'0.1^{i}' for i in range(0, 10)], axis=1, inplace=True)
+    print(df.to_string())
+    df.to_csv('a.csv')
+
+
+def runSingleConvT():
+    runtime = ft.cuda_runtime()
+    g = ft.getConvtransposedNHWC(runtime, [1, 2, 2, 448], 1)
+    opt_g = ft.optimizeGraph(g, runtime)
+    ft.if_onnx.export_onnx(opt_g, 'convtransposed.onnx')
+
+
+def run_InfoGAN_without_tuning(runtime, tuning: bool):
+    g = ft.getInfoGAN(1, runtime, 5)
+    # g = ft.getInfoGAN(1, runtime, 1)
+    opt_g = ft.optimizeGraph(g, runtime, tuning)
+    stub = OnnxStub.from_graph(opt_g)
+    with open("optimized.onnx", "wb") as f:
+        f.write(stub.to_onnx("optimized").SerializeToString())
+    return opt_g
+
+
+def load_onnx(runtime) -> ft.Graph:
+    stub = OnnxStub.from_onnx(onnx.load("optimized.onnx"), runtime, False)
+    return stub.handler.getGraph()
+
+
+def run_and_evaluate(runtime, g):
+    runtime.run(g, True)
+    print(f'getPerfTime = {runtime.getPerfTime(g, True, False, False)}')
+    print(f'Non-ctc time = {runtime.timeNonCtcOperators(g, 1000, 1000)}')
+    print(f'Cuda graph time = {runtime.timeWithCudaGraph(g)}')
+
+
+if __name__ == "__main__":
+    # run_e2e_InfoGAN()
+    # runSingleConvT()
+    # read_and_check()
+
+    runtime = ft.cuda_runtime()
+    if True:
+        g = run_InfoGAN_without_tuning(runtime, False)
+    else:
+        g = load_onnx(runtime)
+    run_and_evaluate(runtime, g)
--- a/test/nnet/test_TConv2gemm.cc
+++ b/test/nnet/test_TConv2gemm.cc
@ -388,4 +388,4 @@ TEST(Conv2conv, InfoGAN_ConvTranspose_3_OOB_Test) {
    dbg(expr);
    Derivator derivator;
    derivator.checkOOB(as<RangeOpNode>(expr));
-}
+}
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@ -5,12 +5,13 @@
 #include "core/search_engine.h"
 #include "cuda/cuda_runtime.h"
 #include "nnet/nmutator.h"
+#include "nnet/test.h"
 #include "operators/conv.h"
 #include "test.h"

 namespace infini {

-TEST(Mutator, NaiveConvWithInterpreter) {
+TEST(NMutator, NaiveConvWithInterpreter) {
    // verifyNaiveMembound True: subgraph after transformation
    // verifyNaiveMembound False: subgraph of one single membound (eOP)
    Runtime runtime = NativeCpuRuntimeObj::getInstance();
@ -54,9 +55,8 @@ TEST(Mutator, NaiveConvWithInterpreter) {
                gg->getOutputs()[0]->getRawDataPtr<void *>());
 }

-// FIXME: failed since implicit transpose for DLT
-TEST(Mutator, InfoGAN_TConv_3_correctness) {
-    const bool useMutatorDirectly = true;
+TEST(NMutator, InfoGAN_TConv_3_correctness) {
+    const bool useMutatorDirectly = false;
    Runtime runtime = make_ref<CudaRuntimeObj>();
    Graph g = make_ref<GraphObj>(runtime);
    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
@ -67,8 +67,9 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
    // const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
    // const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;

-    auto i0 = g->addTensor({n, h, w, f});
-    auto w0 = g->addTensor({f, r, s, c});
+    auto i0 = g->addTensor({n, h, w, f}, DataType::Float32, TensorType::Input);
+    auto w0 =
+        g->addTensor({f, r, s, c}, DataType::Float32, TensorType::Initialized);
    g->addOp<ConvTransposed2dNHWCObj>(i0, w0, nullptr, 1, 1, 2, 2, 1, 1);

    auto mutator =
@ -91,12 +92,13 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
        fuidToInputTensor[t->getFuid()] = t;
    }

+    std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
    for (size_t i = 0; i < bestGraphs.size(); i++) {
        auto bestGraphCpu = bestGraphs[i];
        auto bestGraph =
            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());

-        auto gen = RandomGenerator(0, 1, i);
+        auto gen = RandomGenerator(0.1, 0.1, i);
        bestGraph->dataMalloc();
        // Initialize inputs with random data
        for (auto t : g->getInputs()) {
@ -120,6 +122,7 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {

        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        dbg(runtime->getPerfTime(bestGraph, true));
        EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
    }
 }
--- a/test/nnet/test_serializer.cc
+++ b/test/nnet/test_serializer.cc
@ -1,9 +1,11 @@
+#include "core/graph.h"
 #include "nnet/Visitor/FullPrinterVisitor.h"
 #include "nnet/Visitor/Serializer.h"
+#include "nnet/test.h"
+#include "operators/membound.h"
 #include "gtest/gtest.h"
 using namespace nnet;
 using namespace std;
-#define DEFINE_VAR(name) auto name = make_ref<VarNode>(#name);

 //{L<i3:0:2500><i4:0:4><b:0:8><w:0:65>Sum<k:0:512>
 //{({A}[b, (i3 + (2500 * i4)), k] * {B<pad=0,128,0>}[b, ((i3 + (2500 * i4)) +
@ -12,11 +14,7 @@ using namespace std;
 // ==> B : Input Tensor shape=[8,10000,512] pad=[0,128,0]

 Expr buildSimpleExpr() {
-    DEFINE_VAR(b);
-    DEFINE_VAR(w);
-    DEFINE_VAR(k);
-    DEFINE_VAR(i3);
-    DEFINE_VAR(i4);
+    DEFINE_VAR(b, w, k, i3, i4);
    auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
    auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
    auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
@ -28,9 +26,7 @@ Expr buildSimpleExpr() {
 }

 Expr buildNestedExpr() {
-    DEFINE_VAR(j1);
-    DEFINE_VAR(j2);
-    DEFINE_VAR(j3);
+    DEFINE_VAR(j1, j2, j3);
    // Build a Matmul to verify.
    const int M = 10000, N = 512, K = 3;
    auto C = make_ref<TensorNode>("C", vector<int>({M, K}));
@ -46,11 +42,7 @@ Expr buildNestedExpr() {
    auto E = make_ref<TensorNode>("E", shapeE, shapeE, ele2);
    auto ele1 = make_ref<ElementWiseNode>(expr, vector{E}, shapeE);

-    DEFINE_VAR(b);
-    DEFINE_VAR(w);
-    DEFINE_VAR(k);
-    DEFINE_VAR(i3);
-    DEFINE_VAR(i4);
+    DEFINE_VAR(b, w, k, i3, i4);
    auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0}, matmul);
    auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0}, ele1);
    auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
@ -63,25 +55,23 @@ Expr buildNestedExpr() {

 TEST(Serializer, Serialization) {
    auto range = buildSimpleExpr();
-    auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
+    auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
    EXPECT_TRUE(isSuccessful);
 }

 TEST(Serializer, CompareTwoExprs) {
-    DEFINE_VAR(b);
-    DEFINE_VAR(w);
-    DEFINE_VAR(k);
-    DEFINE_VAR(i3);
-    DEFINE_VAR(i4);
+    DEFINE_VAR(b, w, k, i3, i4);
    auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
    auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
    auto subA = makeSubscript(A, {b, (i3 + (2500 * i4)), k});
+    auto funcA = make_ref<FuncNode>(subA, FuncType::Relu);
    auto subB = makeSubscript(B, {b, ((i3 + (2500 * i4)) + w), k});
    auto range = makeRangeOperator(
        {{i3, {0, 2500}}, {i4, {0, 4}}, {b, {0, 8}}, {w, {0, 65}}},
-        {{k, {0, 512}}}, subA * subB);
-    Serializer().serialize(range, "./test_serializer.json");
-    auto expr = Serializer().deserialize("./test_serializer.json");
+        {{k, {0, 512}}}, funcA * subB);
+    Serializer().toFile(range, "./test_serializer.json");
+    auto expr = Serializer().fromFile("./test_serializer.json");
+    dbg(expr);

    EXPECT_EQ(range->toReadable(), expr->toReadable());
 }
@ -90,11 +80,37 @@ TEST(Serializer, Serialization_NestedTensor) {
    FullPrinterVisitor printer;
    auto range = buildNestedExpr();
    auto ans = printer.print(range);
-    dbg(ans);
-    auto isSuccessful = Serializer().serialize(range, "./test_serializer.json");
+    auto isSuccessful = Serializer().toFile(range, "./test_serializer.json");
    EXPECT_TRUE(isSuccessful);
-    auto exprDeserialized = Serializer().deserialize("./test_serializer.json");
+    auto exprDeserialized = Serializer().fromFile("./test_serializer.json");
    auto output = printer.print(exprDeserialized);
-    dbg(output);
    EXPECT_EQ(output, ans);
-}
+}
+
+TEST(Serializer, Serialization_memboundOp) {
+    auto expr = buildSimpleExpr();
+    auto A = makeTensor("A", {8, 10000, 512}, {0, 0, 0});
+    auto B = makeTensor("B", {8, 10000, 512}, {0, 128, 0});
+    // using namespace infini;
+    auto runtime = infini::NativeCpuRuntimeObj::getInstance();
+    auto g = infini::make_ref<infini::GraphObj>(runtime);
+    auto AT = g->addTensor({8, 10000, 512});
+    auto BT = g->addTensor({8, 10000, 512});
+    auto CT = g->addTensor({2500, 4, 8, 65});
+
+    vector<Tensor> nnetInputs{A, B};
+    double execTime = 1;
+    string hint = "test";
+    infini::MemBoundObj memboundOp(nullptr, {AT, BT}, {CT}, nnetInputs, expr,
+                                   execTime, hint);
+    auto str = memboundOp.toJson();
+    auto [exprLoaded, nnetInputsLoaded, execTimeLoaded, hintLoaded] =
+        Serializer().membundOpFromString(str);
+    EXPECT_EQ(expr->toReadable(), exprLoaded->toReadable());
+    EXPECT_EQ(execTime, execTimeLoaded);
+    EXPECT_EQ(nnetInputs.size(), nnetInputsLoaded.size());
+    for (size_t i = 0; i < nnetInputs.size(); ++i)
+        EXPECT_EQ(nnetInputs[i]->toReadable(),
+                  nnetInputsLoaded[i]->toReadable());
+    EXPECT_EQ(hint, hintLoaded);
+}
Author	SHA1	Message	Date
huangshuhong	8c91faa948	remove expect	2023-04-21 00:17:04 +08:00
huangshuhong	c0ae03a2d7	fix tvm stream	2023-04-21 00:09:47 +08:00
Liyan Zheng	0cb8729bc1	Add: different ONNX names for inputs and weights	2023-04-20 21:51:47 +08:00
YdrMaster	8bc2d3e48d	fix: test graph handler Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 21:51:47 +08:00
YdrMaster	28b123753e	feat: 导入 Tensor 类型 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 21:51:47 +08:00
Liyan Zheng	94730d93b5	Add: hash match for membound kernels	2023-04-20 17:16:01 +08:00
Liyan Zheng	6d17c4caa2	Add: getPerfTime in run_models_nnet	2023-04-20 10:54:49 +08:00
Liyan Zheng	15d0eb79cd	Add: import ONNX with membound Op	2023-04-20 10:45:28 +08:00
Liyan Zheng	2a343e240e	Add: shape of intermediate tensor in exported ONNX	2023-04-20 10:45:28 +08:00
Liyan Zheng	34ca6bf149	Fix: skip check when Graph is exported to ONNX	2023-04-20 10:45:28 +08:00
YdrMaster	a6019e79e3	feat(py): 支持从 Graph 直接创建 OnnxStub Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:45:28 +08:00
YdrMaster	4e1cc8d3e4	refactor(py): 使用工厂方法创建 OnnxStub Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
YdrMaster	725f9260cf	feat: 支持导出 membound Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
YdrMaster	0edd138919	feat: 正反序列化分离为到 string 的和到 file 的 fix: 正确设置 `USE_CUDA` cfg todo: test_search 不过 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-04-20 10:44:39 +08:00
Liyan Zheng	0b23a065ca	Add: debug hacks for InfoGAN	2023-04-20 10:42:56 +08:00
Liyan Zheng	e86e993ed4	Add: CUDA graph stream capture (MemboundOp fails)	2023-04-19 16:32:16 +08:00
Liyan Zheng	e4c20a9ae2	Add: warmup and repeat args in timeNonCtcOperators	2023-04-19 16:22:59 +08:00
Liyan Zheng	537b3b4ea4	Add: Membound operator serialization	2023-04-18 21:53:48 +08:00
Liyan Zheng	2812900ea2	Fix: OpType and print device tensors	2023-04-18 20:28:08 +08:00
Liyan Zheng	01fc19795d	Add: time non-compile-cime-computable operators	2023-04-18 17:21:16 +08:00
Liyan Zheng	afc4123328	Chore: remove deprecated function	2023-04-18 17:21:16 +08:00
Liyan Zheng	b981951a47	Add: NMutator::memboundToJson to export memboundOp	2023-04-18 17:21:16 +08:00
Liyan Zheng	99b5c95455	Add: nnet::Serializer supports FuncNode	2023-04-18 17:21:16 +08:00
Liyan Zheng	9d50b30af8	Chore: disable nnet_unimplemented_continue output	2023-04-18 17:21:16 +08:00
Liyan Zheng	bc31219bde	Add: exclude compile-time computable operator time	2023-04-18 17:21:16 +08:00
Liyan Zheng	edf4e33353	Add: C++ callback to export ONNX	2023-04-18 17:19:05 +08:00
Liyan Zheng	872f3504a9	Add: RangeOpNode::getFullExpression()	2023-04-18 17:19:05 +08:00
Liyan Zheng	da49e91ab0	Add: fuse membound operators	2023-04-18 17:19:05 +08:00
Liyan Zheng	a6b8f344d4	Chore: simplify type names	2023-04-18 17:19:05 +08:00
Liyan Zheng	09293730ea	Add: export to ONNX with custom operators	2023-04-18 17:19:05 +08:00
Liyan Zheng	307614d95d	Add: infogan python interface	2023-04-18 17:16:25 +08:00
Liyan Zheng	f14edcd52f	Fix: avoid reload library	2023-04-18 17:16:25 +08:00
Liyan Zheng	d2d49c5d4f	Add: invoke TVM through pipe	2023-04-18 17:16:25 +08:00
Liyan Zheng	e72fe79168	Add: search engine uses estimated time	2023-04-18 17:16:25 +08:00