diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57dec16a..f1079f65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,6 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
 
 if(BUILD_TEST)
-  # TODO: chekc set
   set(BUILD_GMOCK
       OFF
       CACHE BOOL "Do not build gmock" FORCE)
diff --git a/include/core/common.h b/include/core/common.h
index 0c69ef35..0fe7344e 100644
--- a/include/core/common.h
+++ b/include/core/common.h
@@ -16,8 +16,6 @@
 namespace infini {
 using std::list;
 using std::map;
-using std::nullopt;
-using std::optional;
 using std::pair;
 using std::set;
 using std::string;
@@ -29,6 +27,7 @@ using std::vector;
 
 // Aliases
 using dtype = float;
+using HashType = size_t; // compatible with std::hash
 
 // Metaprogramming utilities
 #define _CAT(A, B) A##B
diff --git a/include/core/graph.h b/include/core/graph.h
index 2f72bb94..9c87310a 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -4,6 +4,7 @@
 
 namespace infini {
 
+// TODO: graph should be attached to a context
 class GraphNode : public Object {
   protected:
     TensorVec tensors;
@@ -29,12 +30,18 @@ class GraphNode : public Object {
         return tensor;
     }
 
-    void updateConnection();
     void dataMalloc();
 
+  private:
+    // TODO: updateConnection
+    /**
+     * @brief Add reverse connections and Op relationship in ctor.
+     */
+    void updateConnection();
+
     // TODO: move to another class
     // bool exportOnnx(const char *path);
     // bool importOnnx(const char *net);
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/kernel.h b/include/core/kernel.h
index 437884e2..6c0c5677 100644
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@@ -23,38 +23,46 @@ class Kernel {
 };
 
 class KernelRegistry {
+  public:
+    using KernelRecord =
+        tuple<Kernel *const, const string, const int>; // Kernel, name, ID
+
+  private:
+    std::map<KernelAttrs, KernelRecord> kernels;
+    int nKernels = 0;
+
   public:
     ~KernelRegistry() {
         for (auto &[k, v] : kernels)
-            delete v;
+            delete std::get<0>(v);
     }
     static KernelRegistry &getInstance() {
         static KernelRegistry instance;
         return instance;
     }
-    bool registerKernel(const KernelAttrs &key, Kernel *kernel) {
-        // TODO: kernels with priority
+    bool registerKernel(const KernelAttrs &key, Kernel *kernel, string name) {
+        // TODO: mutliple kernels support: priority and check name
         IT_ASSERT(kernels.find(key) == kernels.end(),
                   "Kernel already registered");
-        kernels.emplace(key, kernel);
+        kernels.emplace(key, KernelRecord{kernel, name, ++nKernels});
         return true;
     }
     Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
+        return std::get<0>(kernels.at(kernelAttrs));
+    }
+    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
         return kernels.at(kernelAttrs);
     }
-
-  private:
-    std::map<KernelAttrs, Kernel *> kernels;
 };
 
-#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, cnt)              \
+#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
     namespace infini {                                                         \
     static const bool _CAT(_register_kernel_, cnt) =                           \
         KernelRegistry::getInstance().registerKernel(                          \
-            KernelAttrs{device, opType, dataType}, new kernel());              \
+            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
     }
 
-#define REGISTER_KERNEL(device, opType, dataType, kernel)                      \
-    _REGISTER_KERNEL_1(device, opType, dataType, kernel, __COUNTER__)
+#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
+    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
 
 } // namespace infini
diff --git a/include/core/mutator.h b/include/core/mutator.h
index ac2c169d..42402151 100644
--- a/include/core/mutator.h
+++ b/include/core/mutator.h
@@ -4,11 +4,16 @@
 namespace infini {
 
 class Mutator {
+  private:
+    int candidatesLimit;
+    // // Statistical data
+    // int numTotalCandidates;
+
   public:
-    Mutator(){};
+    Mutator(int candidatesLimit) : candidatesLimit(candidatesLimit){};
     virtual ~Mutator(){};
 
     virtual vector<Graph> run(const Graph &in_graph) = 0;
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/core/operator.h b/include/core/operator.h
index bf54737f..9d04aed2 100644
--- a/include/core/operator.h
+++ b/include/core/operator.h
@@ -94,18 +94,42 @@ enum class ActType {
     Tanh,
 };
 
-struct OpAttrs {
+struct OpPerfKey {
+    HashType hash;
+    OpType opType;
+    vector<int> attrs;
+
   public:
-    virtual bool operator<(const OpAttrs &rhs) const {
-        IT_ASSERT(typeid(*this) == typeid(rhs), "OpAttrs type mismatch.");
-        // Empty OpAttrs are equal
+    OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
+        : hash(hash), opType(opType), attrs(attrs) {}
+    bool operator==(const OpPerfKey &rhs) const {
+        if (hash != rhs.hash)
+            return false;
+        if (opType != rhs.opType)
+            return false;
+        if (attrs != rhs.attrs)
+            return false;
+        return true;
+    }
+
+    // TODO: remove this function after we use unordered_map in PerfEngine
+    bool operator<(const OpPerfKey &rhs) const {
+        if (hash != rhs.hash)
+            return hash < rhs.hash;
+        if (opType != rhs.opType)
+            return opType < rhs.opType;
+        if (attrs.size() != rhs.attrs.size())
+            return attrs.size() < rhs.attrs.size();
+        for (size_t i = 0; i < attrs.size(); ++i)
+            if (attrs[i] != rhs.attrs[i])
+                return attrs[i] < rhs.attrs[i];
         return false;
     }
-    virtual ~OpAttrs() {}
 };
 
 class OperatorNode : public Object {
-  public:
+    friend class Kernel;
+
   protected:
     OpType type;
     TensorVec inputs;
@@ -117,7 +141,7 @@ class OperatorNode : public Object {
     OperatorNode(OpType opType, TensorVec inputs, TensorVec outputs)
         : type(opType), inputs(inputs), outputs(outputs) {}
     virtual vector<Shape> computeShape() const = 0;
-    virtual OpAttrs getOpAttrs() const = 0;
+    virtual OpPerfKey getOpAttrs() const = 0;
 
   public: // check Op type
     bool isLinearOp() const;
@@ -143,6 +167,14 @@ class OperatorNode : public Object {
 
     virtual int numInputs() const = 0;
     virtual int numOutputs() const = 0;
+    virtual HashType hash() const { IT_TODO_HALT(); }
+    virtual HashType hashWithShape() const { IT_TODO_HALT(); }
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
+
+namespace std {
+template <> struct hash<infini::OpPerfKey> {
+    size_t operator()(const infini::OpPerfKey &key) const { return key.hash; }
+};
+} // namespace std
\ No newline at end of file
diff --git a/include/core/perf_engine.h b/include/core/perf_engine.h
index b55baf26..563ad704 100644
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@@ -6,7 +6,9 @@ namespace infini {
 
 class PerfEngine {
   public:
-    using Key = std::pair<KernelAttrs, OpAttrs>;
+    // TODO: Key should be OpPerfKey + Context(maybe implicat) to support
+    // multiple candiate kernels.
+    using Key = std::pair<KernelAttrs, OpPerfKey>;
 
   private:
     map<Key, PerfRecord> data;
diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h
index da08e118..cafea062 100644
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@@ -34,22 +34,13 @@ class TensorBaseNode : public Object {
     //     NotCounted,
     // };
 
-    // // TODO: is more compute state needed?
-    // enum ComputeState {
-    //     NotComputed,
-    //     // Allocated,
-    //     // Initialized,
-    //     // ComputedPartial,
-    //     ComputedFull,
-    // };
-
   protected:
     int dim;
 
     DataType dtype;
     vector<WRef<TensorBaseNode>> inputOf;
     WRef<TensorBaseNode> outputOf;
-    // TODO: use a blob instead of vector
+    // TODO: Ref<void> -> Ref<Blob>
     Ref<VType[]> data;
     // ComputeState computed;
     // static int random_seed[256 * 16];
@@ -267,4 +258,4 @@ class TensorBaseNode : public Object {
     //     void printShape();
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/include/operators/matmul.h b/include/operators/matmul.h
index 3a70920b..405b3f76 100644
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@@ -4,30 +4,15 @@
 namespace infini {
 
 class MatmulNode : public OperatorNode {
-  public:
-    struct MatmulArgs : public OpAttrs {
-        int b, m, n, k;
-        // PET assume a row-major tensor layout. transA=false means default
-        // dims, true means A should be transposed before matmul. This is in
-        // oppsite to column-major BLAS.
-        bool transA, transB;
-        ActType act;
-
-        MatmulArgs(int b, int m, int n, int k, bool transA, bool transB,
-                   ActType act)
-            : b(b), m(m), n(n), k(k), transA(transA), transB(transB), act(act) {
-        }
-
-        bool operator<(const OpAttrs &rhsGeneric) {
-            auto rhs = dynamic_cast<const MatmulArgs &>(rhsGeneric);
-            return std::tie(b, m, n, k, transA, transB, act) <
-                   std::tie(rhs.b, rhs.m, rhs.n, rhs.k, rhs.transA, rhs.transB,
-                            rhs.act);
-        }
-    };
-
   private:
-    MatmulArgs args;
+    // InfiniTensor assume a row-major tensor layout. transA=false means default
+    // dims, true means A should be transposed before matmul. This is in
+    // oppsite to column-major BLAS.
+    bool transA, transB;
+    ActType act;
+
+    // Auxiliary attributes
+    int b, m, n, k;
 
   public:
     MatmulNode(Tensor A, Tensor B, Tensor C, bool transA = false,
@@ -41,19 +26,22 @@ class MatmulNode : public OperatorNode {
     int numOutputs() const override { return 1; }
 
     Tensor getBias() const { return inputs[2]; }
-    void setAct(ActType act) { this->args.act = act; }
-    ActType getAct() const { return args.act; }
-    bool getTransA() const { return args.transA; }
-    bool getTransB() const { return args.transB; }
+    ActType getAct() const { return act; }
+    bool getTransA() const { return transA; }
+    bool getTransB() const { return transB; }
+    int getB() const { return b; }
+    int getM() const { return m; }
+    int getN() const { return n; }
+    int getK() const { return k; }
 
-    MatmulArgs getArgs() const { return args; }
-    OpAttrs getOpAttrs() const override { return args; }
+    HashType hashWithShape() const override;
+    OpPerfKey getOpAttrs() const override;
 
   private:
     // Q: whether to check the output? Since we can build an Op first and then
     // assure output.
-    // Fix 1: make shape inference a static method. But OpAttrs are required.
+    // Fix 1: make shape inference a static method. But OpPerfKey are required.
     bool checkValid(const TensorVec &inputs) const;
 };
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/src/kerels/cpu/matmul.cc b/src/kerels/cpu/matmul.cc
index 45c46eab..e8ae5c7e 100644
--- a/src/kerels/cpu/matmul.cc
+++ b/src/kerels/cpu/matmul.cc
@@ -9,10 +9,9 @@ template <typename T> class NaiveMatmul : public Kernel {
         T *A = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
         T *B = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
         T *C = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
-        const auto args = op->getArgs();
-        IT_ASSERT(args.transA == false && args.transB == false);
-        IT_ASSERT(args.act == ActType::None);
-        const int M = args.m, N = args.n, K = args.k;
+        IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
+        IT_ASSERT(op->getAct() == ActType::None);
+        const int M = op->getM(), N = op->getN(), K = op->getK();
         for (int i = 0; i < M; i++) {
             for (int j = 0; j < N; j++) {
                 C[i * N + j] = 0;
@@ -33,8 +32,8 @@ template <typename T> class NaiveMatmul : public Kernel {
 };
 
 REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Int32,
-                NaiveMatmul<uint32_t>);
+                NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
 REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Float32,
-                NaiveMatmul<float>);
+                NaiveMatmul<float>, "MatmulNaive_CPU_float32");
 
 } // namespace infini
\ No newline at end of file
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index bdd11e2f..2f9666a2 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -2,27 +2,24 @@
 
 namespace infini {
 
-vector<Shape> MatmulNode::computeShape() const {
-    Shape ret{args.b, args.m, args.n};
-    return {ret};
-}
+vector<Shape> MatmulNode::computeShape() const { return {{b, m, n}}; }
 
 MatmulNode::MatmulNode(Tensor A, Tensor B, Tensor C, bool transA, bool transB,
                        Tensor bias, ActType act)
-    : OperatorNode(OpType::Matmul, {A, B, bias}, {C}),
-      args(A->getDims()[0], transA ? A->getDims()[2] : A->getDims()[1],
-           transB ? B->getDims()[1] : B->getDims()[2],
-           transA ? A->getDims()[1] : A->getDims()[2], transA, transB, act) {
+    : OperatorNode(OpType::Matmul, {A, B, bias}, {C}), transA(transA),
+      transB(transB), act(act), b(A->getDims()[0]),
+      m(transA ? A->getDims()[2] : A->getDims()[1]),
+      n(transB ? B->getDims()[1] : B->getDims()[2]),
+      k(transA ? A->getDims()[1] : A->getDims()[2]) {
     IT_ASSERT(checkValid(inputs));
 }
 
 string MatmulNode::toString() const {
     std::ostringstream os;
-    MatmulArgs args = getArgs();
-    os << "Matmul([" << (args.transA ? "A^T" : "A") << ","
-       << (args.transB ? "B^T" : "B") << ",act=" << (int)args.act
-       << "],A=" << inputs[0]->getGuid() << ",B=" << inputs[1]->getGuid()
-       << ",C=" << outputs[0]->getGuid() << ")";
+    os << "Matmul([" << (transA ? "A^T" : "A") << "," << (transB ? "B^T" : "B")
+       << ",act=" << enum_to_underlying(act) << "],A=" << inputs[0]->getGuid()
+       << ",B=" << inputs[1]->getGuid() << ",C=" << outputs[0]->getGuid()
+       << ")";
     return os.str();
 }
 
@@ -32,8 +29,8 @@ bool MatmulNode::checkValid(const TensorVec &inputs) const {
     //     return false;
     IT_ASSERT(A->getDims().size() == 3 && B->getDims().size() == 3);
     IT_ASSERT(A->getDims()[0] == B->getDims()[0]);
-    IT_ASSERT((args.transA ? A->getDims()[1] : A->getDims()[2]) ==
-              (args.transB ? B->getDims()[2] : B->getDims()[1]));
+    IT_ASSERT((transA ? A->getDims()[1] : A->getDims()[2]) ==
+              (transB ? B->getDims()[2] : B->getDims()[1]));
     // if (A->getDims().size() != 3 || B->getDims().size() != 3) {
     //     return false;
     // }
@@ -46,4 +43,14 @@ bool MatmulNode::checkValid(const TensorVec &inputs) const {
     // }
     return true;
 }
+
+HashType MatmulNode::hashWithShape() const {
+    // TODO: use a real hash
+    return b + m + n + k + transA + transB + enum_to_underlying(act);
+}
+
+OpPerfKey MatmulNode::getOpAttrs() const {
+    return OpPerfKey(hashWithShape(), type,
+                     {b, m, n, k, transA, transB, enum_to_underlying(act)});
+}
 } // namespace infini
\ No newline at end of file