Merge pull request #1 from InfiniTensor/init

Initialization
2022-08-09 20:21:20 +08:00 · 2022-08-09 20:21:20 +08:00 · eda41b06a7
parent b89495a782 a4fb9fa413
commit eda41b06a7
270 changed files with 71286 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,90 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: false
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
+
--- a/.cmake-format.json
+++ b/.cmake-format.json
@ -0,0 +1,311 @@
+{
+  "_help_parse": "Options affecting listfile parsing",
+  "parse": {
+    "_help_additional_commands": [
+      "Specify structure for custom cmake functions"
+    ],
+    "additional_commands": {
+      "foo": {
+        "flags": [
+          "BAR",
+          "BAZ"
+        ],
+        "kwargs": {
+          "HEADERS": "*",
+          "SOURCES": "*",
+          "DEPENDS": "*"
+        }
+      }
+    },
+    "_help_override_spec": [
+      "Override configurations per-command where available"
+    ],
+    "override_spec": {},
+    "_help_vartags": [
+      "Specify variable tags."
+    ],
+    "vartags": [],
+    "_help_proptags": [
+      "Specify property tags."
+    ],
+    "proptags": []
+  },
+  "_help_format": "Options affecting formatting.",
+  "format": {
+    "_help_disable": [
+      "Disable formatting entirely, making cmake-format a no-op"
+    ],
+    "disable": false,
+    "_help_line_width": [
+      "How wide to allow formatted cmake files"
+    ],
+    "line_width": 1500,
+    "_help_tab_size": [
+      "How many spaces to tab for indent"
+    ],
+    "tab_size": 2,
+    "_help_use_tabchars": [
+      "If true, lines are indented using tab characters (utf-8",
+      "0x09) instead of <tab_size> space characters (utf-8 0x20).",
+      "In cases where the layout would require a fractional tab",
+      "character, the behavior of the  fractional indentation is",
+      "governed by <fractional_tab_policy>"
+    ],
+    "use_tabchars": false,
+    "_help_fractional_tab_policy": [
+      "If <use_tabchars> is True, then the value of this variable",
+      "indicates how fractional indentions are handled during",
+      "whitespace replacement. If set to 'use-space', fractional",
+      "indentation is left as spaces (utf-8 0x20). If set to",
+      "`round-up` fractional indentation is replaced with a single",
+      "tab character (utf-8 0x09) effectively shifting the column",
+      "to the next tabstop"
+    ],
+    "fractional_tab_policy": "use-space",
+    "_help_max_subgroups_hwrap": [
+      "If an argument group contains more than this many sub-groups",
+      "(parg or kwarg groups) then force it to a vertical layout."
+    ],
+    "max_subgroups_hwrap": 2,
+    "_help_max_pargs_hwrap": [
+      "If a positional argument group contains more than this many",
+      "arguments, then force it to a vertical layout."
+    ],
+    "max_pargs_hwrap": 6,
+    "_help_max_rows_cmdline": [
+      "If a cmdline positional group consumes more than this many",
+      "lines without nesting, then invalidate the layout (and nest)"
+    ],
+    "max_rows_cmdline": 2,
+    "_help_separate_ctrl_name_with_space": [
+      "If true, separate flow control names from their parentheses",
+      "with a space"
+    ],
+    "separate_ctrl_name_with_space": false,
+    "_help_separate_fn_name_with_space": [
+      "If true, separate function names from parentheses with a",
+      "space"
+    ],
+    "separate_fn_name_with_space": false,
+    "_help_dangle_parens": [
+      "If a statement is wrapped to more than one line, than dangle",
+      "the closing parenthesis on its own line."
+    ],
+    "dangle_parens": false,
+    "_help_dangle_align": [
+      "If the trailing parenthesis must be 'dangled' on its on",
+      "line, then align it to this reference: `prefix`: the start",
+      "of the statement,  `prefix-indent`: the start of the",
+      "statement, plus one indentation  level, `child`: align to",
+      "the column of the arguments"
+    ],
+    "dangle_align": "prefix",
+    "_help_min_prefix_chars": [
+      "If the statement spelling length (including space and",
+      "parenthesis) is smaller than this amount, then force reject",
+      "nested layouts."
+    ],
+    "min_prefix_chars": 4,
+    "_help_max_prefix_chars": [
+      "If the statement spelling length (including space and",
+      "parenthesis) is larger than the tab width by more than this",
+      "amount, then force reject un-nested layouts."
+    ],
+    "max_prefix_chars": 10,
+    "_help_max_lines_hwrap": [
+      "If a candidate layout is wrapped horizontally but it exceeds",
+      "this many lines, then reject the layout."
+    ],
+    "max_lines_hwrap": 2,
+    "_help_line_ending": [
+      "What style line endings to use in the output."
+    ],
+    "line_ending": "unix",
+    "_help_command_case": [
+      "Format command names consistently as 'lower' or 'upper' case"
+    ],
+    "command_case": "canonical",
+    "_help_keyword_case": [
+      "Format keywords consistently as 'lower' or 'upper' case"
+    ],
+    "keyword_case": "unchanged",
+    "_help_always_wrap": [
+      "A list of command names which should always be wrapped"
+    ],
+    "always_wrap": [],
+    "_help_enable_sort": [
+      "If true, the argument lists which are known to be sortable",
+      "will be sorted lexicographicall"
+    ],
+    "enable_sort": true,
+    "_help_autosort": [
+      "If true, the parsers may infer whether or not an argument",
+      "list is sortable (without annotation)."
+    ],
+    "autosort": false,
+    "_help_require_valid_layout": [
+      "By default, if cmake-format cannot successfully fit",
+      "everything into the desired linewidth it will apply the",
+      "last, most aggressive attempt that it made. If this flag is",
+      "True, however, cmake-format will print error, exit with non-",
+      "zero status code, and write-out nothing"
+    ],
+    "require_valid_layout": false,
+    "_help_layout_passes": [
+      "A dictionary mapping layout nodes to a list of wrap",
+      "decisions. See the documentation for more information."
+    ],
+    "layout_passes": {}
+  },
+  "_help_markup": "Options affecting comment reflow and formatting.",
+  "markup": {
+    "_help_bullet_char": [
+      "What character to use for bulleted lists"
+    ],
+    "bullet_char": "*",
+    "_help_enum_char": [
+      "What character to use as punctuation after numerals in an",
+      "enumerated list"
+    ],
+    "enum_char": ".",
+    "_help_first_comment_is_literal": [
+      "If comment markup is enabled, don't reflow the first comment",
+      "block in each listfile. Use this to preserve formatting of",
+      "your copyright/license statements."
+    ],
+    "first_comment_is_literal": false,
+    "_help_literal_comment_pattern": [
+      "If comment markup is enabled, don't reflow any comment block",
+      "which matches this (regex) pattern. Default is `None`",
+      "(disabled)."
+    ],
+    "literal_comment_pattern": ".*",
+    "_help_fence_pattern": [
+      "Regular expression to match preformat fences in comments",
+      "default= ``r'^\\s*([`~]{3}[`~]*)(.*)$'``"
+    ],
+    "fence_pattern": "^\\s*([`~]{3}[`~]*)(.*)$",
+    "_help_ruler_pattern": [
+      "Regular expression to match rulers in comments default=",
+      "``r'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'``"
+    ],
+    "ruler_pattern": "^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$",
+    "_help_explicit_trailing_pattern": [
+      "If a comment line matches starts with this pattern then it",
+      "is explicitly a trailing comment for the preceding argument.",
+      "Default is '#<'"
+    ],
+    "explicit_trailing_pattern": "#<",
+    "_help_hashruler_min_length": [
+      "If a comment line starts with at least this many consecutive",
+      "hash characters, then don't lstrip() them off. This allows",
+      "for lazy hash rulers where the first hash char is not",
+      "separated by space"
+    ],
+    "hashruler_min_length": 10,
+    "_help_canonicalize_hashrulers": [
+      "If true, then insert a space between the first hash char and",
+      "remaining hash chars in a hash ruler, and normalize its",
+      "length to fill the column"
+    ],
+    "canonicalize_hashrulers": true,
+    "_help_enable_markup": [
+      "enable comment markup parsing and reflow"
+    ],
+    "enable_markup": true
+  },
+  "_help_lint": "Options affecting the linter",
+  "lint": {
+    "_help_disabled_codes": [
+      "a list of lint codes to disable"
+    ],
+    "disabled_codes": [],
+    "_help_function_pattern": [
+      "regular expression pattern describing valid function names"
+    ],
+    "function_pattern": "[0-9a-z_]+",
+    "_help_macro_pattern": [
+      "regular expression pattern describing valid macro names"
+    ],
+    "macro_pattern": "[0-9A-Z_]+",
+    "_help_global_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with global (cache) scope"
+    ],
+    "global_var_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_internal_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with global scope (but internal semantic)"
+    ],
+    "internal_var_pattern": "_[A-Z][0-9A-Z_]+",
+    "_help_local_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with local scope"
+    ],
+    "local_var_pattern": "[a-z][a-z0-9_]+",
+    "_help_private_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "privatedirectory variables"
+    ],
+    "private_var_pattern": "_[0-9a-z_]+",
+    "_help_public_var_pattern": [
+      "regular expression pattern describing valid names for public",
+      "directory variables"
+    ],
+    "public_var_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_argument_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "function/macro arguments and loop variables."
+    ],
+    "argument_var_pattern": "[a-z][a-z0-9_]+",
+    "_help_keyword_pattern": [
+      "regular expression pattern describing valid names for",
+      "keywords used in functions or macros"
+    ],
+    "keyword_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_max_conditionals_custom_parser": [
+      "In the heuristic for C0201, how many conditionals to match",
+      "within a loop in before considering the loop a parser."
+    ],
+    "max_conditionals_custom_parser": 2,
+    "_help_min_statement_spacing": [
+      "Require at least this many newlines between statements"
+    ],
+    "min_statement_spacing": 1,
+    "_help_max_statement_spacing": [
+      "Require no more than this many newlines between statements"
+    ],
+    "max_statement_spacing": 2,
+    "max_returns": 6,
+    "max_branches": 12,
+    "max_arguments": 5,
+    "max_localvars": 15,
+    "max_statements": 50
+  },
+  "_help_encode": "Options affecting file encoding",
+  "encode": {
+    "_help_emit_byteorder_mark": [
+      "If true, emit the unicode byte-order mark (BOM) at the start",
+      "of the file"
+    ],
+    "emit_byteorder_mark": false,
+    "_help_input_encoding": [
+      "Specify the encoding of the input file. Defaults to utf-8"
+    ],
+    "input_encoding": "utf-8",
+    "_help_output_encoding": [
+      "Specify the encoding of the output file. Defaults to utf-8.",
+      "Note that cmake only claims to support utf-8 so be careful",
+      "when using anything else"
+    ],
+    "output_encoding": "utf-8"
+  },
+  "_help_misc": "Miscellaneous configurations options.",
+  "misc": {
+    "_help_per_command": [
+      "A dictionary containing any per-command configuration",
+      "overrides. Currently only `command_case` is supported."
+    ],
+    "per_command": {}
+  }
+}
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@ -0,0 +1,19 @@
+name: clang-format Check
+on: [pull_request]
+jobs:
+  formatting-check:
+    name: Formatting Check
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        path:
+          - 'include'
+          - 'src'
+          - 'test'
+    steps:
+    - uses: actions/checkout@v2
+    - name: Run clang-format style check for C/C++/Protobuf programs.
+      uses: jidicula/clang-format-action@v4.8.0
+      with:
+        clang-format-version: '14'
+        check-path: ${{ matrix.path }}
--- a/.gitignore
+++ b/.gitignore
@ -30,3 +30,8 @@
 *.exe
 *.out
 *.app
+
+build/
+build_debug/
+
+.vscode/
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,9 @@
+[submodule "3rd-party/pybind11"]
+	path = 3rd-party/pybind11
+	url = git@github.com:pybind/pybind11.git
+[submodule "3rd-party/nlohmann_json_cmake_fetchcontent"]
+	path = 3rd-party/nlohmann_json_cmake_fetchcontent
+	url = git@github.com:ArthurSonzogni/nlohmann_json_cmake_fetchcontent.git
+[submodule "3rd-party/googletest"]
+	path = 3rd-party/googletest
+	url = git@github.com:google/googletest.git
--- a/3rd-party/googletest
+++ b/3rd-party/googletest
@ -0,0 +1 @@
+Subproject commit e2239ee6043f73722e7aa812a459f54a28552929
--- a/3rd-party/nlohmann_json_cmake_fetchcontent
+++ b/3rd-party/nlohmann_json_cmake_fetchcontent
@ -0,0 +1 @@
+Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756
--- a/3rd-party/pybind11
+++ b/3rd-party/pybind11
@ -0,0 +1 @@
+Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,91 @@
+# TODO: check the minimum cmake version
+cmake_minimum_required(VERSION 3.9) # Required by find_package(OpenMP)
+include(CMakeDependentOption)
+project(InfiniTensor C CXX)
+
+# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(BUILD_TEST "Build tests" ON)
+cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
+cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
+cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
+
+set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wno-error=deprecated-declarations")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
+
+find_package(
+  Python
+  COMPONENTS Interpreter Development
+  REQUIRED)
+find_package(CUDA REQUIRED)
+# OpenMP
+find_package(OpenMP)
+if(OpenMP_C_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+endif()
+if(OpenMP_CXX_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+include_directories(include)
+
+# # Pybind11
+# add_subdirectory(3rd-party/pybind11)
+# include_directories(3rd-party/pybind11/include)
+
+# nlohmann_json
+add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
+include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
+
+if(BUILD_TEST)
+  set(BUILD_GMOCK
+      OFF
+      CACHE BOOL "Do not build gmock" FORCE)
+  set(INSTALL_GTEST
+      OFF
+      CACHE BOOL "Do not install gtest" FORCE)
+  add_subdirectory(3rd-party/googletest)
+  include_directories(3rd-party/googletest/googletest/include)
+endif()
+
+file(GLOB_RECURSE SRC src/*.cc src/*.cu)
+# file(GLOB_RECURSE FFI src/ffi/ffi_pet.cc)
+# list(REMOVE_ITEM SRC ${TEST} ${FFI})
+
+add_library(InfiniTensor SHARED ${SRC})
+# Target
+# cuda_add_library(it SHARED ${SRC})
+# cuda_add_cublas_to_target(it) # cublas
+# # target_link_libraries(infini_cpp cudnn curand nlohmann_json::nlohmann_json pybind11::embed)
+
+# # Python bindings
+# pybind11_add_module(infini MODULE ${FFI})
+# target_link_libraries(infini PRIVATE infini_cpp)
+
+function(build_test files)
+  # Non-recursive glob for skip failed tests
+  file(GLOB TEST_SOURCES ${files})
+  foreach(testsourcefile ${TEST_SOURCES})
+    get_filename_component(testname ${testsourcefile} NAME_WE)
+    add_executable(${testname} ${testsourcefile})
+    target_link_libraries(${testname} InfiniTensor GTest::gtest_main)
+    add_test(NAME ${testname} COMMAND ${testname})
+  endforeach(testsourcefile ${TEST_SOURCES})
+endfunction()
+
+if(BUILD_TEST)
+  enable_testing()
+  if(BUILD_TEST_CORE)
+    build_test(test/core/*.cc)
+  endif()
+  if(BUILD_TEST_PET)
+    build_test(test/pet/*.cc)
+  endif()
+  if(BUILD_TEST_EINNET)
+    build_test(test/nnet/*.cc)
+  endif()
+endif()
--- a/include/core/common.h
+++ b/include/core/common.h
@ -0,0 +1,61 @@
+#pragma once
+#include <cassert>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace infini {
+using std::list;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::tie;
+using std::to_string;
+using std::tuple;
+using std::unordered_map;
+using std::vector;
+
+// Aliases
+using dtype = float;
+using HashType = size_t; // compatible with std::hash
+
+// Metaprogramming utilities
+#define _CAT(A, B) A##B
+#define _SELECT(NAME, NUM) _CAT(NAME##_, NUM)
+#define _GET_COUNT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ...) COUNT
+#define _VA_SIZE(...) _GET_COUNT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+#define _VA_SELECT(NAME, ...) _SELECT(NAME, _VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
+
+// Assert: conditions should have no side effect
+#define _IT_ASSERT_2(name, info)                                               \
+    (static_cast<bool>(name)                                                   \
+         ? void(0)                                                             \
+         : throw std::runtime_error(                                           \
+               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
+               "] Assertion failed (" + #name + "): " + #info))
+#define _IT_ASSERT_1(name) _IT_ASSERT_2(name, "");
+
+#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
+#define IT_TODO_HALT() IT_ASSERT(false, "Unimplemented")
+#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
+
+// Other utilities
+
+// std::to_underlying is avaiable since C++23
+template <typename T> auto enum_to_underlying(T e) {
+    return static_cast<std::underlying_type_t<T>>(e);
+}
+
+double timeit(const std::function<void()> &func);
+
+} // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -0,0 +1,47 @@
+#pragma once
+#include "core/operator.h"
+#include "core/tensor.h"
+
+namespace infini {
+
+// TODO: graph should be attached to a context
+class GraphNode : public Object {
+  protected:
+    TensorVec tensors;
+    TensorVec inputs;
+    TensorVec outputs;
+    OpVec ops;
+
+  public:
+    // Graph(OpVec oplist);
+    string toString() const override;
+
+    void addOp(Operator op) { ops.push_back(op); };
+    const TensorVec &getTensors() const { return tensors; }
+    const TensorVec &getInputs() const { return inputs; }
+    const TensorVec &getOutputs() const { return outputs; }
+    const OpVec &getOperators() const { return ops; }
+    // TensorVec &getInputs();
+    // TensorVec &getOutputs();
+
+    Tensor addTensor(Shape dim, DataType dtype = DataType::Int32) {
+        Tensor tensor = make_ref<TensorNode>(dim, dtype);
+        tensors.emplace_back(tensor);
+        return tensor;
+    }
+
+    void dataMalloc();
+
+  private:
+    // TODO: updateConnection
+    /**
+     * @brief Add reverse connections and Op relationship in ctor.
+     */
+    void updateConnection();
+
+    // TODO: move to another class
+    // bool exportOnnx(const char *path);
+    // bool importOnnx(const char *net);
+};
+
+} // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -0,0 +1,76 @@
+#pragma once
+#include "core/common.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+
+namespace infini {
+
+struct PerfRecord {
+    double time; // in milliseconds
+};
+
+class Kernel {
+  public:
+    Kernel() {}
+    virtual ~Kernel() {}
+
+    /**
+     * @param op The operator to be executed.
+     * @param record The parameters for kernel execution. If extra parameters
+     * are required, inherit from PerfRecord and add extra parameters.
+     * Otherwire, use PerfRecord directly.
+     */
+    virtual void compute(const Operator &op,
+                         const PerfRecord &record) const = 0;
+    /**
+     * @brief Executes an op with a default parameter.
+     */
+    virtual void compute(const Operator &op) const = 0;
+    // Premise: op is idempotent since it is called multiple times.
+    virtual PerfRecord tune(const Operator &op) const = 0;
+};
+
+class KernelRegistry {
+  public:
+    using KernelRecord =
+        tuple<Kernel *const, const string, const int>; // Kernel, name, ID
+
+  private:
+    std::map<KernelAttrs, KernelRecord> kernels;
+    int nKernels = 0;
+
+  public:
+    ~KernelRegistry() {
+        for (auto &[k, v] : kernels)
+            delete std::get<0>(v);
+    }
+    static KernelRegistry &getInstance() {
+        static KernelRegistry instance;
+        return instance;
+    }
+    bool registerKernel(const KernelAttrs &key, Kernel *kernel, string name) {
+        // TODO: mutliple kernels support: priority and check name
+        IT_ASSERT(kernels.find(key) == kernels.end(),
+                  "Kernel already registered");
+        kernels.emplace(key, KernelRecord{kernel, name, ++nKernels});
+        return true;
+    }
+    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
+        return std::get<0>(kernels.at(kernelAttrs));
+    }
+    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
+        return kernels.at(kernelAttrs);
+    }
+};
+
+} // namespace infini
+
+#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
+    namespace infini {                                                         \
+    static const bool _CAT(_register_kernel_, cnt) =                           \
+        KernelRegistry::getInstance().registerKernel(                          \
+            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
+    }
+
+#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
+    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
--- a/include/core/mutator.h
+++ b/include/core/mutator.h
@ -0,0 +1,19 @@
+#pragma once
+#include "core/graph.h"
+
+namespace infini {
+
+class Mutator {
+  private:
+    int candidatesLimit;
+    // // Statistical data
+    // int numTotalCandidates;
+
+  public:
+    Mutator(int candidatesLimit) : candidatesLimit(candidatesLimit){};
+    virtual ~Mutator(){};
+
+    virtual vector<Graph> run(const Graph &in_graph) = 0;
+};
+
+} // namespace infini
--- a/include/core/object.h
+++ b/include/core/object.h
@ -0,0 +1,54 @@
+#pragma once
+#include "core/common.h"
+#include "ref.h"
+
+namespace infini {
+
+using GuidBaseType = int;
+
+class Guid {
+  private:
+    GuidBaseType guid;
+
+  private:
+    GuidBaseType generateGuid() {
+        static GuidBaseType guidCnt = 0;
+        return ++guidCnt;
+    }
+
+  public:
+    Guid() { guid = generateGuid(); }
+    Guid(const Guid &rhs) { guid = generateGuid(); }
+    Guid &operator=(const Guid &rhs) {
+        guid = generateGuid();
+        return *this;
+    }
+
+    operator GuidBaseType() const { return guid; }
+};
+
+class Object {
+  protected:
+    Guid guid;
+
+  public:
+    virtual ~Object(){};
+    virtual string toString() const = 0;
+    void print() { std::cout << toString() << std::endl; }
+    Guid getGuid() const { return guid; }
+};
+
+inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
+    os << obj.toString();
+    return os;
+}
+
+// Overload for Ref-wrapped Object
+template <typename T,
+          typename std::enable_if_t<std::is_base_of_v<Object, T>> * = nullptr>
+inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
+    os << obj->toString();
+    return os;
+}
+
+} // namespace infini
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -0,0 +1,180 @@
+#pragma once
+#include "core/tensor.h"
+
+namespace infini {
+
+enum class OpType {
+    Unknown = 0,
+    // linear
+    Conv = 100,
+    Matmul,
+    ConvTrans,
+    G2BMM,
+    GBMML,
+    Pad,
+    Slice,
+    Concat,
+    Split,
+    Transpose,
+    Extend,
+    MaxPool,
+    AvgPool,
+    Add,
+    Sub,
+    Mul,
+    Div,
+    Pow,
+    Gather,
+    ReduceMean,
+    Reshape,
+    Identity,
+    // element wise
+    BatchNorm = 200,
+    Softmax,
+    Activation,
+    Resize,
+    //
+    MemBound = 300,
+};
+
+enum class Device { CPU = 1, CUDA };
+
+using KernelAttrs = std::tuple<Device, OpType, DataType>;
+
+class OpRegistry {
+  public:
+    static std::string getOpName(OpType opType) {
+#define FOP(op)                                                                \
+    case OpType::op:                                                           \
+        return #op
+
+        switch (opType) {
+            FOP(Unknown);
+            // linear
+            FOP(Conv);
+            FOP(Matmul);
+            FOP(ConvTrans);
+            FOP(G2BMM);
+            FOP(GBMML);
+            FOP(Pad);
+            FOP(Slice);
+            FOP(Concat);
+            FOP(Split);
+            FOP(Transpose);
+            FOP(Extend);
+            FOP(MaxPool);
+            FOP(AvgPool);
+            FOP(Add);
+            FOP(Sub);
+            FOP(Mul);
+            FOP(Div);
+            FOP(Pow);
+            FOP(Gather);
+            FOP(ReduceMean);
+            FOP(Reshape);
+            FOP(Identity);
+            // element wise
+            FOP(BatchNorm);
+            FOP(Softmax);
+            FOP(Activation);
+            //
+            FOP(MemBound);
+        default:
+            IT_ASSERT(false);
+            break;
+        }
+#undef FOP
+    }
+};
+
+enum class ActType {
+    None,
+    Relu,
+    Sigmoid,
+    Tanh,
+};
+
+struct OpPerfKey {
+    HashType hash;
+    OpType opType;
+    vector<int> attrs;
+
+  public:
+    OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
+        : hash(hash), opType(opType), attrs(attrs) {}
+    bool operator==(const OpPerfKey &rhs) const {
+        if (hash != rhs.hash)
+            return false;
+        if (opType != rhs.opType)
+            return false;
+        if (attrs != rhs.attrs)
+            return false;
+        return true;
+    }
+
+    // TODO: remove this function after we use unordered_map in PerfEngine
+    bool operator<(const OpPerfKey &rhs) const {
+        if (hash != rhs.hash)
+            return hash < rhs.hash;
+        if (opType != rhs.opType)
+            return opType < rhs.opType;
+        if (attrs.size() != rhs.attrs.size())
+            return attrs.size() < rhs.attrs.size();
+        for (size_t i = 0; i < attrs.size(); ++i)
+            if (attrs[i] != rhs.attrs[i])
+                return attrs[i] < rhs.attrs[i];
+        return false;
+    }
+};
+
+class OperatorNode : public Object {
+    friend class Kernel;
+
+  protected:
+    OpType type;
+    TensorVec inputs;
+    TensorVec outputs;
+    // vector<WRef<Operator>> predecessors;
+    // vector<WRef<Operator>> successors;
+
+  public:
+    OperatorNode(OpType opType, TensorVec inputs, TensorVec outputs)
+        : type(opType), inputs(inputs), outputs(outputs) {}
+    virtual vector<Shape> computeShape() const = 0;
+    virtual OpPerfKey getOpPerfKey() const = 0;
+
+  public: // check Op type
+    bool isLinearOp() const;
+    bool isElementWiseOp() const;
+    bool isSplitOp() const;
+    bool isConcatOp() const;
+    bool isComputeOp() const;
+    bool isTransposeOp() const;
+    bool isReshapeOp() const;
+    bool isMemBoundOp() const;
+
+  public: // getter and setter
+    // TensorVec getInputs() { return inputs; }
+    const TensorVec &getInputs() const { return inputs; }
+    // TensorVec getOutputs() { return outputs; }
+    const TensorVec &getOutputs() const { return outputs; }
+    Tensor getInputs(size_t i) { return inputs.at(i); }
+    Tensor getOutput() const {
+        IT_ASSERT(outputs.size() == 1, "Unimplemented");
+        return outputs[0];
+    }
+    OpType getOpType() const { return type; }
+
+    virtual int numInputs() const = 0;
+    virtual int numOutputs() const = 0;
+    virtual HashType hash() const { IT_TODO_HALT(); }
+    virtual HashType hashWithShape() const { IT_TODO_HALT(); }
+};
+
+} // namespace infini
+
+namespace std {
+template <> struct hash<infini::OpPerfKey> {
+    size_t operator()(const infini::OpPerfKey &key) const { return key.hash; }
+};
+} // namespace std
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -0,0 +1,36 @@
+#pragma once
+#include "core/graph.h"
+#include "core/kernel.h"
+
+namespace infini {
+
+class PerfEngine {
+  public:
+    // TODO: Key should be OpPerfKey + Context(maybe implicat) to support
+    // multiple candiate kernels.
+    using Key = std::pair<KernelAttrs, OpPerfKey>;
+
+  private:
+    map<Key, PerfRecord> data;
+
+  public:
+    static PerfEngine &getInstance() {
+        static PerfEngine instance;
+        return instance;
+    }
+
+    std::optional<PerfRecord> getPerfData(const Key &key) {
+        auto it = data.find(key);
+        if (it != data.end()) // find previous evaluating results
+            return data.at(key);
+        else
+            return std::nullopt;
+    }
+
+    void setPerfData(const Key &key, const PerfRecord &record) {
+        IT_ASSERT(data.find(key) == data.end(), "Perf data already exist");
+        data.emplace(key, record);
+    }
+};
+
+} // namespace infini
--- a/include/core/ref.h
+++ b/include/core/ref.h
@ -0,0 +1,35 @@
+#pragma once
+#include <functional> // hash
+#include <memory>
+#include <type_traits>
+
+namespace infini {
+
+template <typename T> using Ref = std::shared_ptr<T>;
+template <typename T> using WRef = std::weak_ptr<T>;
+
+template <typename T> struct is_ref : std::false_type {};
+template <typename T> struct is_ref<Ref<T>> : std::true_type {};
+template <typename T> struct is_ref<WRef<T>> : std::true_type {};
+
+template <typename T, typename... Params> Ref<T> make_ref(Params &&...params) {
+    static_assert(is_ref<T>::value == false, "Ref should not be nested");
+    return std::make_shared<T>(std::forward<Params>(params)...);
+}
+
+template <class T, class U,
+          typename std::enable_if_t<std::is_base_of_v<U, T>> * = nullptr>
+Ref<T> as(const Ref<U> &ref) {
+    return std::dynamic_pointer_cast<T>(ref);
+}
+
+template <typename T>
+std::vector<WRef<T>> get_wref_vec(const std::vector<Ref<T>> &vec) {
+    std::vector<WRef<T>> wref_vec;
+    wref_vec.reserve(vec.size());
+    for (const auto &ref : vec)
+        wref_vec.emplace_back(ref);
+    return wref_vec;
+}
+
+} // namespace infini
--- a/include/core/run_enigne.h
+++ b/include/core/run_enigne.h
@ -0,0 +1,26 @@
+#pragma once
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/perf_engine.h"
+
+namespace infini {
+
+class RunEngine {
+  private:
+    Device device;
+
+  public:
+    RunEngine(Device device) : device(device) {}
+    ~RunEngine() {}
+
+    void run(const Graph &graph, bool tune = false,
+             bool profiling = false) const;
+    double getPerfTime(const Graph &graph, bool profiling = false) const;
+
+  private:
+    void printProfilingData(double totTime,
+                            const std::map<OpType, double> &opTime,
+                            const std::map<OpType, int> &opCnt) const;
+};
+
+} // namespace infini
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -0,0 +1,180 @@
+#pragma once
+#include "core/tensor_base.h"
+
+namespace infini {
+
+// TODO: how to deal with this
+using ShapeElem = int;
+using Shape = vector<ShapeElem>;
+class TensorNode : public TensorBaseNode {
+  private:
+    Shape shape;
+
+  public:
+    TensorNode(const Shape &shape, DataType dtype);
+    virtual ~TensorNode() {}
+    string toString() const override;
+
+    size_t size() const;
+    void dataMalloc();
+
+    Shape getDims() const { return shape; }
+
+    size_t getOffset(const Shape &ds) const;
+    using TensorBaseNode::getData;
+    VType getData(const Shape &pos) const;
+    void copyData(VType *dptr);
+    void printData() const;
+    bool equalData(const Tensor &rhs) const;
+    // void setDims(const Dim &dms) { dims = dms; }
+
+    //     bool dataRand(int seed = 0) {
+    //         if (data == nullptr)
+    //             data = new VType[size()];
+    //         if (!random_inited)
+    //             initFastrand();
+    //         // srand(seed);
+    //         // faster rand generator; parallel
+    //         size_t iEnd = size();
+    //         // std::cerr << "Init beginned " << std::endl;
+    // #pragma omp parallel for
+    //         for (size_t i = 0; i < iEnd; ++i)
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
+    //             10000;
+    //         // std::cerr << "Init finished" << std::endl;
+    //         computed = ComputedFull;
+    //         return true;
+    //     }
+
+    //     bool setScalar(VType val) {
+    //         if (data == nullptr || !dims.empty())
+    //             return false;
+    //         data[0] = val;
+    //         return true;
+    //     }
+
+    //     bool setData(const Dim &ds, VType val) {
+    //         if (data == nullptr || ds.size() != dims.size())
+    //             return false;
+    //         data[getOffset(ds)] = val;
+    //         return true;
+    //     }
+
+    //     bool setData(size_t pos, VType val) {
+    //         if (data == nullptr || pos >= size())
+    //             return false;
+    //         data[pos] = val;
+    //         return true;
+    //     }
+
+    //     VType getScalar() { return data == nullptr ? 0 : data[0]; }
+
+    //     VType getBroadcastData(const Dim &ds) {
+    //         assert(data != nullptr);
+    //         auto offset = getBroadcastOffset(ds);
+    //         return offset == (size_t)-1 ? 0 : data[getOffset(ds)];
+    //     }
+
+    //     VType getBroadcastData(size_t pos) {
+    //         assert(data != nullptr);
+    //         return data[pos % size()];
+    //     }
+
+    //     size_t getBroadcastOffset(const Dim &ds) {
+    //         assert(ds.size() >= dims.size());
+    //         auto nDim = dims.size();
+    //         auto nBroadcastDim = ds.size() - nDim;
+    //         for (size_t i = 0; i < nDim; ++i)
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
+    //             dims[i])
+    //                 return (size_t)-1;
+    //         size_t idx = 0;
+    //         for (size_t i = 0; i < nDim; ++i)
+    //             idx = idx * dims[i] + ds[nBroadcastDim + i];
+    //         return idx;
+    //     }
+
+    //     void itInit() { it = Dim(dims.size(), 0); }
+
+    //     void itReset() {
+    //         itInit();
+    //         for (size_t i = 0, iEnd = it.size(); i < iEnd; ++i)
+    //             it[i] = 0;
+    //     }
+
+    //     bool itValid() {
+    //         if (it.size() != dims.size())
+    //             return false;
+    //         for (size_t i = 0, iEnd = it.size(); i < iEnd; ++i)
+    //             if (it[i] >= dims[i])
+    //                 return false;
+    //         return true;
+    //     }
+
+    //     const Dim &itGet() { return it; }
+
+    //     void itNext() {
+    //         auto p = it.size() - 1;
+    //         it[p] += 1;
+    //         while (p >= 1) {
+    //             if (it[p] == dims[p]) {
+    //                 it[p] = 0;
+    //                 it[--p] += 1;
+    //             } else
+    //                 break;
+    //         }
+    //     }
+
+    //     TensorType getType() const { return type; }
+    //     void setType(TensorType ty) { type = ty; }
+
+    //     static inline void initFastrand() {
+    //         assert(omp_get_max_threads() <= 256);
+    //         // srand(0); // constant seed for test
+    //         // align random_seed to avoid false sharing
+    //         for (int i = 0; i < 256 * 16; ++i) {
+    //             // random_seed[i] = rand();
+    //             // constant random seed for test
+    //             random_seed[i] = i;
+    //         }
+    //         random_inited = true;
+    //     }
+
+    //     static inline int fastrand(int &g_seed) {
+    //         g_seed = (214013 * g_seed + 2531011);
+    //         return (g_seed >> 16) & 0x7FFF;
+    //     }
+
+    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
+    //         assert(!splittingPoints.empty());
+    //         return &splittingPoints;
+    //     }
+
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
+    //         assert(!value.empty());
+    //         splittingPoints = value;
+    //         return true;
+    //     }
+
+    //     void printSplittingPoints() {
+    //         if (splittingPoints.empty())
+    //             printf("Empty SplittingPoints");
+    //         else {
+    //             printf("[");
+    //             for (auto &vs : splittingPoints) {
+    //                 printf("[");
+    //                 for (auto v : vs)
+    //                     printf("%2d,", v);
+    //                 printf("],");
+    //             }
+    //             printf("]");
+    //         }
+    //     }
+
+    //     void initSplittingPoints() {
+    //     splittingPoints.resize(getDims().size()); }
+
+    //     void printShape();
+};
+
+} // namespace infini
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -0,0 +1,261 @@
+#pragma once
+#include "core/object.h"
+#include "core/ref.h"
+
+namespace infini {
+
+// class Tensor;
+class TensorBaseNode;
+class TensorNode;
+class OperatorNode;
+class GraphNode;
+
+using TensorBase = Ref<TensorBaseNode>;
+using Tensor = Ref<TensorNode>;
+using Operator = Ref<OperatorNode>;
+using Graph = Ref<GraphNode>;
+
+using TensorVec = vector<Tensor>;
+using OpVec = vector<Operator>;
+
+using VType = uint32_t;
+
+enum class DataType {
+    Float32,
+    Int32,
+};
+
+class TensorBaseNode : public Object {
+  public:
+    // enum TensorType {
+    //     Input,
+    //     Weight,
+    //     Invalid,
+    //     NotCounted,
+    // };
+
+  protected:
+    int dim;
+
+    DataType dtype;
+    vector<WRef<TensorBaseNode>> inputOf;
+    WRef<TensorBaseNode> outputOf;
+    // TODO: Ref<void> -> Ref<Blob>
+    Ref<VType[]> data;
+    // ComputeState computed;
+    // static int random_seed[256 * 16];
+    // static bool random_inited;
+
+  public:
+    TensorBaseNode(int dim, DataType dtype);
+    virtual ~TensorBaseNode() {}
+
+    Ref<VType[]> getDataPtr() const { return data; }
+    VType getData(size_t offset) const;
+
+    DataType getDType() const { return dtype; }
+
+    // uint64_t getHash() const { return hash; }
+
+    //     void setInputOf(const OpVec &ops) {
+    //         inputOf.clear();
+    //         for (const auto &op : ops)
+    //             inputOf.emplace_back(op);
+    //     }
+    //     void addInputOf(Operator op) { inputOf.emplace_back(op); }
+    //     void setOutputOf(Operator op) { outputOf = op; }
+
+    //     const OpVec &getInputOf() { return inputOf; }
+    //     Operator *getOutputOf() { return outputOf; }
+    //     std::pair<Operator *, int> getOutputOfWithIndex();
+
+    //     const Dim &getDims() const { return dims; }
+    //     void setDims(const Dim &dms) { dims = dms; }
+
+    //     bool dataRand(int seed = 0) {
+    //         if (data == nullptr)
+    //             data = new VType[size()];
+    //         if (!random_inited)
+    //             initFastrand();
+    //         // srand(seed);
+    //         // faster rand generator; parallel
+    //         size_t iEnd = size();
+    //         // std::cerr << "Init beginned " << std::endl;
+    // #pragma omp parallel for
+    //         for (size_t i = 0; i < iEnd; ++i)
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
+    //             10000;
+    //         // std::cerr << "Init finished" << std::endl;
+    //         computed = ComputedFull;
+    //         return true;
+    //     }
+
+    //     bool setScalar(VType val) {
+    //         if (data == nullptr || !dims.empty())
+    //             return false;
+    //         data[0] = val;
+    //         return true;
+    //     }
+
+    //     bool setData(const Dim &ds, VType val) {
+    //         if (data == nullptr || ds.size() != dims.size())
+    //             return false;
+    //         data[getOffset(ds)] = val;
+    //         return true;
+    //     }
+
+    //     bool setData(size_t pos, VType val) {
+    //         if (data == nullptr || pos >= size())
+    //             return false;
+    //         data[pos] = val;
+    //         return true;
+    //     }
+
+    //     VType getScalar() { return data == nullptr ? 0 : data[0]; }
+
+    //     VType getData(const Dim &ds) {
+    //         assert(data != nullptr);
+    //         auto offset = getOffset(ds);
+    //         return offset == (size_t)-1 ? 0 : data[getOffset(ds)];
+    //     }
+
+    //     VType getData(size_t pos) {
+    //         assert(data != nullptr);
+    //         assert(pos < size());
+    //         return data[pos];
+    //     }
+
+    //     VType *getDataPtr() const { return data; }
+
+    //     size_t getOffset(const Dim &ds) {
+    //         auto nDim = ds.size();
+    //         assert(dims.size() == nDim);
+    //         if (ds.empty())
+    //             return 0;
+    //         for (size_t i = 0; i < nDim; ++i)
+    //             if (ds[i] < 0 || ds[i] >= dims[i])
+    //                 return (size_t)-1;
+    //         size_t idx = ds[0];
+    //         size_t dm = 0;
+    //         while (++dm < nDim)
+    //             idx = idx * dims[dm] + ds[dm];
+    //         return idx;
+    //     }
+
+    //     VType getBroadcastData(const Dim &ds) {
+    //         assert(data != nullptr);
+    //         auto offset = getBroadcastOffset(ds);
+    //         return offset == (size_t)-1 ? 0 : data[getOffset(ds)];
+    //     }
+
+    //     VType getBroadcastData(size_t pos) {
+    //         assert(data != nullptr);
+    //         return data[pos % size()];
+    //     }
+
+    //     size_t getBroadcastOffset(const Dim &ds) {
+    //         assert(ds.size() >= dims.size());
+    //         auto nDim = dims.size();
+    //         auto nBroadcastDim = ds.size() - nDim;
+    //         for (size_t i = 0; i < nDim; ++i)
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
+    //             dims[i])
+    //                 return (size_t)-1;
+    //         size_t idx = 0;
+    //         for (size_t i = 0; i < nDim; ++i)
+    //             idx = idx * dims[i] + ds[nBroadcastDim + i];
+    //         return idx;
+    //     }
+
+    //     void itInit() { it = Dim(dims.size(), 0); }
+
+    //     void itReset() {
+    //         itInit();
+    //         for (size_t i = 0, iEnd = it.size(); i < iEnd; ++i)
+    //             it[i] = 0;
+    //     }
+
+    //     bool itValid() {
+    //         if (it.size() != dims.size())
+    //             return false;
+    //         for (size_t i = 0, iEnd = it.size(); i < iEnd; ++i)
+    //             if (it[i] >= dims[i])
+    //                 return false;
+    //         return true;
+    //     }
+
+    //     const Dim &itGet() { return it; }
+
+    //     void itNext() {
+    //         auto p = it.size() - 1;
+    //         it[p] += 1;
+    //         while (p >= 1) {
+    //             if (it[p] == dims[p]) {
+    //                 it[p] = 0;
+    //                 it[--p] += 1;
+    //             } else
+    //                 break;
+    //         }
+    //     }
+
+    //     size_t size() const {
+    //         size_t sz = 1;
+    //         auto dm = dims.size();
+    //         while (dm > 0)
+    //             sz *= dims[--dm];
+    //         return sz;
+    //     }
+
+    //     TensorType getType() const { return type; }
+    //     void setType(TensorType ty) { type = ty; }
+
+    //     static inline void initFastrand() {
+    //         assert(omp_get_max_threads() <= 256);
+    //         // srand(0); // constant seed for test
+    //         // align random_seed to avoid false sharing
+    //         for (int i = 0; i < 256 * 16; ++i) {
+    //             // random_seed[i] = rand();
+    //             // constant random seed for test
+    //             random_seed[i] = i;
+    //         }
+    //         random_inited = true;
+    //     }
+
+    //     static inline int fastrand(int &g_seed) {
+    //         g_seed = (214013 * g_seed + 2531011);
+    //         return (g_seed >> 16) & 0x7FFF;
+    //     }
+
+    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
+    //         assert(!splittingPoints.empty());
+    //         return &splittingPoints;
+    //     }
+
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
+    //         assert(!value.empty());
+    //         splittingPoints = value;
+    //         return true;
+    //     }
+
+    //     void printSplittingPoints() {
+    //         if (splittingPoints.empty())
+    //             printf("Empty SplittingPoints");
+    //         else {
+    //             printf("[");
+    //             for (auto &vs : splittingPoints) {
+    //                 printf("[");
+    //                 for (auto v : vs)
+    //                     printf("%2d,", v);
+    //                 printf("],");
+    //             }
+    //             printf("]");
+    //         }
+    //     }
+
+    //     void initSplittingPoints() {
+    //     splittingPoints.resize(getDims().size()); }
+
+    //     void printShape();
+};
+
+} // namespace infini
--- a/include/nnet/Pass/MatchComputationKernel.h
+++ b/include/nnet/Pass/MatchComputationKernel.h
@ -0,0 +1,15 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class MatchComputationKernel : public Pass {
+  public:
+    MatchComputationKernel(Derivator &derivator)
+        : Pass(derivator, "MatchComputationKernel") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/MatchMemBoundKernel.h
+++ b/include/nnet/Pass/MatchMemBoundKernel.h
@ -0,0 +1,15 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class MatchMemBoundKernel : public Pass {
+  public:
+    MatchMemBoundKernel(Derivator &derivator)
+        : Pass(derivator, "MatchMemBoundKernel") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Pass.h
+++ b/include/nnet/Pass/Pass.h
@ -0,0 +1,41 @@
+#pragma once
+#include "nnet/derivator.h"
+
+namespace nnet {
+
+class Pass {
+  private:
+    VecExpr transformations;
+
+  protected:
+    Derivator &derivator;
+    string passName;
+    /**
+     * @brief // False if does not add log in Derivator. It should be false for
+     * single Pass test to avoid mismatch of passInfos and passMsgs  due to
+     * different number of "run" and "nextStep".
+     */
+    bool enableLogging, enableDebug;
+
+    virtual void transform(Formula &origin, int depth, Expr &rCur) = 0;
+    void nextStep(Formula &origin, int depth, Expr &rCur, Expr newCur,
+                  const string &ruleInfo = "");
+
+    Var getNewVar();
+    string newTensorName();
+
+  private:
+    void initialize(Formula &origin, const Expr &rCur);
+    void finalize();
+
+  public:
+    Pass(Derivator &derivator, const string &passName);
+    virtual ~Pass();
+
+    void run(Formula &origin, int dfsDepth, Expr &rCur);
+    void setEnableLogging(bool value);
+    void setEnableDebug(bool value);
+    const VecExpr &getTransformations();
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule1VariableSplit.h
+++ b/include/nnet/Pass/Rule1VariableSplit.h
@ -0,0 +1,18 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+#include "nnet/ReplaceKit.h"
+
+namespace nnet {
+
+class Rule1VariableSplit : public Pass {
+  public:
+    Rule1VariableSplit(Derivator &derivator)
+        : Pass(derivator, "Rule1VariableSplit") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    vector<Replace> getSplitableVar(const RangeOp &rangeOp);
+    Expr replaceIters(Expr cur, const Replace &replace);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule2VariableMerging.h
+++ b/include/nnet/Pass/Rule2VariableMerging.h
@ -0,0 +1,29 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+#include "nnet/ReplaceKit.h"
+
+namespace nnet {
+
+class Rule2VariableMerging : public Pass {
+  private:
+    map<int, vector<Var>> substituteRules;
+
+  public:
+    Rule2VariableMerging(Derivator &derivator)
+        : Pass(derivator, "Rule2VariableMerging") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+
+    vector<Replace> getMergableReplaces(RangeOp rangeOp, int depth);
+    optional<Replace> getReplaceMergingTwoLoopIters(const RangeOp &rangeOp,
+                                                    pair<Iterator, int> pairA,
+                                                    pair<Iterator, int> pairB,
+                                                    const IteratorTable &exprIT,
+                                                    int tensorID);
+    optional<Replace> getReplaceMappingTwoLoopIters(const RangeOp &rangeOp,
+                                                    pair<Iterator, int> pa,
+                                                    pair<Iterator, int> pb);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule3StageSplit.h
+++ b/include/nnet/Pass/Rule3StageSplit.h
@ -0,0 +1,19 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule3StageSplit : public Pass {
+  private:
+    map<int, vector<Var>> substituteRules;
+
+  public:
+    Rule3StageSplit(Derivator &derivator)
+        : Pass(derivator, "Rule3StageSplit") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    vector<vector<Var>> getSplitSummationIters(RangeOp rangeOp);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule4StageMerging.h
+++ b/include/nnet/Pass/Rule4StageMerging.h
@ -0,0 +1,22 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule4StageMerging : public Pass {
+    bool success, mergeStageWithCalc;
+
+  public:
+    Rule4StageMerging(Derivator &derivator)
+        : Pass(derivator, "Rule4StageMerging"), success(false),
+          mergeStageWithCalc(false) {}
+    bool rule4StageMerging(Formula &origin, int depth, Expr &rCur,
+                           bool mergeStageWithCalc = false);
+    bool isSuccessful();
+    void setMergeStageWithCalc(bool value);
+
+  private:
+    virtual void transform(Formula &origin, int depth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule5RangeRelaxation.h
+++ b/include/nnet/Pass/Rule5RangeRelaxation.h
@ -0,0 +1,16 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule5RangeRelaxation : public Pass {
+  public:
+    Rule5RangeRelaxation(Derivator &derivator)
+        : Pass(derivator, "Rule5RangeRelaxation") {}
+    Expr rule5RangeRelaxation(Formula &origin, int depth, Expr &rCur);
+
+  private:
+    virtual void transform(Formula &origin, int depth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule6KenerlMatching.h
+++ b/include/nnet/Pass/Rule6KenerlMatching.h
@ -0,0 +1,17 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule6KenerlMatching : public Pass {
+  public:
+    Rule6KenerlMatching(Derivator &derivator)
+        : Pass(derivator, "Rule6KenerlMatching") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    // RE: seperating this func is a choice.
+    VecExpr matchElementWise(const RangeOp &rangeOp);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule7DLT.h
+++ b/include/nnet/Pass/Rule7DLT.h
@ -0,0 +1,16 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule7DLT : public Pass {
+  public:
+    Rule7DLT(Derivator &derivator) : Pass(derivator, "Rule7DLT") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    Expr buildDLTSingleRangeOp(const RangeOp &original, const Expr &newSummand);
+    vector<int> getFactors();
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule8GuidedDLT.h
+++ b/include/nnet/Pass/Rule8GuidedDLT.h
@ -0,0 +1,48 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+#include "nnet/ReplaceKit.h"
+
+namespace nnet {
+
+class Rule8GuidedDLT : public Pass {
+  public:
+    Rule8GuidedDLT(Derivator &derivator) : Pass(derivator, "Rule8GuidedDLT") {}
+    VecExpr guidedDLT(Formula &origin, int depth, Expr &rCur,
+                      bool debug = false);
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    /**
+     * @brief If only one row miss match (more iterators mismatch), directly do
+     * data layout construction according to the IT.
+     *
+     * @return Expr Return nullptr if failed.
+     */
+    Expr guidedDLTMoreVar2(const RangeOp &cur, const Mismatch &mismatch,
+                           const IteratorTable &exprIT, const Pattern &pattern);
+    /**
+     * @brief Check whether two iterators overlap each other. If overlapping, we
+     * cannot simply reconstruct the tensor into a new one by seperate all
+     * iterators into different dimensions.
+     */
+    bool checkElementsHaveOnlyOneAccessIteratorSet(const IteratorTable &exprIT,
+                                                   int tensorID);
+    /**
+     * @brief Only product of two tensors can be guided DLTed.
+     *
+     * @param cur
+     * @return true
+     * @return false
+     */
+    bool statisfyGuidedDLT(RangeOp cur) const;
+    /**
+     * @brief Deal with output DLT mismatch only.
+     */
+    Expr guidedDLTDLMismatch(const RangeOp &cur, const Mismatch &mismatch,
+                             const IteratorTable &exprIT,
+                             const Pattern &pattern);
+    Expr buildGuidedDLTSource(const Subscript &originalSub, Replace replace,
+                              vector<Var> tensorDimAxes, vector<int> newShape);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule90TwoStageElementWise.h
+++ b/include/nnet/Pass/Rule90TwoStageElementWise.h
@ -0,0 +1,16 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule90TwoStageElementWise : public Pass {
+  public:
+    Rule90TwoStageElementWise(Derivator &derivator)
+        : Pass(derivator, "Rule90TwoStageElementWise") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+    VecExpr matchTwoStageElementWise(const RangeOp &rangeOp);
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule91MergeStagesWithSum.h
+++ b/include/nnet/Pass/Rule91MergeStagesWithSum.h
@ -0,0 +1,15 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule91MergeStagesWithSum : public Pass {
+  public:
+    Rule91MergeStagesWithSum(Derivator &derivator)
+        : Pass(derivator, "Rule91MergeStagesWithSum") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Pass/Rule9RangeMagnify.h
+++ b/include/nnet/Pass/Rule9RangeMagnify.h
@ -0,0 +1,15 @@
+#pragma once
+#include "nnet/Pass/Pass.h"
+
+namespace nnet {
+
+class Rule9RangeMagnify : public Pass {
+  public:
+    Rule9RangeMagnify(Derivator &derivator)
+        : Pass(derivator, "Rule9RangeMagnify") {}
+
+  private:
+    virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override;
+};
+
+} // namespace nnet
--- a/include/nnet/ReplaceKit.h
+++ b/include/nnet/ReplaceKit.h
@ -0,0 +1,46 @@
+#pragma once
+#include "nnet/expr.h"
+
+namespace nnet {
+
+struct Replace {
+    int iteratorType;
+    vector<Var> oldIters; // i_1, ...
+    vector<Var> newIters; // j_1, ...
+    VecExpr phis;         // j_1=\phi_1(i_1, ...), not necessary for Sum iter
+    VecExpr psis;         // i_1=\psi_1(j_1, ...)
+    vector<VarRangePair> newVarRanges;
+
+    bool isReplaced(Var var) const {
+        for (const auto &iter : oldIters)
+            if (iter->equal(var))
+                return true;
+        return false;
+    }
+
+    string toReadable() const {
+        string ret = "Old iters: " + serializeVec(oldIters) +
+                     ", new iters: " + serializeVec(newIters);
+        ret += " phis: " + serializeVec(phis) + " psis: " + serializeVec(psis);
+        return ret;
+    }
+};
+
+class ReplaceKit {
+  public:
+    static RangeOp replaceRangeOpIterator(const RangeOp &rangeOp,
+                                          const Replace &replace,
+                                          const Expr &replacedSummand);
+    static Subscript buildSubscirptForLoopVarReplace(const RangeOp &inner,
+                                                     const Replace &replace);
+    static RangeOp buildDLTOuterRangeOp(const RangeOp &original,
+                                        const Subscript &subscriptedNewRangeOp);
+    static Expr replaceMultipleExprs(const Expr &cur,
+                                     const vector<Var> &patterns,
+                                     const VecExpr &replacements,
+                                     bool simplify = true);
+    static Expr replaceExpr(const Expr &cur, const Expr &pattern,
+                            const Expr &replacement);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/AsTVMVisitor.h
+++ b/include/nnet/Visitor/AsTVMVisitor.h
@ -0,0 +1,38 @@
+#pragma once
+#include "nnet/Visitor/StrideVisitor.h"
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class AsTVMVisitor : public Functor<std::string(void)> {
+  private:
+    int nStage = 0, curStage = -1;
+    std::unordered_map<std::string, int> offset;
+    std::vector<std::string> inputs;
+    std::string output;
+    std::vector<std::string> pythonVars;
+    std::vector<std::vector<int>> inputShapes;
+    std::vector<int> outputShape;
+    std::string stmts;
+
+  public:
+    std::string getStmts() const;
+
+    const std::vector<std::string> &getInputs() const { return inputs; }
+    const std::string &getOutput() const { return output; }
+
+    const std::vector<std::vector<int>> &getInputShapes() const {
+        return inputShapes;
+    }
+    const std::vector<int> &getOutputShape() const { return outputShape; }
+
+    std::string visit_(const Constant &c) override;
+    std::string visit_(const BinaryOp &c) override;
+    std::string visit_(const Func &c) override;
+    std::string visit_(const RangeOp &c) override;
+    std::string visit_(const Subscript &c) override;
+    std::string visit_(const Var &c) override;
+    std::string visit_(const Tensor &c) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/CheckOOBVisitor.h
+++ b/include/nnet/Visitor/CheckOOBVisitor.h
@ -0,0 +1,22 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class CheckOOBVisitor : public ExprTreeVisitor {
+    RangeOp rangeOp;
+    bool detect = false;
+
+  public:
+    CheckOOBVisitor(int _verobse = 0) : ExprTreeVisitor(1, 1, 0, 0, _verobse) {}
+    void visit_(const Subscript &c) override;
+
+    /**
+     * @brief
+     * @return true If there is OOB
+     * @return false If there is no OOB
+     */
+    bool checkRangeOp(const RangeOp &_rangeOp);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/CloneMutator.h
+++ b/include/nnet/Visitor/CloneMutator.h
@ -0,0 +1,16 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Clone ExprNodes in a stage except Tensor, Var, and Constant nodes.
+class CloneMutator : public Mutator {
+  public:
+    CloneMutator() : Mutator(false) {}
+    Expr visit_(const Constant &c) override;
+    Expr visit_(const Var &c) override;
+    Expr visit_(const Tensor &c) override;
+    Expr clone(const Expr &c) { return dispatch(c); }
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/CompareMultiFormulasVisitor.h
+++ b/include/nnet/Visitor/CompareMultiFormulasVisitor.h
@ -0,0 +1,15 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class CompareMultiFormulasVisitor : public ExprTreeVisitor {
+    vector<VarRangePair> newSumVarRanges;
+    RangeOp newRangeOp;
+
+  public:
+    CompareMultiFormulasVisitor() : ExprTreeVisitor() {}
+    bool compare(const VecExpr &roots);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/CountRoutineVisitor.h
+++ b/include/nnet/Visitor/CountRoutineVisitor.h
@ -0,0 +1,18 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class CountRoutineVisitor : public ExprTreeVisitor {
+  private:
+    vector<int> cnts;
+
+  public:
+    CountRoutineVisitor(int _verobse = 0)
+        : ExprTreeVisitor(1, 1, 1, 1, _verobse) {}
+    void visit_(const Tensor &c) override;
+    vector<int> count(const Expr &root);
+    bool match(const Expr &root, int nMatmul = 0, int nConv = 0,
+               int nElement = 0, int nSg2bmm = 0, int nLongformerGBMM = 0);
+};
+} // namespace nnet
--- a/include/nnet/Visitor/FullPrinterVisitor.h
+++ b/include/nnet/Visitor/FullPrinterVisitor.h
@ -0,0 +1,25 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class FullPrinterVisitor : public ExprTreeVisitor {
+  private:
+    vector<tuple<string, Routine, Tensor>> q;
+
+  public:
+    FullPrinterVisitor(int _verobse = 0)
+        : ExprTreeVisitor(1, 1, 1, 0, _verobse) {}
+    void visit_(const Tensor &c) override;
+
+    string print(const Expr &root);
+    /**
+     * @brief Get all tensors & OPs in a reversed order
+     *
+     * @param root
+     * @return vector<<Output TensorName, RoutineNode, output tensor in NNet>>
+     */
+    const vector<tuple<string, Routine, Tensor>> &traverse(const Expr &root);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/GetTensorsVisitor.h
+++ b/include/nnet/Visitor/GetTensorsVisitor.h
@ -0,0 +1,22 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Get all tensors in the stage
+class GetTensorsVisitor : public ExprTreeVisitor {
+  private:
+    unordered_map<string, Tensor> tensors;
+
+    void visit_(const Tensor &c) override;
+
+  public:
+    GetTensorsVisitor(int _verobse = 0)
+        : ExprTreeVisitor(1, 1, 1, 0, _verobse) {}
+    auto get(const Expr &c) {
+        dispatch(c);
+        return tensors;
+    }
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/HashVisitor.h
+++ b/include/nnet/Visitor/HashVisitor.h
@ -0,0 +1,31 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Calculate hash for a normal form, starting at a RangeOp
+class HashVisitor : public Functor<HashType(void)> {
+    inline const static HashType BKDR_SEED[] = {131, 313, 10007, 65599};
+
+    PtrUmap<Iterator, int> varHash;
+    int nLoopVars = 0;
+    PtrUmap<Iterator, int> name2id;
+    vector<int> rootId;
+    vector<bool> haveAlias;
+    int nVars = 0;
+    vector<HashType> power;
+
+  private:
+    HashType visit_(const Constant &c) override;
+    HashType visit_(const BinaryOp &c) override;
+    HashType visit_(const RangeOp &c) override;
+    HashType visit_(const Subscript &c) override;
+    HashType visit_(const Tensor &c) override;
+    HashType visit_(const Var &c) override;
+
+  public:
+    HashVisitor(int _verobse = 0) : Functor(_verobse) {}
+    HashType getHash(const Expr &c);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/InputVisitor.h
+++ b/include/nnet/Visitor/InputVisitor.h
@ -0,0 +1,23 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class InputVisitor : public ExprTreeVisitor {
+    vector<Tensor> inputs;
+
+  public:
+    int nInputs = 0;
+    InputVisitor(int _verobse = 0) : ExprTreeVisitor(1, 1, 1, 0, _verobse) {}
+    void visit_(const Tensor &c) override;
+
+    /**
+     * @brief Get the all inputs in the netsed stages
+     */
+    vector<Tensor> getInputs(const RangeOp &_rangeOp) {
+        dispatch(_rangeOp);
+        return inputs;
+    }
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/Interpreter.h
+++ b/include/nnet/Visitor/Interpreter.h
@ -0,0 +1,55 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class Interpreter : public Functor<int()> {
+  public:
+    using ttype = int; // Test data type
+    using rtype = int; // Return data type
+    using Position = vector<int>;
+    using Inputs = unordered_map<string, Ref<vector<ttype>>>;
+    using Iteration = PtrUmap<Var, int>;
+
+  private:
+    // cache the input value
+    Inputs inputs;
+    vector<Iteration> iterations;
+    vector<Position> positions;
+
+    rtype visit_(const Constant &c) override;
+    rtype visit_(const BinaryOp &c) override;
+    rtype visit_(const RangeOp &c) override;
+    rtype visit_(const Subscript &c) override;
+    rtype visit_(const Var &c) override;
+    rtype visit_(const Tensor &c) override;
+    // int visit_(const Func &c); // Future work
+
+    static Inputs genInputStartingFromZero(const RangeOp &range);
+
+  public:
+    Interpreter(Inputs _inputs, int _verbose = 0)
+        : Functor(_verbose), inputs(_inputs) {}
+    Interpreter(RangeOp range, int _verbose = 0);
+
+    /**
+     * @brief Calculate the output at specified poistions
+     *
+     * @param expr The expression to be calculated.
+     * @param poses Positions of output.
+     * @return vector<int> Value of output.
+     */
+    vector<rtype> interpret(const Expr &expr, const vector<Position> &poses);
+    /**
+     * @brief Calculate the output at equally spaced positions
+     *
+     * @param expr The expression to be calculated.
+     * @param nPoses The number of calculated output positions.
+     * @return vector<int> Value of output.
+     */
+    vector<rtype> interpretUniformSample(const RangeOp &range,
+                                         int nPoses = 100);
+    vector<rtype> interpretAllOutput(const RangeOp &range);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/MatchReshapeVisitor.h
+++ b/include/nnet/Visitor/MatchReshapeVisitor.h
@ -0,0 +1,14 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class MatchReshapeVisitor : public Functor<bool(void)> {
+  private:
+    PtrMap<Iterator, int> _coefficient;
+
+  public:
+    bool visit_(const RangeOp &c) override;
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/MatchTableVisitor.h
+++ b/include/nnet/Visitor/MatchTableVisitor.h
@ -0,0 +1,60 @@
+#pragma once
+#include "nnet/Visitor/StrideVisitor.h"
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class MatchTableVisitor
+    : public Functor<void(const Tensor &, int dim, optional<int> stride)> {
+  private:
+    // Var -> {(tensor, dim)}
+    Appearance appearance;
+    vector<Tensor> tensors;
+    vector<Subscript> subscripts;
+    StrideTable strideTable;
+    PtrMap<Iterator, vector<vector<int>>>
+        strideInDim; // [Iterator][tensorID][dim]=stride
+
+    // Intermediate variable
+    // product of a sub-exprtree: Stride has to be done in two DFS
+    SubexprSride subexprStride;
+    bool hasUnsupportedOp = false;
+
+  public:
+    MatchTableVisitor(int _verobse = 0) : Functor(_verobse) {}
+    void visit_(const BinaryOp &c, const Tensor &tensor, int dim,
+                optional<int> stride) override;
+    void visit_(const Subscript &c, const Tensor &tensor, int dim,
+                optional<int> stride) override;
+    void visit_(const Var &c, const Tensor &tensor, int dim,
+                optional<int> stride) override;
+    void visit_(const Constant &c, const Tensor &tensor, int dim,
+                optional<int> stride) override;
+    // void visit_(const Tensor &c, const Tensor &tensor) override;
+
+    [[nodiscard]] bool operator()(const RangeOp &e) {
+        hasUnsupportedOp = false;
+        // get the location and stride of each iterator
+        auto mulOp = as<BinaryOpNode>(e->getSummand());
+        // TODO [feature]: support complex index exprs
+        if (!mulOp || mulOp->getOpType() != OpType::Mul) {
+            nnet_unimplemented_continue();
+            return false;
+        }
+        StrideVisitor strideVisitor(0);
+        subexprStride = strideVisitor.getFormulaStride(e);
+        dispatch(mulOp->getLhs(), nullptr, 0, 0);
+        dispatch(mulOp->getRhs(), nullptr, 0, 0);
+        subscripts.emplace_back(as<SubscriptNode>(mulOp->getLhs()));
+        subscripts.emplace_back(as<SubscriptNode>(mulOp->getRhs()));
+        assert(tensors.size() == subscripts.size());
+        assert(tensors.size() < 5);
+        return !hasUnsupportedOp;
+    }
+
+    auto getResult() const {
+        return tuple(appearance, tensors, strideTable, subscripts);
+    }
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/MatmulTransposeMutator.h
+++ b/include/nnet/Visitor/MatmulTransposeMutator.h
@ -0,0 +1,18 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class MatmulTransposeMutator : public Mutator {
+    Derivator &derivator;
+
+  public:
+    MatmulTransposeMutator(Derivator &derivator)
+        : Mutator(1), derivator(derivator) {}
+    VecExpr transpose(const Tensor &tensor);
+
+  private:
+    Tensor transposeInput(const Tensor &tensor);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/MergeMemboundMutator.h
+++ b/include/nnet/Visitor/MergeMemboundMutator.h
@ -0,0 +1,20 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class MergeMemboundMutator : public Mutator {
+    VecExpr kernels;
+    int curDepth; // from the last one to the first one
+    Expr visit_(const Tensor &c) override;
+    // FIXME: duplicate code
+    Expr rule4StageMerging(Expr &rCur, bool mergeStageWithCalc);
+    bool checkEmpty();
+
+  public:
+    MergeMemboundMutator(const VecExpr &kernels)
+        : Mutator(), kernels(kernels), curDepth(kernels.size() - 1) {}
+    Expr merge(bool allowEmptyMembound = false);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/PatternMatcher.h
+++ b/include/nnet/Visitor/PatternMatcher.h
@ -0,0 +1,43 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+/**
+ * @brief Since the output positions of operators always start from 0, we have
+ * to offset them if the the boundary expression of is not 0.
+ */
+class PatternMatcher : public Functor<void(void)> {
+  private:
+    Derivator &derivator;
+    bool hasNonZeroRange;
+    const RangeOp originalCur;
+
+  public:
+    PatternMatcher(Derivator &derivator, const RangeOp &cur);
+    /**
+     * @brief Get the Cur whose loop vars are all offset to [0, x). Since
+     * operator outputs start from 0, RangeOp has to be aligned.
+     */
+    RangeOp getOffsetCur();
+    /**
+     * @brief Add outer RangeOp to map the original positions to the new
+     * positions staring from 0.
+     *
+     * @param exprs Tensors from matched exprs
+     */
+    VecExpr applyWrapper(const VecExpr &exprs);
+
+    VecExpr matchWithPattern(const RangeOp &rangeOp, const Pattern &pattern);
+
+  private:
+    VecExpr matchKernel(const Pattern &pattern, const RangeOp &rangeOp,
+                        IteratorTable &exprIT);
+    // get reverse tensor and iterator map ([pattern tensor/iter ID] ->
+    // real)
+    Expr matchKernelWithTensorMap(const Pattern &pattern,
+                                  const RangeOp &rangeOp,
+                                  IteratorTable &exprIT);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/RangeMagnifyVisitor.h
+++ b/include/nnet/Visitor/RangeMagnifyVisitor.h
@ -0,0 +1,25 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class RangeMagnifyVisitor : public Mutator {
+    vector<VarRangePair> newSumVarRanges;
+    RangeOp newRangeOp;
+
+  public:
+    RangeMagnifyVisitor() : Mutator(0) {}
+    Expr visit_(const RangeOp &c) override;
+    Expr visit_(const Subscript &c) override;
+    /**
+     * @brief
+     *
+     * @param root
+     * @param _newSumVarRanges
+     * @return RangeOp nullptr if failed to magnify
+     */
+    RangeOp magnify(const RangeOp &root,
+                    const vector<VarRangePair> &_newSumVarRanges);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/RangeRelaxFunctor.h
+++ b/include/nnet/Visitor/RangeRelaxFunctor.h
@ -0,0 +1,18 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+using RangeMap = PtrMap<Iterator, Range>;
+class RangeRelaxFunctor : public Functor<RangeMap()> {
+    RangeOp rangeOp;
+
+  public:
+    RangeRelaxFunctor(RangeOp _rangeOp) : Functor(false), rangeOp(_rangeOp) {}
+    RangeMap visit_(const BinaryOp &c) override;
+    RangeMap visit_(const RangeOp &c) override;
+    RangeMap visit_(const Subscript &c) override;
+    RangeMap intersectRangeMaps(const RangeMap &a, const RangeMap &b);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/ReplaceNodeMutator.h
+++ b/include/nnet/Visitor/ReplaceNodeMutator.h
@ -0,0 +1,20 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Replace node according to its address the summand
+// Only subscript and tensor are supported now.
+class ReplaceNodeMutator : public Mutator {
+    int nSubscripts = 0;
+    ExprNode *target;
+    Expr replacement;
+
+  public:
+    ReplaceNodeMutator() : Mutator(0) {}
+    Expr visit_(const Subscript &c) override;
+    Expr visit_(const Tensor &c) override;
+    Expr replace(const Expr &root, ExprNode *_target, const Expr &_replace);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/ReplaceVariable.h
+++ b/include/nnet/Visitor/ReplaceVariable.h
@ -0,0 +1,33 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+class ReplaceVariable : public Mutator {
+    VecExpr patterns, replacements;
+    map<HashType, int> patternHash;
+
+  public:
+    ReplaceVariable(Expr _pattern, Expr _replacement) : Mutator(false) {
+        set({_pattern}, {_replacement});
+    }
+    ReplaceVariable(const map<string, pair<Expr, Expr>> &mapping)
+        : Mutator(false) {
+        VecExpr _patterns, _replacements;
+        for (const auto &[_, v] : mapping) {
+            _patterns.emplace_back(v.first);
+            _replacements.emplace_back(v.second);
+        }
+        set(_patterns, _replacements);
+    }
+    Expr visit_(const BinaryOp &c) override;
+    // NOT recur to the next stage
+    Expr visit_(const RangeOp &c) override;
+    Expr visit_(const Var &c) override;
+
+  private:
+    void set(VecExpr _pattern, VecExpr _replacement);
+    Expr match(const Expr &c);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/Serializer.h
+++ b/include/nnet/Visitor/Serializer.h
@ -0,0 +1,52 @@
+#pragma once
+#include "nlohmann/json_fwd.hpp"
+#include "nnet/visitor.h"
+#include <memory>
+
+namespace nnet {
+
+class Serializer : public Functor<string()> {
+    using json = nlohmann::ordered_json;
+
+  private:
+    static constexpr int VERSION{1};
+    std::unique_ptr<json> jPtr;
+    json &j;
+    static int id;
+
+    string visit_(const Constant &c) override;
+    string visit_(const BinaryOp &c) override;
+    string visit_(const RangeOp &c) override;
+    string visit_(const Subscript &c) override;
+    string visit_(const Var &c) override;
+    string visit_(const Tensor &c) override;
+    string dispatchRoutine(const Routine &c);
+
+    Expr buildExprTree(string key);
+    Routine buildRoutine(string key);
+
+  public:
+    Serializer(int _verobse = 0);
+    virtual ~Serializer();
+
+    /**
+     * @brief Serialize the given expression to json file
+     *
+     * @param expr The expression to be serialized
+     * @param filePath The path of json file to be output
+     * @param msg Message of derivation
+     * @return bool Whether the serialization succeed
+     */
+    bool serialize(const Expr &expr, const string &filePath,
+                   const string &msg = "");
+
+    /**
+     * @brief Deserialize the given json file to expression
+     *
+     * @param filePath The path to file to be deserialized
+     * @return Expression deserialized from the given json file
+     */
+    Expr deserialize(const string &filePath);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/SimplifyExprVisitor.h
+++ b/include/nnet/Visitor/SimplifyExprVisitor.h
@ -0,0 +1,38 @@
+#pragma once
+#include "nnet/Visitor/StrideVisitor.h"
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Simplify a index expression tree
+class SimplifyExprVisitor : public Functor<void(optional<int> stride)> {
+  private:
+    SubexprSride subexprStride;
+    int constant;
+    PtrMap<Iterator, int> strides; // [var]=strides
+
+    map<pair<Iterator, int>, int, RefValueLess<pair<Iterator, int>>> divStrides,
+        modStrides; // 3*(i%8): [<i,8>]=3
+
+    // For divde and modulo with expr as dividend: 3*((i+1)%8): [<i+1,8>]=3
+    map<pair<Expr, int>, int, RefAddrLess<pair<Expr, int>>> divExprStrides,
+        modExprStrides;
+
+  public:
+    SimplifyExprVisitor() : Functor(0) {}
+    void visit_(const BinaryOp &c, optional<int> stride) override;
+    void visit_(const Var &c, optional<int> stride) override;
+    void visit_(const Constant &c, optional<int> stride) override;
+    PtrMap<Iterator, int> getStrides(const Expr &expr);
+    // TODO [refactor]: move this to SimplifyFormulaMutator as a member func
+    // this class should be get coefficients in a expr
+    Expr simplify(const Expr &expr);
+    int getConstant(const Expr &expr);
+    pair<PtrMap<Iterator, int>, int> getStridesConstant(const Expr &expr);
+    optional<Range> getExprRange(const Expr &expr, const RangeOp &rangeOp);
+    PtrMap<Iterator, int> getStrides() { return strides; }
+    const auto &getDivStrides() { return divStrides; }
+    const auto &getModStrides() { return modStrides; }
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/SimplifyFormulaMutator.h
+++ b/include/nnet/Visitor/SimplifyFormulaMutator.h
@ -0,0 +1,18 @@
+#pragma once
+#include "nnet/Visitor/StrideVisitor.h"
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+// Simplify all indexes in subscripts in an expression tree
+class SimplifyFormulaMutator : public Mutator {
+    int nSubscripts = 0;
+
+  public:
+    SimplifyFormulaMutator() : Mutator(0) {}
+    Expr visit_(const Subscript &c) override;
+    // Expr visit_(const BinaryOp &c) override;
+    Expr simplify(const Expr &expr);
+};
+
+} // namespace nnet
--- a/include/nnet/Visitor/StrideVisitor.h
+++ b/include/nnet/Visitor/StrideVisitor.h
@ -0,0 +1,38 @@
+#pragma once
+#include "nnet/visitor.h"
+
+namespace nnet {
+
+using SubexprSride = map<const ExprNode *, optional<int>>;
+class StrideVisitor : public Functor<optional<int>(void)> {
+  private:
+    SubexprSride subexprStride;
+
+  public:
+    StrideVisitor(int _verobse = 0) : Functor(_verobse) {}
+    optional<int> visit_(const BinaryOp &c) override;
+    optional<int> visit_(const Subscript &c) override;
+    optional<int> visit_(const Var &c) override;
+    optional<int> visit_(const Constant &c) override;
+    // void visit_(const Tensor &c, const Tensor &tensor) override;
+
+    auto getFormulaStride(const RangeOp &e) {
+        subexprStride.clear();
+        // get the location and stride of each iterator
+        auto mulOp = as<BinaryOpNode>(e->getSummand());
+        // TODO [feature]: support complex index exprs
+        if (!mulOp || mulOp->getOpType() != OpType::Mul)
+            nnet_unimplemented_continue();
+        dispatch(mulOp->getLhs());
+        dispatch(mulOp->getRhs());
+        return subexprStride;
+    }
+
+    [[nodiscard]] auto getExprStride(const Expr &e) {
+        subexprStride.clear();
+        dispatch(e);
+        return subexprStride;
+    }
+};
+
+} // namespace nnet
--- a/include/nnet/common.h
+++ b/include/nnet/common.h
@ -0,0 +1,77 @@
+#pragma once
+#include "dbg.h"
+#include <cassert>
+#include <list>
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace nnet {
+using std::dynamic_pointer_cast;
+using std::endl;
+using std::list;
+using std::make_pair;
+using std::make_shared;
+using std::make_tuple;
+using std::map;
+using std::max;
+using std::min;
+using std::nullopt;
+using std::optional;
+using std::pair;
+using std::set;
+using std::shared_ptr;
+using std::string;
+using std::tie;
+using std::to_string;
+using std::tuple;
+using std::unique_ptr;
+using std::unordered_map;
+template <typename T> using uset = std::unordered_set<T>;
+using std::vector;
+using std::weak_ptr;
+
+// Aliases
+using dtype = float;
+using HashType = int;
+
+template <typename T> struct ptr_less {
+    bool operator()(const T &lhs, const T &rhs) const { return *lhs < *rhs; }
+};
+
+template <typename T> struct ptr_hash {
+    size_t operator()(const T &lhs) const {
+        return std::hash<decltype(*lhs)>()(*lhs);
+    }
+};
+
+template <typename T> struct ptr_equal {
+    bool operator()(const T &lhs, const T &rhs) const { return *lhs == *rhs; }
+};
+
+static inline HashType genhash(HashType a, HashType b) {
+    return (a * 10007 + b + 12345) % 1000000007;
+}
+
+static inline HashType genhash(string s) {
+    HashType ret = 0;
+    for (auto c : s)
+        ret = genhash(ret, c);
+    return ret;
+}
+
+#define nnet_unimplemented_halt()                                              \
+    { assert(!"Unimplemented"); }
+
+#define nnet_unimplemented_continue()                                          \
+    { dbg("Unimplemented"); }
+
+#define nnet_assert(expr, msg) assert(((void)(msg), (expr)))
+
+std::string pointer_to_hex(void *i);
+} // namespace nnet
--- a/include/nnet/dbg.h
+++ b/include/nnet/dbg.h
@ -0,0 +1,858 @@
+/*****************************************************************************
+
+                                dbg(...) macro
+
+License (MIT):
+
+  Copyright (c) 2019 David Peter <mail@david-peter.de>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+*****************************************************************************/
+
+#ifndef DBG_MACRO_DBG_H
+#define DBG_MACRO_DBG_H
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#define DBG_MACRO_UNIX
+#elif defined(_MSC_VER)
+#define DBG_MACRO_WINDOWS
+#endif
+
+// #ifndef DBG_MACRO_NO_WARNING
+// #pragma message("WARNING: the 'dbg.h' header is included in your code base")
+// #endif  // DBG_MACRO_NO_WARNING
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iomanip>
+#include <ios>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#ifdef DBG_MACRO_UNIX
+#include <unistd.h>
+#endif
+
+#if __cplusplus >= 201703L
+#define DBG_MACRO_CXX_STANDARD 17
+#elif __cplusplus >= 201402L
+#define DBG_MACRO_CXX_STANDARD 14
+#else
+#define DBG_MACRO_CXX_STANDARD 11
+#endif
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+#include <optional>
+#include <variant>
+#endif
+
+namespace dbg {
+
+#ifdef DBG_MACRO_UNIX
+inline bool isColorizedOutputEnabled() { return isatty(fileno(stderr)); }
+#else
+inline bool isColorizedOutputEnabled() { return true; }
+#endif
+
+struct time {};
+
+namespace pretty_function {
+
+// Compiler-agnostic version of __PRETTY_FUNCTION__ and constants to
+// extract the template argument in `type_name_impl`
+
+#if defined(__clang__)
+#define DBG_MACRO_PRETTY_FUNCTION __PRETTY_FUNCTION__
+static constexpr size_t PREFIX_LENGTH =
+    sizeof("const char *dbg::type_name_impl() [T = ") - 1;
+static constexpr size_t SUFFIX_LENGTH = sizeof("]") - 1;
+#elif defined(__GNUC__) && !defined(__clang__)
+#define DBG_MACRO_PRETTY_FUNCTION __PRETTY_FUNCTION__
+static constexpr size_t PREFIX_LENGTH =
+    sizeof("const char* dbg::type_name_impl() [with T = ") - 1;
+static constexpr size_t SUFFIX_LENGTH = sizeof("]") - 1;
+#elif defined(_MSC_VER)
+#define DBG_MACRO_PRETTY_FUNCTION __FUNCSIG__
+static constexpr size_t PREFIX_LENGTH =
+    sizeof("const char *__cdecl dbg::type_name_impl<") - 1;
+static constexpr size_t SUFFIX_LENGTH = sizeof(">(void)") - 1;
+#else
+#error "This compiler is currently not supported by dbg_macro."
+#endif
+
+} // namespace pretty_function
+
+// Formatting helpers
+
+template <typename T> struct print_formatted {
+    static_assert(std::is_integral<T>::value,
+                  "Only integral types are supported.");
+
+    print_formatted(T value, int numeric_base)
+        : inner(value), base(numeric_base) {}
+
+    operator T() const { return inner; }
+
+    const char *prefix() const {
+        switch (base) {
+        case 8:
+            return "0o";
+        case 16:
+            return "0x";
+        case 2:
+            return "0b";
+        default:
+            return "";
+        }
+    }
+
+    T inner;
+    int base;
+};
+
+template <typename T> print_formatted<T> hex(T value) {
+    return print_formatted<T>{value, 16};
+}
+
+template <typename T> print_formatted<T> oct(T value) {
+    return print_formatted<T>{value, 8};
+}
+
+template <typename T> print_formatted<T> bin(T value) {
+    return print_formatted<T>{value, 2};
+}
+
+// Implementation of 'type_name<T>()'
+
+template <typename T> const char *type_name_impl() {
+    return DBG_MACRO_PRETTY_FUNCTION;
+}
+
+template <typename T> struct type_tag {};
+
+template <int &...ExplicitArgumentBarrier, typename T>
+std::string get_type_name(type_tag<T>) {
+    namespace pf = pretty_function;
+
+    std::string type = type_name_impl<T>();
+    return type.substr(pf::PREFIX_LENGTH,
+                       type.size() - pf::PREFIX_LENGTH - pf::SUFFIX_LENGTH);
+}
+
+template <typename T> std::string type_name() {
+    if (std::is_volatile<T>::value) {
+        if (std::is_pointer<T>::value) {
+            return type_name<typename std::remove_volatile<T>::type>() +
+                   " volatile";
+        } else {
+            return "volatile " +
+                   type_name<typename std::remove_volatile<T>::type>();
+        }
+    }
+    if (std::is_const<T>::value) {
+        if (std::is_pointer<T>::value) {
+            return type_name<typename std::remove_const<T>::type>() + " const";
+        } else {
+            return "const " + type_name<typename std::remove_const<T>::type>();
+        }
+    }
+    if (std::is_pointer<T>::value) {
+        return type_name<typename std::remove_pointer<T>::type>() + "*";
+    }
+    if (std::is_lvalue_reference<T>::value) {
+        return type_name<typename std::remove_reference<T>::type>() + "&";
+    }
+    if (std::is_rvalue_reference<T>::value) {
+        return type_name<typename std::remove_reference<T>::type>() + "&&";
+    }
+    return get_type_name(type_tag<T>{});
+}
+
+inline std::string get_type_name(type_tag<short>) { return "short"; }
+
+inline std::string get_type_name(type_tag<unsigned short>) {
+    return "unsigned short";
+}
+
+inline std::string get_type_name(type_tag<long>) { return "long"; }
+
+inline std::string get_type_name(type_tag<unsigned long>) {
+    return "unsigned long";
+}
+
+inline std::string get_type_name(type_tag<std::string>) {
+    return "std::string";
+}
+
+template <typename T>
+std::string get_type_name(type_tag<std::vector<T, std::allocator<T>>>) {
+    return "std::vector<" + type_name<T>() + ">";
+}
+
+template <typename T1, typename T2>
+std::string get_type_name(type_tag<std::pair<T1, T2>>) {
+    return "std::pair<" + type_name<T1>() + ", " + type_name<T2>() + ">";
+}
+
+template <typename... T> std::string type_list_to_string() {
+    std::string result;
+    auto unused = {(result += type_name<T>() + ", ", 0)..., 0};
+    static_cast<void>(unused);
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+    if constexpr (sizeof...(T) > 0) {
+#else
+    if (sizeof...(T) > 0) {
+#endif
+        result.pop_back();
+        result.pop_back();
+    }
+    return result;
+}
+
+template <typename... T> std::string get_type_name(type_tag<std::tuple<T...>>) {
+    return "std::tuple<" + type_list_to_string<T...>() + ">";
+}
+
+template <typename T>
+inline std::string get_type_name(type_tag<print_formatted<T>>) {
+    return type_name<T>();
+}
+
+// Implementation of 'is_detected' to specialize for container-like types
+
+namespace detail_detector {
+
+struct nonesuch {
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const &) = delete;
+    void operator=(nonesuch const &) = delete;
+};
+
+template <typename...> using void_t = void;
+
+template <class Default, class AlwaysVoid, template <class...> class Op,
+          class... Args>
+struct detector {
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+} // namespace detail_detector
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename detail_detector::detector<detail_detector::nonesuch, void, Op,
+                                       Args...>::value_t;
+
+namespace detail {
+
+namespace {
+using std::begin;
+using std::end;
+#if DBG_MACRO_CXX_STANDARD < 17
+template <typename T> constexpr auto size(const T &c) -> decltype(c.size()) {
+    return c.size();
+}
+template <typename T, std::size_t N>
+constexpr std::size_t size(const T (&)[N]) {
+    return N;
+}
+#else
+using std::size;
+#endif
+} // namespace
+
+template <typename T>
+using detect_begin_t = decltype(detail::begin(std::declval<T>()));
+
+template <typename T>
+using detect_end_t = decltype(detail::end(std::declval<T>()));
+
+template <typename T>
+using detect_size_t = decltype(detail::size(std::declval<T>()));
+
+template <typename T> struct is_container {
+    static constexpr bool value =
+        is_detected<detect_begin_t, T>::value &&
+        is_detected<detect_end_t, T>::value &&
+        is_detected<detect_size_t, T>::value &&
+        !std::is_same<std::string,
+                      typename std::remove_cv<typename std::remove_reference<
+                          T>::type>::type>::value;
+};
+
+template <typename T>
+using ostream_operator_t =
+    decltype(std::declval<std::ostream &>() << std::declval<T>());
+
+template <typename T>
+struct has_ostream_operator : is_detected<ostream_operator_t, T> {};
+
+} // namespace detail
+
+// Helper to dbg(…)-print types
+template <typename T> struct print_type {};
+
+template <typename T> print_type<T> type() { return print_type<T>{}; }
+
+// Forward declarations of "pretty_print"
+
+template <typename T>
+inline void pretty_print(std::ostream &stream, const T &value, std::true_type);
+
+template <typename T>
+inline void pretty_print(std::ostream &, const T &, std::false_type);
+
+template <typename T>
+inline typename std::enable_if<!detail::is_container<const T &>::value &&
+                                   !std::is_enum<T>::value,
+                               bool>::type
+pretty_print(std::ostream &stream, const T &value);
+
+inline bool pretty_print(std::ostream &stream, const bool &value);
+
+inline bool pretty_print(std::ostream &stream, const char &value);
+
+template <typename P>
+inline bool pretty_print(std::ostream &stream, P *const &value);
+
+template <typename T, typename Deleter>
+inline bool pretty_print(std::ostream &stream,
+                         std::unique_ptr<T, Deleter> &value);
+
+// template <typename T>
+// inline bool pretty_print(std::ostream& stream, std::shared_ptr<T>& value);
+
+template <size_t N>
+inline bool pretty_print(std::ostream &stream, const char (&value)[N]);
+
+template <>
+inline bool pretty_print(std::ostream &stream, const char *const &value);
+
+template <typename... Ts>
+inline bool pretty_print(std::ostream &stream, const std::tuple<Ts...> &value);
+
+template <>
+inline bool pretty_print(std::ostream &stream, const std::tuple<> &);
+
+template <> inline bool pretty_print(std::ostream &stream, const time &);
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream, const print_formatted<T> &value);
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream, const print_type<T> &);
+
+template <typename Enum>
+inline typename std::enable_if<std::is_enum<Enum>::value, bool>::type
+pretty_print(std::ostream &stream, Enum const &value);
+
+inline bool pretty_print(std::ostream &stream, const std::string &value);
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+
+inline bool pretty_print(std::ostream &stream, const std::string_view &value);
+
+#endif
+
+template <typename T1, typename T2>
+inline bool pretty_print(std::ostream &stream, const std::pair<T1, T2> &value);
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream, const std::optional<T> &value);
+
+template <typename... Ts>
+inline bool pretty_print(std::ostream &stream,
+                         const std::variant<Ts...> &value);
+
+#endif
+
+template <typename Container>
+inline typename std::enable_if<detail::is_container<const Container &>::value,
+                               bool>::type
+pretty_print(std::ostream &stream, const Container &value);
+
+// Specializations of "pretty_print"
+
+template <typename T>
+inline void pretty_print(std::ostream &stream, const T &value, std::true_type) {
+    stream << value;
+}
+
+template <typename T>
+inline void pretty_print(std::ostream &, const T &, std::false_type) {
+    static_assert(detail::has_ostream_operator<const T &>::value,
+                  "Type does not support the << ostream operator");
+}
+
+template <typename T>
+inline typename std::enable_if<!detail::is_container<const T &>::value &&
+                                   !std::is_enum<T>::value,
+                               bool>::type
+pretty_print(std::ostream &stream, const T &value) {
+    pretty_print(stream, value,
+                 typename detail::has_ostream_operator<const T &>::type{});
+    return true;
+}
+
+inline bool pretty_print(std::ostream &stream, const bool &value) {
+    stream << std::boolalpha << value;
+    return true;
+}
+
+inline bool pretty_print(std::ostream &stream, const char &value) {
+    const bool printable = value >= 0x20 && value <= 0x7E;
+
+    if (printable) {
+        stream << "'" << value << "'";
+    } else {
+        stream << "'\\x" << std::setw(2) << std::setfill('0') << std::hex
+               << std::uppercase << (0xFF & value) << "'";
+    }
+    return true;
+}
+
+template <typename P>
+inline bool pretty_print(std::ostream &stream, P *const &value) {
+    if (value == nullptr) {
+        stream << "nullptr";
+    } else {
+        stream << value;
+    }
+    return true;
+}
+
+template <typename T, typename Deleter>
+inline bool pretty_print(std::ostream &stream,
+                         std::unique_ptr<T, Deleter> &value) {
+    pretty_print(stream, value.get());
+    return true;
+}
+
+// template <typename T>
+// inline bool pretty_print(std::ostream& stream, std::shared_ptr<T>& value) {
+//   pretty_print(stream, value.get());
+//   stream << " (use_count = " << value.use_count() << ")";
+//   return true;
+// }
+
+template <size_t N>
+inline bool pretty_print(std::ostream &stream, const char (&value)[N]) {
+    stream << value;
+    return false;
+}
+
+template <>
+inline bool pretty_print(std::ostream &stream, const char *const &value) {
+    stream << '"' << value << '"';
+    return true;
+}
+
+template <size_t Idx> struct pretty_print_tuple {
+    template <typename... Ts>
+    static void print(std::ostream &stream, const std::tuple<Ts...> &tuple) {
+        pretty_print_tuple<Idx - 1>::print(stream, tuple);
+        stream << ", ";
+        pretty_print(stream, std::get<Idx>(tuple));
+    }
+};
+
+template <> struct pretty_print_tuple<0> {
+    template <typename... Ts>
+    static void print(std::ostream &stream, const std::tuple<Ts...> &tuple) {
+        pretty_print(stream, std::get<0>(tuple));
+    }
+};
+
+template <typename... Ts>
+inline bool pretty_print(std::ostream &stream, const std::tuple<Ts...> &value) {
+    stream << "{";
+    pretty_print_tuple<sizeof...(Ts) - 1>::print(stream, value);
+    stream << "}";
+
+    return true;
+}
+
+template <>
+inline bool pretty_print(std::ostream &stream, const std::tuple<> &) {
+    stream << "{}";
+
+    return true;
+}
+
+template <> inline bool pretty_print(std::ostream &stream, const time &) {
+    using namespace std::chrono;
+
+    const auto now = system_clock::now();
+    const auto us =
+        duration_cast<microseconds>(now.time_since_epoch()).count() % 1000000;
+    const auto hms = system_clock::to_time_t(now);
+    const std::tm *tm = std::localtime(&hms);
+    stream << "current time = " << std::put_time(tm, "%H:%M:%S") << '.'
+           << std::setw(6) << std::setfill('0') << us;
+
+    return false;
+}
+
+// Converts decimal integer to binary string
+template <typename T> std::string decimalToBinary(T n) {
+    const size_t length = 8 * sizeof(T);
+    std::string toRet;
+    toRet.resize(length);
+
+    for (size_t i = 0; i < length; ++i) {
+        const auto bit_at_index_i = static_cast<char>((n >> i) & 1);
+        toRet[length - 1 - i] = bit_at_index_i + '0';
+    }
+
+    return toRet;
+}
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream,
+                         const print_formatted<T> &value) {
+    if (value.inner < 0) {
+        stream << "-";
+    }
+    stream << value.prefix();
+
+    // Print using setbase
+    if (value.base != 2) {
+        stream << std::setw(sizeof(T)) << std::setfill('0')
+               << std::setbase(value.base) << std::uppercase;
+
+        if (value.inner >= 0) {
+            // The '+' sign makes sure that a uint_8 is printed as a number
+            stream << +value.inner;
+        } else {
+            using unsigned_type = typename std::make_unsigned<T>::type;
+            stream << +(static_cast<unsigned_type>(-(value.inner + 1)) + 1);
+        }
+    } else {
+        // Print for binary
+        if (value.inner >= 0) {
+            stream << decimalToBinary(value.inner);
+        } else {
+            using unsigned_type = typename std::make_unsigned<T>::type;
+            stream << decimalToBinary<unsigned_type>(
+                static_cast<unsigned_type>(-(value.inner + 1)) + 1);
+        }
+    }
+
+    return true;
+}
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream, const print_type<T> &) {
+    stream << type_name<T>();
+
+    stream << " [sizeof: " << sizeof(T) << " byte, ";
+
+    stream << "trivial: ";
+    if (std::is_trivial<T>::value) {
+        stream << "yes";
+    } else {
+        stream << "no";
+    }
+
+    stream << ", standard layout: ";
+    if (std::is_standard_layout<T>::value) {
+        stream << "yes";
+    } else {
+        stream << "no";
+    }
+    stream << "]";
+
+    return false;
+}
+
+template <typename Enum>
+inline typename std::enable_if<std::is_enum<Enum>::value, bool>::type
+pretty_print(std::ostream &stream, Enum const &value) {
+    using UnderlyingType = typename std::underlying_type<Enum>::type;
+    stream << static_cast<UnderlyingType>(value);
+
+    return true;
+}
+
+inline bool pretty_print(std::ostream &stream, const std::string &value) {
+    stream << '"' << value << '"';
+    return true;
+}
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+
+inline bool pretty_print(std::ostream &stream, const std::string_view &value) {
+    stream << '"' << std::string(value) << '"';
+    return true;
+}
+
+#endif
+
+template <typename T1, typename T2>
+inline bool pretty_print(std::ostream &stream, const std::pair<T1, T2> &value) {
+    stream << "{";
+    pretty_print(stream, value.first);
+    stream << ", ";
+    pretty_print(stream, value.second);
+    stream << "}";
+    return true;
+}
+
+#if DBG_MACRO_CXX_STANDARD >= 17
+
+template <typename T>
+inline bool pretty_print(std::ostream &stream, const std::optional<T> &value) {
+    if (value) {
+        stream << '{';
+        pretty_print(stream, *value);
+        stream << '}';
+    } else {
+        stream << "nullopt";
+    }
+
+    return true;
+}
+
+template <typename... Ts>
+inline bool pretty_print(std::ostream &stream,
+                         const std::variant<Ts...> &value) {
+    stream << "{";
+    std::visit([&stream](auto &&arg) { pretty_print(stream, arg); }, value);
+    stream << "}";
+
+    return true;
+}
+
+#endif
+
+template <typename Container>
+inline typename std::enable_if<detail::is_container<const Container &>::value,
+                               bool>::type
+pretty_print(std::ostream &stream, const Container &value) {
+    stream << "{";
+    const size_t size = detail::size(value);
+    const size_t n = std::min(size_t{10}, size);
+    size_t i = 0;
+    using std::begin;
+    using std::end;
+    for (auto it = begin(value); it != end(value) && i < n; ++it, ++i) {
+        pretty_print(stream, *it);
+        if (i != n - 1) {
+            stream << ", ";
+        }
+    }
+
+    if (size > n) {
+        stream << ", ...";
+        stream << " size:" << size;
+    }
+
+    stream << "}";
+    return true;
+}
+
+template <typename T, typename... U> struct last {
+    using type = typename last<U...>::type;
+};
+
+template <typename T> struct last<T> { using type = T; };
+
+template <typename... T> using last_t = typename last<T...>::type;
+
+class DebugOutput {
+  public:
+    // Helper alias to avoid obscure type `const char* const*` in signature.
+    using expr_t = const char *;
+
+    DebugOutput(const char *filepath, int line, const char *function_name)
+        : m_use_colorized_output(isColorizedOutputEnabled()) {
+        std::string path = filepath;
+        const std::size_t path_length = path.length();
+        if (path_length > MAX_PATH_LENGTH) {
+            path = ".." +
+                   path.substr(path_length - MAX_PATH_LENGTH, MAX_PATH_LENGTH);
+        }
+        std::stringstream ss;
+        ss << ansi(ANSI_DEBUG) << "[" << path << ":" << line << " ("
+           << function_name << ")] " << ansi(ANSI_RESET);
+        m_location = ss.str();
+    }
+
+    template <typename... T>
+    auto print(std::initializer_list<expr_t> exprs,
+               std::initializer_list<std::string> types, T &&...values)
+        -> last_t<T...> {
+        if (exprs.size() != sizeof...(values)) {
+            std::cerr << m_location << ansi(ANSI_WARN)
+                      << "The number of arguments mismatch, please check "
+                         "unprotected comma"
+                      << ansi(ANSI_RESET) << std::endl;
+        }
+        return print_impl(exprs.begin(), types.begin(),
+                          std::forward<T>(values)...);
+    }
+
+  private:
+    template <typename T>
+    T &&print_impl(const expr_t *expr, const std::string *type, T &&value) {
+        const T &ref = value;
+        std::stringstream stream_value;
+        const bool print_expr_and_type = pretty_print(stream_value, ref);
+
+        std::stringstream output;
+        output << m_location;
+        if (print_expr_and_type) {
+            output << ansi(ANSI_EXPRESSION) << *expr << ansi(ANSI_RESET)
+                   << " = ";
+        }
+        output << ansi(ANSI_VALUE) << stream_value.str() << ansi(ANSI_RESET);
+        if (print_expr_and_type) {
+            output << " (" << ansi(ANSI_TYPE) << *type << ansi(ANSI_RESET)
+                   << ")";
+        }
+        output << std::endl;
+        std::cerr << output.str();
+
+        return std::forward<T>(value);
+    }
+
+    template <typename T, typename... U>
+    auto print_impl(const expr_t *exprs, const std::string *types, T &&value,
+                    U &&...rest) -> last_t<T, U...> {
+        print_impl(exprs, types, std::forward<T>(value));
+        return print_impl(exprs + 1, types + 1, std::forward<U>(rest)...);
+    }
+
+    const char *ansi(const char *code) const {
+        if (m_use_colorized_output) {
+            return code;
+        } else {
+            return ANSI_EMPTY;
+        }
+    }
+
+    const bool m_use_colorized_output;
+
+    std::string m_location;
+
+    static constexpr std::size_t MAX_PATH_LENGTH = 20;
+
+    static constexpr const char *const ANSI_EMPTY = "";
+    static constexpr const char *const ANSI_DEBUG = "\x1b[02m";
+    static constexpr const char *const ANSI_WARN = "\x1b[33m";
+    static constexpr const char *const ANSI_EXPRESSION = "\x1b[36m";
+    static constexpr const char *const ANSI_VALUE = "\x1b[01m";
+    static constexpr const char *const ANSI_TYPE = "\x1b[32m";
+    static constexpr const char *const ANSI_RESET = "\x1b[0m";
+};
+
+// Identity function to suppress "-Wunused-value" warnings in DBG_MACRO_DISABLE
+// mode
+template <typename T> T &&identity(T &&t) { return std::forward<T>(t); }
+
+template <typename T, typename... U>
+auto identity(T &&, U &&...u) -> last_t<U...> {
+    return identity(std::forward<U>(u)...);
+}
+
+} // namespace dbg
+
+#ifndef DBG_MACRO_DISABLE
+
+// Force expanding argument with commas for MSVC, ref:
+// https://stackoverflow.com/questions/35210637/macro-expansion-argument-with-commas
+// Note that "args" should be a tuple with parentheses, such as "(e1, e2, ...)".
+#define DBG_IDENTITY(x) x
+#define DBG_CALL(fn, args) DBG_IDENTITY(fn args)
+
+#define DBG_CAT_IMPL(_1, _2) _1##_2
+#define DBG_CAT(_1, _2) DBG_CAT_IMPL(_1, _2)
+
+#define DBG_16TH_IMPL(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,  \
+                      _14, _15, _16, ...)                                      \
+    _16
+#define DBG_16TH(args) DBG_CALL(DBG_16TH_IMPL, args)
+#define DBG_NARG(...)                                                          \
+    DBG_16TH(                                                                  \
+        (__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+// DBG_VARIADIC_CALL(fn, data, e1, e2, ...) => fn_N(data, (e1, e2, ...))
+#define DBG_VARIADIC_CALL(fn, data, ...)                                       \
+    DBG_CAT(fn##_, DBG_NARG(__VA_ARGS__))(data, (__VA_ARGS__))
+
+// (e1, e2, e3, ...) => e1
+#define DBG_HEAD_IMPL(_1, ...) _1
+#define DBG_HEAD(args) DBG_CALL(DBG_HEAD_IMPL, args)
+
+// (e1, e2, e3, ...) => (e2, e3, ...)
+#define DBG_TAIL_IMPL(_1, ...) (__VA_ARGS__)
+#define DBG_TAIL(args) DBG_CALL(DBG_TAIL_IMPL, args)
+
+#define DBG_MAP_1(fn, args) DBG_CALL(fn, args)
+#define DBG_MAP_2(fn, args) fn(DBG_HEAD(args)), DBG_MAP_1(fn, DBG_TAIL(args))
+#define DBG_MAP_3(fn, args) fn(DBG_HEAD(args)), DBG_MAP_2(fn, DBG_TAIL(args))
+#define DBG_MAP_4(fn, args) fn(DBG_HEAD(args)), DBG_MAP_3(fn, DBG_TAIL(args))
+#define DBG_MAP_5(fn, args) fn(DBG_HEAD(args)), DBG_MAP_4(fn, DBG_TAIL(args))
+#define DBG_MAP_6(fn, args) fn(DBG_HEAD(args)), DBG_MAP_5(fn, DBG_TAIL(args))
+#define DBG_MAP_7(fn, args) fn(DBG_HEAD(args)), DBG_MAP_6(fn, DBG_TAIL(args))
+#define DBG_MAP_8(fn, args) fn(DBG_HEAD(args)), DBG_MAP_7(fn, DBG_TAIL(args))
+#define DBG_MAP_9(fn, args) fn(DBG_HEAD(args)), DBG_MAP_8(fn, DBG_TAIL(args))
+#define DBG_MAP_10(fn, args) fn(DBG_HEAD(args)), DBG_MAP_9(fn, DBG_TAIL(args))
+#define DBG_MAP_11(fn, args) fn(DBG_HEAD(args)), DBG_MAP_10(fn, DBG_TAIL(args))
+#define DBG_MAP_12(fn, args) fn(DBG_HEAD(args)), DBG_MAP_11(fn, DBG_TAIL(args))
+#define DBG_MAP_13(fn, args) fn(DBG_HEAD(args)), DBG_MAP_12(fn, DBG_TAIL(args))
+#define DBG_MAP_14(fn, args) fn(DBG_HEAD(args)), DBG_MAP_13(fn, DBG_TAIL(args))
+#define DBG_MAP_15(fn, args) fn(DBG_HEAD(args)), DBG_MAP_14(fn, DBG_TAIL(args))
+#define DBG_MAP_16(fn, args) fn(DBG_HEAD(args)), DBG_MAP_15(fn, DBG_TAIL(args))
+
+// DBG_MAP(fn, e1, e2, e3, ...) => fn(e1), fn(e2), fn(e3), ...
+#define DBG_MAP(fn, ...) DBG_VARIADIC_CALL(DBG_MAP, fn, __VA_ARGS__)
+
+#define DBG_STRINGIFY_IMPL(x) #x
+#define DBG_STRINGIFY(x) DBG_STRINGIFY_IMPL(x)
+
+#define DBG_TYPE_NAME(x) dbg::type_name<decltype(x)>()
+
+#define dbg(...)                                                               \
+    dbg::DebugOutput(__FILE__, __LINE__, __func__)                             \
+        .print({DBG_MAP(DBG_STRINGIFY, __VA_ARGS__)},                          \
+               {DBG_MAP(DBG_TYPE_NAME, __VA_ARGS__)}, __VA_ARGS__)
+#else
+#define dbg(...) dbg::identity(__VA_ARGS__)
+#endif // DBG_MACRO_DISABLE
+
+#endif // DBG_MACRO_DBG_H
--- a/include/nnet/derivator.h
+++ b/include/nnet/derivator.h
@ -0,0 +1,156 @@
+#pragma once
+#include "common.h"
+#include "expr.h"
+#include "iterator_table.h"
+#include "routine.h"
+#include <iostream>
+#include <sstream>
+#include <unordered_set>
+
+namespace nnet {
+
+class Formula {
+  public:
+    Expr root;
+    const int bfsDepth;
+
+  public:
+    Formula(Expr _root, int _bfsDepth) : root(_root), bfsDepth(_bfsDepth) {}
+    string toReadable() const;
+    friend std::ostream &operator<<(std::ostream &ios, const Formula &expr) {
+        ios << expr.toReadable();
+        return ios;
+    }
+    bool isVariable() const { return as<VarNode>(root) != nullptr; }
+};
+
+class MultiFormulas {
+  public:
+    VecExpr roots;
+    const int bfsDepth;
+
+  public:
+    MultiFormulas(VecExpr roots, int _bfsDepth)
+        : roots(roots), bfsDepth(_bfsDepth) {}
+    // string toReadable() const;
+    // friend std::ostream &operator<<(std::ostream &ios, const Formula &expr) {
+    //     ios << expr.toReadable();
+    //     return ios;
+    // }
+};
+
+class Derivator {
+  public:
+    enum class LogMode { Normal, DumpFristCandiate, NoLog };
+    enum class PassMode { Debug, Full };
+
+  private:
+    list<Formula> candidates;
+    const int maxDepth;
+    int nIteratorNames = 0;
+    int nTensorNames = 0;
+    vector<vector<int>> rulesOverall;
+    enum class Strategy { DFS, Rule, RuleAndDFS } searchStrategy;
+    LogMode logMode;
+    PassMode passMode;
+    bool enableEquivalenceCheck = false;
+    string logFnPrefix;
+    const bool enableHashPruning;
+    int searchedMaxDepth = 0;
+    RoutineType targetOp = RoutineType::NoneType;
+    map<int, vector<Var>> substituteRules;
+
+    vector<int> cntAppliedRules;
+    int cntRule3 = 0;
+    std::unordered_set<HashType> visited;
+    VecExpr intermediateStates;
+    vector<string> ruleStates, ruleMsgs;
+    int cntStates = 0;   // the number of intermediate states
+    int searchState = 0; // search state in guided search
+
+  public:
+    Derivator(int maxDepth = 8, bool enableHashPruning = true,
+              LogMode mode = LogMode::NoLog,
+              PassMode passMode = PassMode::Debug);
+    void search(Formula &origin, int depth);
+    void ruleBasedDFS(Formula &origin, int depth, vector<int> _rules,
+                      map<int, vector<Var>> _substituteRules = {},
+                      bool searchAfterRules = false);
+    void guidedSearch(Formula &origin, int depth);
+    void print();
+    int getNumCandidates() const { return candidates.size(); }
+    const auto &getCandidates() const { return candidates; }
+    void appendCanddiate(const Tensor &tensor, int depth);
+    int getSearchedMaxDepth() const { return searchedMaxDepth; };
+    bool stageCombination(MultiFormulas &origin, int depth);
+    bool checkOOB(const RangeOp &rangeOp, bool halt = true);
+
+    string newTensorName();
+    Var getNewVar();
+
+    Expr mergeMemboundStages(VecExpr stages);
+
+  private:
+    void dfs(Formula &origin, int depth);
+    void ruleBasedDerivate(Formula &origin, int depth);
+
+    void rule1VariableSplit(Formula &origin, int depth, Expr &rCur);
+    void rule2VariableMerging(Formula &origin, int depth, Expr &rCur);
+    void rule3StageSplit(Formula &origin, int dfsDepth, Expr &rCur);
+    void rule5RangeRelaxation(Formula &origin, int depth, Expr &rCur);
+    bool rule4StageMerging(Formula &origin, int depth, Expr &rCur,
+                           bool mergeStageWithCalc = false);
+    void rule6KenerlMatching(Formula &origin, int depth, Expr &rCur);
+    void rule7DLT(Formula &origin, int depth, Expr &rCur);
+    // Rule 8: guidedDLT
+    void rule8GuidedDLT(Formula &origin, int depth, Expr &rCur);
+    void rule9RangeMagnify(Formula &origin, int depth, Expr &rCur);
+    void rule90TwoStageElementWise(Formula &origin, int depth, Expr &rCur);
+    void rule91MergeStagesWithSum(Formula &origin, int depth, Expr &rCur);
+    /**
+     * @brief For searchState=2, wrap the RangeOp to add offset, if the boundary
+     * does not start from 0. Then match the inner offset RangeOp.
+     */
+    void matchComputationKernel(Formula &origin, int depth, Expr &rcur);
+    /**
+     * @brief For searchState=3, the Formula must be a MemBound kernel?
+     */
+    void matchMemBoundKernel(Formula &origin, int depth, Expr &rcur);
+
+    /**
+     * @brief Check the equivalence for exprs in intermediateStates.
+     */
+    void checkDerivationEquivalence();
+
+  public:
+    void pushIntermediateState(const Expr &expr);
+    void pushRuleState(const string &state);
+    void pushRuleMsg(const string &state);
+    void popIntermediateState();
+    void popRuleState();
+    void popRuleMsg();
+    // void pushTransformInfo(const Expr &expr, const string &state,
+    //                        const string &msg);
+    void nextStep(Formula &origin, int depth, Expr &rCur, Expr newCur);
+
+    RoutineType getTargetOp();
+    void setTargetOp(RoutineType _targetOp);
+
+    int getSearchState();
+    void setSearchState(int _searchState);
+    int getNumIntermediateStates();
+    void printStatistics();
+    void printIntermediateStates();
+    /**
+     * @brief Enable dumping the first results. Verification is enabled so it is
+     * slow.
+     *
+     * @param _logFnPrefix Prefix of output filename
+     */
+    void setDumpFirstSuccess(const string &_logFnPrefix);
+    void setEquivalenceCheck();
+    PassMode getPassMode();
+    LogMode getLogMode();
+};
+
+} // namespace nnet
--- a/include/nnet/dlt.h
+++ b/include/nnet/dlt.h
@ -0,0 +1,53 @@
+#pragma once
+#include "common.h"
+#include "expr.h"
+#include <iostream>
+
+namespace nnet {
+
+// enum class DLTType { Split, Merge, Reorder };
+
+struct DLTOperation {
+    // DLTType type;
+    virtual ~DLTOperation() {}
+};
+struct DLTSplit : DLTOperation {
+    int dim, factor;
+    DLTSplit(int _dim, int _factor) : dim(_dim), factor(_factor) {}
+};
+struct DLTMerge : DLTOperation {
+    int dim0, dim1;
+    DLTMerge(int _dim0, int _dim1) : dim0(_dim0), dim1(_dim1) {}
+};
+struct DLTReorder : DLTOperation {
+    vector<int> dims;
+    DLTReorder(vector<int> _dims) : dims(_dims) {}
+};
+
+class DLT {
+    vector<Ref<DLTOperation>> ops;
+
+  public:
+    /**
+     * @brief dim -> (dim/factor, factor)
+     */
+    void split(int dim, int factor);
+    /**
+     * @brief Merge dim1 into dim0 -> (dim0, dim1)
+     */
+    void merge(int dim0, int dim1);
+    /**
+     * @brief
+     *
+     * @param dims dims[new_dim]=old_dim
+     */
+    void reorder(vector<int> dims);
+    optional<Expr> apply(const RangeOp &rangeOp, const Subscript &subscript,
+                         string newTensorName);
+
+  private:
+    optional<pair<Expr, Expr>> splitIndex(Expr expr, int factor,
+                                          RangeOp rangeOp);
+};
+
+} // namespace nnet
--- a/include/nnet/expr.h
+++ b/include/nnet/expr.h
@ -0,0 +1,416 @@
+#pragma once
+#include "common.h"
+#include "ref.h"
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+namespace nnet {
+
+class ExprNode;
+class VarNode;
+class TensorNode;
+class OperatorNode;
+class RangeOpNode;
+class SubscriptNode;
+class BinaryOpNode;
+class ConstantNode;
+class FuncNode;
+using Expr = Ref<ExprNode>;
+using Var = Ref<VarNode>;
+using Tensor = Ref<TensorNode>;
+using Operator = Ref<OperatorNode>;
+using RangeOp = Ref<RangeOpNode>;
+using Subscript = Ref<SubscriptNode>;
+using BinaryOp = Ref<BinaryOpNode>;
+using Constant = Ref<ConstantNode>;
+using Func = Ref<FuncNode>;
+
+class RoutineNode;
+using Routine = Ref<RoutineNode>;
+enum class RoutineType {
+    NoneType = 100,
+    MatmulNodeType,
+    ConvNodeType,
+    G2bmmNodeType,
+    GbmmNodeType,
+    ElementWiseNodeType // unmatchable
+};
+constexpr inline int MatchableRoutineTypeCnt = 4;
+constexpr inline int RoutineTypeCnt = MatchableRoutineTypeCnt + 1;
+inline RoutineType idToRoutineType(int i) {
+    return static_cast<RoutineType>(i + 1 +
+                                    static_cast<int>(RoutineType::NoneType));
+}
+inline int routineTypeToId(const RoutineType &routineType) {
+    return static_cast<int>(routineType) -
+           static_cast<int>(RoutineType::NoneType) - 1;
+}
+
+using VecExpr = vector<Expr>;
+
+// common data structure
+using Iterator = Var; // RE: remove this alias
+template <typename T, typename U> using PtrMap = std::map<T, U, ptr_less<T>>;
+template <typename T, typename U>
+// When keys are pointers, compare keys according to its value instead of
+// address Specially, the name of Var are compared due to the overload of op=
+// and hash.
+using PtrUmap = std::unordered_map<T, U, ptr_hash<T>, ptr_equal<T>>;
+template <typename T>
+using PtrUset = std::unordered_set<T, ptr_hash<T>, ptr_equal<T>>;
+using Appearance = PtrMap<Var, vector<pair<Tensor, int>>>;
+using StrideTable =
+    PtrMap<Var, vector<tuple<TensorNode *, int, int>>>; // Tensor, dim, stride
+
+// AST node opeartor
+bool operator==(const Var &lhs, const string &rhs);
+bool operator==(const string &lhs, const Var &rhs);
+Expr operator+(const Expr &lhs, const Expr &rhs);
+BinaryOp operator-(const Expr &lhs, const Expr &rhs);
+BinaryOp operator*(const Expr &lhs, const Expr &rhs);
+BinaryOp operator/(const Expr &lhs, const Expr &rhs);
+BinaryOp operator%(const Expr &lhs, const Expr &rhs);
+
+Expr operator+(const Expr &lhs, const int &rhs);
+Expr operator+(const int &lhs, const Expr &rhs);
+Expr operator-(const Expr &lhs, const int &rhs);
+Expr operator-(const int &lhs, const Expr &rhs);
+Expr operator*(const Expr &lhs, const int &rhs);
+Expr operator*(const int &lhs, const Expr &rhs);
+Expr operator%(const Expr &lhs, const int rhs);
+Expr operator/(const Expr &lhs, const int rhs);
+
+string serializeVec(vector<Expr> v);
+string serializeVec(vector<Var> v);
+template <typename T> inline string serializeVec(vector<T> v) {
+    if (v.empty())
+        return "[]";
+    return "[" +
+           std::accumulate(
+               v.begin() + 1, v.end(), to_string(v[0]),
+               [](const string &a, int b) { return a + ',' + to_string(b); }) +
+           "]";
+}
+
+// For RTTI and visitor pattern
+enum class NodeType {
+    ConstantNodeType,
+    BinaryOpNodeType,
+    RangeOpNodeType,
+    SubscriptNodeType,
+    TensorNodeType,
+    VarNodeType,
+    FuncNodeType
+};
+
+enum class FuncType { Relu, Tanh };
+
+#define DEFINE_GETTYPE(CLASS)                                                  \
+    NodeType getType() const override { return NodeType::CLASS##Type; }
+
+class ExprNode {
+  public:
+    virtual ~ExprNode() {}
+    ExprNode &operator=(const ExprNode &rhs) = delete;
+
+    virtual HashType hash() const = 0; // RE: remove?
+    virtual string toReadable() const = 0;
+    friend std::ostream &operator<<(std::ostream &ios, const ExprNode &expr);
+
+    virtual NodeType getType() const = 0;
+};
+
+class VarNode : public ExprNode {
+    std::string name;
+
+  public:
+    VarNode(std::string _name) : name(_name){};
+    virtual ~VarNode() {}
+    DEFINE_GETTYPE(VarNode);
+
+    const std::string &getName() const { return name; }
+    HashType hash() const override { return genhash(name); };
+    string toReadable() const override { return name; };
+    bool equal(const Var &rhs) const { return name == rhs->getName(); }
+    bool neq(const Var &rhs) const { return !equal(rhs); }
+    bool less(const Var &rhs) const { return name < rhs->getName(); }
+    bool equal(const string &rhs) const { return name == rhs; }
+    bool operator==(const VarNode &rhs) const { return name == rhs.getName(); }
+    bool operator<(const VarNode &rhs) const { return name < rhs.getName(); }
+};
+
+enum class TensorType { Input, Weight, Intermediate };
+
+class TensorNode : public ExprNode {
+    string name;
+    vector<int> shape, paddings;
+    TensorType type;
+    Routine source; // if NO source, then this is a input/weight tensor
+
+  public:
+    TensorNode(string _name, vector<int> _shape, vector<int> _paddings = {},
+               Routine _source = nullptr);
+    virtual ~TensorNode() {}
+    DEFINE_GETTYPE(TensorNode);
+
+    bool operator==(const string &rhs) { return name == rhs; }
+    friend bool operator==(const string &lhs, const TensorNode &rhs) {
+        return lhs == rhs.name;
+    }
+
+    HashType hash() const override { return genhash(name); }
+    string toReadable() const override;
+    string toOutputShape() const;
+    const std::string &getName() const { return name; }
+    std::vector<int> &getPadding() { return paddings; }
+    int getPadding(int i) const { return paddings[i]; }
+    const vector<int> &getPaddings() const { return paddings; }
+    void setPadding(int i, int p) { paddings[i] = p; }
+    const vector<int> &getShape() const { return shape; }
+    int getShape(int i) const { return shape[i]; }
+    int64_t getSize() const;
+    int getDims() const { return shape.size(); }
+    const Routine &getSource() const { return source; }
+    int getData(const Ref<vector<int>> &data, const vector<int> &idx);
+    size_t getOffset(const vector<int> &idx);
+};
+
+enum class OpType { Range, Add, Mul, Div, Mod, Sub };
+const char opSymbols[] = "#+*/%-";
+
+class OperatorNode : public ExprNode {
+  protected:
+    const OpType opType;
+    VecExpr subExprs;
+
+  public:
+    OperatorNode(OpType _opType) : opType(_opType){};
+    OperatorNode(OpType _opType, VecExpr _subExprs)
+        : opType(_opType), subExprs(_subExprs){};
+
+    int getSubExprsNum() { return subExprs.size(); };
+    const VecExpr &getSubExprs() { return subExprs; }
+    const Expr &getSubExprs(int i) const { return subExprs[i]; }
+    OpType getOpType() const { return opType; };
+    void setOperands(int i, Expr e) { subExprs[i] = e; }
+};
+
+using Range = pair<int, int>;
+using VarRangePair = pair<Var, Range>;
+inline int getLength(const Range &range) { return range.second - range.first; }
+struct IterationType {
+    enum { Loop, Sum };
+    constexpr static int NumIterationType = 2;
+};
+class RangeOpNode : public OperatorNode {
+  public:
+    enum { Summand, END_POS };
+    constexpr static int Loop = IterationType::Loop;
+    constexpr static int Sum = IterationType::Sum;
+
+  private:
+    vector<VarRangePair> vars[IterationType::NumIterationType];
+    vector<int> paddings;
+
+  public:
+    RangeOpNode(Expr _summand) : OperatorNode(OpType::Range, {_summand}){};
+    RangeOpNode(const vector<VarRangePair> &_loopIters,
+                const vector<VarRangePair> &_sumIters, Expr _summand,
+                const vector<int> &paddings)
+        : OperatorNode(OpType::Range, {_summand}), vars{_loopIters, _sumIters},
+          paddings(paddings){};
+    DEFINE_GETTYPE(RangeOpNode);
+
+    virtual HashType hash() const override {
+        nnet_unimplemented_halt();
+        return 0;
+    };
+    string toReadable() const override;
+    const Expr &getSummand() const { return subExprs[Summand]; }
+    const vector<VarRangePair> &getVarRanges(int _index) const {
+        return vars[_index];
+    }
+    const vector<VarRangePair> &getLoopVarRanges() const {
+        return vars[IterationType::Loop];
+    }
+    const vector<VarRangePair> &getSumVarRanges() const {
+        return vars[IterationType::Sum];
+    }
+    int getNumOutputDims() const;
+    bool hasVar(int index, Var name) const;
+    bool hasLoopVar(Var name) const { return hasVar(Loop, name); }
+    bool hasSumVar(Var name) const { return hasVar(Sum, name); }
+    bool hasLoopVar(string name) const {
+        return hasVar(Loop, make_ref<VarNode>(name));
+    }
+    bool hasSumVar(string name) const {
+        return hasVar(Sum, make_ref<VarNode>(name));
+    }
+    int getVarIndex(int type, string name);
+    void setSummand(Expr e) { subExprs[Summand] = e; }
+    void setLoopIterator(const vector<VarRangePair> &vecExpr) {
+        vars[Loop] = vecExpr;
+    }
+    void setSumIterator(const vector<VarRangePair> &vecExpr) {
+        vars[Sum] = vecExpr;
+    }
+    void setIterator(const vector<VarRangePair> &loop,
+                     const vector<VarRangePair> &sum) {
+        setLoopIterator(loop);
+        setSumIterator(sum);
+    }
+
+    const VarRangePair &getVarRange(int _index, int i) const {
+        return vars[_index][i];
+    }
+    const Var &getLoopVar(int i) const { return vars[Loop][i].first; }
+    Range getRange(const Var &var) const;
+    VarRangePair getVarRange(const Var &var) const;
+    bool hasPaddings() const;
+    int getPaddings(int dim) const;
+    vector<int> getPaddings() const;
+    void setPaddings(vector<int> _paddings);
+    void setVarRange(int _index, int i, VarRangePair pair) {
+        vars[_index][i] = pair;
+    }
+    int64_t getFlops() const;
+    int64_t getInputSize(const RangeOp &self) const;
+    int64_t getOutputSize() const;
+    vector<int> getOutputShape() const;
+    // Including paddings
+    vector<Range> getOutputRanges() const;
+};
+
+class BinaryOpNode : public OperatorNode {
+    enum { LHS, RHS, END_POS };
+
+  public:
+    BinaryOpNode(OpType _opType, Expr _lhs, Expr _rhs)
+        : OperatorNode(_opType, {_lhs, _rhs}){};
+    virtual ~BinaryOpNode() {}
+    DEFINE_GETTYPE(BinaryOpNode);
+
+    virtual HashType hash() const override {
+        return genhash((HashType)opType,
+                       genhash(subExprs[LHS]->hash(), subExprs[RHS]->hash()));
+    };
+    virtual string toReadable() const override;
+    const Expr &getLhs() const { return getSubExprs(LHS); };
+    const Expr &getRhs() const { return getSubExprs(RHS); };
+    void setLhs(Expr e) { setOperands(LHS, e); };
+    void setRhs(Expr e) { setOperands(RHS, e); };
+    // If Var/constant, use this one
+    optional<pair<Var, int>> getModDivParameter() const;
+    // If (Var+constant)/constant, use this one
+    pair<Expr, int> getModDivExpr() const;
+    bool isSwapable() const;
+};
+
+class ConstantNode : public ExprNode {
+    int val;
+
+  public:
+    ConstantNode(int _val) : val(_val){};
+    ConstantNode(const ConstantNode &rhs) : ExprNode(rhs), val(rhs.val){};
+    virtual ~ConstantNode() {}
+    DEFINE_GETTYPE(ConstantNode);
+
+    int getValue() const { return val; }
+    virtual HashType hash() const override { return genhash(val, 6214587); };
+    virtual string toReadable() const override {
+        string ret;
+        ret += std::to_string(val);
+        return ret;
+    };
+};
+
+class SubscriptNode : public ExprNode {
+  protected:
+    Expr indexed;
+    VecExpr subExprs;
+
+  public:
+    SubscriptNode(Expr _indexed, vector<Expr> _subExprs) : subExprs(_subExprs) {
+        setObject(_indexed);
+    };
+    DEFINE_GETTYPE(SubscriptNode);
+
+    virtual HashType hash() const override {
+        nnet_unimplemented_continue();
+        return -1;
+    };
+    virtual string toReadable() const override;
+
+    size_t getDims() const { return subExprs.size(); }
+    const VecExpr &getIndex() const { return subExprs; }
+    const Expr &getIndex(size_t i) const { return subExprs[i]; }
+    void setIndex(size_t i, Expr e) { subExprs[i] = e; }
+    Expr *getObjectPtr() { return &indexed; }
+    Expr getObject() const { return indexed; }
+    void setObject(Expr e);
+    bool isRangeOpSubscripted() const;
+    bool isTensorSubscripted() const { return !isRangeOpSubscripted(); }
+    // Get the ranges of objects including paddings
+    vector<Range> getObjectRangesWithPaddings() const;
+    vector<Range> getObjectRangesWithoutPaddings() const;
+};
+
+class FuncNode : public ExprNode {
+  protected:
+    Subscript object;
+    FuncType funcType;
+
+  public:
+    FuncNode(Expr object, FuncType funcType) : funcType(funcType) {
+        setObject(object);
+    }
+    DEFINE_GETTYPE(FuncNode);
+
+    virtual HashType hash() const override {
+        nnet_unimplemented_continue();
+        return -1;
+    };
+    virtual string toReadable() const override;
+
+    const Subscript &getObject() const { return object; }
+    void setObject(Expr e);
+
+    FuncType getFuncType() const { return funcType; }
+};
+
+// Wrappers for type deduction
+Subscript makeSubscript(const Expr &tensor, const VecExpr &subscripts);
+RangeOp makeRangeOperator(const vector<VarRangePair> &_loopIters,
+                          const vector<VarRangePair> &_sumIters, Expr _summand,
+                          const vector<int> &paddings = {});
+Tensor makeTensor(const string &name, const vector<int> &shape,
+                  const vector<int> &paddings = {},
+                  const Routine &source = nullptr);
+
+// Pretty output for dbg with shared_ptr
+template <typename T, typename std::enable_if_t<std::is_base_of_v<ExprNode, T>>
+                          *_ = nullptr>
+std::ostream &operator<<(std::ostream &os, const shared_ptr<T> &a) {
+    os << ((!a) ? string("nullptr") : a->toReadable());
+    return os;
+}
+
+// Pretty output for dbg with shared_ptr
+template <typename T, typename std::enable_if_t<std::is_base_of_v<ExprNode, T>>
+                          *_ = nullptr>
+std::ostream &operator<<(std::ostream &os, const Ref<T> &a) {
+    os << ((!a) ? string("nullptr") : a->toReadable());
+    return os;
+}
+#undef DEFINE_GETTYPE
+
+} // namespace nnet
+
+namespace std {
+template <> struct hash<nnet::VarNode &> {
+    size_t operator()(const nnet::VarNode &t) const {
+        return std::hash<string>()(t.getName());
+    }
+};
+} // namespace std
--- a/include/nnet/iterator_table.h
+++ b/include/nnet/iterator_table.h
@ -0,0 +1,234 @@
+#pragma once
+#include "common.h"
+#include "expr.h"
+#include <iostream>
+
+namespace nnet {
+
+using PatternTensorMap = vector<Tensor>;
+using PatternIterRangeMap = PtrMap<Iterator, VarRangePair>;
+
+enum class MismatchType {
+    // Search required (undetermined)
+    MoreVar,
+    LessVar,
+    StrideMismatch,
+    // guided DLT (determined)
+    DLMismatch,
+    OutputDLMismatch,
+    OutputDimismatch
+};
+struct Mismatch {
+    MismatchType type;
+    int bitmap; // Row ID of IT
+    PtrMap<Iterator, Iterator>
+        mappingIter_r; // For DLT mismatch, iters are mapped
+    Mismatch(MismatchType _type, int _bitmap) : type(_type), bitmap(_bitmap) {}
+    Mismatch(MismatchType _type, int _bitmap,
+             PtrMap<Iterator, Iterator> _mappingIter_r)
+        : type(_type), bitmap(_bitmap), mappingIter_r(_mappingIter_r) {}
+};
+class Pattern;
+class IteratorTable {
+  protected:
+    //     using Appearance = map<string, vector<pair<Tensor, int>>>;
+    // using StrideTable = map<TensorNode *, vector<tuple<string, int, int>>>;
+    // // Var, dim, stride
+    RangeOp rangeOp;
+    // To real tensor
+    // FIXME: redundent
+    Appearance appearance;
+    vector<Tensor> tensors;       // original tensor sequence
+    vector<Subscript> subscripts; // original subscripts sequence
+    StrideTable strideTable;      // TODO [Refactor]: rename strideTable
+    PatternIterRangeMap iterToRange;
+
+    // mapping
+    vector<int> tensorMap; // [index for tensors] -> tensorID in pattern
+    PtrMap<Iterator, Iterator> iterMap; // [expr iter] -> pattern iter
+
+    // final data
+    vector<vector<Iterator>> posTable; // [Tensor bitmap]=[Iterator]
+    vector<vector<vector<Iterator>>>
+        iterInTensorDim; // [tensorID][dimOfTensor]=[Iterator],
+                         // stride in each dim may be add
+
+    vector<vector<PtrMap<Iterator, int>>>
+        strideInDim; // [tensorID][dimOfTensor][Iterator]=stride,
+                     // stride in each dim may be add
+
+    PtrMap<Iterator, vector<int>> strideInTensor; // [Iterator][tensorID]=stride
+
+    // final data: auxiliary data
+    vector<int> tensorIDMap_r;
+    PatternTensorMap tensorMap_r;
+    PatternIterRangeMap iterToRange_r;
+
+  public:
+    virtual ~IteratorTable() {}
+    IteratorTable() {}
+    IteratorTable(const IteratorTable &) = delete;
+    [[nodiscard]] bool analyzeExpr(const RangeOp &rangeOp);
+    // mapTensors
+    void buildTable(const vector<int> &_tensorMap);
+    void buildTableWithDefaultMap();
+    /**
+     * @brief Check whether the expression match a pattern. If not, return the
+     * detailed reason for guided search.
+     *
+     * @param patternIT
+     * @return vector<int> mismatched IT rows/tensors for guided DLT.
+     */
+    vector<Mismatch> matchPatternIT(const Pattern &patternIT);
+    void matchIterators();
+    int getNumInputs() const { return tensors.size(); }
+    int getNumTensors() const { return tensors.size() + 1; }
+    int getNumRows() const { return 1 << getNumTensors(); }
+    int getNumIterators() const { return strideTable.size(); }
+    // vector<Tensor> tensorMap_r(
+    //     pattern.nInputs); // [pattern tensor ID] -> real tensor
+    // map<string, VarRangePair> iterToRange_r; // [pattern iter] -> iter &
+    // range
+    auto getTables() const {
+        return tuple(posTable, iterInTensorDim, strideInTensor);
+    }
+    const auto &getStrideInDim() const { return strideInDim; }
+    vector<vector<Iterator>> getIterInTensorDim(int tensorID) const {
+        return iterInTensorDim[tensorID];
+    }
+    const vector<Iterator> &getPosTable(int bitmap) const {
+        return posTable[bitmap];
+    }
+    pair<PatternTensorMap, PatternIterRangeMap> getReverseMap() const;
+
+    int getStridesInTensor(Iterator iter, int tensorID) const;
+    vector<int> getIterDimInTensor(int tensorID, const Iterator &iter) const;
+    Tensor getTensor(int tensorID) const { return tensorMap_r[tensorID]; }
+    Subscript getSubscript(int tensorID) const {
+        return subscripts[tensorIDMap_r[tensorID]];
+    }
+    Range getIterRange(const Iterator &iter) const {
+        return rangeOp->getRange(iter);
+    }
+
+    /**
+     * @brief Check strides of each iterators and there position in tensors.
+     * Since many-to-many iterators matching exist, we take this procudure as a
+     * seperate function to deal with different iterator mapping solution.
+     *
+     * @param patternIT
+     * @param mappingIter_r
+     * @return vector<Mismatch>
+     */
+    vector<Mismatch>
+    matchPatternITCheckStrides(const Pattern &patternIT,
+                               PtrMap<Iterator, Iterator> mappingIter_r);
+    RangeOp getRangeOp() const;
+};
+
+struct StrideConstraint {
+    int tensorID;
+    Var v0, v1;
+    enum class Constraint { SAME, PROPOTIONAL } type;
+};
+
+class Pattern : public IteratorTable {
+    vector<StrideConstraint> strideConstraints;
+
+  public:
+    virtual Expr
+    buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+              [[maybe_unused]] const PatternIterRangeMap &varRanges,
+              string outputName,
+              [[maybe_unused]] const IteratorTable &exprIT) const = 0;
+    /**
+     * @brief Check whether all indexes only are a iterator
+     *
+     * @param tensorID
+     */
+    bool isAllUniqueAccess(int tensorID) const;
+    const auto &getStrideConstraints() const { return strideConstraints; };
+    int calcPadding(const Tensor &tensor, int dim, Range rangeH, Range rangeR,
+                    int offset) const;
+};
+
+class MatmulPattern : public Pattern {
+  public:
+    static const Pattern &getMatmulPattern();
+    static pair<Expr, pair<Tensor, Tensor>> getExpr(bool transA, bool transB,
+                                                    int b, int m, int n, int k);
+
+    Expr buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+                   [[maybe_unused]] const PatternIterRangeMap &varRanges,
+                   string outputName,
+                   [[maybe_unused]] const IteratorTable &exprIT) const override;
+};
+
+class ConvPattern : public Pattern {
+  private:
+    static const Var n, c, h, w, f, r, s;
+
+  public:
+    static const Pattern &getPattern();
+    static Expr getExpr(Tensor A, Tensor K, int n, int c, int h, int w, int f,
+                        int r, int s);
+
+    Expr buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+                   [[maybe_unused]] const PatternIterRangeMap &varRanges,
+                   string outputName,
+                   [[maybe_unused]] const IteratorTable &exprIT) const override;
+};
+
+class ConvTransPattern : public Pattern {
+  private:
+    static const Var n, c, h, w, f, r, s;
+
+  public:
+    static const Pattern &getPattern() = delete;
+    static Expr getExpr(Tensor A, Tensor K, int N, int C, int H, int W, int F,
+                        int R, int S);
+
+    Expr
+    buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+              [[maybe_unused]] const PatternIterRangeMap &varRanges,
+              string outputName,
+              [[maybe_unused]] const IteratorTable &exprIT) const override {
+        nnet_unimplemented_halt();
+        return nullptr;
+    };
+};
+
+class Sg2bmmPattern : public Pattern {
+  private:
+    static const Var b, m, w, k;
+
+  public:
+    static const Pattern &getPattern();
+    static pair<Expr, pair<Tensor, Tensor>> getExpr(int Batch, int M, int K,
+                                                    int W, int D);
+
+    Expr buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+                   [[maybe_unused]] const PatternIterRangeMap &varRanges,
+                   string outputName,
+                   [[maybe_unused]] const IteratorTable &exprIT) const override;
+};
+
+class LongformerGBMMPattern : public Pattern {
+  private:
+    static const Var b, m, w, n;
+
+  public:
+    static const Pattern &getPattern();
+    static pair<Expr, pair<Tensor, Tensor>> getExpr(int Batch, int M, int W,
+                                                    int K, int dilation);
+
+    Expr buildExpr(const Expr &expr, const vector<Tensor> &tensors,
+                   [[maybe_unused]] const PatternIterRangeMap &varRanges,
+                   string outputName,
+                   [[maybe_unused]] const IteratorTable &exprIT) const override;
+};
+
+const Pattern &getPattern(RoutineType targetOp);
+string getPatternName(RoutineType targetOp);
+
+} // namespace nnet
--- a/include/nnet/nmutator.h
+++ b/include/nnet/nmutator.h
@ -0,0 +1,57 @@
+#pragma once
+#include "core/mutator.h"
+#include "nnet/expr.h"
+
+#ifdef ABC
+
+namespace infini {
+
+class NMutator : public Mutator {
+  private:
+    // Suffix -N: NNet objects.
+    // Suffix -T: tpm objects.
+    // Map: NNet tensors -> tpm tensor.
+    std::map<std::string, Tensor> inputsNameNToTensorT;
+    enum class Mode { Normal, ToNaiveMembound, RuleBased } mode = Mode::Normal;
+    const double bandwidth = double(200) * 1024 * 1024 * 1024;
+    // If in RuleBased mode, use derivationRules in derivator
+    const std::vector<int> derivationRules;
+
+  public:
+    NMutator();
+    NMutator(const std::vector<int> &derivationRules);
+    ~NMutator();
+
+    vector<Graph> run(const Graph &in_graph) override;
+    void setToNaiveMembound();
+
+    void setMaxDepth(int _maxDepth) { maxDepth = _maxDepth; }
+    long long cntStates = 0;
+    long long cntCandidates = 0;
+
+  private:
+    int maxDepth = 8;
+    nnet::Expr opToExpression(Operator op);
+    void runSingleOp(Graph in_graph, std::vector<Graph> &out_graphs);
+
+    /**
+     * @brief Test helper. Converting a single OP to Membound Op for
+     * corretness check.
+     */
+    void runSingleOpToNaiveMembound(Graph in_graph,
+                                    std::vector<Graph> &out_graphs);
+    void runMultipleOps(Graph in_graph, std::vector<Graph> &out_graphs);
+    Graph expressionToGraph(nnet::Expr expr, Graph in_graph);
+    Graph fuseHetConv(nnet::Expr expr, Graph in_graph);
+    double memboundTime(ssize_t cnt);
+    double memboundTime(const Shape &dims);
+
+    Graph transformTConv1x1(Operator op);
+    Graph transformTConv3x3(Operator op);
+    Graph transformDialtedConv(Operator op);
+    Graph transformConv1x1(Operator op);
+    Graph transformConv1xk(Operator op);
+};
+
+} // namespace infini
+#endif
--- a/include/nnet/permutation.h
+++ b/include/nnet/permutation.h
@ -0,0 +1,38 @@
+#pragma once
+#include "common.h"
+#include "expr.h"
+#include <iostream>
+
+namespace nnet {
+
+class PermutationGenerator {
+    vector<vector<Iterator>> from, to;
+    vector<vector<size_t>> mapping;
+
+  public:
+    PermutationGenerator(vector<vector<Iterator>> _from,
+                         vector<vector<Iterator>> _to);
+    bool next();
+    PtrMap<Iterator, Iterator> get() const;
+};
+
+template <typename T> class SubsetGenerator {
+    vector<T> elements;
+    int n, bitmap;
+
+  public:
+    SubsetGenerator(vector<T> elements, bool nonEmpty = 1)
+        : elements(elements), n(elements.size()), bitmap((nonEmpty > 0)) {
+        assert(n < 10);
+    };
+    bool next() { return ((++bitmap) < (1 << n) - 1); }
+    vector<T> get() const {
+        vector<T> ret;
+        for (int i = 0; i < n; ++i)
+            if (bitmap & (1 << i))
+                ret.emplace_back(elements[i]);
+        return ret;
+    }
+};
+
+} // namespace nnet
--- a/include/nnet/ref.h
+++ b/include/nnet/ref.h
@ -0,0 +1,200 @@
+#pragma once
+#include "common.h"
+#include <functional> // hash
+#include <memory>
+#include <type_traits>
+
+namespace nnet {
+
+template <typename T> struct is_ref;
+
+/**
+ * Ref-counting pointer
+ *
+ * This class is thread-safe (For developers: concurrent accesses through
+ * different `std::shared_ptr`s to the same object is already thread-safe, while
+ * modifying the same `std::shared_ptr` is not. We never modify a `Ref`, so no
+ * locks are needed. See https://en.cppreference.com/w/cpp/memory/shared_ptr)
+ */
+template <class T> class Ref {
+    static_assert(is_ref<T>::value == false, "Ref should not be nested");
+
+    template <class U> friend class Ref;
+
+    std::shared_ptr<T> ptr_;
+
+  private:
+  public:
+    typedef T Object;
+
+    Ref() = default;
+    // Ref(std::nullptr_t) : Ref() {}
+    constexpr Ref(nullptr_t) noexcept : Ref() {}
+    Ref(const Ref &) = default;
+    Ref(Ref &&) = default;
+    Ref(std::shared_ptr<T> &&ptr) : ptr_(std::move(ptr)) {}
+    // Ref(const std::shared_ptr<T> &ptr) : ptr_(ptr) {}
+
+    // /// NO NOT USE THIS CONSTRUCTOR IN PUBLIC
+    // /// It is public because Pybind11 needs it
+    // Ref(T *ptr) : ptr_(ptr) {}
+
+    /**
+     * Shared with any compatible references
+     */
+    template <class U,
+              typename std::enable_if_t<std::is_base_of_v<T, U>> * = nullptr>
+    Ref(const Ref<U> &other) : ptr_(std::static_pointer_cast<T>(other.ptr_)) {}
+
+    template <class U,
+              typename std::enable_if_t<std::is_base_of_v<T, U>> * = nullptr>
+    Ref &operator=(const Ref<U> &other) {
+        ptr_ = std::static_pointer_cast<T>(other.ptr_);
+        return *this;
+    }
+
+    Ref &operator=(const Ref &) = default;
+    Ref &operator=(Ref &&) = default;
+
+    template <class U> Ref<U> as() const {
+        Ref<U> ret;
+        ret.ptr_ = std::dynamic_pointer_cast<U>(ptr_);
+        return ret;
+    }
+
+    bool isValid() const { return ptr_ != nullptr; }
+
+    T &operator*() const {
+        nnet_assert(isValid(), "Empty pointer.");
+        return *ptr_;
+    }
+
+    T *operator->() const {
+        nnet_assert(isValid(), "Empty pointer.");
+        return ptr_.get();
+    }
+
+    T *get() const {
+        nnet_assert(isValid(), "Empty pointer.");
+        return ptr_.get();
+    }
+
+    friend inline bool operator==(const Ref &lhs, nullptr_t) {
+        return !lhs.isValid();
+    }
+    friend inline bool operator!=(const Ref &lhs, nullptr_t) {
+        return !(lhs == nullptr);
+    }
+    explicit operator bool() const { return ptr_ != nullptr; }
+    bool operator!() { return ptr_ == nullptr; }
+
+    void swap(Ref &__b) noexcept { ptr_.swap(__b.ptr_); }
+};
+
+template <class T, class U,
+          typename std::enable_if_t<std::is_base_of_v<U, T>> * = nullptr>
+Ref<T> as(const Ref<U> &ref) {
+    return ref.template as<T>();
+}
+
+template <typename T, typename... Params> Ref<T> make_ref(Params &&...params) {
+    return Ref(make_shared<T>(std::forward<Params>(params)...));
+}
+
+// Comparator for Ref
+template <typename T> struct is_ref : std::false_type {};
+template <typename T> struct is_ref<Ref<T>> : std::true_type {};
+
+template <class Tuple, std::size_t index = 0, bool address_based>
+typename std::enable_if_t<not is_ref<std::tuple_element_t<index, Tuple>>::value,
+                          bool>
+__ref_less(const Tuple &lhs, const Tuple &rhs) {
+    if constexpr (index >=
+                  std::tuple_size<std::remove_reference_t<Tuple>>::value - 1)
+        return std::get<index>(lhs) < std::get<index>(rhs);
+    else {
+        if (std::get<index>(lhs) != std::get<index>(rhs))
+            return std::get<index>(lhs) < std::get<index>(rhs);
+        else
+            return __ref_less<Tuple, index + 1, address_based>(lhs, rhs);
+    }
+}
+
+template <class Tuple, std::size_t index = 0, bool address_based>
+typename std::enable_if_t<is_ref<std::tuple_element_t<index, Tuple>>::value and
+                              not address_based,
+                          bool>
+__ref_less(const Tuple &lhs, const Tuple &rhs) {
+    if constexpr (index >=
+                  std::tuple_size<std::remove_reference_t<Tuple>>::value - 1)
+        return std::get<index>(lhs)->less(std::get<index>(rhs));
+    else {
+        if (std::get<index>(lhs)->neq(std::get<index>(rhs)))
+            return std::get<index>(lhs)->less(std::get<index>(rhs));
+        else
+            return __ref_less<Tuple, index + 1, address_based>(lhs, rhs);
+    }
+}
+
+template <class Tuple, std::size_t index = 0, bool address_based>
+typename std::enable_if_t<
+    is_ref<std::tuple_element_t<index, Tuple>>::value and address_based, bool>
+__ref_less(const Tuple &lhs, const Tuple &rhs) {
+    if constexpr (index >=
+                  std::tuple_size<std::remove_reference_t<Tuple>>::value - 1)
+        return std::get<index>(lhs).get() < std::get<index>(rhs).get();
+    else {
+        if (std::get<index>(lhs).get() != std::get<index>(rhs).get())
+            return std::get<index>(lhs).get() < std::get<index>(rhs).get();
+        else
+            return __ref_less<Tuple, index + 1, address_based>(lhs, rhs);
+    }
+}
+
+template <class Tuple> bool ref_addr_less(const Tuple &lhs, const Tuple &rhs) {
+    return __ref_less<Tuple, 0, true>(lhs, rhs);
+}
+
+template <class Tuple> bool ref_value_less(const Tuple &lhs, const Tuple &rhs) {
+    return __ref_less<Tuple, 0, false>(lhs, rhs);
+}
+
+template <class Tuple> class RefAddrLess {
+  public:
+    bool operator()(const Tuple &a, const Tuple &b) const {
+        return ref_addr_less(a, b);
+    }
+};
+
+template <class Tuple> class RefValueLess {
+  public:
+    bool operator()(const Tuple &a, const Tuple &b) const {
+        return ref_value_less(a, b);
+    }
+};
+
+// make_ref_from_tuple
+template <typename _Tp, typename _Tuple, size_t... _Idx>
+constexpr Ref<_Tp> make_ref_from_tuple_impl(_Tuple &&__t,
+                                            std::index_sequence<_Idx...>) {
+    return make_ref<_Tp>(std::get<_Idx>(std::forward<_Tuple>(__t))...);
+}
+
+template <typename _Tp, typename _Tuple>
+constexpr Ref<_Tp> make_ref_from_tuple(_Tuple &&__t) {
+    return make_ref_from_tuple_impl<_Tp>(
+        std::forward<_Tuple>(__t),
+        std::make_index_sequence<std::tuple_size_v<std::decay_t<_Tuple>>>{});
+}
+
+} // namespace nnet
+
+// namespace std {
+
+// template <class T> struct hash<ir::Ref<T>> {
+//     hash<T *> hash_;
+//     size_t operator()(const ir::Ref<T> &ref) const { return hash_(ref.get());
+//     }
+// };
+
+// } // namespace nnet
--- a/include/nnet/routine.h
+++ b/include/nnet/routine.h
@ -0,0 +1,158 @@
+#pragma once
+#include "common.h"
+#include "expr.h"
+#include <iostream>
+#include <sstream>
+namespace nnet {
+
+class RoutineNode;
+class MatmulNode;
+class ElementWiseNode;
+using Routine = Ref<RoutineNode>;
+using Matmul = Ref<MatmulNode>;
+using ElementWise = Ref<ElementWiseNode>;
+
+#define DEFINE_GETTYPE(CLASS)                                                  \
+    RoutineType getType() const override { return RoutineType::CLASS##Type; }
+
+class RoutineNode {
+  protected:
+    Expr expr;
+    vector<Tensor> inputs;
+
+  public:
+    RoutineNode(Expr _expr, const vector<Tensor> &_inputs);
+    virtual string toReadable() const = 0;
+    const Expr &getExpr() const { return expr; }
+    const vector<Tensor> &getInputs() const { return inputs; }
+    virtual RoutineType getType() const = 0;
+};
+
+using MatmulArgs = tuple<int,   // b
+                         int,   // m
+                         int,   // n
+                         int,   // k
+                         bool,  // transa
+                         bool>; // transb
+
+class MatmulNode : public RoutineNode {
+    int b, m, n, k;
+    bool transa, transb;
+
+  public:
+    MatmulNode(Expr _source, Tensor A, Tensor B, int _b, int _m, int _n, int _k,
+               bool _transa, bool _transb)
+        : RoutineNode(_source, {A, B}), b(_b), m(_m), n(_n), k(_k),
+          transa(_transa), transb(_transb) {}
+    DEFINE_GETTYPE(MatmulNode);
+
+    string toReadable() const override;
+
+    friend bool operator==(const MatmulNode &lhs, const MatmulNode &rhs);
+    MatmulArgs getArgs() { return tuple(b, m, n, k, transa, transb); }
+};
+
+using ConvArgs = tuple<int,  // ph
+                       int,  // pw
+                       int,  // sh
+                       int,  // sw
+                       int,  // dh
+                       int>; // dw
+
+class ConvNode : public RoutineNode {
+    int ph, pw;
+    int sh, sw;
+    int dh, dw;
+
+  public:
+    ConvNode(Expr _source, Tensor A, Tensor K, int _ph, int _pw, int _sh = 1,
+             int _sw = 1, int _dh = 1, int _dw = 1)
+        : RoutineNode(_source, {A, K}), ph(_ph), pw(_pw), sh(_sh), sw(_sw),
+          dh(_dh), dw(_dw) {}
+    DEFINE_GETTYPE(ConvNode);
+
+    string toReadable() const override;
+    vector<int> getShape() const;
+    friend bool operator==(const ConvNode &lhs, const ConvNode &rhs);
+    ConvArgs getArgs() const;
+};
+
+class ElementWiseNode : public RoutineNode {
+    vector<int> outputShape;
+
+  public:
+    // _outputShape is redundent, but expr is still missing for DLT.
+    ElementWiseNode(Expr _source, vector<Tensor> _inputs,
+                    vector<int> _outputShape)
+        : RoutineNode(_source, _inputs), outputShape(_outputShape) {}
+    DEFINE_GETTYPE(ElementWiseNode);
+
+    string toReadable() const override;
+    /**
+     * @brief Get the Estimated Time of mem bound OP.
+     *
+     * @return double Time in ms.
+     */
+    double getEstimatedTime() const;
+    const vector<int> &getOutputShape() const { return outputShape; }
+};
+
+using G2bmmArgs = tuple<int,  // b
+                        int,  // m
+                        int,  // w
+                        int,  // k
+                        int>; // dilation
+class G2bmmNode : public RoutineNode {
+    int b, m, w, k;
+
+  public:
+    G2bmmNode(Expr source, Tensor A, Tensor B, int b, int m, int w, int k,
+              int d = 1)
+        : RoutineNode(source, {A, B}), b(b), m(m), w(w), k(k) {
+        assert(d == 1);
+    }
+    DEFINE_GETTYPE(G2bmmNode);
+
+    vector<int> getShape() const;
+    string toReadable() const override;
+    G2bmmArgs getArgs() const;
+};
+
+using GbmmArgs = tuple<int,  // b
+                       int,  // m
+                       int,  // w
+                       int,  // n
+                       int>; // dilation
+class GbmmNode : public RoutineNode {
+    int b, m, w, n;
+
+  public:
+    GbmmNode(Expr source, Tensor A, Tensor B, int b, int m, int w, int n,
+             int d = 1)
+        : RoutineNode(source, {A, B}), b(b), m(m), w(w), n(n) {
+        assert(d == 1);
+    }
+    DEFINE_GETTYPE(GbmmNode);
+
+    vector<int> getShape() const;
+    string toReadable() const override;
+    GbmmArgs getArgs() const;
+};
+
+// Pretty output for dbg with shared_ptr
+template <typename T, typename std::enable_if_t<
+                          std::is_base_of_v<RoutineNode, T>> *_ = nullptr>
+std::ostream &operator<<(std::ostream &os, const shared_ptr<T> &a) {
+    os << ((!a) ? string("Null shared_ptr") : a->toReadable());
+    return os;
+}
+
+// Pretty output for dbg with shared_ptr
+template <typename T, typename std::enable_if_t<
+                          std::is_base_of_v<RoutineNode, T>> *_ = nullptr>
+std::ostream &operator<<(std::ostream &os, const Ref<T> &a) {
+    os << ((!a) ? string("Null shared_ptr") : a->toReadable());
+    return os;
+}
+
+} // namespace nnet
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@ -0,0 +1,28 @@
+#pragma once
+#include "common.h"
+#include "derivator.h"
+
+// clang-format off
+#define CAT(A, B) A##B
+#define SELECT(NAME, NUM) CAT(NAME##_, NUM)
+#define GET_COUNT( _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ... ) COUNT
+#define VA_SIZE( ... ) GET_COUNT( __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
+#define VA_SELECT( NAME, ... ) SELECT( NAME, VA_SIZE(__VA_ARGS__) )(__VA_ARGS__)
+
+#define _DEFVAR_1(name) auto name = make_ref<VarNode>(#name);
+#define _DEFVAR_2(name, ...) _DEFVAR_1(name); _DEFVAR_1(__VA_ARGS__)
+#define _DEFVAR_3(name, ...) _DEFVAR_1(name); _DEFVAR_2(__VA_ARGS__)
+#define _DEFVAR_4(name, ...) _DEFVAR_1(name); _DEFVAR_3(__VA_ARGS__)
+#define _DEFVAR_5(name, ...) _DEFVAR_1(name); _DEFVAR_4(__VA_ARGS__)
+#define _DEFVAR_6(name, ...) _DEFVAR_1(name); _DEFVAR_5(__VA_ARGS__)
+#define _DEFVAR_7(name, ...) _DEFVAR_1(name); _DEFVAR_6(__VA_ARGS__)
+#define _DEFVAR_8(name, ...) _DEFVAR_1(name); _DEFVAR_7(__VA_ARGS__)
+#define _DEFVAR_9(name, ...) _DEFVAR_1(name); _DEFVAR_8(__VA_ARGS__)
+#define DEFINE_VAR(...) VA_SELECT(_DEFVAR, __VA_ARGS__)
+// clang-format on
+
+namespace nnet {
+int matchExprResult(Derivator &derivator, string fn);
+bool checkExprLogSame(string fnPrefix, int start, int end);
+bool checkExprsEquvivalence(VecExpr exprs);
+} // namespace nnet
--- a/include/nnet/visitor.h
+++ b/include/nnet/visitor.h
@ -0,0 +1,128 @@
+#pragma once
+#include "common.h"
+#include "derivator.h"
+#include "expr.h"
+#include "routine.h"
+#include <iostream>
+#include <unordered_map>
+
+namespace nnet {
+
+template <typename FType> class Functor;
+
+template <typename R, typename... Args> class Functor<R(Args...)> {
+  protected:
+    int verbose;
+
+    // FIXME: scope should be protected
+  public:
+    Functor(int _verobse = 0) : verbose(_verobse) {}
+    virtual ~Functor() = default;
+#define DISPATCH(CLASS)                                                        \
+    case NodeType::CLASS##Type:                                                \
+        return this->visit_(as<CLASS>(c), std::forward<Args>(args)...);        \
+        break
+
+#define FUNCTOR_DEFAULT                                                        \
+    { return visitDefault(c, std::forward<Args>(args)...); }
+
+    virtual R dispatch(const Expr &c, Args... args) {
+        switch (c->getType()) {
+            DISPATCH(ConstantNode);
+            DISPATCH(BinaryOpNode);
+            DISPATCH(RangeOpNode);
+            DISPATCH(SubscriptNode);
+            DISPATCH(TensorNode);
+            DISPATCH(VarNode);
+            DISPATCH(FuncNode);
+        default:
+            nnet_assert(0, "Unknown type");
+            return R();
+        }
+    }
+
+    virtual R visit_(const Constant &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const BinaryOp &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const RangeOp &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const Subscript &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const Var &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const Tensor &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visit_(const Func &c, Args... args) FUNCTOR_DEFAULT;
+    virtual R visitDefault(const Expr &c, [[maybe_unused]] Args... args) {
+        dbg(*c);
+        nnet_assert(0, "Reach unimplemented visit function.");
+        return R();
+    };
+
+    [[deprecated("Define explicit methods for public access.")]] R
+    operator()(const Expr &e, Args... args) {
+        return dispatch(e, std::forward<Args>(args)...);
+    }
+#undef FUNCTOR_DEFAULT
+#undef DISPATCH
+};
+
+class Mutator : public Functor<Expr()> {
+  public:
+    Mutator(int _verobse = 0) : Functor(_verobse) {}
+    Expr visit_(const Constant &c) override;
+    Expr visit_(const BinaryOp &c) override;
+    Expr visit_(const RangeOp &c) override;
+    Expr visit_(const Subscript &c) override;
+    Expr visit_(const Var &c) override;
+    Expr visit_(const Tensor &c) override;
+    Expr visit_(const Func &c) override;
+};
+
+// template <typename... Args>
+// class SingleStageVisitor : public Functor<void, Args...> {
+//   public:
+//     SingleStageVisitor(int _verobse = 0) : Functor<R, Args...>(_verobse) {}
+//     // R visit(const Constant &c) override ;
+//     R visit_(const BinaryOp &c) override {
+//         if (verbose)
+//             dbg(*c);
+//         this->dispatch(c->getLhs());
+//         this->dispatch(c->getRhs());
+//     }
+//     R visit_(const RangeOp &c) override {
+//         if (verbose)
+//             dbg(*c);
+//         this->dispatch(ret->getSummand());
+//         // NOT visit iterators and its ranges
+//     }
+//     R visit_(const Subscript &c) override {
+//         if (verbose)
+//             dbg(*c);
+//         this->dispatch(ret->getObject());
+//         for (size_t i = 0; i < ret->getDims(); ++i)
+//             this->dispatch(ret->getIndex(i));
+//     }
+//     // R visit(const Var &c) override;
+//     // R visit(const Tensor &c) override;
+// };
+
+// } // namespace nnet
+// #include "nnet/Visitor/ReplaceVariable.h"
+// #include "nnet/Visitor/StrideVisitor.h"
+// namespace nnet {
+
+class ExprTreeVisitor : public Functor<void(void)> {
+  private:
+    bool inBinary, inRange, inSub, inTensor;
+
+  public:
+    ExprTreeVisitor(bool _inBinary = 1, bool _inRange = 1, bool _inSub = 1,
+                    bool _inTensor = 1, int _verobse = 0)
+        : Functor(_verobse), inBinary(_inBinary), inRange(_inRange),
+          inSub(_inSub), inTensor(_inTensor) {}
+    void visit_(const Constant &c) override;
+    void visit_(const BinaryOp &c) override;
+    void visit_(const RangeOp &c) override;
+    void visit_(const Subscript &c) override;
+    void visit_(const Var &c) override;
+    void visit_(const Tensor &c) override;
+    void visit_(const Func &c) override;
+};
+
+} // namespace nnet
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@ -0,0 +1,47 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+
+class MatmulNode : public OperatorNode {
+  private:
+    // InfiniTensor assume a row-major tensor layout. transA=false means default
+    // dims, true means A should be transposed before matmul. This is in
+    // oppsite to column-major BLAS.
+    bool transA, transB;
+    ActType act;
+
+    // Auxiliary attributes
+    int b, m, n, k;
+
+  public:
+    MatmulNode(Tensor A, Tensor B, Tensor C, bool transA = false,
+               bool transB = false, Tensor bias = nullptr,
+               ActType act = ActType::None);
+
+    std::string toString() const override;
+    vector<Shape> computeShape() const override;
+
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+
+    Tensor getBias() const { return inputs[2]; }
+    ActType getAct() const { return act; }
+    bool getTransA() const { return transA; }
+    bool getTransB() const { return transB; }
+    int getB() const { return b; }
+    int getM() const { return m; }
+    int getN() const { return n; }
+    int getK() const { return k; }
+
+    HashType hashWithShape() const override;
+    OpPerfKey getOpPerfKey() const override;
+
+  private:
+    // Q: whether to check the output? Since we can build an Op first and then
+    // assure output.
+    // Fix 1: make shape inference a static method. But OpPerfKey are required.
+    bool checkValid(const TensorVec &inputs) const;
+};
+
+} // namespace infini
--- a/include/test.h
+++ b/include/test.h
@ -0,0 +1,3 @@
+#pragma once
+#include "core/common.h"
+#include "gtest/gtest.h"
--- a/src/core/common.cc
+++ b/src/core/common.cc
@ -0,0 +1,14 @@
+#include "core/common.h"
+#include <chrono>
+#include <functional>
+
+namespace infini {
+
+double timeit(const std::function<void()> &func) {
+    auto start = std::chrono::high_resolution_clock::now();
+    func();
+    auto end = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration<double, std::milli>(end - start).count();
+}
+
+} // namespace infini
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -0,0 +1,20 @@
+#include "core/graph.h"
+
+namespace infini {
+
+void GraphNode::updateConnection() { IT_TODO_HALT(); }
+
+string GraphNode::toString() const {
+    std::ostringstream oss;
+    oss << "GraphNode operators:\n";
+    for (const auto &op : ops)
+        oss << op << "\n";
+    return oss.str();
+}
+
+void GraphNode::dataMalloc() {
+    for (auto &tensor : tensors)
+        tensor->dataMalloc();
+}
+
+} // namespace infini
--- a/src/core/operator.cc
+++ b/src/core/operator.cc
@ -0,0 +1,32 @@
+#include "core/operator.h"
+
+namespace infini {
+
+bool OperatorNode::isLinearOp() const {
+    return enum_to_underlying(type) >= 100 && enum_to_underlying(type) < 200;
+}
+
+bool OperatorNode::isElementWiseOp() const {
+    return enum_to_underlying(type) >= 200 && enum_to_underlying(type) < 300;
+}
+
+bool OperatorNode::isSplitOp() const { return type == OpType::Split; }
+
+bool OperatorNode::isConcatOp() const { return type == OpType::Concat; }
+
+bool OperatorNode::isComputeOp() const {
+    return type == OpType::Conv || type == OpType::Matmul ||
+           type == OpType::ConvTrans || type == OpType::G2BMM ||
+           type == OpType::GBMML;
+}
+
+bool OperatorNode::isTransposeOp() const { return type == OpType::Transpose; }
+
+bool OperatorNode::isReshapeOp() const { return type == OpType::Reshape; }
+
+bool OperatorNode::isMemBoundOp() const {
+    return type == OpType::MemBound || type == OpType::Activation ||
+           type == OpType::Transpose;
+}
+
+} // namespace infini
--- a/src/core/run_engine.cc
+++ b/src/core/run_engine.cc
@ -0,0 +1,105 @@
+#include "core/run_enigne.h"
+#include <chrono>
+
+namespace infini {
+
+void RunEngine::run(const Graph &graph, bool tune, bool profiling) const {
+    if (!tune && profiling)
+        IT_TODO_HALT();
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto perfEngine = PerfEngine::getInstance();
+    // Statistics
+    double totalTime = 0;
+    std::map<OpType, double> opTime;
+    std::map<OpType, int> opCnt;
+    std::chrono::system_clock::time_point begin, end;
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Int32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
+
+        // If no record and disable tuning, run with the default argument
+        if (!perfData && !tune) {
+            kernel->compute(op);
+            continue;
+        }
+
+        // TODO: The copy of record should be eliminated
+        PerfRecord record;
+        // Tune the kernel if there is no record
+        if (!perfData) {
+            record = kernel->tune(op);
+            perfEngine.setPerfData(perfKey, record);
+        } else
+            record = *perfData;
+
+        if (!profiling) {
+            kernel->compute(op, *perfData);
+            continue;
+        } else {
+            double t = timeit([&]() { kernel->compute(op, *perfData); });
+            op->print();
+            printf(" op_time %lf\n", t);
+            totalTime += t;
+            opTime[op->getOpType()] += t;
+            opCnt[op->getOpType()]++;
+        }
+    }
+    if (profiling)
+        printProfilingData(totalTime, opTime, opCnt);
+}
+
+double RunEngine::getPerfTime(const Graph &graph, bool profiling) const {
+    const auto &kernelRegistry = KernelRegistry::getInstance();
+    auto perfEngine = PerfEngine::getInstance();
+    // Statistics
+    double totalTime = 0;
+    std::map<OpType, double> opTime;
+    std::map<OpType, int> opCnt;
+
+    for (auto &op : graph->getOperators()) {
+        // HACK: set correct data type
+        auto kernelAttrs =
+            KernelAttrs{device, op->getOpType(), DataType::Int32};
+        Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
+        auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
+        std::optional<PerfRecord> perfData = perfEngine.getPerfData(perfKey);
+
+        PerfRecord record;
+        // Tune the kernel if there is no record
+        if (!perfData) {
+            record = kernel->tune(op);
+            perfEngine.setPerfData(perfKey, record);
+        } else
+            record = *perfData;
+
+        double t = record.time;
+        totalTime += t;
+        if (profiling) {
+            op->print();
+            printf(" op_time %lf\n", t);
+            opTime[op->getOpType()] += t;
+            opCnt[op->getOpType()]++;
+        }
+    }
+    if (profiling)
+        printProfilingData(totalTime, opTime, opCnt);
+    return totalTime;
+}
+
+void RunEngine::printProfilingData(double totalTime,
+                                   const std::map<OpType, double> &opTime,
+                                   const std::map<OpType, int> &opCnt) const {
+    printf("%11s %3s %7s %7s %7s\n", "Op", "Cnt", "T_tot", "Percent", "T_mean");
+    for (const auto &[type, t] : opTime) {
+        printf("%11s %3d %7.3f %7.1f %7.3f\n",
+               OpRegistry::getOpName(type).data(), opCnt.at(type), t,
+               t / totalTime * 100, t / opCnt.at(type));
+    }
+}
+
+} // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -0,0 +1,90 @@
+#include <core/tensor.h>
+namespace infini {
+
+TensorNode::TensorNode(const Shape &shape, DataType dtype)
+    : TensorBaseNode(shape.size(), dtype), shape(shape) {}
+
+void TensorNode::dataMalloc() {
+    IT_ASSERT(data == nullptr);
+    // initialized to zero
+    data.reset(reinterpret_cast<VType *>(calloc(size(), sizeof(VType))));
+}
+
+VType TensorNode::getData(const Shape &pos) const {
+    return getData(getOffset(pos));
+}
+
+string TensorNode::toString() const {
+    return "TensorNode " + std::to_string(guid);
+}
+
+size_t TensorNode::getOffset(const Shape &pos) const {
+    auto nDim = pos.size();
+    IT_ASSERT(shape.size() == nDim);
+    if (pos.empty())
+        return 0;
+    for (size_t i = 0; i < nDim; ++i)
+        IT_ASSERT(pos[i] < 0 || pos[i] >= shape[i]);
+    size_t idx = pos[0];
+    size_t dm = 0;
+    while (++dm < nDim)
+        idx = idx * shape[dm] + pos[dm];
+    return idx;
+}
+
+size_t TensorNode::size() const {
+    size_t ret = 1;
+    for (const auto &d : shape)
+        ret *= d;
+    return ret;
+}
+
+void TensorNode::copyData(VType *dptr) {
+    IT_ASSERT(data != nullptr);
+    size_t sz = size();
+#pragma omp parallel for
+    for (size_t i = 0; i < sz; ++i) {
+        data[i] = dptr[i];
+    }
+}
+
+void TensorNode::printData() const {
+    IT_ASSERT(data != nullptr);
+    std::cout << "Tensor: " << guid << std::endl;
+    auto numDims = shape.size();
+    auto dimSzVec = std::vector<int>(numDims, 1);
+    dimSzVec[numDims - 1] = shape[numDims - 1];
+    for (int i = numDims - 1; i != 0; --i)
+        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+    for (size_t i = 0, iEnd = size(); i < iEnd; ++i) {
+        for (size_t j = 0; j < numDims; ++j) {
+            if (i % dimSzVec[j] == 0) {
+                std::cout << "[";
+            }
+        }
+        std::cout << data[i];
+        for (size_t j = 0; j < numDims; ++j) {
+            if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) {
+                std::cout << "]";
+            }
+        }
+        if (i != size() - 1)
+            std::cout << ", ";
+        if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1)
+            std::cout << std::endl;
+    }
+}
+
+bool TensorNode::equalData(const Tensor &rhs) const {
+    IT_ASSERT(data != nullptr);
+    IT_ASSERT(rhs->data != nullptr);
+    if (shape != rhs->getDims())
+        return false;
+    size_t sz = size();
+    for (size_t i = 0; i < sz; ++i)
+        if (data[i] != rhs->data[i])
+            return false;
+    return true;
+}
+
+}; // namespace infini
--- a/src/core/tensor_base.cc
+++ b/src/core/tensor_base.cc
@ -0,0 +1,9 @@
+#include <core/tensor_base.h>
+namespace infini {
+
+TensorBaseNode::TensorBaseNode(int dim, DataType dtype)
+    : dim(dim), dtype(dtype) {}
+
+VType TensorBaseNode::getData(size_t offset) const { return data[offset]; }
+
+}; // namespace infini
--- a/src/kerels/cpu/matmul.cc
+++ b/src/kerels/cpu/matmul.cc
@ -0,0 +1,38 @@
+#include "operators/matmul.h"
+#include "core/kernel.h"
+
+namespace infini {
+
+template <typename T> class NaiveMatmul : public Kernel {
+    void compute(const Operator &_op, const PerfRecord &record) const override {
+        auto op = as<MatmulNode>(_op);
+        T *A = reinterpret_cast<T *>(op->getInputs(0)->getDataPtr().get());
+        T *B = reinterpret_cast<T *>(op->getInputs(1)->getDataPtr().get());
+        T *C = reinterpret_cast<T *>(op->getOutput()->getDataPtr().get());
+        IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
+        IT_ASSERT(op->getAct() == ActType::None);
+        IT_ASSERT(op->getB() == 1);
+        const int M = op->getM(), N = op->getN(), K = op->getK();
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < N; j++) {
+                C[i * N + j] = 0;
+                for (int k = 0; k < K; k++) {
+                    C[i * N + j] += A[i * K + k] * B[k * N + j];
+                }
+            }
+        }
+    }
+
+    void compute(const Operator &op) const override { compute(op, {}); }
+
+    PerfRecord tune(const Operator &op) const override {
+        return PerfRecord{.time = timeit([this, &op]() { compute(op); })};
+    }
+};
+
+REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Int32,
+                NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
+REGISTER_KERNEL(Device::CPU, OpType::Matmul, DataType::Float32,
+                NaiveMatmul<float>, "MatmulNaive_CPU_float32");
+
+} // namespace infini
--- a/src/nnet/Pass/MatchComputationKernel.cc
+++ b/src/nnet/Pass/MatchComputationKernel.cc
@ -0,0 +1,25 @@
+#include "nnet/Pass/MatchComputationKernel.h"
+#include "nnet/Visitor/PatternMatcher.h"
+
+namespace nnet {
+
+// RE: is this duplicate with Rule6KenerlMatching?
+void MatchComputationKernel::transform(Formula &origin, int depth, Expr &rCur) {
+    nnet_assert(derivator.getSearchState() == 2, __LINE__);
+    auto cur = as<RangeOpNode>(rCur);
+    // Build wrapper stages for enforce axis starts from 0
+    PatternMatcher patternMatcher(derivator, cur);
+    cur = patternMatcher.getOffsetCur();
+
+    auto matches = patternMatcher.matchWithPattern(
+        cur, getPattern(derivator.getTargetOp()));
+    matches = patternMatcher.applyWrapper(matches);
+
+    for (auto newCur : matches) {
+        derivator.setSearchState(3);
+        nextStep(origin, depth, rCur, newCur);
+        derivator.setSearchState(2);
+    }
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/MatchMemBoundKernel.cc
+++ b/src/nnet/Pass/MatchMemBoundKernel.cc
@ -0,0 +1,23 @@
+#include "nnet/Pass/MatchMemBoundKernel.h"
+#include "nnet/Visitor/InputVisitor.h"
+
+namespace nnet {
+
+void MatchMemBoundKernel::transform(Formula &origin, int depth, Expr &rCur) {
+    // FIXME: Whether the Formula is a Membound OP should be checked.
+    nnet_assert(derivator.getSearchState() == 3, __LINE__);
+    nnet_assert(origin.root.get() == rCur.get(),
+                "Only match the entire formula as a Membound Op");
+    auto rangeOp = as<RangeOpNode>(origin.root);
+    const auto &inputs = InputVisitor().getInputs(rangeOp);
+    auto source =
+        make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
+    auto tensor =
+        makeTensor(newTensorName(), rangeOp->getOutputShape(), {}, source);
+    // The original code directly appends candidate. But it seems should be done
+    // by the search.
+    // appendCanddiate(as<TensorNode>(tensor), depth);
+    nextStep(origin, depth, rCur, tensor);
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Pass.cc
+++ b/src/nnet/Pass/Pass.cc
@ -0,0 +1,58 @@
+#include "nnet/Pass/Pass.h"
+#include "nnet/Visitor/CloneMutator.h"
+
+namespace nnet {
+
+Pass::Pass(Derivator &derivator, const string &passName)
+    : derivator(derivator), passName(passName),
+      enableLogging(derivator.getLogMode() != Derivator::LogMode::NoLog),
+      enableDebug(false) {}
+
+Pass::~Pass() = default;
+
+void Pass::setEnableLogging(bool value) { enableLogging = value; }
+
+void Pass::setEnableDebug(bool value) { enableDebug = value; }
+
+void Pass::run(Formula &origin, int dfsDepth, Expr &rCur) {
+    initialize(origin, rCur);
+    transform(origin, dfsDepth, rCur);
+    finalize();
+}
+
+void Pass::initialize(Formula &origin, const Expr &rCur) {}
+
+void Pass::finalize() {}
+
+Var Pass::getNewVar() { return derivator.getNewVar(); }
+
+string Pass::newTensorName() { return derivator.newTensorName(); }
+
+void Pass::nextStep(Formula &origin, int depth, Expr &rCur, Expr newCur,
+                    const string &ruleMsg) {
+    // push rule action description
+    if (enableLogging) {
+        rCur.swap(newCur);
+        derivator.pushIntermediateState(origin.root);
+        rCur.swap(newCur);
+        derivator.pushRuleState(passName);
+        derivator.pushRuleMsg(ruleMsg);
+    }
+
+    if (enableDebug) {
+        // In debug mode, do not recur but save the transformed state
+        transformations.emplace_back(CloneMutator().clone(newCur));
+    } else
+        derivator.nextStep(origin, depth, rCur, newCur);
+
+    // pop rule action description
+    if (enableLogging) {
+        derivator.popIntermediateState();
+        derivator.popRuleState();
+        derivator.popRuleMsg();
+    }
+}
+
+const VecExpr &Pass::getTransformations() { return transformations; }
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule1VariableSplit.cc
+++ b/src/nnet/Pass/Rule1VariableSplit.cc
@ -0,0 +1,134 @@
+#include "nnet/Pass/Rule1VariableSplit.h"
+#include "nnet/Visitor/ReplaceVariable.h"
+
+namespace nnet {
+
+void Rule1VariableSplit::transform(Formula &origin, int depth, Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    vector<Replace> replaces = getSplitableVar(cur);
+    // for (const auto &replace : replaces)
+    //     dbg(replace.oldIters, replace.newIters, replace.psis,
+    //         replace.newVarRanges);
+    for (const auto &replace : replaces) {
+        auto replacedSummand = replaceIters(cur->getSummand(), replace);
+        if (!replacedSummand) {
+            // TODO: if a real getMergableExprs is implemented, this case should
+            // be an error. Since the expr should appear in the AST.
+            dbg("Warning: No replacment happens.");
+            continue;
+        }
+        auto inner =
+            ReplaceKit::replaceRangeOpIterator(cur, replace, replacedSummand);
+        // build the outerRange{innerRange}[indexForInner] to do DLT
+        Expr nextCur = nullptr;
+        if (replace.iteratorType == IterationType::Loop) {
+            auto subscriptedInner =
+                ReplaceKit::buildSubscirptForLoopVarReplace(inner, replace);
+            nextCur = ReplaceKit::buildDLTOuterRangeOp(cur, subscriptedInner);
+        } else
+            nextCur = inner;
+
+        string msg = "====== END rule1 VariableSplit: ";
+        dbg(msg, replace.oldIters, replace.newIters, replace.phis,
+            replace.psis);
+        msg = replace.toReadable();
+        nextStep(origin, depth, rCur, nextCur, msg);
+    }
+}
+
+vector<Replace> Rule1VariableSplit::getSplitableVar(const RangeOp &rangeOp) {
+    vector<Replace> ret;
+    // Split strategy
+    vector<int> SumFactors, LoopFactors;
+    if (derivator.getPassMode() == Derivator::PassMode::Debug) {
+        SumFactors = {3};
+        LoopFactors = {4};
+    } else if (derivator.getPassMode() == Derivator::PassMode::Full) {
+        SumFactors = {2, 3};
+        // LoopFactors = {3, 4};
+        LoopFactors = {4};
+    } else
+        nnet_unimplemented_halt();
+
+    // Split Sum variable
+    for (const int k : SumFactors) {
+        for (const auto &[var, range] : rangeOp->getSumVarRanges()) {
+            int len = range.second - range.first;
+            auto p1 = getNewVar(); // p1=i/k
+            auto p2 = getNewVar(); // p2=i%k
+            if (len > 10 || len <= k || len % k != 0)
+                continue;
+
+            Range range1, range2;
+            if (range.first < 0) {
+                nnet_unimplemented_halt();
+                // FIXME: this must be ERROR
+                range1.first = range.first / k;
+                range1.second = range1.first + len / k;
+                range2.first = -k / 2;
+                range2.second = range2.first + k;
+            } else if (range.first == 0) {
+                range1.first = 0;
+                range1.second = len / k;
+                range2.first = 0;
+                range2.second = k;
+            } else {
+                nnet_unimplemented_continue();
+                continue;
+            }
+            Replace replace{.iteratorType = IterationType::Sum,
+                            .oldIters = {var},
+                            .newIters = {p1, p2},
+                            .phis = {},
+                            .psis = {make_ref<ConstantNode>(k) * p1 + p2},
+                            .newVarRanges = {{p1, range1}, {p2, range2}}};
+            ret.emplace_back(replace);
+        }
+    }
+    for (const int k : LoopFactors) {
+        // Split Loop variable
+        for (const auto &[var, range] : rangeOp->getLoopVarRanges()) {
+            const int len = range.second - range.first;
+            // Debug HACK for dilated SG2BMM
+            if (derivator.getPassMode() == Derivator::PassMode::Debug &&
+                !(var->getName() == "m" && len % k == 0))
+                continue;
+
+            // Illeagel conditions
+            if (range.second - range.first <= k ||
+                (range.second - range.first) % k != 0)
+                continue;
+            // Unsupport conditions
+            if (range.first != 0)
+                continue;
+            auto p1 = getNewVar(); // p1=i/k
+            auto p2 = getNewVar(); // p2=i%k
+            Range range1(0, len / k);
+            Range range2(0, k);
+            nnet_assert(range1.second > 0 && range2.second > 0,
+                        "Empty loop dim");
+            Replace replace{.iteratorType = IterationType::Loop,
+                            .oldIters = {var},
+                            .newIters = {p1, p2},
+                            .phis = {var / 4, var % 4},
+                            .psis = {make_ref<ConstantNode>(k) * p1 + p2},
+                            .newVarRanges = {{p1, range1}, {p2, range2}}};
+            ret.emplace_back(replace);
+        }
+    }
+    return ret;
+}
+
+Expr Rule1VariableSplit::replaceIters(Expr cur, const Replace &replace) {
+    // TODO [feature]: support multiple replacements in one mutator
+    if (replace.oldIters.size() != 1) {
+        nnet_unimplemented_continue();
+        return nullptr;
+    }
+    auto replaceMutator =
+        ReplaceVariable(replace.oldIters.at(0), replace.psis.at(0));
+    auto ret = replaceMutator(cur);
+    return ret;
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule2VariableMerging.cc
+++ b/src/nnet/Pass/Rule2VariableMerging.cc
@ -0,0 +1,186 @@
+#include "nnet/Pass/Rule2VariableMerging.h"
+#include "nnet/Visitor/CheckOOBVisitor.h"
+
+namespace nnet {
+
+void Rule2VariableMerging::transform(Formula &origin, int depth, Expr &rCur) {
+    // Extract r and s
+    auto cur = as<RangeOpNode>(rCur);
+    vector<Replace> replaces = getMergableReplaces(cur, depth);
+    // dbg("Start rule2VariableMerging", depth, mergableExprs, *cur);
+    for (const auto &replace : replaces) {
+        if (replace.iteratorType != IterationType::Loop) {
+            nnet_unimplemented_continue();
+            continue;
+        }
+        // replace vars in summand
+        auto replacedSummand = ReplaceKit::replaceMultipleExprs(
+            cur->getSummand(), replace.oldIters, replace.psis, true);
+        // replace var in rangeOp
+        auto inner =
+            ReplaceKit::replaceRangeOpIterator(cur, replace, replacedSummand);
+        // If OOB happens, this transformation is skipped
+        if (CheckOOBVisitor().checkRangeOp(inner))
+            continue;
+        // build the outerRange{innerRange}[indexForInner] to do DLT
+        auto subscriptedInner =
+            ReplaceKit::buildSubscirptForLoopVarReplace(inner, replace);
+        auto outer = ReplaceKit::buildDLTOuterRangeOp(cur, subscriptedInner);
+
+        // next searching step
+        string msg = replace.toReadable();
+        nextStep(origin, depth, rCur, outer, msg);
+    }
+}
+
+vector<Replace> Rule2VariableMerging::getMergableReplaces(RangeOp rangeOp,
+                                                          int depth) {
+    vector<Replace> ret;
+    IteratorTable exprIT;
+    if (!exprIT.analyzeExpr(rangeOp)) {
+        nnet_unimplemented_continue();
+        return ret;
+    }
+    exprIT.buildTableWithDefaultMap();
+    const auto &strideInAllDim = exprIT.getStrideInDim();
+
+    set<pair<Iterator, Iterator>, RefValueLess<pair<Iterator, Iterator>>>
+        checkedIterPairs{};
+    // strideInAllDim: [tensorID][dimOfTensor][Iterator]=stride
+    for (size_t tensorID = 0; tensorID < strideInAllDim.size(); ++tensorID) {
+        const auto &strideInDimsOfATensor = strideInAllDim[tensorID];
+        for (const PtrMap<Iterator, int> &strideInADim :
+             strideInDimsOfATensor) {
+            for (const auto &it1 : strideInADim) {
+                for (const auto &it2 : strideInADim) {
+                    // Backdoor for rule-based search
+                    if (substituteRules.count(depth)) {
+                        if (substituteRules[depth].at(0)->neq(it1.first))
+                            continue;
+                        if (substituteRules[depth].at(1)->neq(it2.first))
+                            continue;
+                    }
+                    if (!(it1.first->equal(it2.first) &&
+                          it1.second == it2.second) &&
+                        rangeOp->hasLoopVar(it1.first) &&
+                        rangeOp->hasLoopVar(it2.first)) {
+                        // 2 iters -> 2 iters
+                        if (auto opt = getReplaceMappingTwoLoopIters(rangeOp,
+                                                                     it1, it2))
+                            ret.emplace_back(*opt);
+
+                        // 2 iters -> 1 iter
+                        const auto iterPair = pair(it1.first, it2.first);
+                        if (!checkedIterPairs.count(iterPair)) {
+                            checkedIterPairs.insert(iterPair);
+                            if (auto opt = getReplaceMergingTwoLoopIters(
+                                    rangeOp, it1, it2, exprIT, tensorID))
+                                ret.emplace_back(*opt);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+optional<Replace> Rule2VariableMerging::getReplaceMergingTwoLoopIters(
+    const RangeOp &rangeOp, pair<Iterator, int> pairA,
+    pair<Iterator, int> pairB, const IteratorTable &exprIT, int tensorID) {
+    // 1*A + sb*B -> C
+    // A=C%sb, B=C/sb
+    // ax+by->z, a=1 or -1
+    // For a>0 and b>0 : x=z%b, y=z/b
+    auto x = pairA.first, y = pairB.first;
+    int a = pairA.second, b = pairB.second;
+    if (abs(a) != 1 || abs(a) * abs(b) <= 0)
+        return {};
+    if (a < 0 && b > 0) { // The only unhandled case
+        nnet_unimplemented_continue();
+        return {};
+    }
+    // negative substitution happens only if can be totally merged. So if the
+    // variable appears in another index, skip it.
+    if (a < 0 || b < 0) {
+        if (exprIT.getNumInputs() > 1) {
+            if (exprIT.getStridesInTensor(x, 1 - tensorID) != 0)
+                return {};
+            if (exprIT.getStridesInTensor(y, 1 - tensorID) != 0)
+                return {};
+        }
+    }
+    Range rangeX = rangeOp->getVarRange(x).second,
+          rangeY = rangeOp->getVarRange(y).second;
+    if (rangeX.first != 0 || rangeY.first != 0)
+        return {};
+    int lenX = rangeX.second - rangeX.first;
+    if (abs(b) != lenX)
+        return {};
+    auto z = getNewVar();
+
+    Range rangeExpr{0, 1}; // 1 is the open interval compensation
+    auto calcRangeExpr = [&rangeExpr](int stride, const Range &r) {
+        if (stride > 0) {
+            rangeExpr.first += stride * r.first;
+            rangeExpr.second += stride * (r.second - 1);
+        } else {
+            rangeExpr.first += stride * (r.second - 1);
+            rangeExpr.second += stride * r.first;
+        }
+    };
+    calcRangeExpr(a, rangeX);
+    calcRangeExpr(b, rangeY);
+
+    // build the phi/psi for index transformation
+    // phi: j_x=(i_x...),  psi: i_x=(j_x...)
+    auto ret = optional<Replace>();
+    ret.emplace();
+    ret->iteratorType = IterationType::Loop;
+    ret->newIters = {z};
+    ret->oldIters = {x, y};
+    ret->phis = {a * x + b * y - rangeExpr.first};
+    // For b < 0, the psis are not an equavalent replace. Since it must be
+    // simplified (z/b and z%b will be merged), the only important thing is
+    // their strides should be mergable. To merge the strides, an extra minus
+    // are introduced if their stride is negative.
+    ret->psis = {a * (z % b) + a * rangeExpr.first, (b > 0 ? 1 : -1) * (z / b)};
+    ret->newVarRanges = {{z, {0, rangeExpr.second - rangeExpr.first}}};
+    return ret;
+}
+
+optional<Replace>
+Rule2VariableMerging::getReplaceMappingTwoLoopIters(const RangeOp &rangeOp,
+                                                    pair<Iterator, int> pairA,
+                                                    pair<Iterator, int> pairB) {
+    // the first iterator is replaced, the second remains
+    auto i1 = pairA.first, i2 = pairB.first;
+    int sa = pairA.second, sb = pairB.second;
+    // TODO: can be relaxed to sb|sb
+    if (sa != 1 || sb == 0)
+        return {};
+    if (sb < 0) {
+        nnet_unimplemented_continue();
+        return {};
+    }
+    Range rangeA = rangeOp->getVarRange(i1).second;
+    Range rangeB = rangeOp->getVarRange(i2).second;
+    auto j1 = getNewVar(), j2 = getNewVar();
+    Range rangeJ1, rangeJ2 = rangeB;
+    assert(pairA.second == 1);
+    rangeJ1.first = rangeA.first + rangeB.first * sb;
+    rangeJ1.second = rangeA.second + (rangeB.second - 1) * sb;
+    // build the phi/psi for index transformation
+    // phi: j_x=(i_x...),  psi: i_x=(j_x...)
+    auto ret = optional<Replace>();
+    ret.emplace();
+    ret->iteratorType = IterationType::Loop;
+    ret->newIters = {j1, j2};
+    ret->oldIters = {i1, i2};
+    ret->newVarRanges = {{j1, rangeJ1}, {j2, rangeJ2}};
+    ret->phis = {sa * i1 + sb * i2, i2};
+    ret->psis = {j1 - (sb / sa) * j2, j2};
+    return ret;
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule3StageSplit.cc
+++ b/src/nnet/Pass/Rule3StageSplit.cc
@ -0,0 +1,82 @@
+#include "nnet/Pass/Rule3StageSplit.h"
+#include "nnet/permutation.h"
+
+namespace nnet {
+
+void Rule3StageSplit::transform(Formula &origin, int depth, Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    vector<vector<Iterator>> splitSchemes = getSplitSummationIters(cur);
+
+    for (const auto &varSplit : splitSchemes) {
+
+        bool isSplittable = false;
+        for (const auto &splitted : varSplit)
+            if (cur->hasSumVar(splitted))
+                isSplittable = true;
+        assert(isSplittable);
+
+        const vector<VarRangePair> loopVars = cur->getLoopVarRanges(),
+                                   sumVars = cur->getSumVarRanges();
+        // move iterators from Sigma to Loop
+        vector<VarRangePair> innerLoopVars, innerSumVars, outerSumVars;
+        VecExpr indexForInner;
+        for (const auto &kv : sumVars) {
+            bool isSplitted = false;
+            for (const auto &iter : varSplit)
+                if (iter == kv.first->getName())
+                    isSplitted = true;
+            if (isSplitted) {
+                innerLoopVars.emplace_back(kv);
+                outerSumVars.emplace_back(kv);
+            } else
+                innerSumVars.emplace_back(kv);
+        }
+        innerLoopVars.insert(innerLoopVars.end(), loopVars.begin(),
+                             loopVars.end());
+        for (const auto &[var, _] : innerLoopVars)
+            indexForInner.emplace_back(var);
+
+        // if no sum iterator, the stage is redundant
+        assert(!innerSumVars.empty());
+        auto inner =
+            makeRangeOperator(innerLoopVars, innerSumVars, cur->getSummand());
+        auto subscriptedInner = make_ref<SubscriptNode>(inner, indexForInner);
+        auto outer = makeRangeOperator(cur->getLoopVarRanges(), outerSumVars,
+                                       subscriptedInner);
+        outer->setPaddings(cur->getPaddings());
+
+        // next searching step
+        string msg = "Separate sum iters: " + serializeVec(varSplit);
+        nextStep(origin, depth, rCur, outer, msg);
+    }
+}
+
+vector<vector<Iterator>>
+Rule3StageSplit::getSplitSummationIters(RangeOp rangeOp) {
+    // set<string> varSplit = {"r", "s", "i3", "i13"};
+    vector<vector<Iterator>> ret;
+    // Rule-based Hint
+    // vector<vector<Iterator>> heuristics = {{"r", "s"}, {"i3", "i13"}};
+    // for (const auto &iterSet : heuristics) {
+    //     bool notExist = false;
+    //     for (const auto &iter : iterSet)
+    //         if (!rangeOp->hasSumVar(iter))
+    //             notExist = true;
+    //     if (!notExist)
+    //         ret.emplace_back(iterSet);
+    // }
+    // if (!rulesOverall.empty())
+    //     return ret;
+    vector<Iterator> sumIters;
+    for (const auto &[iter, range] : rangeOp->getSumVarRanges())
+        sumIters.emplace_back(iter);
+    if (sumIters.size() <= 1)
+        return ret;
+    SubsetGenerator gen(sumIters);
+    do {
+        ret.emplace_back(gen.get());
+    } while (gen.next());
+    return ret;
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule4StageMerging.cc
+++ b/src/nnet/Pass/Rule4StageMerging.cc
@ -0,0 +1,91 @@
+#include "nnet/Pass/Rule4StageMerging.h"
+#include "nnet/Visitor/ReplaceNodeMutator.h"
+#include "nnet/Visitor/ReplaceVariable.h"
+
+namespace nnet {
+
+void Rule4StageMerging::transform(Formula &origin, int depth, Expr &rCur) {
+    success = rule4StageMerging(origin, depth, rCur, mergeStageWithCalc);
+}
+
+bool Rule4StageMerging::isSuccessful() { return success; }
+
+void Rule4StageMerging::setMergeStageWithCalc(bool value) {
+    mergeStageWithCalc = value;
+}
+
+bool Rule4StageMerging::rule4StageMerging(Formula &origin, int depth,
+                                          Expr &rCur, bool mergeStageWithCalc) {
+    auto rangeOp0 = as<RangeOpNode>(rCur);
+    const Subscript &sub0 = as<SubscriptNode>(rangeOp0->getSummand());
+    if (!sub0)
+        return false;
+    const auto &rangeOp1 = as<RangeOpNode>(sub0->getObject());
+    if (!rangeOp1)
+        return false;
+    const auto &sub1 = as<SubscriptNode>(rangeOp1->getSummand());
+    if (!sub1)
+        return false;
+    // merge stage with calculation only when mergeStageWithCalc=true
+    if (!mergeStageWithCalc && !rangeOp1->getSumVarRanges().empty())
+        return false;
+    // Only propogate paddings in perfect nested dimension
+    if (rangeOp1->hasPaddings()) {
+        auto oldTensor = as<TensorNode>(sub1->getObject());
+        if (!oldTensor) {
+            nnet_unimplemented_continue();
+            return 0;
+        }
+    }
+    // repalce variables: iters of rangeOp1 repalced by indexes of sub0
+    map<string, pair<Expr, Expr>> varMapping;
+    assert(sub0->getDims() == rangeOp1->getLoopVarRanges().size());
+    for (size_t i = 0; i < sub0->getDims(); ++i) {
+        varMapping[rangeOp1->getLoopVar(i)->getName()] =
+            pair(rangeOp1->getLoopVar(i), sub0->getIndex(i));
+    }
+    ReplaceVariable replaceVariable{varMapping};
+    auto merged = make_ref<RangeOpNode>(*rangeOp0);
+    merged->setSummand(replaceVariable(sub1));
+    // a naive approach to propogate paddings
+    if (rangeOp1->hasPaddings()) {
+        auto oldTensor = as<TensorNode>(sub1->getObject());
+        auto newTensor = make_ref<TensorNode>(*oldTensor);
+        for (int i = 0; i < rangeOp1->getNumOutputDims(); ++i) {
+            if (rangeOp1->getPaddings(i) == 0)
+                continue;
+            auto loopVar = rangeOp1->getLoopVar(i);
+            // FIXME: in fact this var should not appear in other index as well,
+            // which may result in OOB
+            bool findSingleVarAsIndex = false;
+            for (size_t subIndexID = 0; subIndexID < sub1->getDims();
+                 ++subIndexID) {
+                auto index = sub1->getIndex(subIndexID);
+                if (auto indexVar = as<VarNode>(index);
+                    indexVar && (indexVar->equal(loopVar))) {
+                    newTensor->setPadding(subIndexID,
+                                          newTensor->getPadding(subIndexID) +
+                                              rangeOp1->getPaddings(i));
+                    findSingleVarAsIndex = true;
+                }
+            }
+            if (!findSingleVarAsIndex) {
+                nnet_unimplemented_continue();
+                return false;
+            }
+        }
+        merged = as<RangeOpNode>(
+            ReplaceNodeMutator().replace(merged, oldTensor.get(), newTensor));
+        assert(merged != nullptr);
+    }
+    // Merge inner stage sums
+    if (!rangeOp1->getSumVarRanges().empty())
+        merged->setSumIterator(rangeOp1->getSumVarRanges());
+
+    // next searching step
+    // if mergeStageWithCalc, depth counts for invocation in rule-based search
+    nextStep(origin, (mergeStageWithCalc) ? depth : depth - 1, rCur, merged);
+    return true;
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule5RangeRelaxation.cc
+++ b/src/nnet/Pass/Rule5RangeRelaxation.cc
@ -0,0 +1,72 @@
+#include "nnet/Pass/Rule5RangeRelaxation.h"
+#include "nnet/Visitor/RangeRelaxFunctor.h"
+
+namespace nnet {
+
+void Rule5RangeRelaxation::transform(Formula &origin, int depth, Expr &rCur) {
+    rule5RangeRelaxation(origin, depth, rCur);
+}
+
+Expr Rule5RangeRelaxation::rule5RangeRelaxation(Formula &origin, int depth,
+                                                Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    if (cur->hasPaddings()) {
+        // string msg = "====== END rule5RangeRelaxation: Paddings exist \n";
+        // dbg(msg);
+        return nullptr;
+    }
+
+    // Infer meaningful calculation range
+    RangeRelaxFunctor rangeRexlaxtionFunctor{cur};
+    RangeMap rangeMap = rangeRexlaxtionFunctor(cur);
+    auto relaxedCur = make_ref<RangeOpNode>(*cur);
+    bool isRelaxed = false;
+    vector<int> paddings;
+    // check whether narrow the calculation range
+    for (size_t i = 0; i < cur->getLoopVarRanges().size(); ++i) {
+        const auto &[iter, iterRange] =
+            cur->getVarRange(IterationType::Loop, i);
+        if (auto it = rangeMap.find(iter); it != rangeMap.end()) {
+            // intersection of validRange and iterRange is necessary computation
+            // TODO: it is redundant with RangeRelaxFunctor::intersectRangeMaps.
+            // An independent Range class might be necessary.
+            const Range &validRange = it->second;
+            Range relaxedRange{max(iterRange.first, validRange.first),
+                               min(iterRange.second, validRange.second)};
+            if (relaxedRange != iterRange) {
+                isRelaxed = true;
+                relaxedCur->setVarRange(IterationType::Loop, i,
+                                        {iter, relaxedRange});
+                paddings.emplace_back(
+                    max(relaxedRange.first - iterRange.first,
+                        iterRange.second - relaxedRange.second));
+            } else
+                paddings.emplace_back(0);
+        } else
+            paddings.emplace_back(0);
+    }
+    relaxedCur->setPaddings(paddings);
+    if (!isRelaxed) {
+        // string msg = "====== END rule5RangeRelaxation: Relaxation not
+        // found\n"; dbg(msg);
+        return nullptr;
+    }
+
+    // next searching step
+    string detailedMsg;
+    for (size_t i = 0; i < cur->getLoopVarRanges().size(); ++i) {
+        const auto &[v, a] = cur->getVarRange(IterationType::Loop, i);
+        const auto &[_, b] = relaxedCur->getVarRange(IterationType::Loop, i);
+        if (a != b) {
+            detailedMsg += v->getName();
+            detailedMsg +=
+                " (" + to_string(a.first) + "," + to_string(a.second) + ") to";
+            detailedMsg +=
+                " (" + to_string(b.first) + "," + to_string(b.second) + "),";
+        }
+    }
+    nextStep(origin, depth, rCur, relaxedCur, detailedMsg);
+    return relaxedCur;
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule6KenerlMatching.cc
+++ b/src/nnet/Pass/Rule6KenerlMatching.cc
@ -0,0 +1,57 @@
+#include "nnet/Pass/Rule6KenerlMatching.h"
+#include "nnet/Visitor/InputVisitor.h"
+#include "nnet/Visitor/PatternMatcher.h"
+
+namespace nnet {
+
+void Rule6KenerlMatching::transform(Formula &origin, int depth, Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    // Build wrapper stages for enforce axis starts from 0
+    PatternMatcher patternMatcher(derivator, cur);
+    cur = patternMatcher.getOffsetCur();
+
+    // Match matchable routines
+    for (int i = 0; i < MatchableRoutineTypeCnt; ++i) {
+        auto targetOp = idToRoutineType(i);
+        // During guided search, only check the target OP
+        if (derivator.getTargetOp() != RoutineType::NoneType &&
+            derivator.getTargetOp() != targetOp)
+            continue;
+        auto replaces =
+            patternMatcher.matchWithPattern(cur, getPattern(targetOp));
+        replaces = patternMatcher.applyWrapper(replaces);
+        for (auto newCur : replaces)
+            nextStep(origin, depth, rCur, newCur);
+    }
+    { // Match element-wise OP
+        auto replaces = matchElementWise(cur);
+        if (!replaces.empty())
+            dbg(rCur);
+        for (auto newCur : replaces)
+            nextStep(origin, depth, rCur, newCur);
+    }
+}
+
+VecExpr Rule6KenerlMatching::matchElementWise(const RangeOp &rangeOp) {
+    // If the stage is compute bound, then do not convert it.
+    int64_t flops = rangeOp->getFlops(), outputSize = rangeOp->getOutputSize();
+    int64_t inputSize = rangeOp->getInputSize(rangeOp);
+    if (double(flops) / (inputSize + outputSize) > 3)
+        return {};
+
+    vector<int> newShape;
+    for (const auto &[var, range] : rangeOp->getLoopVarRanges()) {
+        if (range.first != 0) {
+            nnet_unimplemented_continue();
+            return {};
+        }
+        newShape.emplace_back(range.second - range.first);
+    }
+    const auto &inputs = InputVisitor().getInputs(rangeOp);
+    auto source =
+        make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
+    auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
+    return {newTensor};
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule7DLT.cc
+++ b/src/nnet/Pass/Rule7DLT.cc
@ -0,0 +1,78 @@
+#include "nnet/Pass/Rule7DLT.h"
+#include "nnet/Visitor/ReplaceNodeMutator.h"
+#include "nnet/dlt.h"
+
+namespace nnet {
+
+void Rule7DLT::transform(Formula &origin, int depth, Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    if (!cur)
+        return;
+    auto op = as<BinaryOpNode>(cur->getSummand());
+    if (!op)
+        return;
+    auto subs = {op->getLhs(), op->getRhs()};
+    for (auto subExpr : subs) {
+        auto sub = as<SubscriptNode>(subExpr);
+        if (!sub)
+            continue;
+        auto tensor = as<TensorNode>(sub->getObject());
+        if (!tensor)
+            continue;
+        // // HACK for G2BMM
+        // if (tensor->getDims() != 3)
+        //     continue;
+        for (const auto factor : getFactors()) {
+            for (int targetDim = 0; targetDim < tensor->getDims();
+                 ++targetDim) {
+                if (tensor->getShape(targetDim) % factor)
+                    continue;
+                // Debug hint for G2BMM
+                if (derivator.getPassMode() == Derivator::PassMode::Debug) {
+                    if (tensor->getShape(targetDim) != 10000)
+                        continue;
+                    assert(targetDim == 1);
+                }
+                DLT dlt;
+                dlt.split(targetDim, factor);
+                vector<int> newOrder(tensor->getDims() + 1);
+                for (int i = 0; i < tensor->getDims() + 1; ++i)
+                    newOrder[i] = i;
+                newOrder[targetDim]++;
+                newOrder[targetDim + 1]--;
+                dlt.reorder(newOrder);
+                dlt.merge(targetDim, targetDim + 1);
+                if (auto opt = dlt.apply(cur, sub, newTensorName())) {
+                    Expr newSummand = ReplaceNodeMutator().replace(
+                        cur->getSummand(), sub.get(), *opt);
+                    auto newCur = buildDLTSingleRangeOp(cur, newSummand);
+
+                    // next searching step
+                    string msg = "====== END rule7DLT\n";
+                    dbg(msg);
+                    nextStep(origin, depth, rCur, newCur);
+                }
+            }
+        }
+    }
+}
+
+Expr Rule7DLT::buildDLTSingleRangeOp(const RangeOp &original,
+                                     const Expr &newSummand) {
+    auto rangeOp = make_ref<RangeOpNode>(*original);
+    rangeOp->setSummand(newSummand);
+    return rangeOp;
+}
+
+vector<int> Rule7DLT::getFactors() {
+    if (derivator.getPassMode() == Derivator::PassMode::Debug) {
+        return {4};
+    } else if (derivator.getPassMode() == Derivator::PassMode::Full) {
+        return {3, 4};
+    } else {
+        nnet_unimplemented_halt();
+        return {};
+    }
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule8GuidedDLT.cc
+++ b/src/nnet/Pass/Rule8GuidedDLT.cc
@ -0,0 +1,317 @@
+#include "nnet/Pass/Rule8GuidedDLT.h"
+#include "nnet/Visitor/ReplaceNodeMutator.h"
+
+namespace nnet {
+
+static int bitCount(unsigned int n) {
+    int count = 0;
+    while (n != 0) {
+        n = n & (n - 1);
+        count++;
+    }
+    return count;
+}
+
+static int bitPosition(unsigned int n) {
+    assert(bitCount(n) == 1);
+    int ret = 0;
+    for (n >>= 1; n; n >>= 1)
+        ++ret;
+    return ret;
+}
+
+void Rule8GuidedDLT::transform(Formula &origin, int depth, Expr &rCur) {
+    guidedDLT(origin, depth, rCur);
+}
+
+VecExpr Rule8GuidedDLT::guidedDLT(Formula &origin, int depth, Expr &rCur,
+                                  bool debug) {
+    string detailedMsg;
+    VecExpr ret;
+    auto cur = as<RangeOpNode>(rCur);
+    // check cur satisfies T1[A]*T2[B]
+
+    if (!statisfyGuidedDLT(cur))
+        return ret;
+
+    IteratorTable exprIT;
+    if (!exprIT.analyzeExpr(cur))
+        return ret;
+    exprIT.buildTableWithDefaultMap();
+
+    bool setTargetOpHere = false;
+    for (int i = 0; i < MatchableRoutineTypeCnt; ++i) {
+        // if not correctly unset this variable
+        assert(setTargetOpHere == false);
+        // If the guide direction is set
+        if (derivator.getTargetOp() != RoutineType::NoneType &&
+            idToRoutineType(i) != derivator.getTargetOp())
+            continue;
+        // Warning: no continue befor unset the targetOp
+        if (derivator.getTargetOp() == RoutineType::NoneType) {
+            setTargetOpHere = true;
+            derivator.setTargetOp(idToRoutineType(i));
+        }
+        const Pattern &pattern = getPattern(derivator.getTargetOp());
+        auto mismatches = exprIT.matchPatternIT(pattern);
+        // Pruning less possible results
+
+        // std::cout << "mismatches= " << mismatches.size()
+        //           << "; setTargetOpHere: " << setTargetOpHere << "; ";
+        // std::cout << "TargetOp = " <<
+        // static_cast<int>(derivator.getTargetOp())
+        //           << "; mismatches : ";
+        // for (const auto i : mismatches)
+        //     std::cout << static_cast<int>(i.type) << " ";
+        // std::cout << endl;
+        if (mismatches.size() == 0) {
+            derivator.setSearchState(2);
+            nextStep(origin, depth, rCur, rCur);
+            derivator.setSearchState(1);
+        }
+        if (mismatches.size() > 0 && mismatches.size() <= 2) {
+            for (const auto &mismatch : mismatches) {
+                Expr newCur;
+                if (mismatch.type == MismatchType::MoreVar) {
+                    newCur = guidedDLTMoreVar2(cur, mismatch, exprIT, pattern);
+                    detailedMsg += "guidedDLTMoreVar2 ";
+                } else if (mismatch.type == MismatchType::DLMismatch ||
+                           mismatch.type == MismatchType::OutputDLMismatch) {
+                    if (mismatches.size() > 1) {
+                        nnet_unimplemented_continue();
+                        break;
+                    }
+                    newCur =
+                        guidedDLTDLMismatch(cur, mismatch, exprIT, pattern);
+                    detailedMsg += "guidedDLTDLMismatch ";
+                }
+                // std::cout << "newCur= "
+                //           << ((newCur == nullptr) ? "Nullptr"
+                //                                   : newCur->toReadable())
+                //           << endl;
+                if (!newCur)
+                    continue;
+                if (debug)
+                    ret.emplace_back(newCur);
+                // next searching step
+                detailedMsg = "Toward " +
+                              getPatternName(derivator.getTargetOp()) + ". " +
+                              detailedMsg;
+                nextStep(origin, depth, rCur, newCur, detailedMsg);
+            }
+        }
+        // Unset targetOp
+        if (setTargetOpHere) {
+            derivator.setTargetOp(RoutineType::NoneType);
+            setTargetOpHere = false;
+        }
+    }
+    return ret;
+}
+
+Expr Rule8GuidedDLT::guidedDLTDLMismatch(
+    const RangeOp &cur, const Mismatch &mismatch,
+    [[maybe_unused]] const IteratorTable &exprIT, const Pattern &pattern) {
+    assert(mismatch.type == MismatchType::DLMismatch ||
+           mismatch.type == MismatchType::OutputDLMismatch);
+    // Currently only deal with ouput DLT
+    if (mismatch.bitmap != pattern.getNumInputs()) {
+        nnet_unimplemented_continue();
+        return nullptr;
+    }
+    vector<VarRangePair> newVarRanges;
+    for (const auto &[var, _] : pattern.getRangeOp()->getLoopVarRanges()) {
+        const auto &iterInExpr = mismatch.mappingIter_r.at(var);
+        newVarRanges.emplace_back(cur->getVarRange(iterInExpr));
+    }
+    auto inner = make_ref<RangeOpNode>(*cur);
+    inner->setLoopIterator(newVarRanges);
+    auto subscriptedInner =
+        ReplaceKit::buildSubscirptForLoopVarReplace(inner, {});
+    auto outer = ReplaceKit::buildDLTOuterRangeOp(cur, subscriptedInner);
+    return outer;
+}
+
+bool Rule8GuidedDLT::statisfyGuidedDLT(RangeOp cur) const {
+    auto mul = as<BinaryOpNode>(cur->getSummand());
+    if (!mul)
+        return false;
+    if (mul->getOpType() != OpType::Mul)
+        return false;
+    return as<SubscriptNode>(mul->getLhs()) && as<SubscriptNode>(mul->getRhs());
+}
+
+Expr Rule8GuidedDLT::guidedDLTMoreVar2(const RangeOp &cur,
+                                       const Mismatch &mismatch,
+                                       const IteratorTable &exprIT,
+                                       const Pattern &pattern) {
+    int bitmap = mismatch.bitmap;
+    const auto &mergedItersDefaultOrder = exprIT.getPosTable(bitmap);
+
+    // Assure vars only appear in one input tensor
+    int bitmapOfInputs = bitmap & ((1 << exprIT.getNumInputs()) - 1);
+    if (bitCount(bitmapOfInputs) > 1)
+        return nullptr;
+    if (pattern.getPosTable(bitmap).size() != 1) {
+        nnet_unimplemented_continue();
+        return nullptr;
+    }
+    if (mergedItersDefaultOrder.size() < 1)
+        return nullptr;
+    int tensorID = bitPosition(bitmapOfInputs);
+    if (!checkElementsHaveOnlyOneAccessIteratorSet(exprIT, tensorID))
+        return nullptr;
+    vector<Var> oldVars; // i_1, ...
+    vector<Var> newVars; // j_1, ...
+    VecExpr psis;        // i_1=\psi_1(j_1, ...)
+    VecExpr phis;        // j_1=\phi_1(i_1, ...), not necessary for Sum iter
+    vector<VarRangePair> newVarRanges;
+
+    auto originalTensor = exprIT.getTensor(tensorID);
+    auto originalSub = exprIT.getSubscript(tensorID);
+    vector<bool> mergedDims(originalTensor->getDims());
+
+    // Heuristic: merge iters according to their appearance positions
+    std::multimap<int, Var> sortedMergedIters;
+    for (const auto &iter : mergedItersDefaultOrder) {
+        vector<int> dims = exprIT.getIterDimInTensor(tensorID, iter);
+        assert(dims.size() == 1);
+        sortedMergedIters.emplace(dims[0], iter);
+    }
+    vector<Var> mergedIters; // decides the order of fused dims
+    for (const auto &[_, v] : sortedMergedIters)
+        mergedIters.emplace_back(v);
+
+    // Add the merged iterators
+    const auto newVar = getNewVar();
+    newVars.emplace_back(newVar);
+    int newRange = 1;
+    for (const auto &iter : mergedIters) {
+        oldVars.emplace_back(iter);
+        auto range = cur->getRange(iter);
+        newRange *= (range.second - range.first);
+        // if (range.first == 0)
+        //     nnet_unimplemented_halt();
+    }
+    newVarRanges.emplace_back(newVar, Range{0, newRange});
+    // Add psis for each old iterator
+    int remainingRange = newRange;
+    Expr phi = nullptr;
+    for (const auto &iter : mergedIters) {
+        auto oldVar = iter;
+        auto range = cur->getRange(iter);
+        int len = (range.second - range.first);
+        remainingRange /= len;
+        Expr psi = newVar;
+        if (remainingRange > 1)
+            psi = psi / remainingRange;
+        if (newRange > remainingRange * len)
+            psi = psi % len;
+        int start = cur->getRange(iter).first;
+        if (start != 0)
+            psi = psi + start;
+        psis.emplace_back(psi);
+        phi = phi + remainingRange * (oldVar - start);
+    }
+    Replace replace{.iteratorType = IterationType::Loop,
+                    .oldIters = oldVars,
+                    .newIters = newVars,
+                    .phis = VecExpr{phi},
+                    .psis = psis,
+                    .newVarRanges = newVarRanges};
+    // HACK: decide the rebuild data shape order
+    // TODO: get a partial iter mapping and permutate them?
+    vector<Var> tensorDimAxes{newVars};
+    vector<int> newShape;
+    for (const auto &[var, range] : newVarRanges)
+        newShape.emplace_back(range.second - range.first);
+    for (int row = 0; row < exprIT.getNumRows(); ++row) {
+        // Deal with other dimensions of the current tensor
+        if (row == bitmap || ((row & (1 << tensorID)) == 0))
+            continue;
+        using StrideIter = tuple<int, int, Iterator>;
+        vector<StrideIter> strideIters;
+
+        for (size_t i = 0; i < exprIT.getPosTable(row).size(); ++i) {
+            const auto &iter = exprIT.getPosTable(row)[i];
+            const Range range = cur->getRange(iter);
+            const int len = range.second - range.first;
+
+            // HACK Sort according to original stride. (keep original order)
+            strideIters.emplace_back(-exprIT.getStridesInTensor(iter, tensorID),
+                                     len, iter);
+
+            // // HACK for conv
+            // if (iter == "n")
+            //     strideIters.emplace_back(2, len, iter);
+            // else if (iter == "c")
+            //     strideIters.emplace_back(1, len, iter);
+            // else
+            //     strideIters.emplace_back(0, len, iter);
+        }
+        // HACK: Assure the order of iterators
+        std::sort(strideIters.begin(), strideIters.end(),
+                  ref_value_less<StrideIter>);
+        for (const auto &[_, len, oldIter] : strideIters) {
+            const auto &oldVar = oldIter;
+            tensorDimAxes.emplace_back(oldVar);
+            newShape.emplace_back(len);
+        }
+    }
+
+    // build DLT source
+    const auto sourceExpr =
+        buildGuidedDLTSource(originalSub, replace, tensorDimAxes, newShape);
+    const auto sourceRoutine = make_ref<ElementWiseNode>(
+        sourceExpr, vector<Tensor>{originalTensor}, newShape);
+    // build stage connections
+    const auto newTensor =
+        makeTensor(newTensorName(), newShape, {}, sourceRoutine);
+    const auto &newSub = makeSubscript(
+        newTensor, VecExpr(tensorDimAxes.begin(), tensorDimAxes.end()));
+    // TODO [1124]: get variable mapping and reorder L according to it
+    // dbg(cur, originalSub, newSub, newVarRanges, replace.toReadable(),
+    //     tensorDimAxes, newShape);
+
+    // Replace the entire subscript(A[xxxxx,xxx]) in the summand
+    Expr newSummand = ReplaceNodeMutator().replace(cur->getSummand(),
+                                                   originalSub.get(), newSub);
+    auto inner = ReplaceKit::replaceRangeOpIterator(cur, replace, newSummand);
+    auto subscriptedInner =
+        ReplaceKit::buildSubscirptForLoopVarReplace(inner, replace);
+    auto outer = ReplaceKit::buildDLTOuterRangeOp(cur, subscriptedInner);
+    return outer;
+}
+
+bool Rule8GuidedDLT::checkElementsHaveOnlyOneAccessIteratorSet(
+    const IteratorTable &exprIT, int tensorID) {
+    const auto &strideInDim = exprIT.getStrideInDim();
+    for (const auto &strideForOneDim : strideInDim[tensorID]) {
+        vector<pair<int, int>> strideLengthPairs;
+        for (const auto &[iter, s] : strideForOneDim) {
+            const auto &range = exprIT.getRangeOp()->getRange(iter);
+            strideLengthPairs.emplace_back(s, range.second - range.first);
+        }
+        std::sort(strideLengthPairs.begin(), strideLengthPairs.end());
+        for (size_t i = 0; i < strideLengthPairs.size() - 1; ++i) {
+            const auto &[stride, length] = strideLengthPairs[i];
+            if (stride * length > strideLengthPairs[i + 1].first)
+                return false;
+        }
+    }
+    return true;
+}
+
+Expr Rule8GuidedDLT::buildGuidedDLTSource(const Subscript &originalSub,
+                                          Replace replace,
+                                          vector<Var> tensorDimAxes,
+                                          vector<int> newShape) {
+    Expr newSub = ReplaceKit::replaceMultipleExprs(
+        originalSub, replace.oldIters, replace.psis, true);
+    vector<VarRangePair> loopVarRangePairs;
+    for (size_t i = 0; i < tensorDimAxes.size(); ++i)
+        loopVarRangePairs.emplace_back(tensorDimAxes[i], pair(0, newShape[i]));
+    return makeRangeOperator(loopVarRangePairs, {}, newSub);
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule90TwoStageElementWise.cc
+++ b/src/nnet/Pass/Rule90TwoStageElementWise.cc
@ -0,0 +1,54 @@
+#include "nnet/Pass/Rule90TwoStageElementWise.h"
+#include "nnet/Visitor/InputVisitor.h"
+
+namespace nnet {
+
+void Rule90TwoStageElementWise::transform(Formula &origin, int depth,
+                                          Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    { // Match element-wise OP
+        auto replaces = matchTwoStageElementWise(cur);
+        // if (!replaces.empty())
+        //     dbg(rCur);
+        // dbg(replaces);
+        for (auto newCur : replaces)
+            nextStep(origin, depth, rCur, newCur);
+    }
+}
+
+VecExpr
+Rule90TwoStageElementWise::matchTwoStageElementWise(const RangeOp &rangeOp) {
+    // If the stage is compute bound, then do not convert it.
+    int64_t flops = rangeOp->getFlops(), outputSize = rangeOp->getOutputSize();
+    int64_t inputSize = rangeOp->getInputSize(rangeOp);
+    if (double(flops) / (inputSize + outputSize) > 3)
+        return {};
+    auto outerSub = as<SubscriptNode>(rangeOp->getSummand());
+    if (!outerSub)
+        return {};
+    auto innerRangeOp = as<RangeOpNode>(outerSub->getObject());
+    if (!innerRangeOp)
+        return {};
+    auto innerSub = as<SubscriptNode>(innerRangeOp->getSummand());
+    if (!innerSub)
+        return {};
+    auto innerTensor = as<TensorNode>(innerSub->getObject());
+    if (!innerTensor)
+        return {};
+
+    vector<int> newShape;
+    for (const auto &[var, range] : rangeOp->getLoopVarRanges()) {
+        if (range.first != 0) {
+            nnet_unimplemented_continue();
+            return {};
+        }
+        newShape.emplace_back(range.second - range.first);
+    }
+    const auto &inputs = InputVisitor().getInputs(rangeOp);
+    auto source =
+        make_ref<ElementWiseNode>(rangeOp, inputs, rangeOp->getOutputShape());
+    auto newTensor = makeTensor(newTensorName(), newShape, {}, source);
+    return {newTensor};
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule91MergeStagesWithSum.cc
+++ b/src/nnet/Pass/Rule91MergeStagesWithSum.cc
@ -0,0 +1,11 @@
+#include "nnet/Pass/Rule91MergeStagesWithSum.h"
+#include "nnet/Pass/Rule4StageMerging.h"
+
+namespace nnet {
+
+void Rule91MergeStagesWithSum::transform(Formula &origin, int depth,
+                                         Expr &rCur) {
+    Rule4StageMerging(derivator).rule4StageMerging(origin, depth, rCur, true);
+}
+
+} // namespace nnet
--- a/src/nnet/Pass/Rule9RangeMagnify.cc
+++ b/src/nnet/Pass/Rule9RangeMagnify.cc
@ -0,0 +1,45 @@
+#include "nnet/Pass/Rule9RangeMagnify.h"
+#include "nnet/Visitor/RangeMagnifyVisitor.h"
+
+namespace nnet {
+
+void Rule9RangeMagnify::transform(Formula &origin, int depth, Expr &rCur) {
+    auto cur = as<RangeOpNode>(rCur);
+    if (cur->hasPaddings()) {
+        // string msg = "====== END rule9RangeMagnify: Paddings exist \n";
+        // dbg(msg);
+        return;
+    }
+    // HACK for conv5x5
+    vector<VarRangePair> newSumVarRanges;
+    for (const auto &[var, range] : cur->getSumVarRanges()) {
+        if (range.first == 0 && range.second == 5) {
+            newSumVarRanges.emplace_back(
+                var, Range{range.first, (range.second + 2) / 3 * 3});
+        } else
+            newSumVarRanges.emplace_back(var, range);
+    }
+    if (newSumVarRanges.empty())
+        return;
+    auto magnifiedCur = RangeMagnifyVisitor().magnify(cur, newSumVarRanges);
+    if (!magnifiedCur)
+        return;
+
+    // next searching step
+    string msg = "relax iterating ranges ";
+    for (size_t i = 0; i < cur->getSumVarRanges().size(); ++i) {
+        const auto &[v1, a] = cur->getVarRange(IterationType::Sum, i);
+        const auto &[v2, b] = magnifiedCur->getVarRange(IterationType::Sum, i);
+        assert(v1->getName() == v2->getName());
+        if (a != b) {
+            msg += v1->getName();
+            msg +=
+                " (" + to_string(a.first) + "," + to_string(a.second) + ") to";
+            msg += " (" + to_string(b.first) + "," + to_string(b.second) + "),";
+        }
+    }
+    nextStep(origin, depth, rCur, magnifiedCur, msg);
+    return;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/AsTVMVisitor.cc
+++ b/src/nnet/Visitor/AsTVMVisitor.cc
@ -0,0 +1,165 @@
+#include "nnet/Visitor/AsTVMVisitor.h"
+
+namespace nnet {
+
+std::string AsTVMVisitor::visit_(const Constant &c) {
+    return std::to_string(c->getValue());
+}
+std::string AsTVMVisitor::visit_(const BinaryOp &c) {
+    switch (c->getOpType()) {
+    case OpType::Add:
+        return "(" + dispatch(c->getLhs()) + " + " + dispatch(c->getRhs()) +
+               ")";
+    case OpType::Sub:
+        return "(" + dispatch(c->getLhs()) + " - " + dispatch(c->getRhs()) +
+               ")";
+    case OpType::Mul:
+        return "(" + dispatch(c->getLhs()) + " * " + dispatch(c->getRhs()) +
+               ")";
+    case OpType::Div:
+        return "(" + dispatch(c->getLhs()) + " // " + dispatch(c->getRhs()) +
+               ")";
+    case OpType::Mod:
+        return "(" + dispatch(c->getLhs()) + " % " + dispatch(c->getRhs()) +
+               ")";
+    default:
+        assert(false);
+    }
+}
+std::string AsTVMVisitor::visit_(const Func &c) {
+    switch (c->getFuncType()) {
+    case FuncType::Relu:
+        // TODO: Deduce the dtype
+        return "te.max(" + dispatch(c->getObject()) +
+               ", tvm.tir.const(0, 'float32'))";
+    case FuncType::Tanh:
+        return "te.tanh(" + dispatch(c->getObject()) + ")";
+    default:
+        assert(false);
+    }
+}
+std::string AsTVMVisitor::visit_(const RangeOp &c) {
+    auto outerStage = curStage;
+    curStage = nStage++;
+
+    std::string stmt;
+    std::string stageName = "s" + std::to_string(curStage);
+    std::vector<std::string> reduceVars;
+    for (auto &&[var, range] : c->getSumVarRanges()) {
+        std::string varName = stageName + "_" + var->getName();
+        stmt += varName + " = " + "te.reduce_axis((" +
+                std::to_string(range.first) + ", " +
+                std::to_string(range.second) + "), name=\"" + varName + "\")\n";
+        reduceVars.emplace_back(varName);
+        pythonVars.emplace_back(varName);
+    }
+    std::vector<int> shape;
+    stmt += stageName + " = te.compute((";
+    for (size_t i = 0, n = c->getLoopVarRanges().size(); i < n; i++) {
+        auto &&[var, range] = c->getLoopVarRanges()[i];
+        std::string varName = stageName + "_" + var->getName();
+        offset[varName] = -range.first + c->getPaddings(i);
+        auto len = range.second - range.first + 2 * c->getPaddings(i);
+        stmt += std::to_string(len) + ", ";
+        shape.emplace_back(len);
+    }
+    stmt += "), lambda ";
+    bool first = true;
+    for (auto &&[var, range] : c->getLoopVarRanges()) {
+        std::string varName = stageName + "_" + var->getName();
+        stmt += (first ? "" : ", ") + varName;
+        first = false;
+    }
+    std::string summand = dispatch(c->getSummand());
+    if (!reduceVars.empty()) {
+        summand = "te.sum(" + summand + ", axis=(";
+        for (auto &&var : reduceVars) {
+            summand += var + ", ";
+        }
+        summand += "))";
+    }
+    if (c->hasPaddings()) {
+        std::string guard = "tir.if_then_else(tir.all(";
+        bool first = true;
+        for (size_t i = 0, n = c->getLoopVarRanges().size(); i < n; i++) {
+            auto &&[var, range] = c->getLoopVarRanges()[i];
+            std::string varName = stageName + "_" + var->getName();
+            if (auto pad = c->getPaddings(i); pad > 0) {
+                guard += (first ? "" : ", ") + varName +
+                         " >= " + std::to_string(range.first) + ", " + varName +
+                         " < " + std::to_string(range.second);
+                first = false;
+            }
+        }
+        // TODO: Deduce the dtype
+        guard += "), " + summand + ", tvm.tir.const(0.0, \"float32\"))";
+        summand = guard;
+    }
+    stmt += ": " + summand + ")";
+    stmts += stmt + "\n";
+
+    pythonVars.emplace_back(stageName);
+    output = stageName;
+    outputShape = std::move(shape);
+    curStage = outerStage;
+    return stageName;
+}
+std::string AsTVMVisitor::visit_(const Subscript &c) {
+    std::string str = dispatch(c->getObject()) + "[";
+    for (size_t i = 0, n = c->getIndex().size(); i < n; i++) {
+        const auto &idx = c->getIndex()[i];
+        str += (i == 0 ? "" : ", ") + dispatch(idx);
+        if (c->getObject()->getType() == NodeType::RangeOpNodeType) {
+            auto rangeOp = as<RangeOpNode>(c->getObject());
+            str += " - " +
+                   std::to_string(rangeOp->getLoopVarRanges()[i].second.first -
+                                  rangeOp->getPaddings(i));
+        }
+    }
+    str += "]";
+    return str;
+}
+std::string AsTVMVisitor::visit_(const Var &c) {
+    std::string stageName = "s" + std::to_string(curStage);
+    std::string varName = stageName + "_" + c->getName();
+    if (offset.count(varName)) {
+        return "(" + varName + " - " + std::to_string(offset.at(varName)) + ")";
+    } else {
+        return varName;
+    }
+}
+std::string AsTVMVisitor::visit_(const Tensor &c) {
+    pythonVars.emplace_back(c->getName());
+    inputs.emplace_back(c->getName());
+    inputShapes.emplace_back(c->getShape());
+    std::string stmt = c->getName() + " = te.placeholder((";
+    for (auto &&dim : c->getShape()) {
+        stmt += std::to_string(dim) + ", ";
+    }
+    stmt += "), name='" + c->getName() + "')";
+    stmts += stmt + "\n";
+    return c->getName();
+}
+std::string AsTVMVisitor::getStmts() const {
+    std::string ret;
+
+    // Workaround because closure capturing does not work in an `exec`
+    // https://stackoverflow.com/questions/2749655/why-are-closures-broken-within-exec
+    ret += "global ";
+    bool first = true;
+    for (auto &&var : pythonVars) {
+        ret += (first ? "" : ", ") + var;
+        first = false;
+    }
+    ret += "\n";
+
+    ret += stmts;
+    ret += "ret = [" + output;
+    for (auto &&input : inputs) {
+        ret += ", " + input;
+    }
+    ret += "]\n";
+    return ret;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/CheckOOBVisitor.cc
+++ b/src/nnet/Visitor/CheckOOBVisitor.cc
@ -0,0 +1,35 @@
+#include "nnet/Visitor/CheckOOBVisitor.h"
+#include "nnet/Visitor/SimplifyExprVisitor.h"
+
+namespace nnet {
+
+void CheckOOBVisitor::visit_(const Subscript &c) {
+    const auto &objectRanges = c->getObjectRangesWithPaddings();
+    for (size_t dim = 0; dim < c->getDims(); ++dim) {
+        SimplifyExprVisitor simplifier;
+        auto optional = simplifier.getExprRange(c->getIndex(dim), rangeOp);
+        if (!optional.has_value())
+            continue;
+        const Range &exprRange = *optional;
+        if (exprRange.first < objectRanges[dim].first ||
+            exprRange.second > objectRanges[dim].second) {
+            // dbg("OOB detected!", c, dim, exprRange, objectRanges[dim]);
+            // std::cout << "OOB detected! " << c->toReadable() << ", dim=" <<
+            // dim
+            //           << ", Range=(" << exprRange.first << ", "
+            //           << exprRange.second << "), objRange=("
+            //           << objectRanges[dim].first << ", "
+            //           << objectRanges[dim].second << ")." << std::endl;
+            detect = true;
+        }
+    }
+}
+
+bool CheckOOBVisitor::checkRangeOp(const RangeOp &_rangeOp) {
+    detect = false;
+    rangeOp = _rangeOp;
+    dispatch(rangeOp);
+    return detect;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/CloneMutator.cc
+++ b/src/nnet/Visitor/CloneMutator.cc
@ -0,0 +1,9 @@
+#include "nnet/Visitor/CloneMutator.h"
+
+namespace nnet {
+
+Expr CloneMutator::visit_(const Constant &c) { return c; }
+Expr CloneMutator::visit_(const Var &c) { return c; }
+Expr CloneMutator::visit_(const Tensor &c) { return c; }
+
+} // namespace nnet
--- a/src/nnet/Visitor/CompareMultiFormulasVisitor.cc
+++ b/src/nnet/Visitor/CompareMultiFormulasVisitor.cc
@ -0,0 +1,34 @@
+#include "nnet/Visitor/CompareMultiFormulasVisitor.h"
+
+namespace nnet {
+
+bool CompareMultiFormulasVisitor::compare(const VecExpr &roots) {
+    if (roots.empty())
+        return false;
+    vector<RangeOp> rangeOps;
+    for (const auto &root : roots) {
+        if (auto rangeOp = as<RangeOpNode>(root))
+            rangeOps.emplace_back(rangeOp);
+        else
+            return false;
+    }
+    const auto pattern = rangeOps[0];
+    for (auto rangeOp : rangeOps) {
+        if (pattern->getNumOutputDims() != rangeOp->getNumOutputDims()) {
+            return false;
+        }
+        for (int i = 0; i < pattern->getNumOutputDims(); ++i)
+            if (pattern->getVarRange(0, i).second !=
+                rangeOp->getVarRange(0, i).second) {
+                return false;
+            }
+        for (size_t i = 0; i < pattern->getSumVarRanges().size(); ++i)
+            if (pattern->getVarRange(1, i).second !=
+                rangeOp->getVarRange(1, i).second) {
+                return false;
+            }
+    }
+    return true;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/CountRoutineVisitor.cc
+++ b/src/nnet/Visitor/CountRoutineVisitor.cc
@ -0,0 +1,38 @@
+#include "nnet/Visitor/CountRoutineVisitor.h"
+
+namespace nnet {
+
+void CountRoutineVisitor::visit_(const Tensor &c) {
+    if (auto routine = c->getSource(); routine) {
+        cnts[routineTypeToId(routine->getType())]++;
+    }
+    ExprTreeVisitor::visit_(c);
+}
+
+vector<int> CountRoutineVisitor::count(const Expr &root) {
+    cnts = vector<int>(RoutineTypeCnt, 0);
+    dispatch(root);
+    return cnts;
+}
+
+bool CountRoutineVisitor::match(const Expr &root, int nMatmul, int nConv,
+                                int nElement, int nSg2bmm,
+                                int nLongformerGBMM) {
+    auto opCount = count(root);
+    bool ret = true;
+    if (opCount[routineTypeToId(RoutineType::MatmulNodeType)] != nMatmul)
+        ret = false;
+    if (opCount[routineTypeToId(RoutineType::ConvNodeType)] != nConv)
+        ret = false;
+    if (opCount[routineTypeToId(RoutineType::ElementWiseNodeType)] != nElement)
+        ret = false;
+    if (opCount.at(routineTypeToId(RoutineType::G2bmmNodeType)) != nSg2bmm)
+        ret = false;
+    if (!ret) {
+        auto target =
+            vector<int>{nMatmul, nConv, nSg2bmm, nLongformerGBMM, nElement};
+    }
+    return ret;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/FullPrinterVisitor.cc
+++ b/src/nnet/Visitor/FullPrinterVisitor.cc
@ -0,0 +1,58 @@
+#include "nnet/Visitor/FullPrinterVisitor.h"
+
+namespace nnet {
+
+void FullPrinterVisitor::visit_(const Tensor &c) {
+    q.emplace_back(c->getName(), c->getSource(), c);
+}
+
+string FullPrinterVisitor::print(const Expr &root) {
+    q.clear();
+    std::ostringstream oss;
+    dispatch(root);
+    oss << "==> ROOT\n" << root->toReadable() << "\n";
+    for (size_t i = 0; i < q.size(); ++i) {
+        const auto &[name, routine, tensor] = q[i];
+        oss << "==> " << name << " : ";
+        if (routine) {
+            oss << routine->toReadable() << "\n";
+            if (routine->getExpr()) {
+                oss << routine->getExpr()->toReadable() << "\n";
+            } else
+                oss << "[INFO] Source is nullptr \n";
+            if (!routine->getInputs().empty()) {
+                for (const auto &tensor : routine->getInputs())
+                    q.emplace_back(tensor->getName(), tensor->getSource(),
+                                   tensor);
+            } else if (routine->getExpr())
+                dispatch(routine->getExpr());
+        } else
+            oss << "Input Tensor " << tensor->toOutputShape() << "\n";
+    }
+    return oss.str();
+}
+
+const vector<tuple<string, Routine, Tensor>> &
+FullPrinterVisitor::traverse(const Expr &root) {
+    q.clear();
+    dispatch(root);
+    for (size_t i = 0; i < q.size(); ++i) {
+        const auto &[name, routine, tensor] = q[i];
+        if (routine) {
+            // Matmul after DLT do not modify expression, so inputs has a higher
+            // priority. Some OPs such as DLT have not implement source. Then
+            // use inputs
+            if (!routine->getInputs().empty()) {
+                for (const auto &tensor : routine->getInputs())
+                    dispatch(tensor);
+            } else if (routine->getExpr()) {
+                dispatch(routine->getExpr());
+            } else {
+                assert(false);
+            }
+        }
+    }
+    return q;
+}
+
+} // namespace nnet
--- a/src/nnet/Visitor/GetTensorsVisitor.cc
+++ b/src/nnet/Visitor/GetTensorsVisitor.cc
@ -0,0 +1,9 @@
+#include "nnet/Visitor/GetTensorsVisitor.h"
+
+namespace nnet {
+
+void GetTensorsVisitor::visit_(const Tensor &c) {
+    tensors.try_emplace(c->getName(), c);
+}
+
+} // namespace nnet
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit e2239ee6043f73722e7aa812a459f54a28552929`
				`@ -0,0 +1 @@`
				`Subproject commit 6aebf09233951e4ce30a63919186a70b2b195756`
				`@ -0,0 +1 @@`
				`Subproject commit 1e3400b6742288429f2069aaf5febf92d0662dae`