fix: 改正 1x1 卷积的变换

Signed-off-by: YdrMaster <ydrml@hotmail.com>
typo
2023-08-04 16:05:22 +08:00 · 2023-08-04 16:05:22 +08:00 · 2023-08-04 16:05:22 +08:00 · 2023-08-04 16:05:22 +08:00 · 2023-08-04 16:05:22 +08:00 · 2023-08-04 16:05:22 +08:00
38 changed files with 2346 additions and 50 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -60,6 +60,10 @@ if(USE_PROTOBUF)
 endif()

 include_directories(include)
+
+add_subdirectory(optimization)
+include_directories(optimization/include)
+
 # Pybind11
 add_subdirectory(3rd-party/pybind11)
 include_directories(3rd-party/pybind11/include)
@ -98,7 +102,7 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG")


 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/optimizations/*.cc src/utils/*.cc)

 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -121,7 +125,7 @@ if(USE_PROTOBUF)
  target_link_libraries(InfiniTensor tensor_proto)
 endif()

-target_link_libraries(InfiniTensor pybind11::embed)
+target_link_libraries(InfiniTensor optimization pybind11::embed)

 # TVM backend
 if(BUILD_TEST_EINNET)
@ -151,10 +155,9 @@ if(USE_INTELCPU)
  set(DNNL_CONFIGURATION "cpu_gomp")
  find_package(dnnl CONFIG REQUIRED)
  if(dnnl_FOUND)
-      add_compile_definitions(USE_MKL=1)
      include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/)
      link_directories(${dnnl_DIR}/../../../cpu_gomp/lib)
-      target_link_libraries(InfiniTensor   dnnl)
+      target_link_libraries(InfiniTensor  dnnl)
  else()
      message(FATAL_ERROR "dnnl library not found")
  endif()
--- a/include/ffi/ffi_embed.h
+++ b/include/ffi/ffi_embed.h
@ -5,4 +5,4 @@

 namespace infini {
 void start_interpreter();
-} // namespace infini
+} // namespace infini
--- a/include/optimizations/partitions/global_graph_partition.h
+++ b/include/optimizations/partitions/global_graph_partition.h
@ -0,0 +1,10 @@
+#include "optimizations/partitions/partition.h"
+
+namespace infini {
+class GlobalGraphPartition : public Partition {
+    Graph run(const GraphObj &graph, const Transformation &tr,
+              const Rating &rating) const override {
+        return rankCandidates(graph, tr, rating).top().graph;
+    }
+};
+} // namespace infini
--- a/include/optimizations/partitions/partition.h
+++ b/include/optimizations/partitions/partition.h
@ -0,0 +1,33 @@
+#pragma once
+
+#include "core/graph.h"
+#include "optimizations/rate/rating.h"
+#include "optimizations/transformations/transformation.h"
+#include <queue>
+
+namespace infini {
+class Partition {
+  public:
+    virtual Graph run(const GraphObj &, const Transformation &,
+                      const Rating &) const = 0;
+    struct Candidate {
+        Graph graph;
+        Rating::Cost cost;
+
+        bool operator<(Candidate others) const { return cost < others.cost; }
+        bool operator>(Candidate others) const { return cost > others.cost; }
+    };
+
+  protected:
+    using CandidateQueue = std::priority_queue<Candidate, vector<Candidate>,
+                                               std::greater<Candidate>>;
+
+    /// @brief Rank the subgraph candidates.
+    /// @param subgraph The subgraph to transform.
+    /// @param tr Transformation object.
+    /// @return Ranked candidates.
+    CandidateQueue rankCandidates(const GraphObj &subgraph,
+                                  const Transformation &tr,
+                                  const Rating &rating) const;
+};
+} // namespace infini
--- a/include/optimizations/partitions/single_operator_partition.h
+++ b/include/optimizations/partitions/single_operator_partition.h
@ -0,0 +1,8 @@
+#include "optimizations/partitions/partition.h"
+
+namespace infini {
+class SingleOperatorPartition : public Partition {
+    Graph run(const GraphObj &, const Transformation &,
+              const Rating &) const override;
+};
+} // namespace infini
--- a/include/optimizations/passes/pass.h
+++ b/include/optimizations/passes/pass.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include "core/graph.h"
+#include "optimizations/partitions/partition.h"
+
+namespace infini {
+class Pass {
+    std::unique_ptr<Partition> p;
+    std::unique_ptr<Transformation> tr;
+    std::unique_ptr<Rating> rating;
+
+  public:
+    Pass(std::unique_ptr<Partition> p, std::unique_ptr<Transformation> tr,
+         std::unique_ptr<Rating> rating)
+        : p(std::move(p)), tr(std::move(tr)), rating(std::move(rating)) {}
+
+    Graph run(const GraphObj &graph) const {
+        return p->run(graph, *tr, *rating);
+    }
+};
+} // namespace infini
--- a/include/optimizations/passes/pass_manager.h
+++ b/include/optimizations/passes/pass_manager.h
@ -0,0 +1,25 @@
+#include "core/graph.h"
+#include "pass.h"
+
+namespace infini {
+class PassManager {
+  public:
+    PassManager() {}
+
+    Graph run(Graph graph) {
+        for (auto pass : passes)
+            graph = pass->run(*graph);
+        return graph;
+    }
+
+    bool addPass(std::unique_ptr<Partition> p,
+                 std::unique_ptr<Transformation> tr,
+                 std::unique_ptr<Rating> rating) {
+        passes.emplace_back(std::move(p), std::move(tr), std::move(rating));
+        return true;
+    }
+
+  private:
+    vector<Ref<Pass>> passes;
+};
+} // namespace infini
--- a/include/optimizations/rate/memory_rating.h
+++ b/include/optimizations/rate/memory_rating.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "optimizations/rate/rating.h"
+#include <numeric>
+
+namespace infini {
+/**
+ * Rate a `Graph` by its memory usage.
+ */
+class MemoryRating : public Rating {
+  public:
+    /**
+     * Run the `Rating` on the graph.
+     */
+    float run(const GraphObj &graph) const override {
+        auto tensors = graph.getTensors();
+        return static_cast<float>(
+            std::accumulate(tensors.begin(), tensors.end(), (size_t)0,
+                            [](auto x) { return x.size(); }));
+    }
+};
+} // namespace infini
--- a/include/optimizations/rate/rating.h
+++ b/include/optimizations/rate/rating.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "core/graph.h"
+
+namespace infini {
+/// @brief Rate a `Graph`.
+class Rating {
+  public:
+    /// @brief Cost of a substitute.
+    using Cost = float;
+
+    /// @brief Run the `Rating` on the `graph`.
+    /// @param graph The graph to rate.
+    /// @return The cost of `graph`.
+    virtual Cost run(const GraphObj &graph) const = 0;
+};
+} // namespace infini
--- a/include/optimizations/rate/time_rating.h
+++ b/include/optimizations/rate/time_rating.h
@ -0,0 +1,16 @@
+#pragma once
+
+#include "optimizations/rate/rating.h"
+
+namespace infini {
+/**
+ * Rate a `Graph` by its memory usage.
+ */
+class TimeRating : public Rating {
+  public:
+    /**
+     * Run the `Rating` on the graph.
+     */
+    float run(const GraphObj &graph) const override;
+};
+} // namespace infini
--- a/include/optimizations/transformations/transformation.h
+++ b/include/optimizations/transformations/transformation.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include "core/common.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+
+namespace infini {
+class Transformation {
+  public:
+    virtual vector<Graph> run(const GraphObj &graph) const {
+        return {make_ref<GraphObj>(graph)};
+    };
+};
+} // namespace infini
--- a/optimization/CMakeLists.txt
+++ b/optimization/CMakeLists.txt
@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+project(optimization LANGUAGES CXX C)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+set(CMAKE_CXX_FLAGS "$ENV{CMAKE_CXX_FLAGS} -fPIC")
+
+file(GLOB_RECURSE SRC src/*.h src/*.cc src/*.cpp)
+add_library(optimization ${SRC})
+
+file(GLOB_RECURSE TEST test/*.cpp)
+add_executable(test_optimization ${TEST})
+target_link_libraries(test_optimization optimization)
--- a/optimization/Makefile
+++ b/optimization/Makefile
@ -0,0 +1,14 @@
+.PHONY : build clean install-python test-cpp test-onnx
+
+TYPE ?= release
+
+build:
+	mkdir -p build/$(TYPE)
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
+
+test: build
+	build/$(TYPE)/test
+	@echo Done
+
+clean:
+	rm -rf build
--- a/optimization/README.md
+++ b/optimization/README.md
@ -0,0 +1,24 @@
+# 设计概述
+
+1. 宗旨：简化
+   尽量简化定义。凡是当前用不到的就去掉，以保证开发同时保持敏捷和可靠。
+2. 目标：中层接口
+   因为不是面向用户开发，不需要考虑接口易用性，也不需要过度考虑防御式编程。调用这些 api 的将会是第一方编写的上层代码。
+3. 文档
+   覆盖率应逼近 100%。
+
+## 代码解析
+
+重要的文件是 [tensor.h](src/tensor.h)、[graph.h](src/graph.h) 和 [mutation.h](src/mutation.h)。
+
+tensor.h 提供了这个图表示中张量的定义。张量的结构由形状、数据类型和数据组成，并储存了在每个图上和算子的连接关系。张量的所有权由所有连接到张量的算子共享，因此其存在的唯一形式就是 `std::shared_ptr`，脱离共享所有权智能指针是没有意义的。通过向工厂函数直接传递形状、数据类型和数据，直接构造智能指针的张量，一旦所有连接到张量的算子释放，算子也被释放。
+
+算子定义和其他重要类型定义在 graph.h。算子中存储且仅存储一个算子类型，其他所有信息都由其持有所有权的输入张量表示。算子类型指示了算子如何解释自己的每个输入输出张量。这样做为算子提供了充分的灵活性，同时也不损失表达能力——那些决定算子如何工作的重要信息必定已经保存在张量的数据中了，而算子类型会解释它们是谁。
+
+算子的所有权属于一张图，确切地说，一张未分的或不可再分的单体图，`Unigraph`。每个算子由唯一的图控制，多个图之间不共享算子的任何部分。由于算子的定义非常轻（1 枚举 + 2 智能指针的数组），这样做不会带来大的开销，但减轻了所有权的管理难度——有且只有张量一种对象，会在算子这一种对象之间共享，其他所有东西的所有权都是独占的。
+
+同时，`Unigraph` 具有只增性。只能向其中增加算子，必须以拓扑序，不能移除，也不能修改算子的顺序。因此，算子在图中的序号是唯一的，每个图则持有一个唯一的 ID。因此，可以用 ID 来指示图，用序号指示算子（`OpRef`）；用序号指示算子，再用序号指示张量（`TensorPos`）。图必须整体销毁，销毁时，其中所有算子控制的所有张量连接也会同时销毁。因此，不必维持不可独立存在的所有权关系。
+
+mutation.h 的 `Partition`、`Mutation` 和 `Rating` 三个类用于支持图的规则优化。这三个类本质是一样的，这种定义是为了对优化的不同阶段实现编译时的约束——一轮优化必须按划分→突变→评价的顺序依次执行每个操作一次。
+
+这些类中保存的是一个 `Mutant` 的二维数组。每个 `Mutant` 是子图的一种突变体，存储了子图结构和评分。内层数组表示每个子图的多个变体，外层数组表示每张图的多个子图。显然，`Partition` 输入完整的图，并构建外层数组的结构，`Mutation` 将填充内层数组。`Rating` 填充每个突变体的得分，然后从高到低排序。接下来可以用序号向量指导图的重建。
--- a/optimization/include/optimization/common.h
+++ b/optimization/include/optimization/common.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "../../src/mutation.h"
+#include "../../src/pass/single_operator.h"
+#include <unordered_set>
+
+namespace optimization {
+
+/// @brief Calculates the memory usage of a graph.
+/// @param arg0 The graph.
+/// @return The reciprocal of the total memory usage of the graph in bytes.
+inline float memory_usage(Unigraph const &g) {
+    std::unordered_set<size_t> mark;
+    uintptr_t memory;
+    for (const auto &op : g.operators)
+        for (const auto &t : op.outputs)
+            if (mark.insert(reinterpret_cast<uintptr_t>(t.get())).second)
+                memory += t->size();
+    return 1e6f / static_cast<float>(memory);
+}
+
+} // namespace optimization
--- a/optimization/src/data.h
+++ b/optimization/src/data.h
@ -0,0 +1,78 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+namespace optimization {
+
+/// @brief Stores tensor data。
+class Data {
+    /// @brief `cpu_data` is stored in the memory space,
+    /// which allows it to be managed using `std::vector<uint8_t>`.
+    uint8_t *cpu_data;
+
+    // #ifdef USE_CUDA
+    //     void *gpu_data;
+    // #endif
+
+    // #ifdef USE_BANG
+    //     void *mlu_data;
+    // #endif
+
+    Data(uint8_t *ptr) : cpu_data(ptr) {}
+
+  public:
+    Data() : cpu_data(nullptr) {}
+    Data(size_t size) : cpu_data(new uint8_t[size]) {}
+    template <class t> Data(t begin, t end) : cpu_data(nullptr) {
+        size_t c = sizeof(decltype(*begin)) * static_cast<size_t>(end - begin);
+        cpu_data = new uint8_t[c];
+        std::copy(begin, end, cpu_data);
+    }
+    Data(Data const &) = delete;
+    Data(Data &&others) noexcept
+        : cpu_data(std::exchange(others.cpu_data, nullptr)) {}
+    ~Data() noexcept { delete[] cpu_data; }
+
+    Data &operator=(Data const &) = delete;
+    Data &operator=(Data &&others) noexcept {
+        if (this != &others)
+            delete[] std::exchange(cpu_data,
+                                   std::exchange(others.cpu_data, nullptr));
+
+        return *this;
+    }
+
+    /// @brief Builds `Data` from `vector` os any type `t`.
+    /// @tparam t Data type.
+    /// @param data Data `vector`.
+    /// @return `Data` object.
+    template <class t> static Data cpu(std::vector<t> const &data) {
+        auto const len = data.size();
+        auto const size = sizeof(t[len]);
+        Data ans;
+        memcpy(ans.cpu_data, data.data(), size);
+        return ans;
+    }
+
+    /// @brief Gets data ptr.
+    /// @tparam t Data type.
+    /// @return Data ptr.
+    template <class t> t *as_ptr() const {
+        return reinterpret_cast<t *>(cpu_data);
+    }
+
+    /// @brief Copies data to a `Vec`.
+    /// @tparam t Data type.
+    /// @param len Count of data.
+    /// @return The data `Vec`.
+    template <class t> std::vector<t> to_vec(size_t len) const {
+        std::vector<t> ans(len);
+        memcpy(cpu_data, ans.data(), sizeof(t[len]));
+        return ans;
+    }
+};
+
+} // namespace optimization
--- a/optimization/src/data_type.cc
+++ b/optimization/src/data_type.cc
@ -0,0 +1,34 @@
+#include "data_type.h"
+
+using namespace optimization;
+
+size_t DataType::size() const {
+    switch (id) {
+    case DataTypeId::FLOAT:
+        return sizeof(float);
+    case DataTypeId::UINT8:
+        return sizeof(uint8_t);
+    case DataTypeId::INT8:
+        return sizeof(int8_t);
+    case DataTypeId::UINT16:
+        return sizeof(uint16_t);
+    case DataTypeId::INT16:
+        return sizeof(int16_t);
+    case DataTypeId::INT32:
+        return sizeof(int32_t);
+    case DataTypeId::INT64:
+        return sizeof(int64_t);
+    case DataTypeId::BOOL:
+        return sizeof(bool);
+    case DataTypeId::FLOAT16:
+        return 2;
+    case DataTypeId::DOUBLE:
+        return sizeof(double);
+    case DataTypeId::UINT32:
+        return sizeof(uint32_t);
+    case DataTypeId::UINT64:
+        return sizeof(uint64_t);
+    default:
+        throw "unsupported data type.";
+    }
+}
--- a/optimization/src/data_type.h
+++ b/optimization/src/data_type.h
@ -0,0 +1,47 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace optimization {
+
+enum class DataTypeId : uint8_t {
+    UNDEFINED,
+    FLOAT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    BOOL,
+    FLOAT16,
+    DOUBLE,
+    UINT32,
+    UINT64,
+    // COMPLEX64,
+    // COMPLEX128,
+    // BFLOAT16,
+};
+
+struct DataType {
+    DataTypeId id;
+
+    size_t size() const;
+};
+
+template <class t> DataType ty();
+template <> inline DataType ty<float>() { return {DataTypeId::FLOAT}; }
+template <> inline DataType ty<uint8_t>() { return {DataTypeId::UINT8}; }
+template <> inline DataType ty<int8_t>() { return {DataTypeId::INT8}; }
+template <> inline DataType ty<uint16_t>() { return {DataTypeId::UINT16}; }
+template <> inline DataType ty<int16_t>() { return {DataTypeId::INT16}; }
+template <> inline DataType ty<int32_t>() { return {DataTypeId::INT32}; }
+template <> inline DataType ty<int64_t>() { return {DataTypeId::INT64}; }
+template <> inline DataType ty<bool>() { return {DataTypeId::BOOL}; }
+template <> inline DataType ty<double>() { return {DataTypeId::DOUBLE}; }
+template <> inline DataType ty<uint32_t>() { return {DataTypeId::UINT32}; }
+template <> inline DataType ty<uint64_t>() { return {DataTypeId::UINT64}; }
+
+} // namespace optimization
--- a/optimization/src/graph.cc
+++ b/optimization/src/graph.cc
@ -0,0 +1,68 @@
+#include "graph.h"
+
+using namespace optimization;
+
+static size_t GRAPH_ID = 1;
+
+Unigraph::Unigraph() : id(GRAPH_ID++) {}
+
+Unigraph::Unigraph(Unigraph &&others)
+    : id(std::exchange(others.id, 0)), operators(std::move(others.operators)) {}
+
+Unigraph::~Unigraph() {
+    for (auto &op : operators) {
+        for (auto &i : op.inputs)
+            i->target.erase(i->target.find(this->id));
+        for (auto &o : op.outputs)
+            o->source.erase(o->source.find(this->id));
+    }
+}
+
+Unigraph &Unigraph::operator=(Unigraph &&others) {
+    if (this == &others)
+        return *this;
+
+    for (auto &op : operators) {
+        for (auto &i : op.inputs)
+            i->target.erase(i->target.find(this->id));
+        for (auto &o : op.outputs)
+            o->source.erase(o->source.find(this->id));
+    }
+
+    this->id = std::exchange(others.id, 0);
+    this->operators = std::move(others.operators);
+
+    return *this;
+}
+
+OpRef Unigraph::push_operator( // fmt: new line
+    OpType op_type,            //
+    Vec<Arc<Tensor>> inputs,   //
+    Vec<Arc<Tensor>> outputs   //
+) {
+    auto ans = OpRef{this->id, operators.size()};
+
+    size_t i = 0;
+    for (auto &input : inputs) {
+        auto it = input->target.find(ans.graph);
+        if (it == input->target.end())
+            input->target[ans.graph] = {{ans.op, i++}};
+        else
+            it->second.push_back({ans.op, i++});
+    }
+    i = 0;
+    for (auto &output : outputs) {
+        auto it = output->source.find(ans.graph);
+        if (it == output->source.end())
+            output->source[ans.graph] = {ans.op, i++};
+        else
+            throw "tensor source exist";
+    }
+
+    this->operators.push_back({
+        op_type,            // fmt: new line
+        std::move(inputs),  //
+        std::move(outputs), //
+    });
+    return ans;
+}
--- a/optimization/src/graph.h
+++ b/optimization/src/graph.h
@ -0,0 +1,55 @@
+#pragma once
+
+#include "op_type.h"
+#include "tensor.h"
+
+namespace optimization {
+
+/// @brief a struct to represent an operator in the computation graph.
+///        The ownership of an `Operator` belongs to one `Unigraph`.
+struct Operator {
+    /// @brief Type of the operator.
+    OpType op_type;
+
+    /// @brief Input and output tensors.
+    ///        Notice: ownership of the tensors are shared between
+    ///        operators that generate and use the same tensor.
+    Vec<Arc<Tensor>> inputs, outputs;
+};
+
+/// @brief A reference of an `Operator` in a `Unigraph`.
+struct OpRef {
+    /// @brief `graph` for unique identifier of `Unigraph`.
+    ///        `op` for `Operator` index in `Unigraph`.
+    size_t graph, op;
+};
+
+/// @brief An unpartitioned graph or an unpartitionable minimum graph.
+struct Unigraph {
+    /// @brief Unique identifier.
+    size_t id;
+    /// @brief List of operators in the graph with topological order.
+    Vec<Operator> operators;
+
+    Unigraph();
+    Unigraph(Unigraph const &) = delete;
+    Unigraph(Unigraph &&others);
+    ~Unigraph();
+
+    Unigraph &operator=(Unigraph const &) = delete;
+    Unigraph &operator=(Unigraph &&);
+
+    /// @brief Pushs an `Operator` into graph.
+    ///        Every `Operator` must be pushed in topological order.
+    /// @param op_type Operator type.
+    /// @param inputs Input tensors.
+    /// @param outputs Output tensors.
+    /// @return An `OpRef`.
+    OpRef push_operator(         // fmt: new line
+        OpType op_type,          //
+        Vec<Arc<Tensor>> inputs, //
+        Vec<Arc<Tensor>> outputs //
+    );
+};
+
+} // namespace optimization
--- a/optimization/src/mutation.h
+++ b/optimization/src/mutation.h
@ -0,0 +1,164 @@
+#pragma once
+
+#include "graph.h"
+#include <functional>
+
+namespace optimization {
+
+/// @brief A candidate subgraph mutant.
+struct Mutant {
+    /// @brief The mutated subgraph.
+    Unigraph graph;
+
+    /// @brief A score representing the quality of the mutant.
+    float score;
+
+    Mutant(Unigraph &&g) : graph(std::move(g)), score(1.0f) {}
+    Mutant(Mutant const &) = delete;
+    Mutant(Mutant &&others)
+        : graph(std::move(others.graph)),
+          score(std::exchange(others.score, 1.0f)) {}
+
+    Mutant &operator=(Mutant const &) = delete;
+    Mutant &operator=(Mutant &&others) {
+        if (this != &others) {
+            this->graph = std::move(others.graph);
+            this->score = std::exchange(others.score, 1.0f);
+        }
+        return *this;
+    }
+};
+
+/// @brief A subgraph partition with `PartitionType`, will be mutated into
+///        multiple `Mutant`s.
+/// @tparam PartitionType To partition this subgraph.
+template <class PartitionType> struct SubGraph {
+    Vec<Mutant> mutants;
+    PartitionType type;
+};
+
+template <class t> Vec<size_t> list_size(Vec<Vec<t>> const &);
+template <class PartitionType> class Mutation;
+template <class PartitionType> class Rating;
+
+/// @brief Partitioned subgraphs.
+template <class PartitionType> struct Partition {
+    /// @brief 2D vector of `Mutant` instances for each partitioned subgraph.
+    Vec<SubGraph<PartitionType>> parts;
+
+    friend Mutation<PartitionType>;
+
+  public:
+    /// @brief A functional object that takes an unpartitioned graph as input
+    ///        and returns a vector of partitioned subgraphs.
+    using Func =
+        std::function<Vec<std::pair<Unigraph, PartitionType>>(Unigraph &&)>;
+
+    /// @brief Constructs a partitioned graph from an unpartitioned graph
+    ///        using a partitioning function.
+    /// @param g An unpartitioned graph.
+    /// @param f A function that takes an unpartitioned graph as input
+    /// and returns a vector of partitioned subgraphs.
+    Partition(Unigraph &&g, Func const &f) {
+        for (auto &[g_, t] : f(std::move(g))) {
+            auto &sub = this->parts.emplace_back();
+            sub.mutants.emplace_back(std::move(g_));
+            sub.type = std::move(t);
+        }
+    }
+
+    /// @brief Returns mutant vector size.
+    /// @return 2D vector size.
+    Vec<size_t> size() const { return list_size(parts); }
+};
+
+/// @brief Generates mutants for every subgraph.
+template <class PartitionType> class Mutation {
+    /// @brief 2D vector of `Mutant` instances for each partitioned subgraph.
+    Vec<SubGraph<PartitionType>> parts;
+
+    friend Rating<PartitionType>;
+
+  public:
+    /// @brief A functional object that takes a subgraph as input
+    ///        and returns a vector of mutated graphs.
+    using Func =
+        std::function<Vec<Unigraph>(Unigraph const &, PartitionType const &)>;
+
+    /// @brief Mutates every subgraph in a partitioned graph.
+    /// @param p The partitioned graph to be mutated.
+    /// @param f A function that takes a subgraph as input
+    /// and returns a vector of mutated graphs.
+    Mutation(Partition<PartitionType> &&p, Func const &f)
+        : parts(std::move(p.parts)) {
+        for (auto &sub : parts)
+            for (auto &m : f(sub.mutants.front().graph, sub.type))
+                sub.mutants.emplace_back(std::move(m));
+    }
+
+    /// @brief Returns mutant vector size.
+    /// @return 2D vector size.
+    Vec<size_t> size() const { return list_size(parts); }
+};
+
+/// @brief Rates each subgraph mutant.
+template <class PartitionType> class Rating {
+    /// @brief 2D vector of `Mutant` instances for each partitioned subgraph.
+    Vec<SubGraph<PartitionType>> parts;
+
+  public:
+    /// @brief A functional object that takes a mutated subgraph as input
+    ///        and returns its score.
+    using Func = std::function<float(Unigraph const &)>;
+
+    /// @brief Rates every mutated subgraph with a `Rating::Func`.
+    /// @param m The mutated subgraphs to be rated.
+    /// @param f A function that takes a mutated subgraph as input
+    ///             and returns its score.
+    Rating(Mutation<PartitionType> &&m, Func const &f)
+        : parts(std::move(m.parts)) {
+
+        for (auto &sub : parts)
+            if (sub.mutants.size() > 1) {
+                auto sum = 0.0f;
+                for (auto &c : sub.mutants)
+                    sum += (c.score = f(c.graph));
+                sum = std::abs(sum);
+                for (auto &c : sub.mutants)
+                    c.score /= sum;
+                std::sort(sub.mutants.begin(), sub.mutants.end(),
+                          [](auto const &a, auto const &b) {
+                              return a.score > b.score;
+                          });
+            }
+    }
+
+    /// @brief Returns mutant vector size.
+    /// @return 2D vector size.
+    Vec<size_t> size() const { return list_size(parts); }
+
+    /// @brief Builds `Unigraph` from the subgraphs
+    /// with specified indices.
+    /// @param indices Subgraph indices.
+    /// @return Merged `Unigraph`.
+    Unigraph build(Vec<size_t> const &indices) const {
+        const auto size = indices.size();
+        if (size != parts.size())
+            throw "indices size wrong";
+        Unigraph ans;
+        for (size_t i = 0; i < size; ++i)
+            for (const auto &op :
+                 parts.at(i).mutants.at(indices[i]).graph.operators)
+                ans.push_operator(op.op_type, op.inputs, op.outputs);
+        return ans;
+    }
+};
+
+template <class t> Vec<size_t> list_size(Vec<SubGraph<t>> const &list) {
+    Vec<size_t> ans(list.size());
+    std::transform(list.begin(), list.end(), ans.begin(),
+                   [](const auto &e) { return e.mutants.size(); });
+    return ans;
+}
+
+} // namespace optimization
--- a/optimization/src/op_type.h
+++ b/optimization/src/op_type.h
@ -0,0 +1,196 @@
+#pragma once
+
+#include <cstdint>
+
+namespace optimization {
+
+enum class OpType : uint16_t {
+    Abs,
+    Acos,
+    Acosh,
+    Add,
+    And,
+    ArgMax,
+    Asin,
+    Asinh,
+    Atan,
+    Atanh,
+    AveragePool,
+    BatchNormalization,
+    Bernoulli,
+    BitShift,
+    BitwiseAnd,
+    BitwiseNot,
+    BitwiseOr,
+    BitwiseXor,
+    BlackmanWindow,
+    Cast,
+    CastLike,
+    Ceil,
+    Celu,
+    CenterCropPad,
+    Clip,
+    Col2lm,
+    Compress,
+    Concat,
+    ConcatFromSequence,
+    // Constant, // -> Input
+    ConstantOfShape,
+    Conv,
+    ConvInteger,
+    ConvTranspose,
+    Cos,
+    Cosh,
+    CumSum,
+    DFT,
+    DeformConv,
+    DepthToSpace,
+    DequantizeLinear,
+    Det,
+    Div,
+    Dropout,
+    DynamicQuantizeLinear,
+    Einsum,
+    Elu,
+    Equal,
+    Erf,
+    Exp,
+    Expand,
+    EyeLike,
+    Flatten,
+    Floor,
+    GRU,
+    Gather,
+    GatherElements,
+    GatherND,
+    Gemm,
+    GlobalAveragePool,
+    GlobalLpPool,
+    GlobalMaxPool,
+    Greater,
+    GreaterOrEqual,
+    GridSample,
+    GroupNormalization,
+    HammingWindow,
+    HannWindow,
+    HardSigmoid,
+    HardSwish,
+    Hardmax,
+    Identity,
+    If,
+    InstanceNormalization,
+    IsInf,
+    IsNaN,
+    LRN,
+    LSTM,
+    LayerNormalization,
+    LeakyRelu,
+    Less,
+    LessOrEqual,
+    Log,
+    LogSoftmax,
+    Loop,
+    LpNormalization,
+    LpPool,
+    MatMul,
+    MatMulInteger,
+    Max,
+    MaxPool,
+    MaxRoiPool,
+    MaxUnpool,
+    Mean,
+    MeanVarianceNormalization,
+    MelWeightMatrix,
+    Min,
+    Mish,
+    Mod,
+    Mul,
+    Multinomial,
+    Neg,
+    NegativeLogLikelihoodLoss,
+    NonMaxSuppression,
+    NonZero,
+    Not,
+    OneHot,
+    Optional,
+    OptionalGetElement,
+    OptionalHasElement,
+    Or,
+    PRelu,
+    Pad,
+    Pow,
+    QLinearConv,
+    QLinearMatMul,
+    QuantizeLinear,
+    RNN,
+    RandomNormal,
+    RandomNormalLike,
+    RandomUniform,
+    RandomUniformLike,
+    Range,
+    Reciprocal,
+    ReduceL1,
+    ReduceL2,
+    ReduceLogSum,
+    ReduceLogSumExp,
+    ReduceMax,
+    ReduceMean,
+    ReduceMin,
+    ReduceProd,
+    ReduceSum,
+    ReduceSumSquare,
+    Relu,
+    Reshape,
+    Resize,
+    ReverseSequence,
+    RoiAlign,
+    Round,
+    STFT,
+    Scan,
+    Scatter,
+    ScatterElements,
+    ScatterND,
+    Selu,
+    SequenceAt,
+    SequenceConstruct,
+    SequenceEmpty,
+    SequenceErase,
+    SequenceInsert,
+    SequenceLength,
+    SequenceMap,
+    Shape,
+    Shrink,
+    Sigmoid,
+    Sign,
+    Sin,
+    Sinh,
+    Size,
+    Slice,
+    Softmax,
+    SoftmaxCrossEntropyLoss,
+    Softplus,
+    Softsign,
+    SpaceToDepth,
+    Split,
+    SplitToSequence,
+    Sqrt,
+    Squeeze,
+    StringNormalizer,
+    Sub,
+    Sum,
+    Tan,
+    Tanh,
+    TfIdfVectorizer,
+    ThresholdedRelu,
+    Tile,
+    TopK,
+    Transpose,
+    Trilu,
+    Unique,
+    Unsqueeze,
+    Upsample,
+    Where,
+    Xor,
+};
+
+} // namespace optimization
--- a/optimization/src/operator/conv.h
+++ b/optimization/src/operator/conv.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "../graph.h"
+
+namespace optimization {
+
+class Conv {
+    Operator const &op;
+
+  public:
+    explicit Conv(Operator &op) : op(op) {}
+    explicit Conv(Operator const &op) : op(op) {}
+
+    Arc<Tensor> const &input() const { return op.inputs.at(0); }
+    Arc<Tensor> const &kernel() const { return op.inputs.at(1); }
+    Arc<Tensor> const &dilations() const { return op.inputs.at(2); }
+    Arc<Tensor> const &pads() const { return op.inputs.at(3); }
+    Arc<Tensor> const &strides() const { return op.inputs.at(4); }
+    Arc<Tensor> const &output() const { return op.outputs.at(0); }
+};
+
+} // namespace optimization
--- a/optimization/src/pass/single_operator.cc
+++ b/optimization/src/pass/single_operator.cc
@ -0,0 +1,197 @@
+#include "single_operator.h"
+#include "../operator/conv.h"
+#include <iterator>
+#include <map>
+#include <numeric>
+
+using namespace optimization;
+using namespace pass;
+
+Vec<std::pair<Unigraph, SingleOperator>>
+optimization::pass::partition(Unigraph &&g) {
+    Vec<std::pair<Unigraph, SingleOperator>> ans;
+    for (auto &op : g.operators) {
+        auto &[g, t] = ans.emplace_back();
+        g.push_operator(op.op_type, op.inputs, op.outputs);
+    }
+    return ans;
+}
+
+// 1st: new shape
+// 2nd: permutation
+static std::pair<Vec<size_t>, Vec<size_t>> // fmt: new line
+transpose(                                 //
+    Vec<size_t> const &shape,              //
+    char const *src,                       // source tensor layout
+    char const *tgt                        // target tensor layout
+) {
+    // assert( shape.size() == str_len(src) == str_len(tgt) )
+    std::map<char, size_t> indices;
+
+    for (size_t i = 0; i < shape.size(); ++i)
+        indices[src[i]] = i;
+
+    auto ans = std::make_pair(     // fmt: new line
+        Vec<size_t>(shape.size()), // shape
+        Vec<size_t>(shape.size())  // permutation
+    );
+
+    for (auto i = 0; i < shape.size(); ++i)
+        ans.first[i] = shape[ans.second[i] = indices[tgt[i]]];
+
+    return ans;
+}
+
+Vec<Unigraph> optimization::pass::mutate( // fmt: new line
+    Unigraph const &g,                    //
+    SingleOperator const &                //
+) {
+    Vec<Unigraph> ans;
+    switch (g.operators.front().op_type) {
+    case OpType::Conv: {
+        auto const conv = Conv(g.operators.front());
+        auto const &i_shape = conv.input()->shape;
+        auto const &k_shape = conv.kernel()->shape;
+        auto const &dilations = conv.dilations()->to_vec<int64_t>();
+        auto const &strides = conv.strides()->to_vec<int64_t>();
+        // assert(conv.input()->data_type == conv.kernel()->data_type);
+        auto const dt = conv.input()->data_type;
+        if (k_shape.rbegin()[0] == 1    // fmt: new line
+            && k_shape.rbegin()[1] == 1 //
+            && i_shape[1] == k_shape[1] // group = 1
+            && std::all_of(strides.begin(), strides.end(),
+                           [](auto x) { return x == 1; })) {
+            // 1x1 conv
+            auto &mutant = ans.emplace_back();
+
+            // (input, "nchw"->"nhwc") -|transpose|-> tranposed -|reshape|-> t0
+            Arc<Tensor> t0;
+            {
+                auto [shape_, permute_] = transpose(i_shape, "nchw", "nhwc");
+                auto tranposed = Tensor::share(std::move(shape_), dt, {});
+                auto permutation = Tensor::share_vec(std::move(permute_));
+                mutant.push_operator(OpType::Transpose,
+                                     {conv.input(), std::move(permutation)},
+                                     {tranposed});
+                mutant.push_operator(
+                    OpType::Reshape, {std::move(tranposed)},
+                    {t0 = Tensor::share(
+                         {shape_[0] * shape_[1] * shape_[2], shape_[3]}, dt,
+                         {})});
+            }
+
+            // (kernel,"fcrs"->"cfrs") -|transpose|-> tranposed -|reshape|-> t1
+            Arc<Tensor> t1;
+            {
+                auto [shape_, permute_] = transpose(k_shape, "fcrs", "cfrs");
+                auto tranposed = Tensor::share(std::move(shape_), dt, {});
+                auto permutation = Tensor::share_vec(std::move(permute_));
+                mutant.push_operator(OpType::Transpose,
+                                     {conv.kernel(), std::move(permutation)},
+                                     {tranposed});
+                mutant.push_operator(
+                    OpType::Reshape, {std::move(tranposed)},
+                    {t1 = Tensor::share(
+                         {shape_[0], shape_[1] /* * shape_[2] * shape_[3] */},
+                         dt, {})});
+            }
+
+            // (t0,t1) -|matmul|-> x -|reshape|-> t2
+            auto x = Tensor::share({t0->shape[0], t1->shape[1]}, dt, {});
+            mutant.push_operator(OpType::MatMul, {std::move(t0), std::move(t1)},
+                                 {x});
+            auto t2 = Tensor::share(
+                {i_shape[0], i_shape[2], i_shape[3], k_shape[0]}, dt, {});
+            mutant.push_operator(OpType::Reshape, {std::move(x)}, {t2});
+
+            // (t2,"nhwf"->"nfhw") -|transpose|-> output
+            {
+                auto [shape_, permute_] = transpose(t2->shape, "nhwf", "nfhw");
+                // auto tranposed = Tensor::share(std::move(shape_), dt, {});
+                auto permutation = Tensor::share_vec(std::move(permute_));
+                mutant.push_operator(OpType::Transpose,
+                                     {std::move(t2), std::move(permutation)},
+                                     {conv.output()});
+            }
+        } else if (
+            // group = 1
+            i_shape[1] == k_shape[1]
+            // stride[*] = 1
+            && std::all_of(strides.begin(), strides.end(),
+                           [](auto x) { return x == 1; })
+            // dilation[*] > 1
+            && std::any_of(dilations.begin(), dilations.end(),
+                           [](auto x) { return x > 1; })) {
+            // dilated conv
+            auto &mutant = ans.emplace_back();
+
+            auto t0 = Tensor::share(
+                {
+                    i_shape[0],
+                    i_shape[1],
+                    i_shape[2] / dilations[0],
+                    static_cast<size_t>(dilations[0]),
+                    i_shape[3] / dilations[1],
+                    static_cast<size_t>(dilations[1]),
+                },
+                dt, {});
+            mutant.push_operator(OpType::Reshape, {conv.input()}, {t0});
+
+            auto [shape_, permute_] = transpose(t0->shape, "nc1234", "n24c13");
+            auto transposed = Tensor::share(shape_, dt, {});
+            auto permutation = Tensor::share_vec(std::move(permute_));
+            mutant.push_operator(OpType::Transpose,
+                                 {std::move(t0), std::move(permutation)},
+                                 {transposed});
+
+            auto t1 = Tensor::share(
+                {
+                    shape_[0] * shape_[1] * shape_[2],
+                    shape_[3],
+                    shape_[4],
+                    shape_[5],
+                },
+                dt, {});
+            mutant.push_operator(OpType::Reshape, {std::move(transposed)},
+                                 {t1});
+
+            Vec<size_t> shape__{
+                shape_[0] * shape_[1] * shape_[2],
+                k_shape[1],
+                conv.output()->shape[2] / shape_[1],
+                conv.output()->shape[3] / shape_[2],
+            };
+
+            auto t2 = Tensor::share(shape__, dt, {});
+            mutant.push_operator(OpType::Conv,
+                                 {
+                                     std::move(t1),
+                                     conv.kernel(),
+                                     Tensor::share_vec<size_t>({1, 1}),
+                                     conv.pads(),
+                                     conv.strides(),
+                                 },
+                                 {t2});
+            auto t3 = Tensor::share({shape_[0], shape_[1], shape_[2],
+                                     shape__[1], shape__[2], shape__[3]},
+                                    dt, {});
+            mutant.push_operator(OpType::Reshape, {std::move(t2)}, {t3});
+
+            auto [shape___, permute__] =
+                transpose(t3->shape, "n12chw", "nc1h2w");
+            auto transposed_ = Tensor::share(shape___, dt, {});
+            auto permutation_ = Tensor::share_vec(std::move(permute__));
+            mutant.push_operator(OpType::Transpose,
+                                 {std::move(t3), std::move(permutation_)},
+                                 {transposed_});
+            mutant.push_operator(OpType::Reshape, {std::move(t3)},
+                                 {conv.output()});
+        }
+    } break;
+
+    default:
+        break;
+    }
+
+    return ans;
+}
--- a/optimization/src/pass/single_operator.h
+++ b/optimization/src/pass/single_operator.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "../mutation.h"
+
+namespace optimization::pass {
+
+/// @brief Partition every operator as a `Unigraph`.
+struct SingleOperator {};
+
+/// @brief Splits a graph into subgraphs, where each subgraph contains
+///        only a single operator.
+/// @param arg0 An unpartitioned graph.
+/// @return A vector of individual subgraphs.
+Vec<std::pair<Unigraph, SingleOperator>> partition(Unigraph &&);
+
+/// @brief Mutates the single operator graph.
+/// @param g The subgraph.
+/// @param arg1 Never used.
+/// @return Mutants.
+Vec<Unigraph> mutate(Unigraph const &g, SingleOperator const &);
+
+} // namespace optimization::pass
--- a/optimization/src/tensor.cc
+++ b/optimization/src/tensor.cc
@ -0,0 +1,28 @@
+#include "tensor.h"
+#include <numeric>
+
+using namespace optimization;
+
+Arc<Tensor> Tensor::share(Vec<size_t> shape, DataType data_type, Data data) {
+    return Arc<Tensor>(
+        new Tensor(std::move(shape), std::move(data_type), std::move(data)));
+}
+
+size_t Tensor::count() const {
+    return shape.empty() // fmt: new line
+               ? 0
+               : std::accumulate(shape.begin(), shape.end(), 1,
+                                 [](auto acc, auto it) { return acc * it; });
+}
+
+size_t Tensor::size() const {
+    return shape.empty() // fmt: new line
+               ? 0
+               : std::accumulate(shape.begin(), shape.end(), data_type.size(),
+                                 [](auto acc, auto it) { return acc * it; });
+}
+
+Tensor::Tensor(Vec<size_t> &&shape, DataType &&data_type, Data &&data)
+    : shape(std::move(shape)),         // fmt: new line
+      data_type(std::move(data_type)), //
+      data(std::move(data)) {}
--- a/optimization/src/tensor.h
+++ b/optimization/src/tensor.h
@ -0,0 +1,87 @@
+#pragma once
+
+#include "data.h"
+#include "data_type.h"
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace optimization {
+
+/// @brief Defines a template alias for `std::vector`.
+template <class t> using Vec = std::vector<t>;
+
+/// @brief Defines a template alias for std::shared_ptr
+template <class t> using Arc = std::shared_ptr<t>;
+
+/// @brief A tensor represented by its position in `Unigraph`.
+struct TensorPos {
+    /// @brief `op` for `Operator` index in `Unigraph`.
+    ///        `idx` for index in `Operator` inputs or outputs.
+    size_t op, idx;
+};
+
+/// @brief A struct to represent a tensor in the computation graph.
+///        The ownership of a `Tensor` belongs to all the operators
+///        that generate it or it passed to.
+struct Tensor {
+    /// @brief Tensor shape.
+    Vec<size_t> shape;
+
+    /// @brief Element data type.
+    DataType data_type;
+
+    /// @brief Data of tensor.
+    Data data;
+
+    /// @brief Operators in different `Unigraph` that generate this tensor.
+    std::unordered_map<size_t, TensorPos> source;
+
+    /// @brief Operators in different `Unigraph` that take this tensor as input.
+    std::unordered_map<size_t, Vec<TensorPos>> target;
+
+    /// @brief A static factory method to create a `shared_ptr<Tensor>`.
+    /// @param shape Tensor shape.
+    /// @param data_type Element data type.
+    /// @param data Data.
+    /// @return A `shared_ptr<Tensor>`.
+    static Arc<Tensor> share(Vec<size_t> shape, DataType data_type, Data data);
+
+    /// @brief A static factory method to create a `shared_ptr<Tensor>` with
+    /// single data.
+    /// @tparam t Data type.
+    /// @param val Data value.
+    /// @return A `shared_ptr<Tensor>`.
+    template <class t> static Arc<Tensor> share_single(t val) {
+        return Tensor::share({1}, ty<t>(), Data::cpu<t>({val}));
+    }
+
+    /// @brief A static factory method to create a `shared_ptr<Tensor>` with
+    /// 1D data.
+    /// @tparam t Data type.
+    /// @param val Data value.
+    /// @return A `shared_ptr<Tensor>`.
+    template <class t> static Arc<Tensor> share_vec(Vec<t> val) {
+        return Tensor::share({val.size()}, ty<t>(),
+                             Data::cpu<t>(std::move(val)));
+    }
+
+    /// @brief Calculates count of data in this tensor.
+    /// @return Data count.
+    size_t count() const;
+
+    /// @brief Calculates the size of the tensor in bytes.
+    /// @return Memory usage in bytes.
+    size_t size() const;
+
+    /// @brief Copies tensor data to a `Vec`.
+    /// @tparam t Data type.
+    /// @return The data `Vec`.
+    template <class t> Vec<t> to_vec() const { return data.to_vec<t>(count()); }
+
+  private:
+    /// @brief Constructor is private and only accessible by the factory method.
+    Tensor(Vec<size_t> &&, DataType &&, Data &&);
+};
+
+} // namespace optimization
--- a/optimization/src0/data.h
+++ b/optimization/src0/data.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+/// @brief Stores tensor data。
+struct Data {
+    /// @brief `cpu_data` is stored in the memory space,
+    /// which allows it to be managed using `std::vector<uint8_t>`.
+    std::vector<uint8_t> cpu_data;
+
+    // #ifdef USE_CUDA
+    //     void *gpu_data;
+    // #endif
+    // #ifdef USE_BANG
+    //     void *mlu_data;
+    // #endif
+
+    /// @brief Builds `Data` from `vector` os any type `t`.
+    /// @tparam t Data type.
+    /// @param data Data `vector`.
+    /// @return `Data` object.
+    template <class t> static Data cpu(std::vector<t> data) {
+        Data ans{std::vector<uint8_t>(sizeof(t) * data.size())};
+        memcpy(ans.cpu_data.data(), data.data(), ans.cpu_data.size());
+        return ans;
+    }
+};
--- a/optimization/src0/data_type.cc
+++ b/optimization/src0/data_type.cc
@ -0,0 +1,32 @@
+#include "data_type.h"
+
+size_t DataType::size() const {
+    switch (id) {
+    case DataTypeId::FLOAT:
+        return sizeof(float);
+    case DataTypeId::UINT8:
+        return sizeof(uint8_t);
+    case DataTypeId::INT8:
+        return sizeof(int8_t);
+    case DataTypeId::UINT16:
+        return sizeof(uint16_t);
+    case DataTypeId::INT16:
+        return sizeof(int16_t);
+    case DataTypeId::INT32:
+        return sizeof(int32_t);
+    case DataTypeId::INT64:
+        return sizeof(int64_t);
+    case DataTypeId::BOOL:
+        return sizeof(bool);
+    case DataTypeId::FLOAT16:
+        return 2;
+    case DataTypeId::DOUBLE:
+        return sizeof(double);
+    case DataTypeId::UINT32:
+        return sizeof(uint32_t);
+    case DataTypeId::UINT64:
+        return sizeof(uint64_t);
+    default:
+        throw "unsupported data type.";
+    }
+}
--- a/optimization/src0/data_type.h
+++ b/optimization/src0/data_type.h
@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+enum class DataTypeId : uint8_t {
+    UNDEFINED,
+    FLOAT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    BOOL,
+    FLOAT16,
+    DOUBLE,
+    UINT32,
+    UINT64,
+    // COMPLEX64,
+    // COMPLEX128,
+    // BFLOAT16,
+};
+
+struct DataType {
+    DataTypeId id;
+
+    size_t size() const;
+};
+
+template <class t> DataType ty();
+template <> inline DataType ty<float>() { return {DataTypeId::FLOAT}; }
+template <> inline DataType ty<uint8_t>() { return {DataTypeId::UINT8}; }
+template <> inline DataType ty<int8_t>() { return {DataTypeId::INT8}; }
+template <> inline DataType ty<uint16_t>() { return {DataTypeId::UINT16}; }
+template <> inline DataType ty<int16_t>() { return {DataTypeId::INT16}; }
+template <> inline DataType ty<int32_t>() { return {DataTypeId::INT32}; }
+template <> inline DataType ty<int64_t>() { return {DataTypeId::INT64}; }
+template <> inline DataType ty<bool>() { return {DataTypeId::BOOL}; }
+template <> inline DataType ty<double>() { return {DataTypeId::DOUBLE}; }
+template <> inline DataType ty<uint32_t>() { return {DataTypeId::UINT32}; }
+template <> inline DataType ty<uint64_t>() { return {DataTypeId::UINT64}; }
--- a/optimization/src0/graph.h
+++ b/optimization/src0/graph.h
@ -0,0 +1,196 @@
+#pragma once
+
+#include <numeric>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "data.h"
+#include "data_type.h"
+#include "op_type.h"
+
+/// @brief A tensor represented by which `node` it is passed to
+/// and at which `slot` in inputs of that `node`.
+struct InletPos {
+    size_t node, slot;
+};
+
+/// @brief A tensor represented by which `node` it is generated from
+/// and at which `slot` in outputs of that `node`.
+struct OutletPos {
+    size_t node, slot;
+};
+
+/// @brief Calculates the hash of `OutletPos`.
+struct OutletPosHash {
+    size_t operator()(OutletPos const &o) const {
+        return o.node ^ (o.slot << 1);
+    }
+};
+
+/// @brief The data structure represents a `Outlet` of a operator,
+/// which generates a tensor, and it is part of the `Node`.
+/// @tparam Tensor discripter.
+template <class Tensor> struct Outlet {
+    Tensor info;
+    std::vector<InletPos> targets;
+
+    explicit Outlet(Tensor info) : info(info), targets({}) {}
+};
+
+/// @brief The specific tensor information excludes all unknowns.
+/// This struct can be used as a tensor discripter type in templates.
+struct TensorInfo {
+    std::vector<size_t> shape;
+    DataType data_type;
+
+    /// @brief Tensor memory usage.
+    /// @return Memory bytes.
+    size_t size() const {
+        return shape.empty() // fmt: new line
+                   ? 0
+                   : std::accumulate(
+                         shape.begin(), shape.end(), data_type.size(),
+                         [](auto acc, auto it) { return acc * it; });
+    }
+};
+
+/// @brief Operator `Node` of the dataflow `Graph`.
+/// @tparam Tensor discripter.
+template <class Tensor> struct Node {
+    OpType op_type;
+    std::vector<OutletPos> inputs;
+    std::vector<Outlet<Tensor>> outputs;
+};
+
+/// @brief A reference of an operator `Node` in a dataflow `Graph`.
+struct OpRef {
+    /// @brief Index of operator `Node` in the corresponding `Graph`.
+    size_t node_idx;
+
+    /// @brief Builds `OutletPos` from `OpRef`.
+    /// @param slot Index of output for operator `Node`.
+    /// @return An `OutletPos`.
+    OutletPos operator[](size_t slot) const { return {node_idx, slot}; }
+};
+
+/// @brief The dataflow `Graph`.
+/// @tparam Tensor discripter.
+///
+/// **NOTICE** Methods of a template class must be defined in the same file
+/// as the class.
+template <class Tensor> class Graph {
+    /// @brief `operators` must be topo sorted.
+    std::vector<Node<Tensor>> _operators;
+
+    /// @brief A map to store data, where the key is the index of input node,
+    /// and the value is data.
+    std::unordered_map<size_t, Data> _data;
+
+    /// @brief
+    std::unordered_map<size_t, size_t> _io_id;
+
+    static size_t IO_ID;
+
+  public:
+    /// @brief Pushs a new operator `Node` into `Graph`.
+    /// @param op_type Operator type.
+    /// @param inputs Tensors passed to operator.
+    /// @param outputs Tensors generated by operator.
+    /// @return A reference to the `Node` in `Graph`.
+    OpRef push_operator(                    // fmt: new line
+        OpType op_type,                     //
+        std::vector<OutletPos> inputs,      //
+        std::vector<Outlet<Tensor>> outputs //
+    ) {
+        if (op_type == OpType::Input)
+            throw "use `push_input` instead";
+        else if (op_type == OpType::Output)
+            throw "use `push_output` instead";
+
+        auto index = _operators.size();
+
+        for (const auto &input : inputs)
+            if (input.node >= index)
+                throw "input node not exist";
+
+        size_t i = 0;
+        for (const auto &input : inputs)
+            _operators[input.node]   // fmt: new line
+                .outputs[input.slot] //
+                .targets             //
+                .push_back({index, ++i});
+
+        _operators.push_back({op_type, std::move(inputs), std::move(outputs)});
+        return {index};
+    }
+
+    /// @brief Pushs a new `Input` `Node` into `Graph`.
+    /// @param output Tensor from `Input`.
+    /// @param id IO id of `Input`.
+    /// @return A reference to the `Node` in `Graph`.
+    OpRef push_input(Outlet<Tensor> output, std::optional<size_t> id) {
+        auto index = _operators.size();
+        _io_id[index] = id ? *id : IO_ID++;
+        _operators.push_back({OpType::Input, {}, {output}});
+        return {index};
+    }
+
+    /// @brief Pushs a new `Output` `Node` into `Graph`.
+    /// @param input Tensor to `Output`.
+    /// @param id IO id of `Output`.
+    /// @return A reference to the `Node` in `Graph`.
+    OpRef push_output(OutletPos input, std::optional<size_t> id) {
+        auto index = _operators.size();
+        _io_id[index] = id ? *id : IO_ID++;
+        _operators.push_back({OpType::Output, {input}, {}});
+        return {index};
+    }
+
+    /// @brief Pushs data of `Input` `Node` into `Graph`.
+    /// @param input A reference to the `Input` `Node`.
+    /// @param data Data to store.
+    void push_data(OpRef const &input, Data data) {
+        if (input.node_idx >= _operators.size())
+            throw "input node not exist";
+        const auto &op = _operators.at(input.node_idx);
+        if (op.op_type != OpType::Input)
+            throw "only input node can have data";
+        if (!data.cpu_data.empty() &&
+            data.cpu_data.size() != op.outputs.front().info.size())
+            throw "wrong data size";
+        _data[input.node_idx] = std::move(data);
+    }
+
+    /// @brief Gets operators in the `Graph`.
+    /// @return Operators in the `Graph`.
+    std::vector<Node<Tensor>> const &operators() const { return _operators; }
+
+    /// @brief `Graph` inputs.
+    /// @return Indices of input `Node`s in `Graph`.
+    std::vector<size_t> inputs() const {
+        std::vector<size_t> ans;
+        size_t i = 0;
+        for (const auto &node : _operators) {
+            if (node.op_type == OpType::Input && _data.find(i) != _data.end())
+                ans.push_back(i);
+            ++i;
+        }
+        return ans;
+    }
+
+    /// @brief `Graph` outputs.
+    /// @return Indices of output `Node`s in `Graph`.
+    std::vector<size_t> outputs() const {
+        std::vector<size_t> ans;
+        size_t i = 0;
+        for (const auto &node : _operators) {
+            if (node.op_type == OpType::Output)
+                ans.push_back(i);
+            ++i;
+        }
+        return ans;
+    }
+};
+
+template <class Tensor> size_t Graph<Tensor>::IO_ID = 0;
--- a/optimization/src0/op_type.h
+++ b/optimization/src0/op_type.h
@ -0,0 +1,193 @@
+#pragma once
+
+enum class OpType : uint16_t {
+    Input,
+    Output,
+
+    Abs,
+    Acos,
+    Acosh,
+    Add,
+    And,
+    ArgMax,
+    Asin,
+    Asinh,
+    Atan,
+    Atanh,
+    AveragePool,
+    BatchNormalization,
+    Bernoulli,
+    BitShift,
+    BitwiseAnd,
+    BitwiseNot,
+    BitwiseOr,
+    BitwiseXor,
+    BlackmanWindow,
+    Cast,
+    CastLike,
+    Ceil,
+    Celu,
+    CenterCropPad,
+    Clip,
+    Col2lm,
+    Compress,
+    Concat,
+    ConcatFromSequence,
+    // Constant, // -> Input
+    ConstantOfShape,
+    Conv,
+    ConvInteger,
+    ConvTranspose,
+    Cos,
+    Cosh,
+    CumSum,
+    DFT,
+    DeformConv,
+    DepthToSpace,
+    DequantizeLinear,
+    Det,
+    Div,
+    Dropout,
+    DynamicQuantizeLinear,
+    Einsum,
+    Elu,
+    Equal,
+    Erf,
+    Exp,
+    Expand,
+    EyeLike,
+    Flatten,
+    Floor,
+    GRU,
+    Gather,
+    GatherElements,
+    GatherND,
+    Gemm,
+    GlobalAveragePool,
+    GlobalLpPool,
+    GlobalMaxPool,
+    Greater,
+    GreaterOrEqual,
+    GridSample,
+    GroupNormalization,
+    HammingWindow,
+    HannWindow,
+    HardSigmoid,
+    HardSwish,
+    Hardmax,
+    Identity,
+    If,
+    InstanceNormalization,
+    IsInf,
+    IsNaN,
+    LRN,
+    LSTM,
+    LayerNormalization,
+    LeakyRelu,
+    Less,
+    LessOrEqual,
+    Log,
+    LogSoftmax,
+    Loop,
+    LpNormalization,
+    LpPool,
+    MatMul,
+    MatMulInteger,
+    Max,
+    MaxPool,
+    MaxRoiPool,
+    MaxUnpool,
+    Mean,
+    MeanVarianceNormalization,
+    MelWeightMatrix,
+    Min,
+    Mish,
+    Mod,
+    Mul,
+    Multinomial,
+    Neg,
+    NegativeLogLikelihoodLoss,
+    NonMaxSuppression,
+    NonZero,
+    Not,
+    OneHot,
+    Optional,
+    OptionalGetElement,
+    OptionalHasElement,
+    Or,
+    PRelu,
+    Pad,
+    Pow,
+    QLinearConv,
+    QLinearMatMul,
+    QuantizeLinear,
+    RNN,
+    RandomNormal,
+    RandomNormalLike,
+    RandomUniform,
+    RandomUniformLike,
+    Range,
+    Reciprocal,
+    ReduceL1,
+    ReduceL2,
+    ReduceLogSum,
+    ReduceLogSumExp,
+    ReduceMax,
+    ReduceMean,
+    ReduceMin,
+    ReduceProd,
+    ReduceSum,
+    ReduceSumSquare,
+    Relu,
+    Reshape,
+    Resize,
+    ReverseSequence,
+    RoiAlign,
+    Round,
+    STFT,
+    Scan,
+    Scatter,
+    ScatterElements,
+    ScatterND,
+    Selu,
+    SequenceAt,
+    SequenceConstruct,
+    SequenceEmpty,
+    SequenceErase,
+    SequenceInsert,
+    SequenceLength,
+    SequenceMap,
+    Shape,
+    Shrink,
+    Sigmoid,
+    Sign,
+    Sin,
+    Sinh,
+    Size,
+    Slice,
+    Softmax,
+    SoftmaxCrossEntropyLoss,
+    Softplus,
+    Softsign,
+    SpaceToDepth,
+    Split,
+    SplitToSequence,
+    Sqrt,
+    Squeeze,
+    StringNormalizer,
+    Sub,
+    Sum,
+    Tan,
+    Tanh,
+    TfIdfVectorizer,
+    ThresholdedRelu,
+    Tile,
+    TopK,
+    Transpose,
+    Trilu,
+    Unique,
+    Unsqueeze,
+    Upsample,
+    Where,
+    Xor,
+};
--- a/optimization/src0/test/test.cpp
+++ b/optimization/src0/test/test.cpp
@ -0,0 +1,47 @@
+#include "../src/graph.h"
+#include <iostream>
+
+int main() {
+    try {
+        Graph<TensorInfo> g;
+        auto a = g.push_input(                             // fmt: new line
+            Outlet(TensorInfo{{1, 1, 2, 3}, ty<float>()}), // output
+            std::nullopt                                   // id
+        );
+        g.push_data(a, Data::cpu<float>({1, 2, 3, 4, 5, 6}));
+
+        auto b = g.push_input(                             // fmt: new line
+            Outlet(TensorInfo{{1, 1, 3, 1}, ty<float>()}), // output
+            std::nullopt                                   // id
+        );
+        g.push_data(b, Data::cpu<float>({1, 2, 3}));
+
+        auto matmul = g.push_operator(                      // fmt: new line
+            OpType::MatMul,                                 // op_type
+            {a[0], b[0]},                                   // inputs
+            {Outlet(TensorInfo{{1, 1, 2, 1}, ty<float>()})} // outputs
+        );
+
+        g.push_output(   // fmt: new line
+            matmul[0],   // input
+            std::nullopt // id
+        );
+
+        std::cout << "inputs: ";
+        for (auto it : g.inputs()) {
+            std::cout << it << " ";
+        }
+        std::cout << std::endl;
+
+        std::cout << "outputs: ";
+        for (auto it : g.outputs()) {
+            std::cout << it << " ";
+        }
+        std::cout << std::endl;
+
+        return 0;
+    } catch (const char *e) {
+        std::cerr << "[ERROR] " << e << std::endl;
+        return 1;
+    }
+}
--- a/optimization/test/test.cpp
+++ b/optimization/test/test.cpp
@ -0,0 +1,43 @@
+#include "../include/optimization/common.h"
+#include <iostream>
+#include <unordered_set>
+
+using namespace optimization;
+
+int main() {
+    try {
+        Unigraph g;
+        auto a = Tensor::share( // fmt: new line
+            {1, 1, 2, 3},       //
+            ty<float>(),        //
+            Data::cpu<float>({1, 2, 3, 4, 5, 6}));
+
+        auto b = Tensor::share( // fmt: new line
+            {1, 1, 3, 1},       //
+            ty<float>(),        //
+            Data::cpu<float>({1, 2, 3}));
+
+        auto c = Tensor::share( // fmt: new line
+            {1, 1, 2, 1},       //
+            ty<float>(),        //
+            {});
+
+        auto matmul = g.push_operator( // fmt: new line
+            OpType::MatMul,            // op_type
+            {a, b},                    // inputs
+            {c}                        // outputs
+        );
+
+        auto p = Partition<pass::SingleOperator>(std::move(g), pass::partition);
+        auto m = Mutation<pass::SingleOperator>(
+            std::move(p),
+            [](const auto &g, const auto &t) { return Vec<Unigraph>{}; });
+        auto r = Rating<pass::SingleOperator>(std::move(m), memory_usage);
+        auto ans = r.build(Vec<size_t>(r.size().size(), 0));
+
+        return 0;
+    } catch (const char *e) {
+        std::cerr << "[ERROR] " << e << std::endl;
+        return 1;
+    }
+}
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -1,4 +1,13 @@
 #include "core/graph.h"
+#include "operators/concat.h"
+#include "operators/conv.h"
+#include "operators/gather.h"
+#include "operators/matmul.h"
+#include "operators/pad.h"
+#include "operators/pooling.h"
+#include "operators/reduce_mean.h"
+#include "operators/unary.h"
+#include "optimization/common.h"
 #include <algorithm>
 #include <queue>

@ -114,13 +123,394 @@ bool GraphObj::topo_sort() {
    return this->sorted = true;
 }

-void GraphObj::optimize() {
-    for (auto &op : ops) {
+optimization::DataType cast(DataType ty) {
+#define IT(A, B)                                                               \
+    if (ty == DataType::A)                                                     \
+        return {optimization::DataTypeId::B};
+
+    IT(Float32, FLOAT)          //
+    else IT(UInt32, UINT32)     //
+        else IT(UInt8, UINT8)   //
+        else IT(Int8, INT8)     //
+        else IT(UInt16, UINT16) //
+        else IT(Int16, INT16)   //
+        else IT(Int32, INT32)   //
+        else IT(Int64, INT64)   //
+        else IT_ASSERT(false, "unsupported data type");
+
+#undef IT
+}
+
+DataType cast(optimization::DataType ty) {
+#define IT(A, B)                                                               \
+    if (optimization::DataTypeId::A == ty.id)                                  \
+        return {DataType::B};
+
+    IT(FLOAT, Float32)          //
+    else IT(UINT32, UInt32)     //
+        else IT(UINT8, UInt8)   //
+        else IT(INT8, Int8)     //
+        else IT(UINT16, UInt16) //
+        else IT(INT16, Int16)   //
+        else IT(INT32, Int32)   //
+        else IT(INT64, Int64)   //
+        else IT_ASSERT(false, "unsupported data type");
+
+#undef IT
+}
+
+optimization::Unigraph cast(GraphObj &g) {
+    namespace opt = optimization;
+
+    g.topo_sort();
+
+#define I(PTR) reinterpret_cast<uintptr_t>((PTR).get())
+
+    unordered_map<uintptr_t, opt::Arc<opt::Tensor>> tensors;
+    for (const auto &t : g.getTensors()) {
+        const auto dims = t->getDims();
+        opt::Vec<size_t> shape(dims.size());
+        std::transform(dims.begin(), dims.end(), shape.begin(),
+                       [](auto x) { return static_cast<size_t>(x); });
+
+        opt::Data data;
+        if (t->hasData()) {
+            auto ptr = t->getDataBlob()->getPtr<uint8_t *>();
+            data = opt::Data(ptr, ptr + t->getBytes());
+        }
+        tensors[I(t)] =
+            opt::Tensor::share(shape, cast(t->getDType()), std::move(data));
+    }
+
+    opt::Unigraph ans;
+
+    for (const auto &op : g.getOperators()) {
+        const auto inputs = op->getInputs(), outputs = op->getOutputs();
+        opt::Vec<opt::Arc<opt::Tensor>> in(inputs.size()), out(outputs.size());
+        std::transform(inputs.begin(), inputs.end(), in.begin(),
+                       [&](auto x) { return tensors[I(x)]; });
+        std::transform(outputs.begin(), outputs.end(), out.begin(),
+                       [&](auto x) { return tensors[I(x)]; });
        switch (op->getOpType()) {
+        case OpType::Abs:
+            ans.push_operator(opt::OpType::Abs, std::move(in), std::move(out));
+            break;
+        case OpType::ACos:
+            ans.push_operator(opt::OpType::Acos, std::move(in), std::move(out));
+            break;
+        case OpType::ACosH:
+            ans.push_operator(opt::OpType::Acosh, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Add:
+            ans.push_operator(opt::OpType::Add, std::move(in), std::move(out));
+            break;
+        case OpType::And:
+            ans.push_operator(opt::OpType::And, std::move(in), std::move(out));
+            break;
+        case OpType::ASin:
+            ans.push_operator(opt::OpType::Asin, std::move(in), std::move(out));
+            break;
+        case OpType::ASinH:
+            ans.push_operator(opt::OpType::Asinh, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::ATan:
+            ans.push_operator(opt::OpType::Atan, std::move(in), std::move(out));
+            break;
+        case OpType::ATanH:
+            ans.push_operator(opt::OpType::Atanh, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::AvgPool: {
+            auto obj = as<AvgPoolObj>(op);
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getDh(), obj->getDw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getKh(), obj->getKw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getPh(), obj->getPw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getSh(), obj->getSw()}));
+            ans.push_operator(opt::OpType::AveragePool, std::move(in),
+                              std::move(out));
+        } break;
+        case OpType::BatchNorm:
+            ans.push_operator(opt::OpType::BatchNormalization, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitLeftShift:
+            in.push_back(opt::Tensor::share_single<uint8_t>(0));
+            ans.push_operator(opt::OpType::BitShift, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitRightShift:
+            in.push_back(opt::Tensor::share_single<uint8_t>(1));
+            ans.push_operator(opt::OpType::BitShift, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitAnd:
+            ans.push_operator(opt::OpType::BitwiseAnd, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitNot:
+            ans.push_operator(opt::OpType::BitwiseNot, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitOr:
+            ans.push_operator(opt::OpType::BitwiseOr, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::BitXor:
+            ans.push_operator(opt::OpType::BitwiseXor, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Cast:
+            ans.push_operator(opt::OpType::Cast, std::move(in), std::move(out));
+            break;
+        case OpType::Ceil:
+            ans.push_operator(opt::OpType::Ceil, std::move(in), std::move(out));
+            break;
+        case OpType::Clip: {
+            auto obj = as<ClipObj>(op);
+            auto min = obj->getMin();
+            auto max = obj->getMax();
+            in.push_back(
+                opt::Tensor::share_single<float>(min ? *min : -INFINITY));
+            in.push_back(
+                opt::Tensor::share_single<float>(max ? *max : INFINITY));
+            ans.push_operator(opt::OpType::Clip, std::move(in), std::move(out));
+        } break;
+        case OpType::Concat:
+            in.push_back(
+                opt::Tensor::share_single<int>(as<ConcatObj>(op)->getDim()));
+            ans.push_operator(opt::OpType::Concat, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Conv: {
+            auto obj = as<ConvObj>(op);
+            in.push_back(opt::Tensor::share_vec<size_t>(
+                {(size_t)obj->getDh(), (size_t)obj->getDw()}));
+            in.push_back(opt::Tensor::share_vec<size_t>(
+                {(size_t)obj->getPh(), (size_t)obj->getPw()}));
+            in.push_back(opt::Tensor::share_vec<size_t>(
+                {(size_t)obj->getSh(), (size_t)obj->getSw()}));
+            ans.push_operator(opt::OpType::Conv, std::move(in), std::move(out));
+        } break;
+        case OpType::Cos:
+            ans.push_operator(opt::OpType::Cos, std::move(in), std::move(out));
+            break;
+        case OpType::CosH:
+            ans.push_operator(opt::OpType::Cosh, std::move(in), std::move(out));
+            break;
+        case OpType::Div:
+            ans.push_operator(opt::OpType::Div, std::move(in), std::move(out));
+            break;
+        case OpType::Dropout:
+            ans.push_operator(opt::OpType::Dropout, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Exp:
+            ans.push_operator(opt::OpType::Exp, std::move(in), std::move(out));
+            break;
+        case OpType::Flatten:
+            ans.push_operator(opt::OpType::Flatten, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Floor:
+            ans.push_operator(opt::OpType::Floor, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Gather:
+            in.push_back(
+                opt::Tensor::share_single<int>(as<GatherObj>(op)->getAxis()));
+            ans.push_operator(opt::OpType::Gather, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::GreaterThan:
+            ans.push_operator(opt::OpType::Greater, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::GreaterEqual:
+            ans.push_operator(opt::OpType::GreaterOrEqual, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Identity:
+            ans.push_operator(opt::OpType::Identity, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Log:
+            ans.push_operator(opt::OpType::Log, std::move(in), std::move(out));
+            break;
+        case OpType::Matmul: {
+            auto obj = as<MatmulObj>(op);
+            IT_ASSERT(obj->getAct() == ActType::None);
+            in.push_back(opt::Tensor::share_single<float>(1.0f));
+            in.push_back(opt::Tensor::share_single<float>(1.0f));
+            in.push_back(
+                opt::Tensor::share_single<int>(obj->getTransA() ? 1 : 0));
+            in.push_back(
+                opt::Tensor::share_single<int>(obj->getTransB() ? 1 : 0));
+            ans.push_operator(opt::OpType::Gemm, std::move(in), std::move(out));
+        } break;
+        case OpType::Maximum:
+            ans.push_operator(opt::OpType::Max, std::move(in), std::move(out));
+            break;
+        case OpType::MaxPool: {
+            auto obj = as<MaxPoolObj>(op);
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getDh(), obj->getDw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getKh(), obj->getKw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getPh(), obj->getPw()}));
+            in.push_back(
+                opt::Tensor::share_vec<int>({obj->getSh(), obj->getSw()}));
+            ans.push_operator(opt::OpType::AveragePool, std::move(in),
+                              std::move(out));
+        } break;
+        case OpType::Minimum:
+            ans.push_operator(opt::OpType::Min, std::move(in), std::move(out));
+            break;
+        case OpType::Mul:
+            ans.push_operator(opt::OpType::Mul, std::move(in), std::move(out));
+            break;
+        case OpType::Neg:
+            ans.push_operator(opt::OpType::Neg, std::move(in), std::move(out));
+            break;
+        case OpType::Not:
+            ans.push_operator(opt::OpType::Not, std::move(in), std::move(out));
+            break;
+        case OpType::Or:
+            ans.push_operator(opt::OpType::Or, std::move(in), std::move(out));
+            break;
+        case OpType::Pad:
+            in.push_back(
+                opt::Tensor::share_vec<int>(as<PadObj>(op)->getPads()));
+            ans.push_operator(opt::OpType::Pad, std::move(in), std::move(out));
+            break;
+        case OpType::Reciprocal:
+            ans.push_operator(opt::OpType::Reciprocal, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::ReduceMean: {
+            const auto obj = as<ReduceMeanObj>(op);
+            const auto axes = obj->getAxes();
+            in.push_back(
+                opt::Tensor::share_vec<int>(vector(axes.begin(), axes.end())));
+            in.push_back(
+                opt::Tensor::share_single<int>(obj->getKeepDims() ? 1 : 0));
+            ans.push_operator(opt::OpType::ReduceMean, std::move(in),
+                              std::move(out));
+        } break;
+        case OpType::Relu:
+            ans.push_operator(opt::OpType::Relu, std::move(in), std::move(out));
+            break;
+        case OpType::Reshape:
+            ans.push_operator(opt::OpType::Reshape, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Resize:
+            ans.push_operator(opt::OpType::Resize, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Round:
+            ans.push_operator(opt::OpType::Round, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Shape:
+            ans.push_operator(opt::OpType::Shape, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Sigmoid:
+            ans.push_operator(opt::OpType::Sigmoid, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Sin:
+            ans.push_operator(opt::OpType::Sin, std::move(in), std::move(out));
+            break;
+        case OpType::SinH:
+            ans.push_operator(opt::OpType::Sinh, std::move(in), std::move(out));
+            break;
+        case OpType::Slice:
+            IT_TODO_HALT();
+            ans.push_operator(opt::OpType::Slice, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Softmax:
+            ans.push_operator(opt::OpType::Softmax, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Split:
+            ans.push_operator(opt::OpType::Split, std::move(in),
+                              std::move(out));
+            break;
+        case OpType::Sqrt:
+            ans.push_operator(opt::OpType::Sqrt, std::move(in), std::move(out));
+            break;
+        case OpType::Sub:
+            ans.push_operator(opt::OpType::Sub, std::move(in), std::move(out));
+            break;
+        case OpType::Tan:
+            ans.push_operator(opt::OpType::Tan, std::move(in), std::move(out));
+            break;
+        case OpType::TanH:
+            ans.push_operator(opt::OpType::Tanh, std::move(in), std::move(out));
+            break;
+        case OpType::Transpose:
+            ans.push_operator(opt::OpType::Tanh, std::move(in), std::move(out));
+            break;
+        case OpType::Xor:
+            ans.push_operator(opt::OpType::Xor, std::move(in), std::move(out));
+            break;
        default:
            break;
        }
    }
+
+#undef I
+    return ans;
+}
+
+Graph cast(optimization::Unigraph const &g, Runtime rt) {
+    namespace opt = optimization;
+
+    unordered_map<uintptr_t, Tensor> tensors;
+
+#define I(PTR) reinterpret_cast<uintptr_t>((PTR).get())
+
+    auto ans = make_ref<GraphObj>(std::move(rt));
+
+    for (auto const &op : g.operators) {
+        for (auto const &t : op.inputs) {
+            auto const &shape_ = t->shape;
+            opt::Vec<int> shape(shape_.size());
+            std::transform(shape_.begin(), shape_.end(), shape.begin(),
+                           [](auto x) { return static_cast<int>(x); });
+            tensors[I(t)] =
+                ans->addTensor(std::move(shape), cast(t->data_type));
+        }
+        for (auto const &t : op.outputs) {
+            auto const &shape_ = t->shape;
+            opt::Vec<int> shape(shape_.size());
+            std::transform(shape_.begin(), shape_.end(), shape.begin(),
+                           [](auto x) { return static_cast<int>(x); });
+            tensors[I(t)] =
+                ans->addTensor(std::move(shape), cast(t->data_type));
+        }
+        switch (op.op_type) {
+        default:
+            break;
+        }
+    }
+
+#undef I
+    return ans;
+}
+
+void GraphObj::optimize() {
+    auto graph = cast(*this);
+    auto ans = cast(graph, this->runtime);
 }

 void GraphObj::dataMalloc() {
@ -191,7 +581,8 @@ void GraphObj::replaceConnection(Tensor oldTensor, Tensor newTensor,
 // tensor's "source" and "target" must be in "ops".
 // tensor has no "source" and no "target" must not exist.
 // "inputs" or "outputs" of operators must be in "tensors"
-// "predecessors" and "successors" of an operator of "ops" must be in "ops".
+// "predecessors" and "successors" of an operator of "ops" must be in
+// "ops".
 bool GraphObj::checkValid() const {
    for (auto tensor : tensors) {
        IT_ASSERT(!(tensor->getTargets().size() == 0 &&
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@ -29,65 +29,97 @@ void SearchEngine::printMetaGraph(Ref<SearchEngine::MetaGraph> metaGraph) {
 }

 Graph SearchEngine::run(const Graph graph) {
-    IT_ASSERT(runtimeExec == graph->getRuntime());
-    std::cout << "[INFO] original graph: " << std::endl;
-    std::cout << graph->toString();
-    std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
-              << std::endl;
-
-    std::vector<Graph> partitions = partitionGraph(graph);
-
-    std::cout << "[INFO] Partition num: " << partitions.size() << std::endl;
-    std::vector<Graph> bestGraphs = {nullptr};
-    for (size_t pid = 0; pid < partitions.size(); pid++) {
-        auto &subGraph = partitions[pid];
-        std::cout << "[INFO] Partition: " << pid << std::endl;
-        std::vector<Graph> candidates = search(subGraph);
-        std::cout << "[INFO] size: " << candidates.size() << std::endl;
-        IT_ASSERT(candidates.size() > 0);
-        std::cout << subGraph->toString() << std::endl;
+    vector<Graph> bestGraphs{nullptr};
+    for (auto &subGraph : partitionGraph(graph)) {
        std::vector<Graph> nextGraphs;
-        for (auto lastGraph : bestGraphs) {
-            for (auto thisGraph : candidates) {
+        for (auto lastGraph : bestGraphs)
+            for (auto thisGraph : search(subGraph)) {
                std::vector<Operator> ops;
-                if (lastGraph != nullptr) {
-                    for (auto op : lastGraph->getOperators()) {
+                if (lastGraph != nullptr)
+                    for (auto op : lastGraph->getOperators())
                        ops.emplace_back(op);
-                    }
-                }
-                if (thisGraph != nullptr) {
-                    for (auto op : thisGraph->getOperators()) {
+
+                if (thisGraph != nullptr)
+                    for (auto op : thisGraph->getOperators())
                        ops.emplace_back(op);
-                    }
-                }
+
                auto tmp = make_ref<GraphObj>(runtimeExec, ops);
                tmp->dataMalloc();
                nextGraphs.emplace_back(tmp);
            }
-        }
        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
        });
-        if (nextGraphs.size() > GRAPH_SIZE) {
+        if (nextGraphs.size() > GRAPH_SIZE)
            nextGraphs.resize(GRAPH_SIZE);
-        }
-        bestGraphs.clear();
-        for (size_t i = 0; i < nextGraphs.size(); i++) {
-            bestGraphs.emplace_back(nextGraphs[i]);
-        }
-    }
-
-    std::cout << "[INFO] unfused graph: " << std::endl;
-    for (size_t i = 0; i < bestGraphs.size(); i++) {
-        std::cout << "bestGraph " << i << ":" << std::endl;
-        std::cout << bestGraphs[i]->toString();
-        std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
-                  << std::endl;
+        bestGraphs = nextGraphs;
    }

    return bestGraphs[0];
 }

+// Graph SearchEngine::run(const Graph graph) {
+//     IT_ASSERT(runtimeExec == graph->getRuntime());
+//     std::cout << "[INFO] original graph: " << std::endl;
+//     std::cout << graph->toString();
+//     std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
+//               << std::endl;
+
+//     std::vector<Graph> partitions = partitionGraph(graph);
+
+//     std::cout << "[INFO] Partition num: " << partitions.size() << std::endl;
+//     std::vector<Graph> bestGraphs = {nullptr};
+//     for (size_t pid = 0; pid < partitions.size(); pid++) {
+//         auto &subGraph = partitions[pid];
+//         std::cout << "[INFO] Partition: " << pid << std::endl;
+//         std::vector<Graph> candidates = search(subGraph);
+//         std::cout << "[INFO] size: " << candidates.size() << std::endl;
+//         IT_ASSERT(candidates.size() > 0);
+//         std::cout << subGraph->toString() << std::endl;
+//         std::vector<Graph> nextGraphs;
+//         for (auto lastGraph : bestGraphs) {
+//             for (auto thisGraph : candidates) {
+//                 std::vector<Operator> ops;
+//                 if (lastGraph != nullptr) {
+//                     for (auto op : lastGraph->getOperators()) {
+//                         ops.emplace_back(op);
+//                     }
+//                 }
+//                 if (thisGraph != nullptr) {
+//                     for (auto op : thisGraph->getOperators()) {
+//                         ops.emplace_back(op);
+//                     }
+//                 }
+//                 auto tmp = make_ref<GraphObj>(runtimeExec, ops);
+//                 tmp->dataMalloc();
+//                 nextGraphs.emplace_back(tmp);
+//             }
+//         }
+//         std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y)
+//         {
+//             return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
+//         });
+//         if (nextGraphs.size() > GRAPH_SIZE) {
+//             nextGraphs.resize(GRAPH_SIZE);
+//         }
+//         bestGraphs.clear();
+//         for (size_t i = 0; i < nextGraphs.size(); i++) {
+//             bestGraphs.emplace_back(nextGraphs[i]);
+//         }
+//     }
+
+//     std::cout << "[INFO] unfused graph: " << std::endl;
+//     for (size_t i = 0; i < bestGraphs.size(); i++) {
+//         std::cout << "bestGraph " << i << ":" << std::endl;
+//         std::cout << bestGraphs[i]->toString();
+//         std::cout << "[INFO] perf: " <<
+//         runtimeExec->getPerfTime(bestGraphs[i])
+//                   << std::endl;
+//     }
+
+//     return bestGraphs[0];
+// }
+
 std::vector<Graph> SearchEngine::search(const Graph &graph) {
    auto metaGraph = buildMetaGraphWithGraph(graph);
    auto mergedGraphs = searchMerge(metaGraph);
--- a/src/optimizations/partitions/partition.cc
+++ b/src/optimizations/partitions/partition.cc
@ -0,0 +1,38 @@
+#include "optimizations/partitions/partition.h"
+#include <algorithm>
+
+namespace infini {
+Partition::CandidateQueue
+Partition::rankCandidates(const GraphObj &subgraph, const Transformation &tr,
+                          const Rating &rating) const {
+    auto substitutes = tr.run(subgraph);
+    CandidateQueue ans;
+    while (!substitutes.empty()) {
+        auto g = std::move(substitutes.back());
+        auto cost = rating.run(*g);
+        ans.push({std::move(g), cost});
+        substitutes.pop_back();
+    }
+    return ans;
+}
+
+} // namespace infini
+
+namespace x {
+
+struct Operator;
+
+/// @brief 未分的完整图或不可再分的最小子图。
+using UniGraph = std::vector<Operator>;
+struct Candidate {
+    /// @brief 候选子图。
+    UniGraph graph;
+    /// @brief 子图评分。
+    float score;
+};
+/// @brief 一组连接到相同张量、平行的图。
+using Candidates = std::priority_queue<Candidate>;
+/// @brief 由多个通过张量相连的子图组合成的完整的图。
+using Graph = std::vector<Candidates>;
+
+}; // namespace x
--- a/src/optimizations/partitions/single_operator_partition.cc
+++ b/src/optimizations/partitions/single_operator_partition.cc
@ -0,0 +1,11 @@
+#include "optimizations/partitions/single_operator_partition.h"
+
+namespace infini {
+Graph SingleOperatorPartition::run(const GraphObj &graph,
+                                   const Transformation &tr,
+                                   const Rating &rating) const {
+    IT_TODO_HALT();
+    return make_ref<GraphObj>(graph);
+}
+
+} // namespace infini
Author	SHA1	Message	Date
YdrMaster	c077a61681	fix: 改正 1x1 卷积的变换 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	325b279468	typo Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	7b23fdbbfe	feat: 基本实现空洞卷积的变换 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	e3428d8fd8	fix: 正确使用张量中的数据 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	3631b03e73	feat: 优化转置写法 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	f78ff0e8ee	feat: 实现 1x1 卷积转矩阵乘 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	4a5e66b36a	feat: 区分 1x1 卷积和空洞卷积 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	7d7d923e8d	perf: 没有变体就不用评分 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	2a147c235d	style: 调整结构 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	17033fad97	feat: 添加导出 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	28287f3782	feat: 完成导入 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	cc62a3216d	feat: 整理子项目结构，实现一部分导入 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	9cfe223953	refactor: 重命名 optimization，以后可以分离为 submodule Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	34f7d7e9ed	feat: 图变换相关类分到单独文件中，并模板化。划分子图时提供一个模板化的子图类型。 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	59a46f3ff9	feat: Rating 评分对分归一化 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	72788e8e0a	feat: 实现子图合并 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	bd61cf4533	docs: 补充文档 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	f0f8915433	docs: 补充文档 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	bb5bfb0be8	docs Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	a56e86dfa9	feat: 提供子图划分、突变生成和评分的类型 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	5129d312d2	feat: 重写 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	45f7e891f1	feat: 增加 io_id 以支持子图划分 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	b4b5157bd4	feat: 增加更多示例 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	63cc93aadc	feat: 将数据和张量分离，单独保存 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	ddaf6685b3	fix: compile Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	385586d57b	docs: 补充文档 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	320468b627	try: 提出另一种图表示 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	8f38a41fb6	fix: compile Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	a6a0141234	feat: 整理每个子图的突变和评价算法 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	36b0c5855c	feat: 添加 PassManager 的运行 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	0ad0150b87	feat: 添加 Pass 的构造 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	334e0cccbc	feat: 添加互换图排序依据的实现 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
YdrMaster	cc6c18b00f	feat: 调整优化接口 Signed-off-by: YdrMaster <ydrml@hotmail.com>	2023-08-04 16:05:22 +08:00
whjthu	d9da06eb67	init optimization-pass	2023-08-04 16:05:22 +08:00