From a27391fcdcefcff3b355f3d42faf0ab7dbc40cf5 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 11:48:28 +0800
Subject: [PATCH 01/29] =?UTF-8?q?fix:=20=E4=BF=AE=E6=AD=A3=20batchNorm=20?=
 =?UTF-8?q?=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- onnx 和 pytorch 认为 batchNorm 的 4 个参数是 [c] 形状的，cuDNN 可能认为是 [1,c,1,...]。
优化已改为 [c]，但 cuDNN 推理没有改；

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/tests/test_onnx.py | 8 ++++----
 src/operators/batch_norm.cc       | 9 +++------
 test/operators/test_batch_norm.cc | 8 ++++----
 3 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 1f839256..0db20fc9 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -66,10 +66,10 @@ class TestStringMethods(unittest.TestCase):
 
     def test_batch_norm(self):
         x = make_tensor_value_info("x", TensorProto.UINT32, [1, 3, 2, 2])
-        scale = make_tensor_value_info("scale", TensorProto.FLOAT, [1, 3, 1, 1])
-        b = make_tensor_value_info("b", TensorProto.FLOAT, [1, 3, 1, 1])
-        mean = make_tensor_value_info("mean", TensorProto.FLOAT, [1, 3, 1, 1])
-        var = make_tensor_value_info("var", TensorProto.FLOAT, [1, 3, 1, 1])
+        scale = make_tensor_value_info("scale", TensorProto.FLOAT, [3])
+        b = make_tensor_value_info("b", TensorProto.FLOAT, [3])
+        mean = make_tensor_value_info("mean", TensorProto.FLOAT, [3])
+        var = make_tensor_value_info("var", TensorProto.FLOAT, [3])
         y = make_tensor_value_info("y", TensorProto.UINT32, [1, 3, 2, 2])
         batch_norm = make_node(
             "BatchNormalization",
diff --git a/src/operators/batch_norm.cc b/src/operators/batch_norm.cc
index b9be9425..b744df5a 100644
--- a/src/operators/batch_norm.cc
+++ b/src/operators/batch_norm.cc
@@ -19,12 +19,9 @@ BatchNormObj::inferShape(const TensorVec &inputs) const {
     auto var = inputs[2];
     auto scale = inputs[3];
     auto bias = inputs[4];
-    if (input->getDims().size() < 2)
-        return {};
-    Shape dims(input->getDims().size(), 1);
-    dims[1] = input->getDims()[1]; //
-    if (mean->getDims() != dims || var->getDims() != dims ||
-        scale->getDims() != dims || bias->getDims() != dims)
+    auto c = std::vector<int>{input->getDims()[1]};
+    if (mean->getDims() != c || var->getDims() != c || scale->getDims() != c ||
+        bias->getDims() != c)
         return {};
     return {{input->getDims()}};
 }
diff --git a/test/operators/test_batch_norm.cc b/test/operators/test_batch_norm.cc
index d904edd1..438db6e6 100644
--- a/test/operators/test_batch_norm.cc
+++ b/test/operators/test_batch_norm.cc
@@ -9,10 +9,10 @@ TEST(BatchNorm, ShapeInference) {
     {
         Graph g = make_ref<GraphObj>(cpuRuntime);
         Tensor i = g->addTensor({1, 3, 2, 2}, DataType::UInt32);
-        Tensor mean = g->addTensor({1, 3, 1, 1}, DataType::Float32);
-        Tensor var = g->addTensor({1, 3, 1, 1}, DataType::Float32);
-        Tensor scaler = g->addTensor({1, 3, 1, 1}, DataType::Float32);
-        Tensor bias = g->addTensor({1, 3, 1, 1}, DataType::Float32);
+        Tensor mean = g->addTensor({3}, DataType::Float32);
+        Tensor var = g->addTensor({3}, DataType::Float32);
+        Tensor scaler = g->addTensor({3}, DataType::Float32);
+        Tensor bias = g->addTensor({3}, DataType::Float32);
         auto op = g->addOp<BatchNormObj>(i, nullptr, mean, var, scaler, bias,
                                          0.9, 1e-5);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));

From 4ffaa44c1eebfe394e792c3e05238942d29ae0d3 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 11:51:46 +0800
Subject: [PATCH 02/29] =?UTF-8?q?fix:=20Matmul=20=E6=94=AF=E6=8C=81=202=20?=
 =?UTF-8?q?=E7=BB=B4=E6=88=96=E4=BB=A5=E4=B8=8A=E7=9A=84=E8=BE=93=E5=85=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

> 现在能导入 resnet18

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/tests/test_onnx.py |  5 ++--
 src/operators/matmul.cc           | 42 ++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 0db20fc9..8be02f11 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -8,7 +8,7 @@ from onnx.helper import (
     make_tensor_value_info,
 )
 from onnx.checker import check_model
-from pyinfinitensor.onnx import from_onnx, parse_onnx, backend, runtime, to_onnx
+from pyinfinitensor.onnx import from_onnx, backend, runtime, to_onnx
 
 
 def make_and_import_model(graph: onnx.GraphProto):
@@ -28,7 +28,7 @@ class TestStringMethods(unittest.TestCase):
                     file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
                 )
             )
-            parse_onnx(onnx.load(model_file))
+            from_onnx(onnx.load(model_file))
 
     def test_tensor(self):
         x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
@@ -290,7 +290,6 @@ class TestStringMethods(unittest.TestCase):
         model = make_model(graph)
         check_model(model)
         from_onnx(model)
-        parse_onnx(model)
 
     def test_frontend(self):
         handler = backend.GraphHandler(runtime)
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index 8e82ccc7..07708d07 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -5,10 +5,26 @@ namespace infini {
 MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
                      bool transB, [[maybe_unused]] Tensor bias, ActType act)
     : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
-      act(act), b(A->getDims()[0]),
-      m(transA ? A->getDims()[2] : A->getDims()[1]),
-      n(transB ? B->getDims()[1] : B->getDims()[2]),
-      k(transA ? A->getDims()[1] : A->getDims()[2]) {
+      act(act), b(1) {
+    auto shape_a = A->getDims();
+    auto shape_b = B->getDims();
+    IT_ASSERT(shape_a.size() == shape_b.size());
+    switch (shape_a.size()) {
+    case 0:
+    case 1:
+        IT_ASSERT(false);
+    case 2:
+        break;
+    default:
+        for (size_t i = 0; i < shape_a.size() - 2; ++i) {
+            IT_ASSERT(shape_a[i] == shape_b[i]);
+            b *= shape_a[i];
+        }
+        break;
+    }
+    m = *(transA ? shape_a.rbegin() : shape_a.rbegin() + 1);
+    n = *(transB ? shape_b.rbegin() + 1 : shape_b.rbegin());
+    k = *(transA ? shape_a.rbegin() + 1 : shape_a.rbegin());
     IT_ASSERT(checkValid(graph));
 }
 
@@ -22,19 +38,11 @@ string MatmulObj::toString() const {
 }
 
 optional<vector<Shape>> MatmulObj::inferShape(const TensorVec &inputs) const {
-    auto A = inputs[0], B = inputs[1];
-    // if (A->getType() == Tensor::Weight && B->getType() == Tensor::Weight)
-    //     return false;
-    if (!(A->getDims().size() == 3 && B->getDims().size() == 3))
-        return {};
-    if (!(A->getDims()[0] == B->getDims()[0]))
-        return {};
-    if (!((transA ? A->getDims()[1] : A->getDims()[2]) ==
-          (transB ? B->getDims()[2] : B->getDims()[1])))
-        return {};
-    int b(A->getDims()[0]), m(transA ? A->getDims()[2] : A->getDims()[1]),
-        n(transB ? B->getDims()[1] : B->getDims()[2]);
-    return {{{b, m, n}}};
+    auto shape_a = inputs[0]->getDims();
+    auto it = shape_a.rbegin();
+    *it++ = n;
+    *it++ = m;
+    return {{std::move(shape_a)}};
 }
 
 vector<int> MatmulObj::getWorkloadVector() const {

From ed81861375bb7eeef9f73496c6a7c6220ea8dee1 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 15:29:16 +0800
Subject: [PATCH 03/29] =?UTF-8?q?temp:=20=E5=AE=9E=E7=8E=B0=E5=88=9D?=
 =?UTF-8?q?=E5=A7=8B=E5=80=BC=E5=AF=BC=E5=85=A5=EF=BC=8C=E4=BD=86=20resnet?=
 =?UTF-8?q?=20=E6=8A=A5=E9=94=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/graph_handler.h              | 20 ++++++++
 pyinfinitensor/src/pyinfinitensor/onnx.py | 60 +++++++++--------------
 src/ffi/ffi_infinitensor.cc               |  3 ++
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index dc221042..82695faa 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -2,6 +2,8 @@
 
 #include "core/graph.h"
 #include "core/runtime.h"
+#include <cstdint>
+#include <iostream>
 
 namespace infini {
 
@@ -87,6 +89,24 @@ class GraphHandlerObj {
 
     inline void data_malloc() { g->dataMalloc(); }
 
+    inline void copy_int32(Tensor tensor, std::vector<int32_t> list) {
+        std::cout << "copy " << list.size() << " ints to (" << tensor->size()
+                  << ")" << std::endl;
+        tensor->copyData(list);
+    }
+
+    inline void copy_int64(Tensor tensor, std::vector<int64_t> list) {
+        std::cout << "copy " << list.size() << " ints to (" << tensor->size()
+                  << ")" << std::endl;
+        tensor->copyData(list);
+    }
+
+    inline void copy_float(Tensor tensor, std::vector<float> list) {
+        std::cout << "copy " << list.size() << " floats to (" << tensor->size()
+                  << ")" << std::endl;
+        tensor->copyData(list);
+    }
+
     inline void run() { g->getRuntime()->run(g); }
 };
 
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index ce315685..207280ef 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -22,7 +22,7 @@ from onnx.checker import (
     check_tensor,
 )
 from onnx.shape_inference import infer_shapes
-from typing import Dict, List, Any, Tuple, Sequence
+from typing import Dict, List, Any, Tuple, Sequence, Union
 from functools import reduce
 
 runtime = backend.cpu_runtime()
@@ -324,6 +324,24 @@ def from_onnx(model: ModelProto) -> backend.GraphHandler:
         else:
             raise Exception('Unsupported operator "{}"'.format(node.op_type))
 
+    handler.data_malloc()
+
+    inputs = []
+    for name, obj in tensors.items():
+        tensor = data.get(name)
+        if tensor == None:
+            if any(input.name == name for input in model.graph.input):
+                inputs.append((name, tensor))
+        else:
+            if tensor.data_type == TensorProto.INT32:
+                handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
+            elif tensor.data_type == TensorProto.INT64:
+                handler.copy_int64(obj, [int(i) for i in tensor.int64_data])
+            elif tensor.data_type == TensorProto.FLOAT:
+                handler.copy_float(obj, [float(i) for i in tensor.float_data])
+            else:
+                assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+
 
 def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
     class Context:
@@ -482,42 +500,6 @@ def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
     return ctx.build(name)
 
 
-def parse_onnx(model: ModelProto):
-    print()
-
-    for field in [
-        "doc_string",
-        "domain",
-        "functions",
-        "metadata_props",
-        "model_version",
-        "producer_name",
-        "producer_version",
-        "training_info",
-    ]:
-        print("{}: {}".format(field, getattr(model, field)))
-
-    print("ir_version:", model.ir_version)
-    for opset in model.opset_import:
-        print("opset domain={} version={}".format(opset.domain, opset.version))
-
-    print("layout:")
-    for node in model.graph.node:
-        print(
-            '   {o} <- {op}"{name}"{a} <- {i}'.format(
-                name=node.name,
-                op=node.op_type,
-                i=node.input,
-                o=node.output,
-                a=[a.name for a in node.attribute],
-            )
-        )
-
-    print("weight:")
-    for node in model.graph.initializer:
-        print("   {}".format(node.name))
-
-
 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
     for attr in node.attribute:
         if attr.name in attrs:
@@ -536,11 +518,13 @@ def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[st
     return attrs
 
 
-def _parse_data(tensor: TensorProto) -> List[int]:
+def _parse_data(tensor: TensorProto) -> List[Union[int, float]]:
     if tensor.data_type == TensorProto.INT32:
         return [int(i) for i in tensor.int32_data]
     elif tensor.data_type == TensorProto.INT64:
         return [int(i) for i in tensor.int64_data]
+    elif tensor.data_type == TensorProto.FLOAT:
+        return [float(i) for i in tensor.float_data]
     else:
         assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
 
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index e545f43c..b791e66e 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -171,6 +171,9 @@ void init_graph_builder(py::module &m) {
         .def("topo_sort", &Handler::topo_sort, policy::automatic)
         .def("operators", &Handler::operators, policy::move)
         .def("data_malloc", &Handler::data_malloc, policy::automatic)
+        .def("copy_int32", &Handler::copy_int32, policy::automatic)
+        .def("copy_int64", &Handler::copy_int64, policy::automatic)
+        .def("copy_float", &Handler::copy_float, policy::automatic)
         .def("run", &Handler::run, policy::automatic);
 }
 

From 60c5d6b5b8561093f83d29565358b64a623254c6 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 16:22:03 +0800
Subject: [PATCH 04/29] =?UTF-8?q?fix:=20=E5=85=88=E4=B8=8D=E5=9C=A8=20cpu?=
 =?UTF-8?q?=20=E4=B8=8A=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7dae8509..879ae847 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -66,5 +66,5 @@ jobs:
     - name: Download test model
       run: wget ${{ env.resnet-download }}
 
-    - name: Test onnx frontend
-      run: make test-onnx
+    # - name: Test onnx frontend
+    #   run: make test-onnx

From 9ab78f13f73108239956db0e7f4c102fd1ae40a0 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 16:29:20 +0800
Subject: [PATCH 05/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=20cuda=5Frunti?=
 =?UTF-8?q?me?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py |  4 ++--
 src/ffi/ffi_infinitensor.cc               | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 207280ef..ef7d4de2 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -25,10 +25,10 @@ from onnx.shape_inference import infer_shapes
 from typing import Dict, List, Any, Tuple, Sequence, Union
 from functools import reduce
 
-runtime = backend.cpu_runtime()
+cpu_runtime = backend.cpu_runtime()
 
 
-def from_onnx(model: ModelProto) -> backend.GraphHandler:
+def from_onnx(model: ModelProto, runtime) -> backend.GraphHandler:
     model = infer_shapes(model)
     handler = backend.GraphHandler(runtime)
 
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index b791e66e..5d6085ea 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -6,6 +6,7 @@
 #include <pybind11/stl.h>
 
 #ifdef USE_CUDA
+#include "cuda/cuda_runtime.h"
 #include "cuda/operator_timer.h"
 #endif
 
@@ -94,6 +95,10 @@ static int tensor_dtype(Tensor t) {
     IT_ASSERT(false, "Unsupported data type");
 }
 
+#ifdef USE_CUDA
+static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
+#endif
+
 static int concat_axis_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Concat);
     return dynamic_cast<const ConcatObj *>(op.get())->getDim();
@@ -118,6 +123,9 @@ static Shape reshape_shape_of(Operator op) {
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
     m.def("cpu_runtime", &CpuRuntimeObj::getInstance)
+#ifdef USE_CUDA
+        .FUNCTION(cuda_runtime)
+#endif
         .FUNCTION(tensor_dtype)
         .FUNCTION(reshape_shape_of)
         .FUNCTION(concat_axis_of)
@@ -132,6 +140,10 @@ void init_graph_builder(py::module &m) {
     py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "Runtime");
     py::class_<CpuRuntimeObj, std::shared_ptr<CpuRuntimeObj>, RuntimeObj>(
         m, "CpuRuntime");
+#ifdef USE_CUDA
+    py::class_<CudaRuntimeObj, std::shared_ptr<CudaRuntimeObj>, RuntimeObj>(
+        m, "CudaRuntime");
+#endif
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("shape", &TensorObj::getDims, policy::move)
         .def("src", &TensorObj::getOutputOf, policy::move);

From dc79b7265506c0d139aa31289657d7a9ddf40ac6 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 23 Feb 2023 17:20:05 +0800
Subject: [PATCH 06/29] =?UTF-8?q?fix:=20=E9=87=8D=E6=96=B0=E5=AF=BC?=
 =?UTF-8?q?=E5=87=BA=20cuda=5Fruntime()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index ef7d4de2..f815c037 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -28,6 +28,10 @@ from functools import reduce
 cpu_runtime = backend.cpu_runtime()
 
 
+def cuda_runtime():
+    return backend.cuda_runtime()
+
+
 def from_onnx(model: ModelProto, runtime) -> backend.GraphHandler:
     model = infer_shapes(model)
     handler = backend.GraphHandler(runtime)

From 6dce129cb3fdc20578506528085fe10e0eee7e6b Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 09:22:03 +0800
Subject: [PATCH 07/29] fix: TensorObj::dataMalloc

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .github/workflows/build.yml       |  4 ++--
 pyinfinitensor/tests/test_onnx.py | 10 +++++-----
 src/core/tensor.cc                | 11 ++---------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 879ae847..7dae8509 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -66,5 +66,5 @@ jobs:
     - name: Download test model
       run: wget ${{ env.resnet-download }}
 
-    # - name: Test onnx frontend
-    #   run: make test-onnx
+    - name: Test onnx frontend
+      run: make test-onnx
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index 8be02f11..a529fb67 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -8,13 +8,13 @@ from onnx.helper import (
     make_tensor_value_info,
 )
 from onnx.checker import check_model
-from pyinfinitensor.onnx import from_onnx, backend, runtime, to_onnx
+from pyinfinitensor.onnx import from_onnx, backend, to_onnx, cpu_runtime
 
 
 def make_and_import_model(graph: onnx.GraphProto):
     model = make_model(graph)
     check_model(model)
-    from_onnx(model)
+    from_onnx(model, cpu_runtime)
 
 
 class TestStringMethods(unittest.TestCase):
@@ -28,7 +28,7 @@ class TestStringMethods(unittest.TestCase):
                     file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
                 )
             )
-            from_onnx(onnx.load(model_file))
+            from_onnx(onnx.load(model_file), cpu_runtime)
 
     def test_tensor(self):
         x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3])
@@ -289,10 +289,10 @@ class TestStringMethods(unittest.TestCase):
         graph = make_graph([matmul, add], "lr", [x, a, b], [y])
         model = make_model(graph)
         check_model(model)
-        from_onnx(model)
+        from_onnx(model, cpu_runtime)
 
     def test_frontend(self):
-        handler = backend.GraphHandler(runtime)
+        handler = backend.GraphHandler(cpu_runtime)
         a = handler.tensor([1, 2, 3], 12)
         b = handler.tensor([1, 2, 3], 12)
         c = handler.tensor([1, 2, 3], 12)
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index 5050e53e..89294a8e 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -148,15 +148,8 @@ bool TensorObj::equalData(const Tensor &rhs) const {
 }
 
 void TensorObj::dataMalloc() {
-    if (data != nullptr)
-        return;
-    // IT_ASSERT(data == nullptr);
-    size_t bytesPerElement;
-    if (getDType() == DataType::Float32)
-        bytesPerElement = sizeof(float);
-    else if (getDType() == DataType::UInt32)
-        bytesPerElement = sizeof(uint32_t);
-    data = runtime->allocBlob(size() * bytesPerElement);
+    if (data == nullptr)
+        data = runtime->allocBlob(getBytes());
 }
 
 void TensorObj::copyData(const TensorObj *src) {

From f44a4daf70d734e61dcbfa47d579743620eeb35e Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 09:39:30 +0800
Subject: [PATCH 08/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=E6=9C=AA?=
 =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E7=9A=84=E5=BC=A0=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/graph_handler.h              |  6 ------
 pyinfinitensor/src/pyinfinitensor/onnx.py | 11 ++++++++---
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 82695faa..ff702cf8 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -90,20 +90,14 @@ class GraphHandlerObj {
     inline void data_malloc() { g->dataMalloc(); }
 
     inline void copy_int32(Tensor tensor, std::vector<int32_t> list) {
-        std::cout << "copy " << list.size() << " ints to (" << tensor->size()
-                  << ")" << std::endl;
         tensor->copyData(list);
     }
 
     inline void copy_int64(Tensor tensor, std::vector<int64_t> list) {
-        std::cout << "copy " << list.size() << " ints to (" << tensor->size()
-                  << ")" << std::endl;
         tensor->copyData(list);
     }
 
     inline void copy_float(Tensor tensor, std::vector<float> list) {
-        std::cout << "copy " << list.size() << " floats to (" << tensor->size()
-                  << ")" << std::endl;
         tensor->copyData(list);
     }
 
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index f815c037..217d230d 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -32,7 +32,9 @@ def cuda_runtime():
     return backend.cuda_runtime()
 
 
-def from_onnx(model: ModelProto, runtime) -> backend.GraphHandler:
+def from_onnx(
+    model: ModelProto, runtime
+) -> Tuple[Dict[str, backend.Tensor], backend.GraphHandler]:
     model = infer_shapes(model)
     handler = backend.GraphHandler(runtime)
 
@@ -330,12 +332,13 @@ def from_onnx(model: ModelProto, runtime) -> backend.GraphHandler:
 
     handler.data_malloc()
 
-    inputs = []
+    inputs: Dict[str, backend.Tensor] = {}
     for name, obj in tensors.items():
+        print("{}: {}".format(name, obj))
         tensor = data.get(name)
         if tensor == None:
             if any(input.name == name for input in model.graph.input):
-                inputs.append((name, tensor))
+                inputs[name] = obj
         else:
             if tensor.data_type == TensorProto.INT32:
                 handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
@@ -346,6 +349,8 @@ def from_onnx(model: ModelProto, runtime) -> backend.GraphHandler:
             else:
                 assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
 
+    return inputs, handler
+
 
 def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
     class Context:

From cf9bdb05629f15a6bfcd9514bbd8a1936728dc11 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 10:08:04 +0800
Subject: [PATCH 09/29] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E6=89=93?=
 =?UTF-8?q?=E5=8D=B0=E7=BB=93=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 8 ++++++--
 src/ffi/ffi_infinitensor.cc               | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 217d230d..6b3e394c 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -34,7 +34,7 @@ def cuda_runtime():
 
 def from_onnx(
     model: ModelProto, runtime
-) -> Tuple[Dict[str, backend.Tensor], backend.GraphHandler]:
+) -> Tuple[Dict[str, backend.Tensor], Dict[str, backend.Tensor], backend.GraphHandler]:
     model = infer_shapes(model)
     handler = backend.GraphHandler(runtime)
 
@@ -349,7 +349,11 @@ def from_onnx(
             else:
                 assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
 
-    return inputs, handler
+    outputs: Dict[str, backend.Tensor] = {}
+    for output in model.graph.output:
+        outputs[output.name] = tensors[output.name]
+
+    return inputs, outputs, handler
 
 
 def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 5d6085ea..402d5306 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -146,6 +146,7 @@ void init_graph_builder(py::module &m) {
 #endif
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("shape", &TensorObj::getDims, policy::move)
+        .def("printData", &TensorObj::printData, policy::automatic)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
         .def("op_type", &OperatorObj::getOpType, policy::automatic)

From 3d122aebfe6d091d3452bb763aaf522853ab7789 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 11:19:37 +0800
Subject: [PATCH 10/29] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E5=AF=BC?=
 =?UTF-8?q?=E5=87=BA=E6=B5=AE=E7=82=B9=E5=90=91=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/tensor.h       | 24 +++++++++++++++++++++++-
 src/ffi/ffi_infinitensor.cc |  2 +-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/core/tensor.h b/include/core/tensor.h
index 9e835a6b..422355e9 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -1,6 +1,11 @@
 #pragma once
 #include "core/tensor_base.h"
 #include <cmath>
+#include <cstring>
+
+#if USE_CUDA
+#include "cuda/cuda_runtime.h"
+#endif
 
 namespace infini {
 
@@ -55,7 +60,6 @@ class TensorObj : public TensorBaseObj {
         obj->outputOf.reset();
         return obj;
     }
-    // TODO: clarify whether clone copies data
     Tensor clone(Runtime runtime) const {
         auto obj = make_ref<TensorObj>(*this);
         obj->runtime = runtime;
@@ -68,6 +72,24 @@ class TensorObj : public TensorBaseObj {
         }
         return obj;
     }
+    inline std::vector<float> cloneFloats() const {
+        IT_ASSERT(data != nullptr);
+        IT_ASSERT(getDType() == DataType::Float32);
+        std::vector<float> ans(size());
+        auto src = getRawDataPtr<void *>();
+        auto dst = ans.data();
+        auto bytes = getBytes();
+        if (runtime->isCpu()) {
+            memcpy(dst, src, bytes);
+        } else {
+#if USE_CUDA
+            cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
+#else
+            IT_TODO_HALT();
+#endif
+        }
+        return ans;
+    }
 
     void printData() const;
     bool equalData(const Tensor &rhs) const;
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 402d5306..2be4971b 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -146,7 +146,7 @@ void init_graph_builder(py::module &m) {
 #endif
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("shape", &TensorObj::getDims, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic)
+        .def("cloneFloats", &TensorObj::cloneFloats, policy::move)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
         .def("op_type", &OperatorObj::getOpType, policy::automatic)

From 514666591e145586b8b30b7829cf69e75f217ba4 Mon Sep 17 00:00:00 2001
From: wanghailu <wanghailu@qiyuanlab.com>
Date: Fri, 24 Feb 2023 13:55:53 +0800
Subject: [PATCH 11/29] add batch_norm

---
 src/kernels/cuda/batch_norm.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/kernels/cuda/batch_norm.cc b/src/kernels/cuda/batch_norm.cc
index 7ca75ba6..b150aaa5 100644
--- a/src/kernels/cuda/batch_norm.cc
+++ b/src/kernels/cuda/batch_norm.cc
@@ -28,9 +28,11 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
         for (size_t i = 0; i < dims.size(); ++i) {
             dimArray[i] = dims[i];
             strideArray[i] = op->getInputs(0)->getStride()[i];
-            dimPArray[i] = op->getInputs(1)->getDims()[i];
-            stridePArray[i] = op->getInputs(1)->getStride()[i];
+            dimPArray[i] = 1;
+            stridePArray[i] = 1; 
         }
+        dimPArray[1] = op->getInputs(0)->getDims()[1];
+        stridePArray[1] = op->getInputs(0)->getStride()[1];
         // get inputs
         cudnnTensorDescriptor_t inDesc;
         checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));

From fb3478bf3e395cd5b7d54d81937e3d95b1344250 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 14:14:52 +0800
Subject: [PATCH 12/29] build: update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 71033bbb..3661da78 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,17 @@
 ﻿.PHONY : build clean install-python test-cpp test-onnx
 
 TYPE ?= release
+CUDA ?= off
+
+CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
+
+ifeq ($(CUDA), on)
+	CMAKE_OPT += -DUSE_CUDA=ON
+endif
 
 build:
 	mkdir -p build/$(TYPE)
-	cd build/$(TYPE) && cmake -DCMAKE_BUILD_TYPE=$(TYPE) ../.. && make -j8
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
 
 clean:
 	rm -rf build

From 59bf59c10b966457cb2f0225cc3f43683434b1e0 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 14:28:34 +0800
Subject: [PATCH 13/29] docs: update README.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 Makefile  |  2 +-
 README.md | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 3661da78..4de85392 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ CUDA ?= off
 
 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 
-ifeq ($(CUDA), on)
+ifeq ($(CUDA), ON)
 	CMAKE_OPT += -DUSE_CUDA=ON
 endif
 
diff --git a/README.md b/README.md
index 8e109331..1f6c07fa 100644
--- a/README.md
+++ b/README.md
@@ -5,16 +5,27 @@
 ``` bash
 # Enter the root of InfiniTensor
 source test/script/env_lotus.sh
-mkdir build && cd build
-cmake -DUSE_CUDA=ON .. && make -j 12
+make CUDA=ON
 ```
 
+### Make Commands
+
+- `make`/`make build`: Builds the project;
+- `make install-python`: Builds the project then install the python frontend;
+- `make test-cpp`: Builds the project then run cpp unit tests;
+- `make test-onnx`: Run python unit tests;
+
+---
+
+> Sets env: `CUDA=ON` to enable cuda.
+
 ### CMake Options
 
 There are several configurable CMake options, see the [CMakeLists.txt file](/CMakeLists.txt#L5).
 
 - If `USE_BACKTRACE` is `ON`, `libdw-dev` have to be installed. See the README of [backward-cpp](https://github.com/bombela/backward-cpp) for details.
 - If `USE_PROTOBUF` is `ON`, `protobuf` have to be installed. See the README of [protobuf](https://github.com/protocolbuffers/protobuf) for details.
+- If `USE_CUDA` is `ON`, `cuda` have to be installed.
 
 ## Contributor Guide
 

From 5b6698bac73adc641627da8122e671d07d3734f5 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 15:02:52 +0800
Subject: [PATCH 14/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=E5=85=A8?=
 =?UTF-8?q?=E5=9B=BE=E7=9A=84=E8=BE=93=E5=87=BA=E5=BC=A0=E9=87=8F=E5=88=B0?=
 =?UTF-8?q?=20onnx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/tensor.h                     |  8 ++++----
 include/core/tensor_base.h                | 15 +++++++++------
 pyinfinitensor/src/pyinfinitensor/onnx.py |  7 ++++++-
 src/core/tensor.cc                        | 10 +++++-----
 src/ffi/ffi_infinitensor.cc               |  1 +
 5 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/include/core/tensor.h b/include/core/tensor.h
index 422355e9..1edc950a 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -56,16 +56,16 @@ class TensorObj : public TensorBaseObj {
     Tensor clone() const {
         auto obj = make_ref<TensorObj>(*this);
         obj->freeData();
-        obj->inputOf.clear();
-        obj->outputOf.reset();
+        obj->targets.clear();
+        obj->source.reset();
         return obj;
     }
     Tensor clone(Runtime runtime) const {
         auto obj = make_ref<TensorObj>(*this);
         obj->runtime = runtime;
         obj->freeData();
-        obj->inputOf.clear();
-        obj->outputOf.reset();
+        obj->targets.clear();
+        obj->source.reset();
         if (hasData()) {
             obj->dataMalloc();
             obj->copyData(this);
diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h
index af2b97c3..09286a63 100644
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@@ -19,8 +19,8 @@ class TensorBaseObj : public Object {
     int dim;
 
     DataType dtype;
-    vector<WRef<OperatorObj>> inputOf;
-    WRef<OperatorObj> outputOf;
+    vector<WRef<OperatorObj>> targets;
+    WRef<OperatorObj> source;
     Blob data;
     Runtime runtime;
 
@@ -46,10 +46,13 @@ class TensorBaseObj : public Object {
     DataType getDType() const { return dtype; }
     Runtime getRuntime() const { return runtime; }
 
-    void addInputOf(const Operator &op) { inputOf.emplace_back(op); }
-    void setOutputOf(const Operator &op) { outputOf = op; }
-    OpVec getInputOf() { return wrefs_to_refs(inputOf); }
-    Operator getOutputOf() { return outputOf.lock(); }
+    void addInputOf(const Operator &op) { targets.emplace_back(op); }
+    void setOutputOf(const Operator &op) { source = op; }
+
+    bool hasTarget() const { return !targets.empty(); }
+
+    OpVec getInputOf() const { return wrefs_to_refs(targets); }
+    Operator getOutputOf() const { return source.lock(); }
     //     std::pair<Operator *, int> getOutputOfWithIndex();
 
     //     bool setScalar(VType val) {
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 6b3e394c..f8fd662c 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -334,7 +334,6 @@ def from_onnx(
 
     inputs: Dict[str, backend.Tensor] = {}
     for name, obj in tensors.items():
-        print("{}: {}".format(name, obj))
         tensor = data.get(name)
         if tensor == None:
             if any(input.name == name for input in model.graph.input):
@@ -382,6 +381,12 @@ def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
 
         def push_output(self, name: str, tensor: backend.Tensor) -> str:
             self.names[tensor] = name
+            if not tensor.has_target():
+                shape = tensor.shape()
+                dtype = backend.tensor_dtype(tensor)
+                value_info = make_tensor_value_info(name, dtype, shape)
+                check_value_info(value_info)
+                self.outputs.append(value_info)
             return name
 
         def push_input(self, tensor: backend.Tensor) -> str:
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index 89294a8e..5f04e114 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -18,13 +18,13 @@ string TensorObj::toString() const {
                  std::to_string(fuid) + ", shape " + vecToString(shape) +
                  ", dtype " + dtype.toString();
     vector<UidBaseType> inputOfGuid;
-    for (const auto &op : inputOf)
+    for (const auto &op : targets)
         inputOfGuid.emplace_back(op.lock()->getGuid());
-    if (auto o = outputOf.lock())
-        ret += ", outputOf " + std::to_string(o->getGuid());
+    if (auto o = source.lock())
+        ret += ", source " + std::to_string(o->getGuid());
     else
-        ret += ", outputOf None";
-    ret += ", inputOf " + vecToString(inputOfGuid);
+        ret += ", source None";
+    ret += ", targets " + vecToString(inputOfGuid);
     return ret;
 }
 
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 2be4971b..53e1376c 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -147,6 +147,7 @@ void init_graph_builder(py::module &m) {
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("shape", &TensorObj::getDims, policy::move)
         .def("cloneFloats", &TensorObj::cloneFloats, policy::move)
+        .def("has_target", &TensorObj::hasTarget, policy::automatic)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
         .def("op_type", &OperatorObj::getOpType, policy::automatic)

From 71ca4459d9e79d13c05e6ea0359da6d68d8c50a2 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 15:04:31 +0800
Subject: [PATCH 15/29] fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 src/kernels/cuda/batch_norm.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/kernels/cuda/batch_norm.cc b/src/kernels/cuda/batch_norm.cc
index b150aaa5..35cc78c0 100644
--- a/src/kernels/cuda/batch_norm.cc
+++ b/src/kernels/cuda/batch_norm.cc
@@ -2,6 +2,7 @@
 #include "core/kernel.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
+
 namespace infini {
 class BatchNormCudnn : public CudaKernelWithoutConfig {
     void compute(const Operator &_op,
@@ -29,7 +30,7 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
             dimArray[i] = dims[i];
             strideArray[i] = op->getInputs(0)->getStride()[i];
             dimPArray[i] = 1;
-            stridePArray[i] = 1; 
+            stridePArray[i] = 1;
         }
         dimPArray[1] = op->getInputs(0)->getDims()[1];
         stridePArray[1] = op->getInputs(0)->getStride()[1];

From a5e692baea00a43dd656c82c005a008c3d245200 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 15:42:39 +0800
Subject: [PATCH 16/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=20batchnorm=20?=
 =?UTF-8?q?=E5=88=B0=20onnx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/operators/batch_norm.h            |   8 +-
 pyinfinitensor/src/pyinfinitensor/onnx.py | 972 +++++++++++-----------
 pyinfinitensor/tests/test_onnx.py         |   4 +-
 src/ffi/ffi_infinitensor.cc               |  11 +-
 4 files changed, 515 insertions(+), 480 deletions(-)

diff --git a/include/operators/batch_norm.h b/include/operators/batch_norm.h
index 20842615..8e41a043 100644
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@@ -39,9 +39,11 @@ class BatchNormObj : public OperatorObj {
     std::string toString() const override;
 
     // output size will be 3 when training
-    int numInputs() const override { return 5; }
-    int numOutputs() const override { return outputs.size(); }
-    float getEps() const { return eps; }
+    inline int numInputs() const override { return 5; }
+    inline int numOutputs() const override { return outputs.size(); }
+    inline float getMomentum() const { return momentum; }
+    inline float getEps() const { return eps; }
+    inline bool getTraining() const { return training; }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index f8fd662c..28b8f514 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -32,490 +32,516 @@ def cuda_runtime():
     return backend.cuda_runtime()
 
 
-def from_onnx(
-    model: ModelProto, runtime
-) -> Tuple[Dict[str, backend.Tensor], Dict[str, backend.Tensor], backend.GraphHandler]:
-    model = infer_shapes(model)
-    handler = backend.GraphHandler(runtime)
-
-    tensors: Dict[str, backend.Tensor] = dict()
-    data: Dict[str, TensorProto] = dict()
-
-    for input in model.graph.input:
-        dims = _take_shape_dim(input.type.tensor_type.shape)
-        tensors[input.name] = handler.tensor(dims, input.type.tensor_type.elem_type)
-
-    for output in model.graph.output:
-        dims = _take_shape_dim(output.type.tensor_type.shape)
-        tensors[output.name] = handler.tensor(dims, output.type.tensor_type.elem_type)
-
-    for initializer in model.graph.initializer:
-        data[initializer.name] = initializer
-
-    for node in model.graph.node:
-        if node.op_type == "Conv":
-            attributes = _parse_attribute(
-                node,
-                {
-                    "dilations": [1, 1],
-                    "pads": [0, 0],
-                    "strides": [1, 1],
-                },
-            )
-            (d, p, s) = (attributes[name] for name in ["dilations", "pads", "strides"])
-            tensors[node.output[0]] = handler.conv(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-                p[0],
-                p[1],
-                s[0],
-                s[1],
-                d[0],
-                d[1],
-            )
-        elif node.op_type == "MatMul":
-            tensors[node.output[0]] = handler.matmul(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-                False,
-                False,
-                None,
-                backend.ActType.Linear,
-            )
-        elif node.op_type == "Gemm":
-            attributes = _parse_attribute(
-                node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
-            )
-            (alpha, beta, transA, transB) = (
-                attributes[name] for name in ["alpha", "beta", "transA", "transB"]
-            )
-            # FIXME 不支持 `alpha` `beta`
-            assert alpha == 1.0
-            assert beta == 1.0
-            tensors[node.output[0]] = handler.matmul(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-                transA == 1,
-                transB == 1,
-                tensors[node.input[2]] if len(node.input) > 2 else None,
-                backend.ActType.Linear,
-            )
-        elif node.op_type == "BatchNormalization":
-            (input, mean, var, scale, bias) = (
-                tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
-            )
-            output = tensors.get(node.output[0])
-            attributes = _parse_attribute(
-                node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
-            )
-            (momentum, eps, training) = (
-                attributes[name] for name in ["momentum", "epsilon", "training_mode"]
-            )
-            tensors[node.output[0]] = handler.batchNorm(
-                input, output, mean, var, scale, bias, momentum, eps, training != 0
-            )
-        elif node.op_type == "MaxPool":
-            attributes = _parse_attribute(
-                node,
-                {
-                    "kernel_shape": None,
-                    "dilations": [1, 1],
-                    "pads": [0, 0],
-                    "strides": [1, 1],
-                },
-            )
-            (k, d, p, s) = (
-                attributes[name]
-                for name in ["kernel_shape", "dilations", "pads", "strides"]
-            )
-            tensors[node.output[0]] = handler.maxPool(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                k[0],
-                k[1],
-                d[0],
-                d[1],
-                p[0],
-                p[1],
-                s[0],
-                s[1],
-            )
-        elif node.op_type == "AveragePool":
-            attributes = _parse_attribute(
-                node,
-                {
-                    "kernel_shape": None,
-                    "pads": [0, 0],
-                    "strides": [1, 1],
-                },
-            )
-            (k, p, s) = (
-                attributes[name] for name in ["kernel_shape", "pads", "strides"]
-            )
-            tensors[node.output[0]] = handler.avgPool(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                k[0],
-                k[1],
-                1,
-                1,
-                p[0],
-                p[1],
-                s[0],
-                s[1],
-            )
-        elif node.op_type == "GlobalAveragePool":
-            shape = next(
-                (
-                    value.type.tensor_type.shape
-                    for value in model.graph.value_info
-                    if value.name == node.input[0]
-                ),
-                None,
-            ) or next(
-                input.type.tensor_type.shape
-                for input in model.graph.input
-                if input.name == node.input[0]
-            )
-            [_, _, h, w] = _take_shape_dim(shape)
-            tensors[node.output[0]] = handler.avgPool(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                h,
-                w,
-                1,
-                1,
-                0,
-                0,
-                1,
-                1,
-            )
-        elif node.op_type == "Add":
-            tensors[node.output[0]] = handler.add(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Sub":
-            tensors[node.output[0]] = handler.sub(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Mul":
-            tensors[node.output[0]] = handler.mul(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Div":
-            tensors[node.output[0]] = handler.div(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Pow":
-            tensors[node.output[0]] = handler.pow(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Relu":
-            tensors[node.output[0]] = handler.relu(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Sigmoid":
-            tensors[node.output[0]] = handler.sigmoid(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Tanh":
-            tensors[node.output[0]] = handler.tanh(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Softmax":
-            tensors[node.output[0]] = handler.softmax(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Abs":
-            tensors[node.output[0]] = handler.abs(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Identity":
-            tensors[node.output[0]] = handler.identity(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Flatten":
-            # FIXME 后端算子不支持沿任意轴展开
-            axis = next(
-                (attr.i for attr in node.attribute if attr.name == "axis"), None
-            )
-            assert axis == None or axis == 1
-            tensors[node.output[0]] = handler.flatten(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-            )
-        elif node.op_type == "Reshape":
-            input_shape = next(
-                (
-                    value.type.tensor_type.shape
-                    for value in model.graph.value_info
-                    if value.name == node.input[0]
-                ),
-                None,
-            ) or next(
-                input.type.tensor_type.shape
-                for input in model.graph.input
-                if input.name == node.input[0]
-            )
-            dims = _take_shape_dim(input_shape)
-            size = reduce(lambda acc, x: acc * x, dims)
-            output_shape = [int(i) for i in data[node.input[1]].int64_data]
-            for i, x in enumerate(output_shape):
-                if x == 0:
-                    output_shape[i] = dims[i]
-            temp = reduce(lambda acc, x: acc * x, output_shape)
-            if temp < 0:
-                output_shape[output_shape.index(-1)] = size // -temp
-            tensors[node.output[0]] = handler.reshape(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                output_shape,
-            )
-        elif node.op_type == "Concat":
-            tensors[node.output[0]] = handler.concat(
-                [tensors[name] for name in node.input],
-                tensors.get(node.output[0]),
-                next((attr.i for attr in node.attribute if attr.name == "axis")),
-            )
-        elif node.op_type == "Gather":
-            tensors[node.output[0]] = handler.gather(
-                tensors[node.input[0]],
-                tensors[node.input[1]],
-                tensors.get(node.output[0]),
-                next((attr.i for attr in node.attribute if attr.name == "axis")),
-            )
-        elif node.op_type == "ReduceMean":
-            tensors[node.output[0]] = handler.reduceMean(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                tensors[node.input[1]] if len(node.input) > 1 else None,
-                next((attr.i for attr in node.attribute if attr.name == "keepdims"))
-                != 0,
-            )
-        elif node.op_type == "Slice":
-            tensors[node.output[0]] = handler.slice(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                _parse_data(data[node.input[1]]),
-                _parse_data(data[node.input[2]]),
-                _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
-                _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
-            )
-        elif node.op_type == "Pad":
-            tensors[node.output[0]] = handler.pad(
-                tensors[node.input[0]],
-                tensors.get(node.output[0]),
-                _parse_data(data[node.input[1]]),
-                _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
-            )
-        else:
-            raise Exception('Unsupported operator "{}"'.format(node.op_type))
-
-    handler.data_malloc()
-
+class OnnxStub:
     inputs: Dict[str, backend.Tensor] = {}
-    for name, obj in tensors.items():
-        tensor = data.get(name)
-        if tensor == None:
-            if any(input.name == name for input in model.graph.input):
-                inputs[name] = obj
-        else:
-            if tensor.data_type == TensorProto.INT32:
-                handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
-            elif tensor.data_type == TensorProto.INT64:
-                handler.copy_int64(obj, [int(i) for i in tensor.int64_data])
-            elif tensor.data_type == TensorProto.FLOAT:
-                handler.copy_float(obj, [float(i) for i in tensor.float_data])
-            else:
-                assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
-
     outputs: Dict[str, backend.Tensor] = {}
-    for output in model.graph.output:
-        outputs[output.name] = tensors[output.name]
+    handler: backend.GraphHandler
 
-    return inputs, outputs, handler
+    def __init__(self, model: ModelProto, runtime):
+        model = infer_shapes(model)
+        self.handler = backend.GraphHandler(runtime)
 
+        tensors: Dict[str, backend.Tensor] = dict()
+        data: Dict[str, TensorProto] = dict()
 
-def to_onnx(graph: backend.GraphHandler, name: str) -> ModelProto:
-    class Context:
-        # saves object names, including tensors and operators
-        names: Dict[Any, str] = dict()
-        # counts the occurrence times of each operator for naming
-        count_op: Dict[backend.OpType, int] = dict()
-        # counts input and output tensors for naming
-        count_in, count_out = 0, 0
-        # saves nodes (operators)
-        nodes: List[NodeProto] = []
-        # saves global input tensors
-        inputs: List[ValueInfoProto] = []
-        # saves global output tensors
-        outputs: List[ValueInfoProto] = []
-        # saves global input tensors
-        initializers: List[TensorProto] = []
-
-        def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
-            ty = op.op_type()
-            name = "{}{}".format(ty.name, self.count_op.setdefault(ty, 0) + 1)
-            self.names[op] = name
-            self.count_op[ty] += 1
-            return ty, name
-
-        def push_output(self, name: str, tensor: backend.Tensor) -> str:
-            self.names[tensor] = name
-            if not tensor.has_target():
-                shape = tensor.shape()
-                dtype = backend.tensor_dtype(tensor)
-                value_info = make_tensor_value_info(name, dtype, shape)
-                check_value_info(value_info)
-                self.outputs.append(value_info)
-            return name
-
-        def push_input(self, tensor: backend.Tensor) -> str:
-            name = self.names.get(tensor)
-            # means that this input is a global input
-            if name is None:
-                self.count_in += 1
-                name = "input{}".format(self.count_in)
-                self.names[tensor] = name
-                shape = tensor.shape()
-                dtype = backend.tensor_dtype(tensor)
-                value_info = make_tensor_value_info(name, dtype, shape)
-                check_value_info(value_info)
-                self.inputs.append(value_info)
-
-            return name
-
-        def push_data_input(
-            self,
-            node_name: str,
-            attr_name: str,
-            elem_type: int,
-            shape: Sequence[int],
-            vals: Any,
-        ) -> str:
-            name = "{}_{}".format(node_name, attr_name)
-            value_info = make_tensor_value_info(name, elem_type, shape)
-            tensor = make_tensor(name, elem_type, shape, vals)
-            check_value_info(value_info)
-            check_tensor(tensor)
-            self.inputs.append(value_info)
-            self.initializers.append(tensor)
-            return name
-
-        def push_node(self, node: NodeProto) -> None:
-            check_node(node)
-            self.nodes.append(node)
-
-        def build(self, name: str) -> ModelProto:
-            print()
-            print(ctx.names)
-            print()
-            print(ctx.inputs)
-            print()
-            print(ctx.outputs)
-            print()
-            print(ctx.nodes)
-
-            graph = make_graph(
-                self.nodes, name, self.inputs, self.outputs, self.initializers
+        for input in model.graph.input:
+            dims = _take_shape_dim(input.type.tensor_type.shape)
+            tensors[input.name] = self.handler.tensor(
+                dims, input.type.tensor_type.elem_type
             )
-            check_graph(graph)
 
-            model = make_model(graph)
-            check_model(model)
+        for output in model.graph.output:
+            dims = _take_shape_dim(output.type.tensor_type.shape)
+            tensors[output.name] = self.handler.tensor(
+                dims, output.type.tensor_type.elem_type
+            )
 
-            return model
+        for initializer in model.graph.initializer:
+            data[initializer.name] = initializer
 
-    # 拓扑排序
-    if not graph.topo_sort():
-        raise Exception("Sorting fails")
-
-    ops = graph.operators()  # 图中所有算子（节点）
-
-    ctx = Context()
-
-    for op in ops:
-        ty, name = ctx.name_op(op)
-        inputs = [ctx.push_input(it) for it in op.inputs()]
-        outputs = [
-            ctx.push_output("{}_{}".format(name, i), it)
-            for (i, it) in enumerate(op.outputs())
-        ]
-        if ty == backend.OpType.Matmul:
-            ctx.push_node(make_node("MatMul", inputs, outputs, name))
-        elif ty == backend.OpType.BatchNorm:
-            raise Exception("TODO")
-        elif ty == backend.OpType.MaxPool:
-            raise Exception("TODO")
-        elif ty == backend.OpType.AvgPool:
-            raise Exception("TODO")
-        elif ty in [
-            backend.OpType.Add,
-            backend.OpType.Sub,
-            backend.OpType.Mul,
-            backend.OpType.Div,
-            backend.OpType.Pow,
-            backend.OpType.Relu,
-            backend.OpType.Sigmoid,
-            backend.OpType.Tanh,
-            backend.OpType.Softmax,
-            backend.OpType.Abs,
-            backend.OpType.Identity,
-        ]:
-            ctx.push_node(make_node(ty.name, inputs, outputs, name))
-        elif ty == backend.OpType.Flatten:
-            raise Exception("TODO")
-        elif ty == backend.OpType.Reshape:
-            shape = backend.reshape_shape_of(op)
-            inputs.append(
-                ctx.push_data_input(
-                    name,
-                    "shape",
-                    TensorProto.INT32,
-                    [len(shape)],
-                    shape,
+        for node in model.graph.node:
+            if node.op_type == "Conv":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "dilations": [1, 1],
+                        "pads": [0, 0],
+                        "strides": [1, 1],
+                    },
                 )
-            )
-            ctx.push_node(make_node(ty.name, inputs, outputs, name))
-        elif ty == backend.OpType.Concat:
-            axis = backend.concat_axis_of(op)
-            ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
-        elif ty == backend.OpType.Gather:
-            axis = backend.gather_axis_of(op)
-            ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
-        elif ty == backend.OpType.ReduceMean:
-            axes = backend.reduce_mean_axes_of(op)
-            inputs.append(
-                ctx.push_data_input(name, "axes", TensorProto.INT32, [len(axes)], axes)
-            )
-            ctx.push_node(make_node(ty.name, inputs, outputs, name, keepdims=1))
-        elif ty == backend.OpType.Slice:
-            raise Exception("TODO")
-        elif ty == backend.OpType.Pad:
-            raise Exception("TODO")
-        else:
-            raise Exception("Unsupported OpType {}".format(ty.name))
+                (d, p, s) = (
+                    attributes[name] for name in ["dilations", "pads", "strides"]
+                )
+                tensors[node.output[0]] = self.handler.conv(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    p[0],
+                    p[1],
+                    s[0],
+                    s[1],
+                    d[0],
+                    d[1],
+                )
+            elif node.op_type == "MatMul":
+                tensors[node.output[0]] = self.handler.matmul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    False,
+                    False,
+                    None,
+                    backend.ActType.Linear,
+                )
+            elif node.op_type == "Gemm":
+                attributes = _parse_attribute(
+                    node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
+                )
+                (alpha, beta, transA, transB) = (
+                    attributes[name] for name in ["alpha", "beta", "transA", "transB"]
+                )
+                # TODO 不支持这些参数
+                assert alpha == 1.0
+                assert beta == 1.0
+                tensors[node.output[0]] = self.handler.matmul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    transA == 1,
+                    transB == 1,
+                    tensors[node.input[2]] if len(node.input) > 2 else None,
+                    backend.ActType.Linear,
+                )
+            elif node.op_type == "BatchNormalization":
+                (input, mean, var, scale, bias) = (
+                    tensors[node.input[i]] for i in [0, 3, 4, 1, 2]
+                )
+                output = tensors.get(node.output[0])
+                attributes = _parse_attribute(
+                    node, {"momentum": 0.9, "epsilon": 1e-05, "training_mode": 0}
+                )
+                (momentum, eps, training) = (
+                    attributes[name]
+                    for name in ["momentum", "epsilon", "training_mode"]
+                )
+                tensors[node.output[0]] = self.handler.batchNorm(
+                    input, output, mean, var, scale, bias, momentum, eps, training != 0
+                )
+            elif node.op_type == "MaxPool":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "kernel_shape": None,
+                        "dilations": [1, 1],
+                        "pads": [0, 0],
+                        "strides": [1, 1],
+                    },
+                )
+                (k, d, p, s) = (
+                    attributes[name]
+                    for name in ["kernel_shape", "dilations", "pads", "strides"]
+                )
+                tensors[node.output[0]] = self.handler.maxPool(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    k[0],
+                    k[1],
+                    d[0],
+                    d[1],
+                    p[0],
+                    p[1],
+                    s[0],
+                    s[1],
+                )
+            elif node.op_type == "AveragePool":
+                attributes = _parse_attribute(
+                    node,
+                    {
+                        "kernel_shape": None,
+                        "pads": [0, 0],
+                        "strides": [1, 1],
+                    },
+                )
+                (k, p, s) = (
+                    attributes[name] for name in ["kernel_shape", "pads", "strides"]
+                )
+                tensors[node.output[0]] = self.handler.avgPool(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    k[0],
+                    k[1],
+                    1,
+                    1,
+                    p[0],
+                    p[1],
+                    s[0],
+                    s[1],
+                )
+            elif node.op_type == "GlobalAveragePool":
+                shape = next(
+                    (
+                        value.type.tensor_type.shape
+                        for value in model.graph.value_info
+                        if value.name == node.input[0]
+                    ),
+                    None,
+                ) or next(
+                    input.type.tensor_type.shape
+                    for input in model.graph.input
+                    if input.name == node.input[0]
+                )
+                [_, _, h, w] = _take_shape_dim(shape)
+                tensors[node.output[0]] = self.handler.avgPool(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    h,
+                    w,
+                    1,
+                    1,
+                    0,
+                    0,
+                    1,
+                    1,
+                )
+            elif node.op_type == "Add":
+                tensors[node.output[0]] = self.handler.add(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Sub":
+                tensors[node.output[0]] = self.handler.sub(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Mul":
+                tensors[node.output[0]] = self.handler.mul(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Div":
+                tensors[node.output[0]] = self.handler.div(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Pow":
+                tensors[node.output[0]] = self.handler.pow(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Relu":
+                tensors[node.output[0]] = self.handler.relu(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Sigmoid":
+                tensors[node.output[0]] = self.handler.sigmoid(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Tanh":
+                tensors[node.output[0]] = self.handler.tanh(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Softmax":
+                tensors[node.output[0]] = self.handler.softmax(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Abs":
+                tensors[node.output[0]] = self.handler.abs(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Identity":
+                tensors[node.output[0]] = self.handler.identity(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Flatten":
+                # TODO 后端算子不支持沿任意轴展开
+                axis = next(
+                    (attr.i for attr in node.attribute if attr.name == "axis"), None
+                )
+                assert axis == None or axis == 1
+                tensors[node.output[0]] = self.handler.flatten(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                )
+            elif node.op_type == "Reshape":
+                input_shape = next(
+                    (
+                        value.type.tensor_type.shape
+                        for value in model.graph.value_info
+                        if value.name == node.input[0]
+                    ),
+                    None,
+                ) or next(
+                    input.type.tensor_type.shape
+                    for input in model.graph.input
+                    if input.name == node.input[0]
+                )
+                dims = _take_shape_dim(input_shape)
+                size = reduce(lambda acc, x: acc * x, dims)
+                output_shape = [int(i) for i in data[node.input[1]].int64_data]
+                for i, x in enumerate(output_shape):
+                    if x == 0:
+                        output_shape[i] = dims[i]
+                temp = reduce(lambda acc, x: acc * x, output_shape)
+                if temp < 0:
+                    output_shape[output_shape.index(-1)] = size // -temp
+                tensors[node.output[0]] = self.handler.reshape(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    output_shape,
+                )
+            elif node.op_type == "Concat":
+                tensors[node.output[0]] = self.handler.concat(
+                    [tensors[name] for name in node.input],
+                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "axis")),
+                )
+            elif node.op_type == "Gather":
+                tensors[node.output[0]] = self.handler.gather(
+                    tensors[node.input[0]],
+                    tensors[node.input[1]],
+                    tensors.get(node.output[0]),
+                    next((attr.i for attr in node.attribute if attr.name == "axis")),
+                )
+            elif node.op_type == "ReduceMean":
+                tensors[node.output[0]] = self.handler.reduceMean(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    tensors[node.input[1]] if len(node.input) > 1 else None,
+                    next((attr.i for attr in node.attribute if attr.name == "keepdims"))
+                    != 0,
+                )
+            elif node.op_type == "Slice":
+                tensors[node.output[0]] = self.handler.slice(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    _parse_data(data[node.input[1]]),
+                    _parse_data(data[node.input[2]]),
+                    _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+                    _parse_data(data[node.input[4]]) if len(node.input) > 4 else None,
+                )
+            elif node.op_type == "Pad":
+                tensors[node.output[0]] = self.handler.pad(
+                    tensors[node.input[0]],
+                    tensors.get(node.output[0]),
+                    _parse_data(data[node.input[1]]),
+                    _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
+                )
+            else:
+                raise Exception('Unsupported operator "{}"'.format(node.op_type))
 
-    return ctx.build(name)
+        self.handler.data_malloc()
+
+        for name, obj in tensors.items():
+            tensor = data.get(name)
+            if tensor == None:
+                if any(input.name == name for input in model.graph.input):
+                    self.inputs[name] = obj
+            else:
+                if tensor.data_type == TensorProto.INT32:
+                    self.handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
+                elif tensor.data_type == TensorProto.INT64:
+                    self.handler.copy_int64(obj, [int(i) for i in tensor.int64_data])
+                elif tensor.data_type == TensorProto.FLOAT:
+                    self.handler.copy_float(obj, [float(i) for i in tensor.float_data])
+                else:
+                    assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+
+        for output in model.graph.output:
+            self.outputs[output.name] = tensors[output.name]
+
+    def to_onnx(self, name: str) -> ModelProto:
+        class Context:
+            # saves object names, including tensors and operators
+            names: Dict[Any, str] = dict()
+            # counts the occurrence times of each operator for naming
+            count_op: Dict[backend.OpType, int] = dict()
+            # counts input and output tensors for naming
+            count_in, count_out = 0, 0
+            # saves nodes (operators)
+            nodes: List[NodeProto] = []
+            # saves global input tensors
+            inputs: List[ValueInfoProto] = []
+            # saves global output tensors
+            outputs: List[ValueInfoProto] = []
+            # saves global input tensors
+            initializers: List[TensorProto] = []
+
+            def name_op(self, op: backend.Operator) -> Tuple[backend.OpType, str]:
+                ty = op.op_type()
+                name = "{}{}".format(ty.name, self.count_op.setdefault(ty, 0) + 1)
+                self.names[op] = name
+                self.count_op[ty] += 1
+                return ty, name
+
+            def push_output(self, name: str, tensor: backend.Tensor) -> str:
+                self.names[tensor] = name
+                if not tensor.has_target():
+                    shape = tensor.shape()
+                    dtype = backend.tensor_dtype(tensor)
+                    value_info = make_tensor_value_info(name, dtype, shape)
+                    check_value_info(value_info)
+                    self.outputs.append(value_info)
+                return name
+
+            def push_input(self, tensor: backend.Tensor) -> str:
+                name = self.names.get(tensor)
+                # means that this input is a global input
+                if name is None:
+                    self.count_in += 1
+                    name = "input{}".format(self.count_in)
+                    self.names[tensor] = name
+                    shape = tensor.shape()
+                    dtype = backend.tensor_dtype(tensor)
+                    value_info = make_tensor_value_info(name, dtype, shape)
+                    check_value_info(value_info)
+                    self.inputs.append(value_info)
+
+                return name
+
+            def push_data_input(
+                self,
+                node_name: str,
+                attr_name: str,
+                elem_type: int,
+                shape: Sequence[int],
+                vals: Any,
+            ) -> str:
+                name = "{}_{}".format(node_name, attr_name)
+                value_info = make_tensor_value_info(name, elem_type, shape)
+                tensor = make_tensor(name, elem_type, shape, vals)
+                check_value_info(value_info)
+                check_tensor(tensor)
+                self.inputs.append(value_info)
+                self.initializers.append(tensor)
+                return name
+
+            def push_node(self, node: NodeProto) -> None:
+                check_node(node)
+                self.nodes.append(node)
+
+            def build(self, name: str) -> ModelProto:
+                print()
+                print(ctx.names)
+                print()
+                print(ctx.inputs)
+                print()
+                print(ctx.outputs)
+                print()
+                print(ctx.nodes)
+
+                graph = make_graph(
+                    self.nodes, name, self.inputs, self.outputs, self.initializers
+                )
+                check_graph(graph)
+
+                model = make_model(graph)
+                check_model(model)
+
+                return model
+
+        # 拓扑排序
+        if not self.handler.topo_sort():
+            raise Exception("Sorting fails")
+
+        ops = self.handler.operators()  # 图中所有算子（节点）
+
+        ctx = Context()
+
+        for op in ops:
+            ty, name = ctx.name_op(op)
+            inputs = [ctx.push_input(it) for it in op.inputs()]
+            outputs = [
+                ctx.push_output("{}_{}".format(name, i), it)
+                for (i, it) in enumerate(op.outputs())
+            ]
+            if ty == backend.OpType.Conv:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Matmul:
+                ctx.push_node(make_node("MatMul", inputs, outputs, name))
+            elif ty == backend.OpType.BatchNorm:
+                inputs = [inputs[i] for i in [0, 3, 4, 1, 2]]
+                momentum, eps, training = backend.batch_norm_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "BatchNormalization",
+                        inputs,
+                        outputs,
+                        name,
+                        epsilon=eps,
+                        momentum=momentum,
+                        training_mode=training,
+                    )
+                )
+            elif ty == backend.OpType.MaxPool:
+                raise Exception("TODO")
+            elif ty == backend.OpType.AvgPool:
+                raise Exception("TODO")
+            elif ty in [
+                backend.OpType.Add,
+                backend.OpType.Sub,
+                backend.OpType.Mul,
+                backend.OpType.Div,
+                backend.OpType.Pow,
+                backend.OpType.Relu,
+                backend.OpType.Sigmoid,
+                backend.OpType.Tanh,
+                backend.OpType.Softmax,
+                backend.OpType.Abs,
+                backend.OpType.Identity,
+            ]:
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Flatten:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Reshape:
+                shape = backend.reshape_shape_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name,
+                        "shape",
+                        TensorProto.INT32,
+                        [len(shape)],
+                        shape,
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name))
+            elif ty == backend.OpType.Concat:
+                axis = backend.concat_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
+            elif ty == backend.OpType.Gather:
+                axis = backend.gather_axis_of(op)
+                ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis))
+            elif ty == backend.OpType.ReduceMean:
+                axes = backend.reduce_mean_axes_of(op)
+                inputs.append(
+                    ctx.push_data_input(
+                        name, "axes", TensorProto.INT32, [len(axes)], axes
+                    )
+                )
+                ctx.push_node(make_node(ty.name, inputs, outputs, name, keepdims=1))
+            elif ty == backend.OpType.Slice:
+                raise Exception("TODO")
+            elif ty == backend.OpType.Pad:
+                raise Exception("TODO")
+            else:
+                raise Exception("Unsupported OpType {}".format(ty.name))
+
+        return ctx.build(name)
+
+
+def from_onnx(model: ModelProto, runtime):
+    stub = OnnxStub(model, runtime)
+    return stub.inputs, stub.outputs, stub.handler
 
 
 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py
index a529fb67..150a96e7 100644
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@@ -8,7 +8,7 @@ from onnx.helper import (
     make_tensor_value_info,
 )
 from onnx.checker import check_model
-from pyinfinitensor.onnx import from_onnx, backend, to_onnx, cpu_runtime
+from pyinfinitensor.onnx import from_onnx, backend, cpu_runtime
 
 
 def make_and_import_model(graph: onnx.GraphProto):
@@ -305,8 +305,6 @@ class TestStringMethods(unittest.TestCase):
         y = handler.tensor([3, 2, 1], 12)
         handler.reshape(x, y, [3, 2, 1])
 
-        to_onnx(handler, "test_frontend")
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 53e1376c..87599f28 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -1,4 +1,5 @@
 #include "core/graph_handler.h"
+#include "operators/batch_norm.h"
 #include "operators/concat.h"
 #include "operators/gather.h"
 #include "operators/reduce_mean.h"
@@ -120,6 +121,13 @@ static Shape reshape_shape_of(Operator op) {
     return dynamic_cast<const ReshapeObj *>(op.get())->getShape();
 }
 
+static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::BatchNorm);
+    auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
+    return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
+                           batchnorm->getTraining());
+}
+
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
     m.def("cpu_runtime", &CpuRuntimeObj::getInstance)
@@ -130,7 +138,8 @@ void export_functions(py::module &m) {
         .FUNCTION(reshape_shape_of)
         .FUNCTION(concat_axis_of)
         .FUNCTION(gather_axis_of)
-        .FUNCTION(reduce_mean_axes_of);
+        .FUNCTION(reduce_mean_axes_of)
+        .FUNCTION(batch_norm_attrs_of);
 #undef FUNCTION
 }
 

From 40fb8390b12d4d77b55345c85175017ee2775182 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 16:49:53 +0800
Subject: [PATCH 17/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=85=A5=E6=97=B6?=
 =?UTF-8?q?=E4=BF=9D=E5=AD=98=E6=9D=83=E9=87=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/tensor.h                     | 2 +-
 pyinfinitensor/src/pyinfinitensor/onnx.py | 2 ++
 src/ffi/ffi_infinitensor.cc               | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/core/tensor.h b/include/core/tensor.h
index 1edc950a..c5823be0 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -32,7 +32,7 @@ class TensorObj : public TensorBaseObj {
     using TensorBaseObj::getData;
     VType getData(const Shape &pos) const;
     void dataMalloc();
-    UidBaseType getFuid() const { return fuid; }
+    inline UidBaseType getFuid() const { return fuid; }
 
     void load(std::string file_path);
     void save(std::string file_path);
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 28b8f514..d554883c 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -35,6 +35,7 @@ def cuda_runtime():
 class OnnxStub:
     inputs: Dict[str, backend.Tensor] = {}
     outputs: Dict[str, backend.Tensor] = {}
+    initializer: Dict[int, TensorProto] = {}
     handler: backend.GraphHandler
 
     def __init__(self, model: ModelProto, runtime):
@@ -348,6 +349,7 @@ class OnnxStub:
                 if any(input.name == name for input in model.graph.input):
                     self.inputs[name] = obj
             else:
+                self.initializer[obj.fuid()] = tensor
                 if tensor.data_type == TensorProto.INT32:
                     self.handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
                 elif tensor.data_type == TensorProto.INT64:
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 87599f28..d7230f42 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -154,6 +154,7 @@ void init_graph_builder(py::module &m) {
         m, "CudaRuntime");
 #endif
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
+        .def("fuid", &TensorObj::getFuid, policy::automatic)
         .def("shape", &TensorObj::getDims, policy::move)
         .def("cloneFloats", &TensorObj::cloneFloats, policy::move)
         .def("has_target", &TensorObj::hasTarget, policy::automatic)

From afed749b7423d5b9de3a7d65dd54a3b6923f94c5 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 24 Feb 2023 16:57:37 +0800
Subject: [PATCH 18/29] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E5=AF=BC?=
 =?UTF-8?q?=E5=87=BA=E6=9D=83=E9=87=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index d554883c..d20b4ca7 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -22,7 +22,7 @@ from onnx.checker import (
     check_tensor,
 )
 from onnx.shape_inference import infer_shapes
-from typing import Dict, List, Any, Tuple, Sequence, Union
+from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce
 
 cpu_runtime = backend.cpu_runtime()
@@ -365,7 +365,7 @@ class OnnxStub:
     def to_onnx(self, name: str) -> ModelProto:
         class Context:
             # saves object names, including tensors and operators
-            names: Dict[Any, str] = dict()
+            names: Dict[Union[backend.Tensor, backend.Operator], str] = dict()
             # counts the occurrence times of each operator for naming
             count_op: Dict[backend.OpType, int] = dict()
             # counts input and output tensors for naming
@@ -396,7 +396,9 @@ class OnnxStub:
                     self.outputs.append(value_info)
                 return name
 
-            def push_input(self, tensor: backend.Tensor) -> str:
+            def push_input(
+                self, tensor: backend.Tensor, init: Optional[TensorProto]
+            ) -> str:
                 name = self.names.get(tensor)
                 # means that this input is a global input
                 if name is None:
@@ -408,7 +410,9 @@ class OnnxStub:
                     value_info = make_tensor_value_info(name, dtype, shape)
                     check_value_info(value_info)
                     self.inputs.append(value_info)
-
+                    if init != None:
+                        init.name = name
+                        self.initializers.append(init)
                 return name
 
             def push_data_input(
@@ -462,7 +466,10 @@ class OnnxStub:
 
         for op in ops:
             ty, name = ctx.name_op(op)
-            inputs = [ctx.push_input(it) for it in op.inputs()]
+            inputs = [
+                ctx.push_input(it, self.initializer.get(it.fuid()))
+                for it in op.inputs()
+            ]
             outputs = [
                 ctx.push_output("{}_{}".format(name, i), it)
                 for (i, it) in enumerate(op.outputs())

From 8a871c3773a5058bad715833e2d713fd6a56dde2 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 27 Feb 2023 09:15:45 +0800
Subject: [PATCH 19/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=20conv=20?=
 =?UTF-8?q?=E5=88=B0=20onnx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 13 ++++++++++-
 src/ffi/ffi_infinitensor.cc               | 27 +++++++++++++++--------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index d20b4ca7..9af10f6e 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -475,7 +475,18 @@ class OnnxStub:
                 for (i, it) in enumerate(op.outputs())
             ]
             if ty == backend.OpType.Conv:
-                raise Exception("TODO")
+                ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        pads=[ph, pw],
+                        strides=[sh, sw],
+                        dilations=[dh, dw],
+                    )
+                )
             elif ty == backend.OpType.Matmul:
                 ctx.push_node(make_node("MatMul", inputs, outputs, name))
             elif ty == backend.OpType.BatchNorm:
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index d7230f42..49164e07 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -1,6 +1,7 @@
 #include "core/graph_handler.h"
 #include "operators/batch_norm.h"
 #include "operators/concat.h"
+#include "operators/conv.h"
 #include "operators/gather.h"
 #include "operators/reduce_mean.h"
 #include "operators/reshape.h"
@@ -100,6 +101,20 @@ static int tensor_dtype(Tensor t) {
 static Ref<CudaRuntimeObj> cuda_runtime() { return make_ref<CudaRuntimeObj>(); }
 #endif
 
+static std::tuple<int, int, int, int, int, int> conv_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::Conv);
+    auto conv = dynamic_cast<const ConvObj *>(op.get());
+    return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(),
+                           conv->getDw(), conv->getSh(), conv->getSw());
+}
+
+static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::BatchNorm);
+    auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
+    return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
+                           batchnorm->getTraining());
+}
+
 static int concat_axis_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Concat);
     return dynamic_cast<const ConcatObj *>(op.get())->getDim();
@@ -121,25 +136,19 @@ static Shape reshape_shape_of(Operator op) {
     return dynamic_cast<const ReshapeObj *>(op.get())->getShape();
 }
 
-static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
-    IT_ASSERT(op->getOpType() == OpType::BatchNorm);
-    auto batchnorm = dynamic_cast<const BatchNormObj *>(op.get());
-    return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(),
-                           batchnorm->getTraining());
-}
-
 void export_functions(py::module &m) {
 #define FUNCTION(NAME) def(#NAME, &NAME)
     m.def("cpu_runtime", &CpuRuntimeObj::getInstance)
 #ifdef USE_CUDA
         .FUNCTION(cuda_runtime)
 #endif
+        .FUNCTION(conv_attrs_of)
+        .FUNCTION(batch_norm_attrs_of)
         .FUNCTION(tensor_dtype)
         .FUNCTION(reshape_shape_of)
         .FUNCTION(concat_axis_of)
         .FUNCTION(gather_axis_of)
-        .FUNCTION(reduce_mean_axes_of)
-        .FUNCTION(batch_norm_attrs_of);
+        .FUNCTION(reduce_mean_axes_of);
 #undef FUNCTION
 }
 

From e294e4643608977ebdca7ce005e4069f9292686c Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 27 Feb 2023 09:32:08 +0800
Subject: [PATCH 20/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=20pool=20?=
 =?UTF-8?q?=E5=88=B0=20onnx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/operators/pooling.h               | 26 ++++++++++++----------
 pyinfinitensor/src/pyinfinitensor/onnx.py | 27 +++++++++++++++++++++--
 src/ffi/ffi_infinitensor.cc               | 12 ++++++++++
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/include/operators/pooling.h b/include/operators/pooling.h
index c14bb8ad..1a1a6edf 100644
--- a/include/operators/pooling.h
+++ b/include/operators/pooling.h
@@ -39,20 +39,22 @@ class PoolingObj : public OperatorObj {
 
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
     std::string toString() const override;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
+    inline int numInputs() const override { return 1; }
+    inline int numOutputs() const override { return 1; }
 
-    int getKh() const { return kh; }
-    int getKw() const { return kw; }
-    int getDh() const { return dh; }
-    int getDw() const { return dw; }
-    int getPh() const { return ph; }
-    int getPw() const { return pw; }
-    int getSh() const { return sh; }
-    int getSw() const { return sw; }
+    inline int getKh() const { return kh; }
+    inline int getKw() const { return kw; }
+    inline int getDh() const { return dh; }
+    inline int getDw() const { return dw; }
+    inline int getPh() const { return ph; }
+    inline int getPw() const { return pw; }
+    inline int getSh() const { return sh; }
+    inline int getSw() const { return sw; }
 
-    auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
-    auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
+    inline auto getPadStrideDilation() const {
+        return tuple(ph, pw, sh, sw, dh, dw);
+    }
+    inline auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 9af10f6e..0915b9bc 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -504,9 +504,32 @@ class OnnxStub:
                     )
                 )
             elif ty == backend.OpType.MaxPool:
-                raise Exception("TODO")
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        ty.name,
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw],
+                        dilations=[dh, dw],
+                        strides=[sh, sw],
+                    )
+                )
             elif ty == backend.OpType.AvgPool:
-                raise Exception("TODO")
+                kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op)
+                ctx.push_node(
+                    make_node(
+                        "AveragePool",
+                        inputs,
+                        outputs,
+                        name,
+                        kernel_shape=[kh, kw],
+                        pads=[ph, pw],
+                        strides=[sh, sw],
+                    )
+                )
             elif ty in [
                 backend.OpType.Add,
                 backend.OpType.Sub,
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 49164e07..c9547f1c 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -3,6 +3,7 @@
 #include "operators/concat.h"
 #include "operators/conv.h"
 #include "operators/gather.h"
+#include "operators/pooling.h"
 #include "operators/reduce_mean.h"
 #include "operators/reshape.h"
 #include <pybind11/stl.h>
@@ -115,6 +116,16 @@ static std::tuple<float, float, bool> batch_norm_attrs_of(Operator op) {
                            batchnorm->getTraining());
 }
 
+static std::tuple<int, int, int, int, int, int, int, int>
+pool_attrs_of(Operator op) {
+    IT_ASSERT(op->getOpType() == OpType::MaxPool ||
+              op->getOpType() == OpType::AvgPool);
+    auto pool = dynamic_cast<const PoolingObj *>(op.get());
+    return std::make_tuple(pool->getKh(), pool->getKw(), pool->getDh(),
+                           pool->getDw(), pool->getPh(), pool->getPw(),
+                           pool->getSh(), pool->getSw());
+}
+
 static int concat_axis_of(Operator op) {
     IT_ASSERT(op->getOpType() == OpType::Concat);
     return dynamic_cast<const ConcatObj *>(op.get())->getDim();
@@ -144,6 +155,7 @@ void export_functions(py::module &m) {
 #endif
         .FUNCTION(conv_attrs_of)
         .FUNCTION(batch_norm_attrs_of)
+        .FUNCTION(pool_attrs_of)
         .FUNCTION(tensor_dtype)
         .FUNCTION(reshape_shape_of)
         .FUNCTION(concat_axis_of)

From 6e1af09dd014cfb9bc4b7e71bf937923fa90b43d Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 27 Feb 2023 09:35:17 +0800
Subject: [PATCH 21/29] fix: remove print

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 0915b9bc..ffb61529 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -437,15 +437,6 @@ class OnnxStub:
                 self.nodes.append(node)
 
             def build(self, name: str) -> ModelProto:
-                print()
-                print(ctx.names)
-                print()
-                print(ctx.inputs)
-                print()
-                print(ctx.outputs)
-                print()
-                print(ctx.nodes)
-
                 graph = make_graph(
                     self.nodes, name, self.inputs, self.outputs, self.initializers
                 )

From c18845a2fdfbf30b068c452962217393b9703886 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 27 Feb 2023 10:03:06 +0800
Subject: [PATCH 22/29] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E6=8E=A8?=
 =?UTF-8?q?=E7=90=86=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index ffb61529..1f6afe19 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -572,6 +572,24 @@ class OnnxStub:
 
         return ctx.build(name)
 
+    def init(self) -> None:
+        self.handler.data_malloc()
+
+    def run(self) -> None:
+        self.handler.run()
+
+    def put_int32(self, name: str) -> None:
+        self.handler.copy_int32(self.inputs[name])
+
+    def put_int64(self, name: str) -> None:
+        self.handler.copy_int64(self.inputs[name])
+
+    def put_float(self, name: str) -> None:
+        self.handler.copy_float(self.inputs[name])
+
+    def take_float(self) -> List[float]:
+        return next(self.handler.outputs.values()).copyFloats()
+
 
 def from_onnx(model: ModelProto, runtime):
     stub = OnnxStub(model, runtime)

From e1c976568d408bd2dc071c6f49dad997fec23312 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 27 Feb 2023 10:13:55 +0800
Subject: [PATCH 23/29] =?UTF-8?q?fix:=20=E5=A2=9E=E5=8A=A0=E6=8E=A8?=
 =?UTF-8?q?=E7=90=86=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 1f6afe19..f85addb4 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -578,17 +578,17 @@ class OnnxStub:
     def run(self) -> None:
         self.handler.run()
 
-    def put_int32(self, name: str) -> None:
-        self.handler.copy_int32(self.inputs[name])
+    def put_int32(self, name: str, data: List[int]) -> None:
+        self.handler.copy_int32(self.inputs[name], data)
 
-    def put_int64(self, name: str) -> None:
-        self.handler.copy_int64(self.inputs[name])
+    def put_int64(self, name: str, data: List[int]) -> None:
+        self.handler.copy_int64(self.inputs[name], data)
 
-    def put_float(self, name: str) -> None:
-        self.handler.copy_float(self.inputs[name])
+    def put_float(self, name: str, data: List[int]) -> None:
+        self.handler.copy_float(self.inputs[name], data)
 
     def take_float(self) -> List[float]:
-        return next(self.handler.outputs.values()).copyFloats()
+        return next(iter(self.handler.outputs.values())).copyFloats()
 
 
 def from_onnx(model: ModelProto, runtime):

From 9db97eb212a08fcd19ce3db8a7dcaf411b286135 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Tue, 21 Mar 2023 14:00:04 +0800
Subject: [PATCH 24/29] =?UTF-8?q?refactor:=20=E6=95=B4=E5=90=88=E6=93=8D?=
 =?UTF-8?q?=E4=BD=9C=E5=BC=A0=E9=87=8F=E6=95=B0=E6=8D=AE=E7=9A=84=E6=96=B9?=
 =?UTF-8?q?=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/graph_handler.h               |  6 +-
 include/core/tensor.h                      | 72 ++++++++--------
 include/core/tensor_base.h                 |  1 -
 src/core/tensor.cc                         | 24 ++----
 src/core/tensor_base.cc                    |  7 +-
 src/ffi/ffi_infinitensor.cc                |  2 +-
 src/kernels/cpu/membound.cc                |  4 +-
 src/utils/dataloader.cc                    |  4 +-
 test/core/test_graph.cc                    | 20 ++---
 test/core/test_search.cc                   |  4 +-
 test/core/test_tensor_save.cc              |  8 +-
 test/kernels/cuda/test_cuda_batch_norm.cc  |  4 +-
 test/kernels/cuda/test_cuda_gather.cc      |  8 +-
 test/kernels/cuda/test_cuda_reduce_mean.cc |  2 +-
 test/kernels/cuda/test_cuda_resize.cc      | 96 +++++++++++-----------
 test/operators/test_conv.cc                |  2 +-
 test/operators/test_resize.cc              | 12 +--
 17 files changed, 133 insertions(+), 143 deletions(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index ff702cf8..72f7a6a3 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -90,15 +90,15 @@ class GraphHandlerObj {
     inline void data_malloc() { g->dataMalloc(); }
 
     inline void copy_int32(Tensor tensor, std::vector<int32_t> list) {
-        tensor->copyData(list);
+        tensor->copyin(list);
     }
 
     inline void copy_int64(Tensor tensor, std::vector<int64_t> list) {
-        tensor->copyData(list);
+        tensor->copyin(list);
     }
 
     inline void copy_float(Tensor tensor, std::vector<float> list) {
-        tensor->copyData(list);
+        tensor->copyin(list);
     }
 
     inline void run() { g->getRuntime()->run(g); }
diff --git a/include/core/tensor.h b/include/core/tensor.h
index c5823be0..af3ed3f6 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -15,39 +15,61 @@ using Shape = vector<ShapeElem>;
 class TensorObj : public TensorBaseObj {
   private:
     Shape shape;
-    Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
-               // scratch have a new id.
+    size_t _size; // Cache of Π(shape).
+    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
+                  // scratch have a new id.
+
+    inline void copyin(const void *ptr, size_t size) {
+        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
+    }
+    inline void copyout(void *ptr, size_t size) const {
+        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
+    }
 
   public:
-    TensorObj(const Shape &shape, DataType dtype, Runtime runtime);
+    TensorObj(Shape shape, DataType dtype, Runtime runtime);
     virtual ~TensorObj() {}
     string toString() const override;
 
-    size_t size() const;
-    size_t getBytes() const;
+    inline size_t size() const { return _size; }
+    inline size_t getBytes() const { return _size * dtype.getSize(); }
 
     Shape getDims() const { return shape; }
     vector<size_t> getStride() const;
-    size_t getOffset(const Shape &ds) const;
-    using TensorBaseObj::getData;
-    VType getData(const Shape &pos) const;
+    size_t getOffset(const vector<int> &ds) const;
     void dataMalloc();
     inline UidBaseType getFuid() const { return fuid; }
 
     void load(std::string file_path);
     void save(std::string file_path);
 
-    template <typename T> void copyData(const T *dptr) {
+    // Copy elements from `data`.
+    template <typename T> inline void copyin(const vector<T> &data) {
         IT_ASSERT(DataType::get<T>() == dtype);
-        IT_ASSERT(data != nullptr);
-        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), dptr, getBytes());
+        IT_ASSERT(data.size() >= _size);
+        copyin(data.data(), getBytes());
+    }
+    // Copy all the elements to a vector.
+    template <typename T> inline auto copyout() const {
+        IT_ASSERT(DataType::get<T>() == dtype);
+        std::vector<T> ans(_size);
+        copyout(ans.data(), getBytes());
+        return ans;
+    }
+    // Copy the element at `pos`.
+    template <typename T> inline auto copyout(const vector<int> &pos) const {
+        IT_ASSERT(DataType::get<T>() == dtype);
+        auto offset = getOffset(pos);
+        auto bytes = dtype.getSize();
+        T ans;
+        runtime->copyBlobToCPU(&ans, getRawDataPtr<void *>() + offset * bytes,
+                               bytes);
+        return ans;
     }
 
-    template <typename T> void copyData(vector<T> dataVector) {
-        IT_ASSERT(DataType::get<T>() == dtype);
-        IT_ASSERT(dataVector.size() >= size());
-        copyData(dataVector.data());
-    }
+    inline auto copyoutFloat() const { return copyout<float>(); }
+    inline auto copyoutInt32() const { return copyout<int32_t>(); }
+    inline auto copyoutInt64() const { return copyout<int64_t>(); }
 
     void copyData(const TensorObj *src);
     void copyData(const Tensor &src) { copyData(src.get()); }
@@ -72,24 +94,6 @@ class TensorObj : public TensorBaseObj {
         }
         return obj;
     }
-    inline std::vector<float> cloneFloats() const {
-        IT_ASSERT(data != nullptr);
-        IT_ASSERT(getDType() == DataType::Float32);
-        std::vector<float> ans(size());
-        auto src = getRawDataPtr<void *>();
-        auto dst = ans.data();
-        auto bytes = getBytes();
-        if (runtime->isCpu()) {
-            memcpy(dst, src, bytes);
-        } else {
-#if USE_CUDA
-            cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
-#else
-            IT_TODO_HALT();
-#endif
-        }
-        return ans;
-    }
 
     void printData() const;
     bool equalData(const Tensor &rhs) const;
diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h
index 09286a63..4c73094a 100644
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@@ -41,7 +41,6 @@ class TensorBaseObj : public Object {
         IT_ASSERT(data != nullptr);
         return data->getPtr<T>();
     }
-    VType getData(size_t offset) const;
 
     DataType getDType() const { return dtype; }
     Runtime getRuntime() const { return runtime; }
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index 5f04e114..b8a41728 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -3,15 +3,16 @@
 #include "core/operator.h"
 #include "core/runtime.h"
 #include "utils/dataloader.h"
+#include <numeric>
 
 namespace infini {
 
-TensorObj::TensorObj(const Shape &shape, DataType dtype, Runtime runtime)
-    : TensorBaseObj(shape.size(), dtype, runtime), shape(shape) {}
-
-VType TensorObj::getData(const Shape &pos) const {
-    return getData(getOffset(pos));
-}
+TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime)
+    : TensorBaseObj(shape.size(), dtype, runtime), shape(std::move(shape_)),
+      _size(shape.empty()
+                ? 0
+                : std::accumulate(shape.begin(), shape.end(), 1,
+                                  [](auto acc, auto x) { return acc * x; })) {}
 
 string TensorObj::toString() const {
     string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
@@ -28,7 +29,7 @@ string TensorObj::toString() const {
     return ret;
 }
 
-size_t TensorObj::getOffset(const Shape &pos) const {
+size_t TensorObj::getOffset(const vector<int> &pos) const {
     auto nDim = pos.size();
     IT_ASSERT(shape.size() == nDim);
     if (pos.empty())
@@ -53,15 +54,6 @@ vector<size_t> TensorObj::getStride() const {
     return ret;
 }
 
-size_t TensorObj::size() const {
-    size_t ret = 1;
-    for (const auto &d : shape)
-        ret *= d;
-    return ret;
-}
-
-size_t TensorObj::getBytes() const { return size() * dtype.getSize(); }
-
 void TensorObj::printData() const {
     IT_ASSERT(data != nullptr);
     if (!runtime->isCpu())
diff --git a/src/core/tensor_base.cc b/src/core/tensor_base.cc
index 98409322..62612cc8 100644
--- a/src/core/tensor_base.cc
+++ b/src/core/tensor_base.cc
@@ -6,9 +6,4 @@ namespace infini {
 TensorBaseObj::TensorBaseObj(int dim, DataType dtype, Runtime runtime)
     : dim(dim), dtype(dtype), runtime(runtime) {}
 
-VType TensorBaseObj::getData(size_t offset) const {
-    // TODO: check cuda array
-    return (data->getPtr<VType *>())[offset];
-}
-
-}; // namespace infini
\ No newline at end of file
+}; // namespace infini
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index c9547f1c..641c2c3b 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -177,7 +177,7 @@ void init_graph_builder(py::module &m) {
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("fuid", &TensorObj::getFuid, policy::automatic)
         .def("shape", &TensorObj::getDims, policy::move)
-        .def("cloneFloats", &TensorObj::cloneFloats, policy::move)
+        .def("copyoutFloat", &TensorObj::copyoutFloat, policy::move)
         .def("has_target", &TensorObj::hasTarget, policy::automatic)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
diff --git a/src/kernels/cpu/membound.cc b/src/kernels/cpu/membound.cc
index 361ee5ac..31755389 100644
--- a/src/kernels/cpu/membound.cc
+++ b/src/kernels/cpu/membound.cc
@@ -64,7 +64,7 @@ class MemboundInterpreter : public Kernel {
         vector<uint32_t> valsUint(vals.size());
         for (size_t i = 0; i < vals.size(); ++i)
             valsUint[i] = (uint32_t)vals[i];
-        output->copyData(valsUint);
+        output->copyin(valsUint);
     }
 
     void compute(const Operator &op, const RuntimeObj *context) const override {
@@ -81,4 +81,4 @@ class MemboundInterpreter : public Kernel {
 REGISTER_KERNEL(Device::CPU, OpType::MemBound, DataType::UInt32,
                 MemboundInterpreter, "MemboundInterpreter_CPU");
 
-} // namespace infini
\ No newline at end of file
+} // namespace infini
diff --git a/src/utils/dataloader.cc b/src/utils/dataloader.cc
index 1de0cd8d..73ce34fc 100644
--- a/src/utils/dataloader.cc
+++ b/src/utils/dataloader.cc
@@ -59,13 +59,13 @@ void loadTensorData(TensorObj *tensor, std::string file_path) {
         for (int i = 0; i < temp.data_float_size(); ++i) {
             data_temp.push_back(temp.data_float(i));
         }
-        tensor->copyData(data_temp);
+        tensor->copyin(data_temp);
     } else if (tensor->getDType() == DataType::UInt32) {
         std::vector<uint32_t> data_temp;
         for (int i = 0; i < temp.data_uint32_size(); ++i) {
             data_temp.push_back(temp.data_uint32(i));
         }
-        tensor->copyData(data_temp);
+        tensor->copyin(data_temp);
     } else {
         IT_TODO_HALT();
     }
diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc
index 65bcf68a..8140ea43 100644
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@@ -15,8 +15,8 @@ TEST(Graph, build_and_run) {
     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
     g->dataMalloc();
-    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
     g->print();
     // check inputOf and outputsOf for tensor
@@ -33,7 +33,7 @@ TEST(Graph, build_and_run) {
     // check execution results
     auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
     ans->dataMalloc();
-    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyin(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
     EXPECT_TRUE(o0->equalData(ans));
 }
 
@@ -84,8 +84,8 @@ TEST(Graph, perf_engine) {
     auto matmul = g->addOp<MatmulObj>(i0, w0, nullptr);
 
     g->dataMalloc();
-    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     runtime->run(g, true, true);
     double perfTime = runtime->getPerfTime(g);
     // The example matmul takes 0.0036ms with one core
@@ -94,7 +94,7 @@ TEST(Graph, perf_engine) {
     // check answer
     auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
     ans->dataMalloc();
-    ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
+    ans->copyin(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
     EXPECT_TRUE(matmul->getOutput()->equalData(ans));
 }
 
@@ -105,8 +105,8 @@ TEST(Graph, test_tensor_id) {
     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
     g->dataMalloc();
-    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto i1 = g->addTensor(i0->clone());
     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
     g->print();
@@ -123,8 +123,8 @@ TEST(Graph, test_OpVec_ctor) {
     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
     g->dataMalloc();
-    i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto o1 = g->addTensor(o0->clone());
     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
     g->addOp<ReluObj>(o1, nullptr);
diff --git a/test/core/test_search.cc b/test/core/test_search.cc
index 0e882182..5f531c21 100644
--- a/test/core/test_search.cc
+++ b/test/core/test_search.cc
@@ -19,8 +19,8 @@ namespace infini {
 //     Tensor w0 = g->addTensor({1, 3, 4}, DataType::UInt32);
 //     Tensor o0 = g->addTensor({1, 2, 4}, DataType::UInt32);
 //     g->dataMalloc();
-//     i0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-//     w0->copyData(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+//     i0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+//     w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 //     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
 //     g->print();
 //     // check inputOf and outputsOf for tensor
diff --git a/test/core/test_tensor_save.cc b/test/core/test_tensor_save.cc
index d310ab84..086e6455 100644
--- a/test/core/test_tensor_save.cc
+++ b/test/core/test_tensor_save.cc
@@ -14,10 +14,10 @@ TEST(Prtotbuf, save_and_load) {
     Tensor u0 = g->addTensor({1, 3, 4}, DataType::UInt32);
     Tensor u1 = g->addTensor({1, 3, 4}, DataType::UInt32);
     g->dataMalloc();
-    i0->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-    w0->copyData(vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    u0->copyData(vector<uint32_t>{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 0, 0});
-    u1->copyData(vector<uint32_t>{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0});
+    i0->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    w0->copyin(vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    u0->copyin(vector<uint32_t>{1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 0, 0});
+    u1->copyin(vector<uint32_t>{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0});
     i0->save("i0.pb");
     w0->printData();
     w0->load("i0.pb");
diff --git a/test/kernels/cuda/test_cuda_batch_norm.cc b/test/kernels/cuda/test_cuda_batch_norm.cc
index e3a5ef04..0f92710d 100644
--- a/test/kernels/cuda/test_cuda_batch_norm.cc
+++ b/test/kernels/cuda/test_cuda_batch_norm.cc
@@ -22,8 +22,8 @@ TEST(CUDA_BatchNorm, run) {
     // Build input data on CPU
     gCpu->dataMalloc();
     iCpu->setData(IncrementalGenerator());
-    meanCpu->copyData(vector<float>{1, 6, 9});
-    varCpu->copyData(vector<float>{4, 1, 9});
+    meanCpu->copyin(vector<float>{1, 6, 9});
+    varCpu->copyin(vector<float>{4, 1, 9});
     scaleCpu->setData(OneGenerator());
     biasCpu->setData(ZeroGenerator());
 
diff --git a/test/kernels/cuda/test_cuda_gather.cc b/test/kernels/cuda/test_cuda_gather.cc
index 9108d719..807ea0ff 100644
--- a/test/kernels/cuda/test_cuda_gather.cc
+++ b/test/kernels/cuda/test_cuda_gather.cc
@@ -181,8 +181,8 @@ TEST(Gather, Cuda) {
         auto input = gCpu->addTensor({3, 2}, DataType::Float32);
         auto index = gCpu->addTensor({2, 2}, DataType::UInt32);
         gCpu->dataMalloc();
-        input->copyData(vector<float>{1, 2, 3, 4, 5, 6});
-        index->copyData(vector<uint32_t>{0, 1, 1, 2});
+        input->copyin(vector<float>{1, 2, 3, 4, 5, 6});
+        index->copyin(vector<uint32_t>{0, 1, 1, 2});
         auto cudaRuntime = make_ref<CudaRuntimeObj>();
         Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
@@ -203,7 +203,7 @@ TEST(Gather, Cuda) {
         auto index = gCpu->addTensor({1, 2}, DataType::UInt32);
         gCpu->dataMalloc();
         input->setData(IncrementalGenerator());
-        index->copyData(vector<uint32_t>{0, 2});
+        index->copyin(vector<uint32_t>{0, 2});
         auto cudaRuntime = make_ref<CudaRuntimeObj>();
         Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
@@ -224,7 +224,7 @@ TEST(Gather, Cuda) {
         auto index = gCpu->addTensor({3, 1}, DataType::UInt32);
         gCpu->dataMalloc();
         input->setData(IncrementalGenerator());
-        index->copyData(vector<uint32_t>{0, 3, 1});
+        index->copyin(vector<uint32_t>{0, 3, 1});
         auto cudaRuntime = make_ref<CudaRuntimeObj>();
         Graph gCuda = make_ref<GraphObj>(cudaRuntime);
 
diff --git a/test/kernels/cuda/test_cuda_reduce_mean.cc b/test/kernels/cuda/test_cuda_reduce_mean.cc
index 90356994..ff309635 100644
--- a/test/kernels/cuda/test_cuda_reduce_mean.cc
+++ b/test/kernels/cuda/test_cuda_reduce_mean.cc
@@ -18,7 +18,7 @@ void test_reducemean(const Shape &shape, const vector<float> &data,
     // Build input data on CPU
     Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
     icpu->dataMalloc();
-    icpu->copyData(data);
+    icpu->copyin(data);
 
     // Build CUDA graph
     Graph g = make_ref<GraphObj>(cudaRuntime);
diff --git a/test/kernels/cuda/test_cuda_resize.cc b/test/kernels/cuda/test_cuda_resize.cc
index 913f8acd..b462f516 100644
--- a/test/kernels/cuda/test_cuda_resize.cc
+++ b/test/kernels/cuda/test_cuda_resize.cc
@@ -13,8 +13,8 @@ TEST(Resize, Cuda_downsample_sizes_nearest) {
     auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
-    sizes->copyData(vector<uint32_t>{1, 1, 1, 3});
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    sizes->copyin(vector<uint32_t>{1, 1, 1, 3});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -38,8 +38,8 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notlarger) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto sizes = gCpu->addTensor({2}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    sizes->copyData(vector<uint32_t>{7, 8});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    sizes->copyin(vector<uint32_t>{7, 8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -68,8 +68,8 @@ TEST(Resize, Cuda_upsample_sizes_nearest_notsmaller) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto sizes = gCpu->addTensor({2}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    sizes->copyData(vector<uint32_t>{7, 8});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    sizes->copyin(vector<uint32_t>{7, 8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -98,9 +98,9 @@ TEST(Resize, Cuda_upsample_sizes_nearest_ceil_half_pixel) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 8, 8});
+    sizes->copyin(vector<uint32_t>{1, 1, 8, 8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -131,9 +131,9 @@ TEST(Resize, Cuda_upsample_sizes_nearest_floor_align_corners) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({2}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{8, 8});
+    sizes->copyin(vector<uint32_t>{8, 8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -164,9 +164,9 @@ TEST(Resize, Cuda_upsample_sizes_nearest_round_prefer_ceil_asymmetri) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 8, 8});
+    sizes->copyin(vector<uint32_t>{1, 1, 8, 8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -197,8 +197,8 @@ TEST(Resize, Cuda_downsample_scales_nearest) {
     auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
-    scales->copyData(vector<float>{1, 1, 0.6, 0.6});
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    scales->copyin(vector<float>{1, 1, 0.6, 0.6});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -221,8 +221,8 @@ TEST(Resize, Cuda_upsample_scales_nearest) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    scales->copyData(vector<float>{1, 1, 2, 3});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    scales->copyin(vector<float>{1, 1, 2, 3});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -247,8 +247,8 @@ TEST(Resize, Cuda_upsample_scales_nearest_axes_3_2) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto scales = gCpu->addTensor({2}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    scales->copyData(vector<float>{3, 2});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    scales->copyin(vector<float>{3, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -273,8 +273,8 @@ TEST(Resize, Cuda_downsample_scales_linear) {
     auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
-    scales->copyData(vector<float>{1, 1, 0.6, 0.6});
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    scales->copyin(vector<float>{1, 1, 0.6, 0.6});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -297,8 +297,8 @@ TEST(Resize, Cuda_downsample_scales_linear_aligncorners) {
     auto input = gCpu->addTensor({1, 1, 2, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
-    scales->copyData(vector<float>{1, 1, 0.6, 0.6});
+    input->copyin(vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    scales->copyin(vector<float>{1, 1, 0.6, 0.6});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -323,8 +323,8 @@ TEST(Resize, Cuda_upsample_scales_linear) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    scales->copyData(vector<float>{1, 1, 2, 2});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    scales->copyin(vector<float>{1, 1, 2, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -349,8 +349,8 @@ TEST(Resize, Cuda_upsample_scales_linear_align_corners) {
     auto input = gCpu->addTensor({1, 1, 2, 2}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(vector<float>{1, 2, 3, 4});
-    scales->copyData(vector<float>{1, 1, 2, 2});
+    input->copyin(vector<float>{1, 2, 3, 4});
+    scales->copyin(vector<float>{1, 1, 2, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -377,9 +377,9 @@ TEST(Resize, Cuda_downsample_sizes_linear_pytorchhalfpixel) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 3, 1});
+    sizes->copyin(vector<uint32_t>{1, 1, 3, 1});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -406,10 +406,10 @@ TEST(Resize, Cuda_tf_crop_and_resize) {
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     auto roi = gCpu->addTensor({8}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 3, 3});
-    roi->copyData(vector<float>{0, 0, 0.4, 0.6, 1, 1, 0.6, 0.8});
+    sizes->copyin(vector<uint32_t>{1, 1, 3, 3});
+    roi->copyin(vector<float>{0, 0, 0.4, 0.6, 1, 1, 0.6, 0.8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -437,10 +437,10 @@ TEST(Resize, Cuda_tf_crop_and_resize_axes_3_2) {
     auto sizes = gCpu->addTensor({2}, DataType::UInt32);
     auto roi = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{3, 3});
-    roi->copyData(vector<float>{0.6, 0.4, 0.8, 0.6});
+    sizes->copyin(vector<uint32_t>{3, 3});
+    roi->copyin(vector<float>{0.6, 0.4, 0.8, 0.6});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -467,9 +467,9 @@ TEST(Resize, Cuda_downsample_scales_cubic) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    scales->copyData(vector<float>{1.0, 1.0, 0.8, 0.8});
+    scales->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -494,9 +494,9 @@ TEST(Resize, Cuda_downsample_scales_cubic_align_corners) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    scales->copyData(vector<float>{1.0, 1.0, 0.8, 0.8});
+    scales->copyin(vector<float>{1.0, 1.0, 0.8, 0.8});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -522,9 +522,9 @@ TEST(Resize, Cuda_upsample_scales_cubic) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    scales->copyData(vector<float>{1.0, 1.0, 2, 2});
+    scales->copyin(vector<float>{1.0, 1.0, 2, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -559,9 +559,9 @@ TEST(Resize, Cuda_upsample_scales_cubic_align_corners) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    scales->copyData(vector<float>{1.0, 1.0, 2, 2});
+    scales->copyin(vector<float>{1.0, 1.0, 2, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -598,9 +598,9 @@ TEST(Resize, Cuda_upsample_scales_cubic_asymmetric) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto scales = gCpu->addTensor({4}, DataType::Float32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    scales->copyData(vector<float>{1.0, 1.0, 2, 2});
+    scales->copyin(vector<float>{1.0, 1.0, 2, 2});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -633,9 +633,9 @@ TEST(Resize, Cuda_downsample_sizes_cubic) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 3, 3});
+    sizes->copyin(vector<uint32_t>{1, 1, 3, 3});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
@@ -667,9 +667,9 @@ TEST(Resize, Cuda_upsample_sizes_cubic) {
     auto input = gCpu->addTensor({1, 1, 4, 4}, DataType::Float32);
     auto sizes = gCpu->addTensor({4}, DataType::UInt32);
     gCpu->dataMalloc();
-    input->copyData(
+    input->copyin(
         vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    sizes->copyData(vector<uint32_t>{1, 1, 9, 10});
+    sizes->copyin(vector<uint32_t>{1, 1, 9, 10});
 
     auto cudaRuntime = make_ref<CudaRuntimeObj>();
     Graph gCuda = make_ref<GraphObj>(cudaRuntime);
diff --git a/test/operators/test_conv.cc b/test/operators/test_conv.cc
index 059fa9af..420b4ab8 100644
--- a/test/operators/test_conv.cc
+++ b/test/operators/test_conv.cc
@@ -61,7 +61,7 @@ TEST(Conv, NaiveCPU) {
     auto ans =
         make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
     ans->dataMalloc();
-    ans->copyData(
+    ans->copyin(
         vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
     EXPECT_TRUE(conv->getOutput()->equalData(ans));
 }
diff --git a/test/operators/test_resize.cc b/test/operators/test_resize.cc
index 37e2fc7d..1c36da20 100644
--- a/test/operators/test_resize.cc
+++ b/test/operators/test_resize.cc
@@ -12,7 +12,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 1, 2, 4}, DataType::UInt32);
         Tensor sizes = g->addTensor({4}, DataType::UInt32);
         sizes->dataMalloc();
-        sizes->copyData(vector<uint32_t>{1, 1, 1, 3});
+        sizes->copyin(vector<uint32_t>{1, 1, 1, 3});
         auto op = g->addOp<ResizeObj>(
             i, nullptr, std::nullopt, sizes, nullptr, nullptr,
             ResizeObj::EKeepAspectRatioPolicy::stretch);
@@ -24,7 +24,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 1, 2, 4}, DataType::UInt32);
         Tensor sizes = g->addTensor({2}, DataType::UInt32);
         sizes->dataMalloc();
-        sizes->copyData(vector<uint32_t>{1, 3});
+        sizes->copyin(vector<uint32_t>{1, 3});
         auto op = g->addOp<ResizeObj>(
             i, nullptr, vector<int>{2, 3}, sizes, nullptr, nullptr,
             ResizeObj::EKeepAspectRatioPolicy::stretch);
@@ -36,7 +36,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 3, 2, 4}, DataType::UInt32);
         Tensor sizes = g->addTensor({2}, DataType::UInt32);
         sizes->dataMalloc();
-        sizes->copyData(vector<uint32_t>{7, 8});
+        sizes->copyin(vector<uint32_t>{7, 8});
         auto op = g->addOp<ResizeObj>(
             i, nullptr, vector<int>{2, 3}, sizes, nullptr, nullptr,
             ResizeObj::EKeepAspectRatioPolicy::notLarger);
@@ -48,7 +48,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 3, 2, 4}, DataType::UInt32);
         Tensor sizes = g->addTensor({3}, DataType::UInt32);
         sizes->dataMalloc();
-        sizes->copyData(vector<uint32_t>{2, 6, 8});
+        sizes->copyin(vector<uint32_t>{2, 6, 8});
         auto op = g->addOp<ResizeObj>(
             i, nullptr, vector<int>{1, 2, 3}, sizes, nullptr, nullptr,
             ResizeObj::EKeepAspectRatioPolicy::notSmaller);
@@ -60,7 +60,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 1, 4, 4}, DataType::UInt32);
         Tensor scales = g->addTensor({3}, DataType::Float32);
         scales->dataMalloc();
-        scales->copyData(vector<float>{1, 0.8, 0.8});
+        scales->copyin(vector<float>{1, 0.8, 0.8});
         auto op = g->addOp<ResizeObj>(i, nullptr, vector<int>{1, 2, 3}, nullptr,
                                       scales, nullptr);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 3, 3}));
@@ -71,7 +71,7 @@ TEST(Resize, ShapeInference) {
         Tensor i = g->addTensor({1, 1, 2, 2}, DataType::UInt32);
         Tensor scales = g->addTensor({4}, DataType::Float32);
         scales->dataMalloc();
-        scales->copyData(vector<float>{1, 1, 2, 2});
+        scales->copyin(vector<float>{1, 1, 2, 2});
         auto op = g->addOp<ResizeObj>(i, nullptr, std::nullopt, nullptr, scales,
                                       nullptr);
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 4, 4}));

From 73e895b8ce4bb8488ef4527c34498fdda95953c1 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Tue, 21 Mar 2023 14:40:13 +0800
Subject: [PATCH 25/29] =?UTF-8?q?feat:=20=E5=AF=BC=E5=87=BA=E6=8B=B7?=
 =?UTF-8?q?=E5=87=BA=E5=BC=A0=E9=87=8F=E5=80=BC=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 src/ffi/ffi_infinitensor.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 641c2c3b..390ecb2d 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -178,6 +178,8 @@ void init_graph_builder(py::module &m) {
         .def("fuid", &TensorObj::getFuid, policy::automatic)
         .def("shape", &TensorObj::getDims, policy::move)
         .def("copyoutFloat", &TensorObj::copyoutFloat, policy::move)
+        .def("copyoutInt32", &TensorObj::copyoutInt32, policy::move)
+        .def("copyoutInt64", &TensorObj::copyoutInt64, policy::move)
         .def("has_target", &TensorObj::hasTarget, policy::automatic)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")

From 5aeacedab3deedd6e2a5f327f29f66346f03d01e Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Wed, 22 Mar 2023 09:46:40 +0800
Subject: [PATCH 26/29] =?UTF-8?q?fix:=20=E4=BB=8E=E6=A8=A1=E6=9D=BF?=
 =?UTF-8?q?=E5=AF=BC=E5=87=BA=E6=AF=8F=E4=B8=AA=E7=B1=BB=E5=9E=8B=E7=9A=84?=
 =?UTF-8?q?=20python=20=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/core/graph_handler.h              | 12 ------------
 include/core/tensor.h                     | 10 +++-------
 pyinfinitensor/src/pyinfinitensor/onnx.py | 12 ++++++------
 src/ffi/ffi_infinitensor.cc               | 14 +++++++-------
 4 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h
index 72f7a6a3..9936c637 100644
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@@ -89,18 +89,6 @@ class GraphHandlerObj {
 
     inline void data_malloc() { g->dataMalloc(); }
 
-    inline void copy_int32(Tensor tensor, std::vector<int32_t> list) {
-        tensor->copyin(list);
-    }
-
-    inline void copy_int64(Tensor tensor, std::vector<int64_t> list) {
-        tensor->copyin(list);
-    }
-
-    inline void copy_float(Tensor tensor, std::vector<float> list) {
-        tensor->copyin(list);
-    }
-
     inline void run() { g->getRuntime()->run(g); }
 };
 
diff --git a/include/core/tensor.h b/include/core/tensor.h
index af3ed3f6..ed95f5f5 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -57,20 +57,16 @@ class TensorObj : public TensorBaseObj {
         return ans;
     }
     // Copy the element at `pos`.
-    template <typename T> inline auto copyout(const vector<int> &pos) const {
+    template <typename T> inline auto copyOne(const vector<int> &pos) const {
         IT_ASSERT(DataType::get<T>() == dtype);
         auto offset = getOffset(pos);
         auto bytes = dtype.getSize();
         T ans;
-        runtime->copyBlobToCPU(&ans, getRawDataPtr<void *>() + offset * bytes,
-                               bytes);
+        runtime->copyBlobToCPU(
+            &ans, getRawDataPtr<uint8_t *>() + offset * bytes, bytes);
         return ans;
     }
 
-    inline auto copyoutFloat() const { return copyout<float>(); }
-    inline auto copyoutInt32() const { return copyout<int32_t>(); }
-    inline auto copyoutInt64() const { return copyout<int64_t>(); }
-
     void copyData(const TensorObj *src);
     void copyData(const Tensor &src) { copyData(src.get()); }
     void setData(
diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index f85addb4..4ec4f365 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -101,7 +101,7 @@ class OnnxStub:
                 (alpha, beta, transA, transB) = (
                     attributes[name] for name in ["alpha", "beta", "transA", "transB"]
                 )
-                # TODO 不支持这些参数
+                # FIXME unsupport attributes: `alpha` `beta`
                 assert alpha == 1.0
                 assert beta == 1.0
                 tensors[node.output[0]] = self.handler.matmul(
@@ -265,7 +265,7 @@ class OnnxStub:
                     tensors.get(node.output[0]),
                 )
             elif node.op_type == "Flatten":
-                # TODO 后端算子不支持沿任意轴展开
+                # FIXME axis must be 1
                 axis = next(
                     (attr.i for attr in node.attribute if attr.name == "axis"), None
                 )
@@ -315,7 +315,7 @@ class OnnxStub:
                     next((attr.i for attr in node.attribute if attr.name == "axis")),
                 )
             elif node.op_type == "ReduceMean":
-                tensors[node.output[0]] = self.handler.reduceMean(
+                tensors[node.output[0]] = self.handler.reduce_mean(
                     tensors[node.input[0]],
                     tensors.get(node.output[0]),
                     tensors[node.input[1]] if len(node.input) > 1 else None,
@@ -351,11 +351,11 @@ class OnnxStub:
             else:
                 self.initializer[obj.fuid()] = tensor
                 if tensor.data_type == TensorProto.INT32:
-                    self.handler.copy_int32(obj, [int(i) for i in tensor.int32_data])
+                    obj.copyin_int32([int(i) for i in tensor.int32_data])
                 elif tensor.data_type == TensorProto.INT64:
-                    self.handler.copy_int64(obj, [int(i) for i in tensor.int64_data])
+                    obj.copyin_int64([int(i) for i in tensor.int64_data])
                 elif tensor.data_type == TensorProto.FLOAT:
-                    self.handler.copy_float(obj, [float(i) for i in tensor.float_data])
+                    obj.copyin_float([int(i) for i in tensor.float_data])
                 else:
                     assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
 
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 390ecb2d..627be8bf 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -177,9 +177,12 @@ void init_graph_builder(py::module &m) {
     py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
         .def("fuid", &TensorObj::getFuid, policy::automatic)
         .def("shape", &TensorObj::getDims, policy::move)
-        .def("copyoutFloat", &TensorObj::copyoutFloat, policy::move)
-        .def("copyoutInt32", &TensorObj::copyoutInt32, policy::move)
-        .def("copyoutInt64", &TensorObj::copyoutInt64, policy::move)
+        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
+        .def("copyin_int32", &TensorObj::copyin<int32_t>, policy::move)
+        .def("copyin_int64", &TensorObj::copyin<int64_t>, policy::move)
+        .def("copyout_float", &TensorObj::copyout<float>, policy::move)
+        .def("copyout_int32", &TensorObj::copyout<int32_t>, policy::move)
+        .def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
         .def("has_target", &TensorObj::hasTarget, policy::automatic)
         .def("src", &TensorObj::getOutputOf, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
@@ -212,15 +215,12 @@ void init_graph_builder(py::module &m) {
         .def("reshape", &Handler::reshape, policy::move)
         .def("concat", &Handler::concat, policy::move)
         .def("gather", &Handler::gather, policy::move)
-        .def("reduceMean", &Handler::reduceMean, policy::move)
+        .def("reduce_mean", &Handler::reduceMean, policy::move)
         .def("slice", &Handler::slice, policy::move)
         .def("pad", &Handler::pad, policy::move)
         .def("topo_sort", &Handler::topo_sort, policy::automatic)
         .def("operators", &Handler::operators, policy::move)
         .def("data_malloc", &Handler::data_malloc, policy::automatic)
-        .def("copy_int32", &Handler::copy_int32, policy::automatic)
-        .def("copy_int64", &Handler::copy_int64, policy::automatic)
-        .def("copy_float", &Handler::copy_float, policy::automatic)
         .def("run", &Handler::run, policy::automatic);
 }
 

From 64a5de51f32834c1ae5395fc4d37a2df35681852 Mon Sep 17 00:00:00 2001
From: wanghailu <wanghailu@qiyuanlab.com>
Date: Mon, 27 Feb 2023 10:22:03 +0800
Subject: [PATCH 27/29] fix

---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index 4ec4f365..e9424ab3 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -588,7 +588,7 @@ class OnnxStub:
         self.handler.copy_float(self.inputs[name], data)
 
     def take_float(self) -> List[float]:
-        return next(iter(self.handler.outputs.values())).copyFloats()
+        return next(iter(self.outputs.values())).cloneFloats()
 
 
 def from_onnx(model: ModelProto, runtime):

From aff2b538cebb72b04d91c69a47849c4237cc99df Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Wed, 22 Mar 2023 10:13:06 +0800
Subject: [PATCH 28/29] =?UTF-8?q?fix:=20=E5=88=A0=E9=99=A4=E5=8D=95?=
 =?UTF-8?q?=E7=8B=AC=E7=9A=84=E6=8B=B7=E8=B4=9D=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 pyinfinitensor/src/pyinfinitensor/onnx.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py
index e9424ab3..9a6afd21 100644
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@@ -578,18 +578,6 @@ class OnnxStub:
     def run(self) -> None:
         self.handler.run()
 
-    def put_int32(self, name: str, data: List[int]) -> None:
-        self.handler.copy_int32(self.inputs[name], data)
-
-    def put_int64(self, name: str, data: List[int]) -> None:
-        self.handler.copy_int64(self.inputs[name], data)
-
-    def put_float(self, name: str, data: List[int]) -> None:
-        self.handler.copy_float(self.inputs[name], data)
-
-    def take_float(self) -> List[float]:
-        return next(iter(self.outputs.values())).cloneFloats()
-
 
 def from_onnx(model: ModelProto, runtime):
     stub = OnnxStub(model, runtime)

From d9886e9de302f126f998ef23ebd3cbdd3087d393 Mon Sep 17 00:00:00 2001
From: whjthu <haojie0429@gmail.com>
Date: Sat, 25 Mar 2023 12:04:24 +0800
Subject: [PATCH 29/29] fix: remove inline keyword in class; rename getter and
 setter for inputOf and outputOf

---
 include/core/graph.h                     |  4 ++--
 include/core/tensor.h                    | 16 +++++++--------
 include/core/tensor_base.h               | 10 ++++-----
 include/operators/batch_norm.h           | 10 ++++-----
 include/operators/pooling.h              | 26 +++++++++++-------------
 src/core/graph.cc                        | 10 ++++-----
 src/core/tensor.cc                       |  6 +++---
 src/ffi/ffi_infinitensor.cc              |  2 +-
 test/core/test_graph.cc                  | 18 ++++++++--------
 test/core/test_search.cc                 |  4 ++--
 test/kernels/cuda/test_cuda_inception.cc |  2 +-
 11 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/include/core/graph.h b/include/core/graph.h
index 8e317a8b..4e654caa 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -63,7 +63,7 @@ class GraphObj : public Object {
     inline TensorVec getInputs() const {
         TensorVec ret;
         for (const auto &t : tensors)
-            if (!t->getOutputOf())
+            if (!t->getSource())
                 ret.emplace_back(t);
         return ret;
     }
@@ -74,7 +74,7 @@ class GraphObj : public Object {
     inline TensorVec getOutputs() const {
         TensorVec ret;
         for (const auto &t : tensors)
-            if (t->getInputOf().empty())
+            if (t->getTargets().empty())
                 ret.emplace_back(t);
         return ret;
     }
diff --git a/include/core/tensor.h b/include/core/tensor.h
index ed95f5f5..72a3b007 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -19,10 +19,10 @@ class TensorObj : public TensorBaseObj {
     Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                   // scratch have a new id.
 
-    inline void copyin(const void *ptr, size_t size) {
+    void copyin(const void *ptr, size_t size) {
         runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
     }
-    inline void copyout(void *ptr, size_t size) const {
+    void copyout(void *ptr, size_t size) const {
         runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
     }
 
@@ -31,33 +31,33 @@ class TensorObj : public TensorBaseObj {
     virtual ~TensorObj() {}
     string toString() const override;
 
-    inline size_t size() const { return _size; }
-    inline size_t getBytes() const { return _size * dtype.getSize(); }
+    size_t size() const { return _size; }
+    size_t getBytes() const { return _size * dtype.getSize(); }
 
     Shape getDims() const { return shape; }
     vector<size_t> getStride() const;
     size_t getOffset(const vector<int> &ds) const;
     void dataMalloc();
-    inline UidBaseType getFuid() const { return fuid; }
+    UidBaseType getFuid() const { return fuid; }
 
     void load(std::string file_path);
     void save(std::string file_path);
 
     // Copy elements from `data`.
-    template <typename T> inline void copyin(const vector<T> &data) {
+    template <typename T> void copyin(const vector<T> &data) {
         IT_ASSERT(DataType::get<T>() == dtype);
         IT_ASSERT(data.size() >= _size);
         copyin(data.data(), getBytes());
     }
     // Copy all the elements to a vector.
-    template <typename T> inline auto copyout() const {
+    template <typename T> auto copyout() const {
         IT_ASSERT(DataType::get<T>() == dtype);
         std::vector<T> ans(_size);
         copyout(ans.data(), getBytes());
         return ans;
     }
     // Copy the element at `pos`.
-    template <typename T> inline auto copyOne(const vector<int> &pos) const {
+    template <typename T> auto copyOne(const vector<int> &pos) const {
         IT_ASSERT(DataType::get<T>() == dtype);
         auto offset = getOffset(pos);
         auto bytes = dtype.getSize();
diff --git a/include/core/tensor_base.h b/include/core/tensor_base.h
index 4c73094a..61b8d032 100644
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@@ -45,14 +45,14 @@ class TensorBaseObj : public Object {
     DataType getDType() const { return dtype; }
     Runtime getRuntime() const { return runtime; }
 
-    void addInputOf(const Operator &op) { targets.emplace_back(op); }
-    void setOutputOf(const Operator &op) { source = op; }
+    void addTarget(const Operator &op) { targets.emplace_back(op); }
+    void setSource(const Operator &op) { source = op; }
 
     bool hasTarget() const { return !targets.empty(); }
 
-    OpVec getInputOf() const { return wrefs_to_refs(targets); }
-    Operator getOutputOf() const { return source.lock(); }
-    //     std::pair<Operator *, int> getOutputOfWithIndex();
+    OpVec getTargets() const { return wrefs_to_refs(targets); }
+    Operator getSource() const { return source.lock(); }
+    //     std::pair<Operator *, int> getSourceWithIndex();
 
     //     bool setScalar(VType val) {
     //         if (data == nullptr || !dims.empty())
diff --git a/include/operators/batch_norm.h b/include/operators/batch_norm.h
index 8e41a043..fbee21fd 100644
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@@ -39,11 +39,11 @@ class BatchNormObj : public OperatorObj {
     std::string toString() const override;
 
     // output size will be 3 when training
-    inline int numInputs() const override { return 5; }
-    inline int numOutputs() const override { return outputs.size(); }
-    inline float getMomentum() const { return momentum; }
-    inline float getEps() const { return eps; }
-    inline bool getTraining() const { return training; }
+    int numInputs() const override { return 5; }
+    int numOutputs() const override { return outputs.size(); }
+    float getMomentum() const { return momentum; }
+    float getEps() const { return eps; }
+    bool getTraining() const { return training; }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/include/operators/pooling.h b/include/operators/pooling.h
index 1a1a6edf..c14bb8ad 100644
--- a/include/operators/pooling.h
+++ b/include/operators/pooling.h
@@ -39,22 +39,20 @@ class PoolingObj : public OperatorObj {
 
     optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
     std::string toString() const override;
-    inline int numInputs() const override { return 1; }
-    inline int numOutputs() const override { return 1; }
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
 
-    inline int getKh() const { return kh; }
-    inline int getKw() const { return kw; }
-    inline int getDh() const { return dh; }
-    inline int getDw() const { return dw; }
-    inline int getPh() const { return ph; }
-    inline int getPw() const { return pw; }
-    inline int getSh() const { return sh; }
-    inline int getSw() const { return sw; }
+    int getKh() const { return kh; }
+    int getKw() const { return kw; }
+    int getDh() const { return dh; }
+    int getDw() const { return dw; }
+    int getPh() const { return ph; }
+    int getPw() const { return pw; }
+    int getSh() const { return sh; }
+    int getSw() const { return sw; }
 
-    inline auto getPadStrideDilation() const {
-        return tuple(ph, pw, sh, sw, dh, dw);
-    }
-    inline auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
+    auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
+    auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 04ce2581..800f4ca5 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -33,15 +33,15 @@ void GraphObj::addOperatorAndConnect(const Operator &op) {
     sorted = false;
     ops.push_back(op);
     for (auto &input : op->getInputs()) {
-        input->addInputOf(op);
-        if (auto pred = input->getOutputOf()) {
+        input->addTarget(op);
+        if (auto pred = input->getSource()) {
             pred->addSuccessors(op);
             op->addPredecessors(pred);
         }
     }
     for (auto &output : op->getOutputs()) {
-        output->setOutputOf(op);
-        for (auto &succ : output->getInputOf()) {
+        output->setSource(op);
+        for (auto &succ : output->getTargets()) {
             succ->addPredecessors(op);
             op->addSuccessors(succ);
         }
@@ -87,7 +87,7 @@ bool GraphObj::topo_sort() {
             // this node is a head node.
             const auto is_head = std::all_of(
                 this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
-                    auto src = input->getOutputOf();
+                    auto src = input->getSource();
                     return src // If the source node is in the waiting list,
                                // means that this node is not the head node.
                                ? waiting.find(src) == waiting.end()
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index b8a41728..362c3e76 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -18,14 +18,14 @@ string TensorObj::toString() const {
     string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                  std::to_string(fuid) + ", shape " + vecToString(shape) +
                  ", dtype " + dtype.toString();
-    vector<UidBaseType> inputOfGuid;
+    vector<UidBaseType> targetGuids;
     for (const auto &op : targets)
-        inputOfGuid.emplace_back(op.lock()->getGuid());
+        targetGuids.emplace_back(op.lock()->getGuid());
     if (auto o = source.lock())
         ret += ", source " + std::to_string(o->getGuid());
     else
         ret += ", source None";
-    ret += ", targets " + vecToString(inputOfGuid);
+    ret += ", targets " + vecToString(targetGuids);
     return ret;
 }
 
diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc
index 627be8bf..7d080548 100644
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@@ -184,7 +184,7 @@ void init_graph_builder(py::module &m) {
         .def("copyout_int32", &TensorObj::copyout<int32_t>, policy::move)
         .def("copyout_int64", &TensorObj::copyout<int64_t>, policy::move)
         .def("has_target", &TensorObj::hasTarget, policy::automatic)
-        .def("src", &TensorObj::getOutputOf, policy::move);
+        .def("src", &TensorObj::getSource, policy::move);
     py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
         .def("op_type", &OperatorObj::getOpType, policy::automatic)
         .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc
index 8140ea43..85c012b9 100644
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@@ -19,13 +19,13 @@ TEST(Graph, build_and_run) {
     w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
     g->print();
-    // check inputOf and outputsOf for tensor
-    EXPECT_EQ(i0->getInputOf().size(), 1u);
-    EXPECT_EQ(w0->getInputOf().size(), 1u);
-    EXPECT_EQ(o0->getInputOf().size(), 0u);
-    EXPECT_EQ(i0->getOutputOf(), nullptr);
-    EXPECT_EQ(w0->getOutputOf(), nullptr);
-    EXPECT_NE(o0->getOutputOf(), nullptr);
+    // check targets and source for tensor
+    EXPECT_EQ(i0->getTargets().size(), 1u);
+    EXPECT_EQ(w0->getTargets().size(), 1u);
+    EXPECT_EQ(o0->getTargets().size(), 0u);
+    EXPECT_EQ(i0->getSource(), nullptr);
+    EXPECT_EQ(w0->getSource(), nullptr);
+    EXPECT_NE(o0->getSource(), nullptr);
     EXPECT_EQ(matmul->getPredecessors().size(), 0u);
     EXPECT_EQ(matmul->getSuccessors().size(), 0u);
 
@@ -139,8 +139,8 @@ TEST(Graph, test_OpVec_ctor) {
     map<pair<int, int>, int> inputOutput2Cnt = {
         {{1, 0}, 2}, {{1, 1}, 1}, {{0, 1}, 1}};
     for (auto t : g2->getTensors()) {
-        pair<int, int> key = {t->getInputOf().size(),
-                              t->getOutputOf() != nullptr};
+        pair<int, int> key = {t->getTargets().size(),
+                              t->getSource() != nullptr};
         EXPECT_GE(inputOutput2Cnt[key], 0);
         inputOutput2Cnt[key]--;
     }
diff --git a/test/core/test_search.cc b/test/core/test_search.cc
index 5f531c21..5f354fdb 100644
--- a/test/core/test_search.cc
+++ b/test/core/test_search.cc
@@ -23,7 +23,7 @@ namespace infini {
 //     w0->copyin(vector<uint32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 //     auto matmul = g->addOpWithOutputs<MatmulObj>(i0, w0, o0);
 //     g->print();
-//     // check inputOf and outputsOf for tensor
+//     // check targets and source for tensor
 //     SearchEngine searchEngine(runtime, make_ref<NMutator>());
 //     searchEngine.run(g);
 //     // check execution results
@@ -46,7 +46,7 @@ TEST(Graph, search_withdm) {
     auto conv1 = g->addOpWithOutputs<ConvObj>(t3, w3, t4, 1, 1);
     auto add1 = g->addOpWithOutputs<AddObj>(t4, t5, t6);
     g->dataMalloc();
-    // check inputOf and outputsOf for tensor
+    // check targets and source for tensor
     SearchEngine searchEngine(runtime, make_ref<DummyMutator>(10));
     searchEngine.run(g);
     // check execution results
diff --git a/test/kernels/cuda/test_cuda_inception.cc b/test/kernels/cuda/test_cuda_inception.cc
index 1e691576..31a7b888 100644
--- a/test/kernels/cuda/test_cuda_inception.cc
+++ b/test/kernels/cuda/test_cuda_inception.cc
@@ -64,7 +64,7 @@ TEST(CUDA_Inception_v3_block, run) {
 
     // check connection
     EXPECT_EQ(maxpool->getSuccessors().size(), 4u);
-    EXPECT_EQ(chainInput->getInputOf().size(), 4u);
+    EXPECT_EQ(chainInput->getTargets().size(), 4u);
     for (const auto &chainOps : ops) {
         for (size_t i = 1; i < chainOps.size(); i++) {
             auto prev = chainOps[i - 1];