merge master

2023-09-20 09:21:24 +08:00 · 2023-09-20 09:21:24 +08:00 · 4c1e32cac8
parent 39f1310343 8f2597a508
commit 4c1e32cac8
16 changed files with 576 additions and 133 deletions
--- a/4
+++ b/4
@ -54,6 +54,10 @@ test-onnx:
 	@echo
 	python3 pyinfinitensor/tests/test_onnx.py

+test-api:
+	@echo
+	python3 pyinfinitensor/tests/test_api.py
+
 docker-build: 
 	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .

--- a/examples/distributed/launch_kvcache.py
+++ b/examples/distributed/launch_kvcache.py
@ -0,0 +1,245 @@
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.external_data_helper import convert_model_to_external_data
+import numpy as np
+from parallel_opt import parallel_model
+
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=1, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, default="test", help="name of this instance."
+    )
+    parser.add_argument(
+        "--model1", type=str, required=True, help="path to the ONNX model file."
+    )
+    parser.add_argument(
+        "--model2", type=str, required=True, help="path to the ONNX model file."
+    )    
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--gen_std",
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model1,
+        args.model2,
+        args.batch_size,
+        args.length,
+        args.gen_std,
+    )
+
+
+def run_model(model1, model2, runtime1, runtime2, inputs1: np.array, inputs2: np.array, n=20):
+    ####################################
+    # run the first graph without kvcache
+    ####################################
+    stub1 = OnnxStub(model1, runtime1)
+    stub1.inputs['onnx::Reshape_0'].copyin_int32(inputs1.reshape(-1).tolist())
+    stub1.tune()
+    stub1.run()
+    kvcache_it1 = []
+    count = 0
+    for output in stub1.outputs.items().__iter__():
+        if count == 0:
+            logits_it1 = np.array(output[1].copyout_float(), dtype=np.float32)
+        else:
+            kvcache_it1.append(np.array(output[1].copyout_float(), dtype=np.float32))
+        count = count + 1
+        
+    # bench for stub1
+    next(stub1.inputs.items().__iter__())[1].copyin_int32(inputs1.reshape(-1).tolist())
+    begin = time.time()
+    for _ in range(n):
+        stub1.run()
+    end = time.time()
+    avg_time = (end - begin) / n
+    print(f"stub1 average time: {avg_time}")        
+        
+    ####################################
+    # run the second graph with kvcache
+    ####################################
+    i = 0
+    batchsize = 1
+    stub2 = OnnxStub(model2, runtime2)
+    past_kvcache_length = (i+2)*np.ones((batchsize, 1), dtype=np.int32)
+    # copyin input
+    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
+    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
+    count = -1
+    for input in stub2.inputs.items().__iter__():
+        if count in range(24):
+            # print(count, input[0])
+            # print(np.dtype(kvcache_it1[count][0]), kvcache_it1[count].shape)
+            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
+        count = count + 1
+    stub2.tune()
+    stub2.run()
+    
+    # copyout output
+    count = 0
+    kvcache_it2 = []
+    for output in stub2.outputs.items().__iter__():
+        if count == 0:
+            logits_it2 = np.array(output[1].copyout_float(), dtype=np.float32)
+        else:
+            kvcache_it2.append(np.array(output[1].copyout_float(), dtype=np.float32))
+        count = count + 1     
+
+    # bench for stub2
+    # copyin input
+    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
+    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
+    count = -1
+    for input in stub2.inputs.items().__iter__():
+        if count in range(24):
+            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
+        count = count + 1
+    begin = time.time()
+    for _ in range(n):
+        stub2.run()
+    end = time.time()
+    avg_time = (end - begin) / n
+    print(f"stub2 average time: {avg_time}")
+    return logits_it2
+
+
+def run_and_compare(name, model1, model2, runtime1, runtime2):
+    data1 = np.load(f"{name}_inputs1.npy")
+    data2 = np.load(f"{name}_inputs2.npy")
+    results = np.load(f"{name}_results.npy")
+    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2)
+    print("outputs sum:", outputs.sum())
+    print("max abs diff:", abs(outputs - results).max())
+    print("max rel diff:", abs((outputs - results) / results).max())
+    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
+
+
+def start_worker(
+    name: str, world_size: int, rank: int, local_rank: int, model1: onnx.ModelProto, model2: onnx.ModelProto
+):
+    dist_name = name + "_dist"
+    ####################################
+    # shard the first graph
+    ####################################
+    model1 = parallel_model(model1, world_size, rank)
+    extern_path = f"./{dist_name}_stub1_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    convert_model_to_external_data(
+        model1,
+        all_tensors_to_one_file=True,
+        location=extern_path,
+        size_threshold=1024,
+        convert_attribute=False,
+    )
+    onnx.save(model1, f"./{dist_name}_stub1_rank{rank}.onnx")
+    runtime1 = backend.CudaRuntime(local_rank)
+    runtime1.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    
+    ####################################
+    # shard the second graph
+    ####################################    
+    model2 = parallel_model(model2, world_size, rank)
+    extern_path = f"./{dist_name}_stub2_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    convert_model_to_external_data(
+        model2,
+        all_tensors_to_one_file=True,
+        location=extern_path,
+        size_threshold=1024,
+        convert_attribute=False,
+    )    
+    onnx.save(model2, f"./{dist_name}_stub2_rank{rank}.onnx")
+    runtime2 = backend.CudaRuntime(local_rank)
+    # print("init comm")
+    runtime2.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )    
+    
+    # run the two graphs
+    run_and_compare(name, model1, model2, runtime1, runtime2)
+
+
+def start_single(name, model1, model2):
+    runtime1 = backend.CudaRuntime(0)
+    runtime2 = backend.CudaRuntime(0)
+    run_and_compare(name, model1, model2, runtime1, runtime2)
+
+
+def gen_standard(name, model1, model2, voc_size, bs, len):
+    # generate standard results
+    data1 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
+    data2 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
+    np.save(f"{name}_inputs1", data1)
+    np.save(f"{name}_inputs2", data2)
+    runtime1 = backend.CudaRuntime(0)
+    runtime2 = backend.CudaRuntime(0)
+    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2, 1)
+    np.save(f"{name}_results", outputs)
+
+
+def main():
+    nnodes, nproc_per_node, name, model1_path, model2_path, bs, length, gen_std = parse_args()
+
+    model1 = onnx.load(model1_path)
+    model2 = onnx.load(model2_path)
+
+    # generate standart output
+    if gen_std:
+        print(f"generate standard data for {name}.")
+        # a small vocabulary size to fit all LLM.
+        voc_size = 1000
+        gen_standard(name, model1, model2, voc_size, bs, length)
+        return
+
+    # run single process.
+    # use standalone process to isolate cuda.
+    p = mp.Process(target=start_single, args=(name, model1, model2))
+    p.start()
+    p.join()
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(name, world_size, rank, rank % nproc_per_node, model1, model2),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -56,6 +56,16 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
        ndim = len(vinfo[output].type.tensor_type.shape.dim)
        out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
        place[node.output[0]] = out_plc
+        
+    def shard_concat(node: NodeProto):
+        # hack for kvcache
+        in_plc = place[node.input[1]]
+        if in_plc.is_sharded():
+            seq_len_dim = vinfo[node.input[0]].type.tensor_type.shape.dim.pop(1)
+            seq_len_dim.dim_value //= tp_world_size
+            vinfo[node.input[0]].type.tensor_type.shape.dim.insert(1, seq_len_dim)
+            place[node.input[0]] = in_plc
+            place[node.output[0]] = in_plc

    def shard_binary(node: NodeProto, groups: int = 1):
        # print("binary", node.name, node.input[0], place[node.input[0]])
@ -143,6 +153,8 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
                place[node.input[0]] == place[node.input[1]]
            ), f"{place[node.input[0]]} != {place[node.input[1]]}"
            place[node.output[0]] = place[node.input[0]]
+        elif node.op_type == "Concat":
+            shard_concat(node)            

    def find_successor(op_type: str, idx: int, search_limit: int = 1):
        for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
@ -203,10 +215,14 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
            continue
        shard_node(node)

+    new_input = []
+    for info in model.graph.input:
+        new_input.append(vinfo[info.name])
+    
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
-        model.graph.input,
+        new_input,
        model.graph.output,
        data.values(),
        doc_string=model.graph.doc_string,
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -67,6 +67,10 @@ class BangRuntimeObj : public RuntimeObj {
                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }

+    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
+
+    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
+
  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
 };
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -125,6 +125,11 @@ class GraphObj : public Object {
     * @brief If the nodes is sorted in topological order.
     */
    bool sorted;
+
+    /**
+     * @brief If the weight tensors are allocated.
+     */
+    bool weightAllocated = false;
 };

 } // namespace infini
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -20,14 +20,23 @@ class LazyAllocator {

    Runtime runtime;

-    size_t used;
+    size_t used = 0;

-    size_t peak;
+    size_t peak = 0;
+
+    size_t weightPeak = 0;

    size_t alignment;

    // pointer to the memory actually allocated
-    void *ptr;
+    void *ptr = nullptr;
+
+    // pointer to the weight memory space
+    void *weightPtr = nullptr;
+
+    // // a cache designed for a batch size that has already occurred
+    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
+    // batchsizeToTensorOffset;

    struct freeBlockInfo {
        size_t addr;
@ -57,12 +66,16 @@ class LazyAllocator {

    virtual ~LazyAllocator();

+    void init();
+
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
    // return: head address offset of the allocated memory block
    size_t alloc(size_t size);

+    size_t allocWeight(size_t size);
+
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
@ -73,6 +86,12 @@ class LazyAllocator {
    // return: pointer to the head address of the allocated memory
    void *getPtr();

+    // void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
+
+    // std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
+
+    void *getWeightPtr();
+
    void info();

  private:
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -1,5 +1,6 @@
 #pragma once
 #include "core/tensor_base.h"
+#include "core/tensor_type.h"
 #include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
@ -19,6 +20,8 @@ class TensorObj : public TensorBaseObj {
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
+    TensorType tensorType = TensorType::others;
+
  public:
    TensorObj(Shape shape, DataType dtype, Runtime runtime);
    virtual ~TensorObj() {}
@ -33,6 +36,33 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
+    bool isWeight() const { return tensorType == TensorType::weight; }
+    bool isInput() const { return tensorType == TensorType::input; }
+    bool isOutput() const { return tensorType == TensorType::output; }
+    bool isOthers() const { return tensorType == TensorType::others; }
+    void setWeight() { tensorType = TensorType::weight; }
+    void setInput() { tensorType = TensorType::input; }
+    void setOutput() { tensorType = TensorType::output; }
+    string tensorTypeToString() const {
+        switch (tensorType) {
+        case TensorType::weight:
+            return "weight";
+            break;
+        case TensorType::input:
+            return "input";
+            break;
+        case TensorType::output:
+            return "output";
+            break;
+        case TensorType::others:
+            return "others";
+            break;
+
+        default:
+            return "unknown tensor type";
+            break;
+        }
+    }

    void load(std::string file_path);
    void save(std::string file_path);
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -44,7 +44,7 @@ class TensorBaseObj : public Object {
    }

    DataType getDType() const { return dtype; }
-    int getDTypeCode() const { return dtype.getIndex(); }
+    int getDTypeIndex() const { return dtype.getIndex(); }
    Runtime getRuntime() const { return runtime; }

    //     std::pair<Operator *, int> getOutputOfWithIndex();
--- a/include/core/tensor_type.h
+++ b/include/core/tensor_type.h
@ -0,0 +1,7 @@
+#pragma once
+
+namespace infini {
+
+enum class TensorType { weight, input, output, others };
+
+} // namespace infini
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -32,24 +32,27 @@ class OnnxStub:
    The Onnx model imported into infinitensor.
    It can be generated from an Onnx model object.
    """
-
-    inputs: Dict[str, backend.Tensor] = {}
-    outputs: Dict[str, backend.Tensor] = {}
-    initializer: Dict[int, TensorProto] = {}
-    handler: backend.GraphHandler
-
    def __init__(self, model: ModelProto, runtime):
+        self.inputs: Dict[str, backend.Tensor] = {}
+        self.outputs: Dict[str, backend.Tensor] = {}
+        self.initializer: Dict[int, TensorProto] = {}
        model = infer_shapes(model)
        self.handler = backend.GraphHandler(runtime)

        tensors: Dict[str, backend.Tensor] = dict()
        data: Dict[str, TensorProto] = dict()

+        for initializer in model.graph.initializer:
+            dims = [d for d in initializer.dims]
+            tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
+            data[initializer.name] = initializer
+
        for input in model.graph.input:
            dims = _take_shape_dim(input.type.tensor_type.shape)
-            tensors[input.name] = self.handler.tensor(
-                dims, input.type.tensor_type.elem_type
-            )
+            if input.name not in tensors.keys():
+                tensors[input.name] = self.handler.tensor(
+                    dims, input.type.tensor_type.elem_type
+                )

        for output in model.graph.output:
            dims = _take_shape_dim(output.type.tensor_type.shape)
@ -57,10 +60,6 @@ class OnnxStub:
                dims, output.type.tensor_type.elem_type
            )

-        for initializer in model.graph.initializer:
-            dims = [d for d in initializer.dims]
-            tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type)
-            data[initializer.name] = initializer

        node_name = []
        new_node_name = []
@ -667,6 +666,19 @@ class OnnxStub:
            # update the node_list
            node_list = list(set(node_name) - set(new_node_name))

+        ################################
+        # Set tensor type
+        ################################
+        for initializer in model.graph.initializer:
+            tensors[initializer.name].set_weight()
+
+        for input in model.graph.input:
+            tensors[input.name].set_input()
+        
+        for output in model.graph.output:
+            tensors[output.name].set_output()
+
+
        ################################
        # Allocate memory space for data
        ################################
--- a/pyinfinitensor/tests/test_api.py
+++ b/pyinfinitensor/tests/test_api.py
@ -0,0 +1,65 @@
+import os, unittest
+from onnx import TensorProto
+from pyinfinitensor import backend
+import numpy as np
+
+
+class TestPythonAPI(unittest.TestCase):
+    def test_copyin_numpy(self):
+        dims = [2, 3, 5, 4]
+        np_array = np.random.random(dims).astype(np.float32)
+        handler = backend.GraphHandler(backend.cpu_runtime())
+        tensor1 = handler.tensor(dims, TensorProto.FLOAT)
+        tensor2 = handler.tensor(dims, TensorProto.FLOAT)
+        handler.data_malloc()
+        tensor1.copyin_numpy(np_array)
+        tensor2.copyin_float(np_array.flatten().tolist())
+        array1 = tensor1.copyout_float()
+        array2 = tensor2.copyout_float()
+        self.assertEqual(array1, array2)
+        self.assertTrue(np.array_equal(np.array(array1).reshape(dims), np_array))
+
+        np_array = np.random.random(dims).astype(np.int64)
+        handler = backend.GraphHandler(backend.cpu_runtime())
+        tensor1 = handler.tensor(dims, TensorProto.INT64)
+        tensor2 = handler.tensor(dims, TensorProto.INT64)
+        handler.data_malloc()
+        tensor1.copyin_numpy(np_array)
+        tensor2.copyin_int64(np_array.flatten().tolist())
+        array1 = tensor1.copyout_int64()
+        array2 = tensor2.copyout_int64()
+        self.assertEqual(array1, array2)
+        self.assertTrue(np.array_equal(np.array(array1).reshape(dims), np_array))
+
+    def test_copyout_numpy(self):
+        dims = [2, 3, 5, 4]
+        np_array = np.random.random(dims).astype(np.float32)
+        handler = backend.GraphHandler(backend.cpu_runtime())
+        tensor1 = handler.tensor(dims, TensorProto.FLOAT)
+        tensor2 = handler.tensor(dims, TensorProto.FLOAT)
+        handler.data_malloc()
+        tensor1.copyin_float(np_array.flatten().tolist())
+        tensor2.copyin_float(np_array.flatten().tolist())
+        array1 = np.array(tensor1.copyout_float()).reshape(dims)
+        array2 = tensor2.copyout_numpy()
+        self.assertTrue(np.array_equal(array2, np_array))
+        self.assertTrue(np.array_equal(array1, array2))
+
+        np_array = np.random.random(dims).astype(np.float16)
+        np_array[0, 0, 0, 0] = .1
+        handler = backend.GraphHandler(backend.cpu_runtime())
+        tensor1 = handler.tensor(dims, TensorProto.FLOAT16)
+        handler.data_malloc()
+        tensor1.copyin_numpy(np_array)
+        array1 = tensor1.copyout_numpy()
+        # Copy should be the same as original array
+        self.assertTrue(np.array_equal(array1, np_array)) 
+        # Modify the value so that tensorObj value changes
+        np_array[0, 0, 0, 0] = 0. 
+        tensor1.copyin_numpy(np_array)
+        # The copied-out array should not change
+        self.assertFalse(np.array_equal(array1, np_array)) 
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -458,54 +458,6 @@ class TestStringMethods(unittest.TestCase):
        where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
        make_and_import_model(make_graph([where], "where", [x, y, con], [output]))

-    def test_copyin(self):
-        dims = [2, 3, 5, 4]
-        np_array = np.random.random(dims).astype(np.float32)
-        handler = backend.GraphHandler(backend.cpu_runtime())
-        tensor1 = handler.tensor(dims, TensorProto.FLOAT)
-        tensor2 = handler.tensor(dims, TensorProto.FLOAT)
-        handler.data_malloc()
-        tensor1.copyin_numpy(np_array)
-        tensor2.copyin_float(np_array.flatten().tolist())
-        array1 = tensor1.copyout_float()
-        array2 = tensor2.copyout_float()
-        self.assertEqual(array1, array2)
-        self.assertTrue(np.array_equal(np.array(array1).reshape(dims), np_array))
-
-        np_array = np.random.random(dims).astype(np.int64)
-        handler = backend.GraphHandler(backend.cpu_runtime())
-        tensor1 = handler.tensor(dims, TensorProto.INT64)
-        tensor2 = handler.tensor(dims, TensorProto.INT64)
-        handler.data_malloc()
-        tensor1.copyin_numpy(np_array)
-        tensor2.copyin_int64(np_array.flatten().tolist())
-        array1 = tensor1.copyout_int64()
-        array2 = tensor2.copyout_int64()
-        self.assertEqual(array1, array2)
-        self.assertTrue(np.array_equal(np.array(array1).reshape(dims), np_array))
-
-    def test_to_numpy(self):
-        dims = [2, 3, 5, 4]
-        np_array = np.random.random(dims).astype(np.float32)
-        handler = backend.GraphHandler(backend.cpu_runtime())
-        tensor1 = handler.tensor(dims, TensorProto.FLOAT)
-        tensor2 = handler.tensor(dims, TensorProto.FLOAT)
-        handler.data_malloc()
-        tensor1.copyin_float(np_array.flatten().tolist())
-        tensor2.copyin_float(np_array.flatten().tolist())
-        array1 = np.array(tensor1.copyout_float()).reshape(dims)
-        array2 = np.array(tensor2)
-        self.assertTrue(np.array_equal(array2, np_array))
-        self.assertTrue(np.array_equal(array1, array2))
-
-        np_array = np.random.random(dims).astype(np.float16)
-        handler = backend.GraphHandler(backend.cpu_runtime())
-        tensor1 = handler.tensor(dims, TensorProto.FLOAT16)
-        handler.data_malloc()
-        tensor1.copyin_numpy(np_array)
-        array1 = np.array(tensor1, copy=False)
-        self.assertTrue(np.array_equal(array1, np_array))
-

 if __name__ == "__main__":
    unittest.main()
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -131,30 +131,63 @@ void GraphObj::dataMalloc() {
    // record the memory address offsets of all tensors to be allocated
    std::unordered_map<TensorObj *, size_t> tensorToOffset;

-    // record all constant tensors, including weight tensors and input tensors
-    std::unordered_set<TensorObj *> constTensor;
+    // reinit allocator
+    allocator.init();
+
+    // record all weight tensors, including weight tensors and kvcache
+    // tensors
+    std::unordered_set<TensorObj *> weightTensors;
    for (auto &tensor : tensors) {
-        if (tensor.get()->getSource() == nullptr) {
-            // allocate memory for all constant tensors first, and this memory
+        if (tensor->isWeight()) {
+            // allocate memory for all weight tensors first, and this memory
+            // will not be freed until the graph is destroyed
+            weightTensors.insert(tensor.get());
+            if (!this->weightAllocated) {
+                tensorToOffset[tensor.get()] =
+                    allocator.allocWeight(tensor->getBytes());
+            }
+        } else if (tensor->isInput() || tensor->isOutput()) {
+            // allocate memory for all input and output tensors, and this memory
            // will not be reused later
-            constTensor.insert(tensor.get());
            tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
        } else {
            tensorToRefCount[tensor.get()] = tensor->getTargets().size();
+            // allocate memory for all user-created tensors
+            if (tensor.get()->getSource() == nullptr) {
+                tensorToOffset[tensor.get()] =
+                    allocator.alloc(tensor->getBytes());
+            }
+        }
+    }
+    // if memory has not yet been allocated for weight tensors,
+    // allocate memory now and do not allocate again in the future.
+    if (!this->weightAllocated) {
+        this->weightAllocated = true;
+        // only allocate once for weight tensors
+        for (auto &tensor : weightTensors) {
+            IT_ASSERT(tensorToOffset.find(tensor) != tensorToOffset.end());
+            tensor->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime,
+                static_cast<uint8_t *>(allocator.getWeightPtr()) +
+                    tensorToOffset[tensor]));
        }
    }
    // traverse in topological order and simulate memory allocation
    for (auto &op : ops) {
-        // memory should be allocated for the output first
+        // memory should be allocated for the op's output first
        auto outputs = op->getOutputs();
        for (auto &tensor : outputs) {
-            tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
+            if (tensor->isOthers()) {
+                tensorToOffset[tensor.get()] =
+                    allocator.alloc(tensor->getBytes());
+            }
        }
        auto inputs = op->getInputs();
        for (auto &tensor : inputs) {
-            if (constTensor.find(tensor.get()) == constTensor.end()) {
+            if (tensor->isOthers()) {
                auto tensorIter = tensorToRefCount.find(tensor.get());
                IT_ASSERT(tensorIter != tensorToRefCount.end());
+                IT_ASSERT(tensorToRefCount[tensor.get()] > 0);
                tensorToRefCount[tensor.get()] -= 1;
                if (tensorToRefCount[tensor.get()] == 0) {
                    // indicate that this tensor will no longer be used and
@ -167,15 +200,20 @@ void GraphObj::dataMalloc() {
        }
    }

-    // perform actual memory allocation
+    // perform actual memory allocation for non-weight tensors
    for (auto &tensor : tensors) {
-        IT_ASSERT(tensorToOffset.find(tensor.get()) != tensorToOffset.end());
-        tensor->setDataBlob(make_ref<BlobObj>(
-            tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
-                                 tensorToOffset[tensor.get()]));
+        if (!tensor->isWeight()) {
+            IT_ASSERT(tensorToOffset.find(tensor.get()) !=
+                      tensorToOffset.end());
+            tensor->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
+                                     tensorToOffset[tensor.get()]));
+        }
    }

+#ifdef DEBUG_MODE
    allocator.info();
+#endif
 }

 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@ -11,9 +11,6 @@ namespace infini {
 constexpr size_t alignmentInBytesForCUDA = 256;

 LazyAllocator::LazyAllocator(Runtime runtime) : runtime(runtime) {
-    used = 0;
-    peak = 0;
-    ptr = nullptr;
    if (runtime->isCuda()) {
        // TODO: the alignment on cuda might need further discussion
        alignment = alignmentInBytesForCUDA;
@ -30,10 +27,24 @@ LazyAllocator::~LazyAllocator() {
    if (this->ptr != nullptr) {
        runtime->dealloc(this->ptr);
    }
+    if (this->weightPtr != nullptr) {
+        runtime->dealloc(this->weightPtr);
+    }
+}
+
+void LazyAllocator::init() {
+    used = 0;
+    peak = 0;
+    freeBlocks.clear();
+    headAddrToBlockSize.clear();
+    tailAddrToBlockSize.clear();
+    if (this->ptr != nullptr) {
+        runtime->dealloc(this->ptr);
+    }
+    this->ptr = nullptr;
 }

 size_t LazyAllocator::alloc(size_t size) {
-    IT_ASSERT(this->ptr == nullptr);
    // pad the size to the multiple of alignment
    size = this->getAlignedSize(size);
    auto it = this->freeBlocks.lower_bound(freeBlockInfo{(size_t)0, size});
@ -83,6 +94,14 @@ size_t LazyAllocator::alloc(size_t size) {
    return retAddr;
 }

+size_t LazyAllocator::allocWeight(size_t size) {
+    IT_ASSERT(this->weightPtr == nullptr);
+    size = this->getAlignedSize(size);
+    size_t retAddr = this->weightPeak;
+    this->weightPeak += size;
+    return retAddr;
+}
+
 void LazyAllocator::free(size_t addr, size_t size) {
    IT_ASSERT(this->ptr == nullptr);
    size = getAlignedSize(size);
@ -126,18 +145,33 @@ void LazyAllocator::free(size_t addr, size_t size) {
 void *LazyAllocator::getPtr() {
    if (this->ptr == nullptr) {
        this->ptr = runtime->alloc(this->peak);
-        printf("LazyAllocator really alloc: %p %lu bytes\n", this->ptr, peak);
+#ifdef DEBUG_MODE
+        printf("LazyAllocator really alloc non-weight: %p %lu bytes\n",
+               this->ptr, peak);
+#endif
    }
    return this->ptr;
 }

+void *LazyAllocator::getWeightPtr() {
+    if (this->weightPtr == nullptr) {
+        this->weightPtr = runtime->alloc(this->weightPeak);
+#ifdef DEBUG_MODE
+        printf("LazyAllocator really alloc weight: %p %lu bytes\n",
+               this->weightPtr, weightPeak);
+#endif
+    }
+    return this->weightPtr;
+}
+
 size_t LazyAllocator::getAlignedSize(size_t size) {
    return ((size - 1) / this->alignment + 1) * this->alignment;
 }

 void LazyAllocator::info() {
-    std::cout << "Used memory: " << this->used
-              << ", peak memory: " << this->peak << std::endl;
+    std::cout << "Used memory: " << this->used + this->weightPeak
+              << ", peak memory: " << this->peak + this->weightPeak
+              << std::endl;
 }

 } // namespace infini
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -23,7 +23,7 @@ string TensorObj::toString() const {
    string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                 std::to_string(fuid) + ", shape " + vecToString(shape) +
                 ", dtype " + dtype.toString() + ", " + runtime->toString() +
-                 ", " + ss.str() + "\n";
+                 ", " + ss.str() + ", " + tensorTypeToString() + "\n";
    vector<UidBaseType> targetGuids;
    for (const auto &op : targets)
        targetGuids.emplace_back(op.lock()->getGuid());
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -307,6 +307,44 @@ void export_functions(py::module &m) {
 #undef FUNCTION
 }

+// A helper function that converts DataType to python format string
+static std::string getFormat(DataType type) {
+    std::string format;
+    if (type == DataType::Float32) {
+        format = py::format_descriptor<float>::format();
+    } else if (type == DataType::Double) {
+        format = py::format_descriptor<double>::format();
+    } else if (type == DataType::Int32) {
+        format = py::format_descriptor<int>::format();
+    } else if (type == DataType::UInt32) {
+        format = py::format_descriptor<uint32_t>::format();
+    } else if (type == DataType::Int64) {
+        format = py::format_descriptor<int64_t>::format();
+    } else if (type == DataType::UInt64) {
+        format = py::format_descriptor<uint64_t>::format();
+    } else if (type == DataType::Int16) {
+        format = py::format_descriptor<int16_t>::format();
+    } else if (type == DataType::UInt16) {
+        format = py::format_descriptor<uint16_t>::format();
+    } else if (type == DataType::Int8) {
+        format = py::format_descriptor<int8_t>::format();
+    } else if (type == DataType::UInt8) {
+        format = py::format_descriptor<uint8_t>::format();
+    } else if (type == DataType::Bool) {
+        format = py::format_descriptor<bool>::format();
+    } else if (type == DataType::Float16 || type == DataType::BFloat16) {
+        // Python uses "e" for half precision float type code.
+        // Check the following link for more information.
+        // https://docs.python.org/3/library/struct.html#format-characters
+        format = "e";
+    } else {
+        throw std::runtime_error("Error converting TensorObj to "
+                                 "Numpy: unsupported datatype.\n");
+    }
+
+    return format;
+}
+
 void init_graph_builder(py::module &m) {
    using Handler = GraphHandlerObj;

@ -327,7 +365,10 @@ void init_graph_builder(py::module &m) {
                                                      py::buffer_protocol())
        .def("fuid", &TensorObj::getFuid, policy::automatic)
        .def("shape", &TensorObj::getDims, policy::move)
-        .def("dtype", &TensorObj::getDTypeCode, policy::automatic)
+        .def("set_weight", &TensorObj::setWeight, policy::move)
+        .def("set_input", &TensorObj::setInput, policy::move)
+        .def("set_output", &TensorObj::setOutput, policy::move)
+        .def("dtype", &TensorObj::getDTypeIndex, policy::automatic)
        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
        .def("copyin_int32", &TensorObj::copyin<int32_t>, policy::move)
        .def("copyin_int64", &TensorObj::copyin<int64_t>, policy::move)
@ -354,53 +395,24 @@ void init_graph_builder(py::module &m) {
                 }
                 self.copyin(data_np, self.getBytes());
             })
-        // A buffer can be used to convert a TensorObj directly to Numpy array
-        // without copy
-        .def_buffer([](TensorObj &self) -> py::buffer_info {
-            vector<size_t> stride_byte;
-            for (int s : self.getStride()) {
-                stride_byte.push_back(s * self.getDType().getSize());
-            }
+        // Return a Numpy array which copies the values of this tensor
+        .def("copyout_numpy",
+             [](TensorObj &self) -> py::array {
+                 vector<size_t> stride_byte;
+                 for (int s : self.getStride()) {
+                     stride_byte.push_back(s * self.getDType().getSize());
+                 }
+                 std::string format = getFormat(self.getDType());

-            std::string format;
-            if (self.getDType() == DataType::Float32) {
-                format = py::format_descriptor<float>::format();
-            } else if (self.getDType() == DataType::Double) {
-                format = py::format_descriptor<double>::format();
-            } else if (self.getDType() == DataType::Int32) {
-                format = py::format_descriptor<int>::format();
-            } else if (self.getDType() == DataType::UInt32) {
-                format = py::format_descriptor<uint32_t>::format();
-            } else if (self.getDType() == DataType::Int64) {
-                format = py::format_descriptor<int64_t>::format();
-            } else if (self.getDType() == DataType::UInt64) {
-                format = py::format_descriptor<uint64_t>::format();
-            } else if (self.getDType() == DataType::Int16) {
-                format = py::format_descriptor<int16_t>::format();
-            } else if (self.getDType() == DataType::UInt16) {
-                format = py::format_descriptor<uint16_t>::format();
-            } else if (self.getDType() == DataType::Int8) {
-                format = py::format_descriptor<int8_t>::format();
-            } else if (self.getDType() == DataType::UInt8) {
-                format = py::format_descriptor<uint8_t>::format();
-            } else if (self.getDType() == DataType::Bool) {
-                format = py::format_descriptor<bool>::format();
-            } else if (self.getDType() == DataType::Float16 ||
-                       self.getDType() == DataType::BFloat16) {
-                // Python uses "e" for half precision float type code.
-                // Check the following link for more information.
-                // https://docs.python.org/3/library/struct.html#format-characters
-                format = "e";
-            } else {
-                throw std::runtime_error("Error converting TensorObj to "
-                                         "Numpy: unsupported datatype.\n");
-            }
+                 py::array numpy_array(py::dtype(format), self.getDims(),
+                                       nullptr);

-            return py::buffer_info(self.getRawDataPtr<void *>(),
-                                   self.getDType().getSize(), format,
-                                   self.getRank(), self.getDims(), stride_byte,
-                                   true); // Read-only = true
-        })
+                 // Copy data to the numpy array
+                 auto ptr = numpy_array.mutable_data();
+                 self.copyout(ptr, self.getBytes());
+
+                 return numpy_array;
+             })
        .def("has_target", &TensorObj::hasTarget, policy::automatic)
        .def("src", &TensorObj::getSource, policy::move)
        .def("printData", &TensorObj::printData, policy::automatic);