- modify allocator

2023-09-04 19:20:35 +08:00 · 2023-09-04 19:20:35 +08:00 · e18900128d
parent f000e211f7
commit e18900128d
7 changed files with 122 additions and 24 deletions
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -123,6 +123,11 @@ class GraphObj : public Object {
     * @brief If the nodes is sorted in topological order.
     */
    bool sorted;
+
+    /**
+     * @brief If the persistent tensors are allocated.
+     */
+    bool persistentAllocated = false;
 };

 } // namespace infini
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -20,14 +20,23 @@ class LazyAllocator {

    Runtime runtime;

-    size_t used;
+    size_t used = 0;

-    size_t peak;
+    size_t peak = 0;
+
+    size_t persistentPeak = 0;

    size_t alignment;

    // pointer to the memory actually allocated
-    void *ptr;
+    void *ptr = nullptr;
+
+    // pointer to the persistent memory space
+    void *persistentPtr = nullptr;
+
+    // // a cache designed for a batch size that has already occurred
+    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
+    // batchsizeToTensorOffset;

    struct freeBlockInfo {
        size_t addr;
@ -57,12 +66,16 @@ class LazyAllocator {

    virtual ~LazyAllocator();

+    void init();
+
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
    // return: head address offset of the allocated memory block
    size_t alloc(size_t size);

+    size_t allocPersistent(size_t size);
+
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
@ -73,6 +86,12 @@ class LazyAllocator {
    // return: pointer to the head address of the allocated memory
    void *getPtr();

+    // void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
+
+    // std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
+
+    void *getPersistentPtr();
+
    void info();

  private:
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -19,6 +19,8 @@ class TensorObj : public TensorBaseObj {
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
+    bool persistent = false;
+
  public:
    TensorObj(Shape shape, DataType dtype, Runtime runtime);
    virtual ~TensorObj() {}
@ -35,6 +37,8 @@ class TensorObj : public TensorBaseObj {
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
+    bool isPersistent() const { return persistent; }
+    void setPersistent() { persistent = true; }

    void load(std::string file_path);
    void save(std::string file_path);
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -608,6 +608,13 @@ class OnnxStub:
            # update the node_list
            node_list = list(set(node_name) - set(new_node_name))

+        ################################
+        # Set weight tensors as persistent
+        ################################
+        for name, obj in tensors.items():
+            if data.get(name) != None:
+                obj.set_persistent()
+
        ################################
        # Allocate memory space for data
        ################################
@ -950,8 +957,7 @@ class OnnxStub:
            oldTensor = self.inputs[oldInput]
            self.handler.change_shape(newInput, oldTensor.fuid())
        self.handler.shape_infer()
-
-    #        self.handler.data_malloc()
+        self.handler.data_malloc()

    def getShape(self, name: str) -> List[int]:
        if name in self.inputs:
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -165,16 +165,42 @@ void GraphObj::dataMalloc() {
    // record the memory address offsets of all tensors to be allocated
    std::unordered_map<TensorObj *, size_t> tensorToOffset;

-    // record all constant tensors, including weight tensors and input tensors
-    std::unordered_set<TensorObj *> constTensor;
+    // reinit allocator
+    allocator.init();
+
+    // record all persistent tensors, including weight tensors and kvcache
+    // tensors
+    std::unordered_set<TensorObj *> persistentTensors;
    for (auto &tensor : tensors) {
-        if (tensor.get()->getSource() == nullptr) {
-            // allocate memory for all constant tensors first, and this memory
+        if (tensor->isPersistent()) {
+            // allocate memory for all persistent tensors first, and this memory
            // will not be reused later
-            constTensor.insert(tensor.get());
-            tensorToOffset[tensor.get()] = allocator.alloc(tensor->getBytes());
+            persistentTensors.insert(tensor.get());
+            if (!this->persistentAllocated) {
+                tensorToOffset[tensor.get()] =
+                    allocator.allocPersistent(tensor->getBytes());
+            }
        } else {
            tensorToRefCount[tensor.get()] = tensor->getTargets().size();
+            if (tensor.get()->getSource() == nullptr) {
+                // allocate memory for input tensors, because it is not the
+                // output of any op
+                tensorToOffset[tensor.get()] =
+                    allocator.alloc(tensor->getBytes());
+            }
+        }
+    }
+    // if memory has not yet been allocated for persistent tensors,
+    // allocate memory now and do not allocate again in the future.
+    if (!this->persistentAllocated) {
+        this->persistentAllocated = true;
+        // only allocate once for persistent tensors
+        for (auto &tensor : persistentTensors) {
+            IT_ASSERT(tensorToOffset.find(tensor) != tensorToOffset.end());
+            tensor->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime,
+                static_cast<uint8_t *>(allocator.getPersistentPtr()) +
+                    tensorToOffset[tensor]));
        }
    }
    // traverse in topological order and simulate memory allocation
@ -186,7 +212,8 @@ void GraphObj::dataMalloc() {
        }
        auto inputs = op->getInputs();
        for (auto &tensor : inputs) {
-            if (constTensor.find(tensor.get()) == constTensor.end()) {
+            if (persistentTensors.find(tensor.get()) ==
+                persistentTensors.end()) {
                auto tensorIter = tensorToRefCount.find(tensor.get());
                IT_ASSERT(tensorIter != tensorToRefCount.end());
                tensorToRefCount[tensor.get()] -= 1;
@ -201,15 +228,20 @@ void GraphObj::dataMalloc() {
        }
    }

-    // perform actual memory allocation
+    // perform actual memory allocation for non-persistent tensors
    for (auto &tensor : tensors) {
-        IT_ASSERT(tensorToOffset.find(tensor.get()) != tensorToOffset.end());
-        tensor->setDataBlob(make_ref<BlobObj>(
-            tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
-                                 tensorToOffset[tensor.get()]));
+        if (!tensor->isPersistent()) {
+            IT_ASSERT(tensorToOffset.find(tensor.get()) !=
+                      tensorToOffset.end());
+            tensor->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime, static_cast<uint8_t *>(allocator.getPtr()) +
+                                     tensorToOffset[tensor.get()]));
+        }
    }

+#ifdef DEBUG_MODE
    allocator.info();
+#endif
 }

 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@ -11,9 +11,6 @@ namespace infini {
 constexpr size_t alignmentInBytesForCUDA = 256;

 LazyAllocator::LazyAllocator(Runtime runtime) : runtime(runtime) {
-    used = 0;
-    peak = 0;
-    ptr = nullptr;
    if (runtime->isCuda()) {
        // TODO: the alignment on cuda might need further discussion
        alignment = alignmentInBytesForCUDA;
@ -30,10 +27,21 @@ LazyAllocator::~LazyAllocator() {
    if (this->ptr != nullptr) {
        runtime->dealloc(this->ptr);
    }
+    if (this->persistentPtr != nullptr) {
+        runtime->dealloc(this->persistentPtr);
+    }
+}
+
+void LazyAllocator::init() {
+    used = 0;
+    peak = 0;
+    if (this->ptr != nullptr) {
+        runtime->dealloc(this->ptr);
+    }
+    this->ptr = nullptr;
 }

 size_t LazyAllocator::alloc(size_t size) {
-    IT_ASSERT(this->ptr == nullptr);
    // pad the size to the multiple of alignment
    size = this->getAlignedSize(size);
    auto it = this->freeBlocks.lower_bound(freeBlockInfo{(size_t)0, size});
@ -83,6 +91,14 @@ size_t LazyAllocator::alloc(size_t size) {
    return retAddr;
 }

+size_t LazyAllocator::allocPersistent(size_t size) {
+    IT_ASSERT(this->persistentPtr == nullptr);
+    size = this->getAlignedSize(size);
+    size_t retAddr = this->persistentPeak;
+    this->persistentPeak += size;
+    return retAddr;
+}
+
 void LazyAllocator::free(size_t addr, size_t size) {
    IT_ASSERT(this->ptr == nullptr);
    size = getAlignedSize(size);
@ -126,18 +142,33 @@ void LazyAllocator::free(size_t addr, size_t size) {
 void *LazyAllocator::getPtr() {
    if (this->ptr == nullptr) {
        this->ptr = runtime->alloc(this->peak);
-        printf("LazyAllocator really alloc: %p %lu bytes\n", this->ptr, peak);
+#ifdef DEBUG_MODE
+        printf("LazyAllocator really alloc non-persistent: %p %lu bytes\n",
+               this->ptr, peak);
+#endif
    }
    return this->ptr;
 }

+void *LazyAllocator::getPersistentPtr() {
+    if (this->persistentPtr == nullptr) {
+        this->persistentPtr = runtime->alloc(this->persistentPeak);
+#ifdef DEBUG_MODE
+        printf("LazyAllocator really alloc persistent: %p %lu bytes\n",
+               this->persistentPtr, persistentPeak);
+#endif
+    }
+    return this->persistentPtr;
+}
+
 size_t LazyAllocator::getAlignedSize(size_t size) {
    return ((size - 1) / this->alignment + 1) * this->alignment;
 }

 void LazyAllocator::info() {
-    std::cout << "Used memory: " << this->used
-              << ", peak memory: " << this->peak << std::endl;
+    std::cout << "Used memory: " << this->used + this->persistentPeak
+              << ", peak memory: " << this->peak + this->persistentPeak
+              << std::endl;
 }

 } // namespace infini
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -321,6 +321,7 @@ void init_graph_builder(py::module &m) {
                                                      py::buffer_protocol())
        .def("fuid", &TensorObj::getFuid, policy::automatic)
        .def("shape", &TensorObj::getDims, policy::move)
+        .def("set_persistent", &TensorObj::setPersistent, policy::move)
        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
        .def("copyin_int32", &TensorObj::copyin<int32_t>, policy::move)
        .def("copyin_int64", &TensorObj::copyin<int64_t>, policy::move)