add a simple mempory pool for allocator

2023-10-19 12:36:01 +08:00
7 changed files with 128 additions and 19 deletions
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -64,7 +64,11 @@ class GraphObj : public Object {

    void optimize();

-    void dataMalloc(bool useNaiveAllocator = false);
+    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
+
+    Tensor cloneKV(Tensor &tensor);
+
+    void freeHeap();

    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -95,7 +95,14 @@ class GraphHandlerObj {

    //------ runtime

-    inline void data_malloc() { g->dataMalloc(); }
+    inline void data_malloc(bool useNaiveAllocator = false,
+                            size_t memPoolSize = 0) {
+        g->dataMalloc(useNaiveAllocator, memPoolSize);
+    }
+
+    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
+
+    inline void free_heap() { g->freeHeap(); }

    inline void tune() { g->getRuntime()->run(g, true); }

--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -26,14 +26,23 @@ class LazyAllocator {

    size_t weightPeak = 0;

+    size_t heapPeak = 0;
+
    size_t alignment;

+    bool hasMemPool = false;
+
+    size_t memPoolSize = 0;
+
    // pointer to the memory actually allocated
    void *ptr = nullptr;

    // pointer to the weight memory space
    void *weightPtr = nullptr;

+    // memory pool ptr
+    void *memPoolPtr = nullptr;
+
    // // a cache designed for a batch size that has already occurred
    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
    // batchsizeToTensorOffset;
@ -68,6 +77,10 @@ class LazyAllocator {

    void init();

+    void setMemPool(size_t memPoolSize);
+
+    bool getMemPoolStatus();
+
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
@ -76,6 +89,10 @@ class LazyAllocator {

    size_t allocWeight(size_t size);

+    size_t heapAlloc(size_t size);
+
+    void freeHeap();
+
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
@ -92,6 +109,8 @@ class LazyAllocator {

    void *getWeightPtr();

+    void *getHeapPtr();
+
    void info();

  private:
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -1074,6 +1074,12 @@ class OnnxStub:

    def optimize(self) -> None:
        self.handler.optimize()
+        
+    def clone_KV(self, tensor: backend.Tensor) -> backend.Tensor:
+        return self.handler.clone_KV(tensor)
+    
+    def free_heap(self) -> None:
+        self.handler.free_heap()

    def tune(self) -> None:
        self.handler.tune()
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -123,10 +123,12 @@ void GraphObj::optimize() {
    }
 }

-void GraphObj::dataMalloc(bool useNaiveAllocator) {
+void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
    // topological sorting first
    IT_ASSERT(topo_sort() == true);
    if (useNaiveAllocator) {
+        // can not set memory pool when use naive allocator
+        IT_ASSERT(memPoolSize == 0);
        // used for debugging memory out-of-bounds access, tensors will not be
        // released correctly
        // note: behavior may not match running in non-naive mode, and it may
@ -136,6 +138,9 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
        }
        return;
    }
+    if (memPoolSize > 0) {
+        allocator.setMemPool(memPoolSize);
+    }
    // count the number of times all tensors are used
    std::unordered_map<TensorObj *, size_t> tensorToRefCount;
    // record the memory address offsets of all tensors to be allocated
@ -222,6 +227,27 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
    }
 }

+Tensor GraphObj::cloneKV(Tensor &tensor) {
+    auto obj = tensor->clone();
+    if (allocator.getMemPoolStatus()) {
+        if (tensor->hasData()) {
+            obj->setDataBlob(make_ref<BlobObj>(
+                tensor->runtime,
+                static_cast<uint8_t *>(allocator.getHeapPtr()) +
+                    allocator.heapAlloc(tensor->getBytes())));
+            obj->copyData(tensor);
+        }
+    } else {
+        if (tensor->hasData()) {
+            obj->dataMalloc();
+            obj->copyData(tensor);
+        }
+    }
+    return obj;
+}
+
+void GraphObj::freeHeap() { this->allocator.freeHeap(); }
+
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
 }
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@ -30,6 +30,9 @@ LazyAllocator::~LazyAllocator() {
    if (this->weightPtr != nullptr) {
        runtime->dealloc(this->weightPtr);
    }
+    if (this->memPoolPtr != nullptr) {
+        runtime->dealloc(this->memPoolPtr);
+    }
 }

 void LazyAllocator::init() {
@ -44,6 +47,17 @@ void LazyAllocator::init() {
    this->ptr = nullptr;
 }

+void LazyAllocator::setMemPool(size_t memPoolSize) {
+    IT_ASSERT(memPoolSize > 0);
+    if (!this->hasMemPool) {
+        this->hasMemPool = true;
+        this->memPoolSize = memPoolSize;
+        this->memPoolPtr = runtime->alloc(memPoolSize);
+    }
+}
+
+bool LazyAllocator::getMemPoolStatus() { return this->hasMemPool; }
+
 size_t LazyAllocator::alloc(size_t size) {
    // pad the size to the multiple of alignment
    size = this->getAlignedSize(size);
@ -102,6 +116,17 @@ size_t LazyAllocator::allocWeight(size_t size) {
    return retAddr;
 }

+size_t LazyAllocator::heapAlloc(size_t size) {
+    size = this->getAlignedSize(size);
+    this->heapPeak += size;
+    IT_ASSERT(this->memPoolSize >=
+              this->weightPeak + this->peak + this->heapPeak);
+    size_t retAddr = this->memPoolSize - this->heapPeak;
+    return retAddr;
+}
+
+void LazyAllocator::freeHeap() { this->heapPeak = 0; }
+
 void LazyAllocator::free(size_t addr, size_t size) {
    IT_ASSERT(this->ptr == nullptr);
    size = getAlignedSize(size);
@ -143,25 +168,40 @@ void LazyAllocator::free(size_t addr, size_t size) {
 }

 void *LazyAllocator::getPtr() {
-    if (this->ptr == nullptr) {
-        this->ptr = runtime->alloc(this->peak);
-        // #ifdef DEBUG_MODE
-        //         printf("LazyAllocator really alloc non-weight: %p %lu
-        //         bytes\n", this->ptr, peak);
-        // #endif
+    if (!hasMemPool) {
+        if (this->ptr == nullptr) {
+            this->ptr = runtime->alloc(this->peak);
+            // #ifdef DEBUG_MODE
+            //         printf("LazyAllocator really alloc non-weight: %p %lu
+            //         bytes\n", this->ptr, peak);
+            // #endif
+        }
+        return this->ptr;
+    } else {
+        IT_ASSERT(this->memPoolSize >= this->weightPeak + this->peak);
+        return static_cast<uint8_t *>(this->memPoolPtr) + weightPeak;
    }
-    return this->ptr;
 }

 void *LazyAllocator::getWeightPtr() {
-    if (this->weightPtr == nullptr) {
-        this->weightPtr = runtime->alloc(this->weightPeak);
-        // #ifdef DEBUG_MODE
-        //         printf("LazyAllocator really alloc weight: %p %lu bytes\n",
-        //                this->weightPtr, weightPeak);
-        // #endif
+    if (!hasMemPool) {
+        if (this->weightPtr == nullptr) {
+            this->weightPtr = runtime->alloc(this->weightPeak);
+            // #ifdef DEBUG_MODE
+            //         printf("LazyAllocator really alloc weight: %p %lu
+            //         bytes\n",
+            //                this->weightPtr, weightPeak);
+            // #endif
+        }
+        return this->weightPtr;
+    } else {
+        return this->memPoolPtr;
    }
-    return this->weightPtr;
+}
+
+void *LazyAllocator::getHeapPtr() {
+    IT_ASSERT(hasMemPool);
+    return this->memPoolPtr;
 }

 size_t LazyAllocator::getAlignedSize(size_t size) {
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -437,7 +437,10 @@ void init_graph_builder(py::module &m) {
             })
        .def("has_target", &TensorObj::hasTarget, policy::automatic)
        .def("src", &TensorObj::getSource, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic);
+        .def("printData", &TensorObj::printData, policy::automatic)
+        .def("copy_data",
+             py::overload_cast<const Tensor &>(&TensorObj::copyData),
+             policy::move);
    py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
        .def("op_type", &OperatorObj::getOpType, policy::automatic)
        .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
@ -499,7 +502,11 @@ void init_graph_builder(py::module &m) {
        .def("topo_sort", &Handler::topo_sort, policy::automatic)
        .def("optimize", &Handler::optimize, policy::automatic)
        .def("operators", &Handler::operators, policy::move)
-        .def("data_malloc", &Handler::data_malloc, policy::automatic)
+        .def("data_malloc", &Handler::data_malloc,
+             py::arg("useNaiveAllocator") = false, py::arg("memPoolSize") = 0,
+             policy::automatic)
+        .def("clone_KV", &Handler::clone_KV, policy::move)
+        .def("free_heap", &Handler::free_heap, policy::move)
        .def("get_perf_time", &Handler::get_perf_time, policy::automatic)
        .def("tune", &Handler::tune, policy::automatic)
        .def("run", &Handler::run, policy::automatic)