add a simple mempory pool for allocator

2023-10-19 12:36:01 +08:00
7 changed files with 128 additions and 19 deletions
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -64,7 +64,11 @@ class GraphObj : public Object {
    void optimize();
-    void dataMalloc(bool useNaiveAllocator = false);
+    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
    Tensor cloneKV(Tensor &tensor);
    void freeHeap();
    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -95,7 +95,14 @@ class GraphHandlerObj {
    //------ runtime
-    inline void data_malloc() { g->dataMalloc(); }
+    inline void data_malloc(bool useNaiveAllocator = false,
                            size_t memPoolSize = 0) {
        g->dataMalloc(useNaiveAllocator, memPoolSize);
    }
    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
    inline void free_heap() { g->freeHeap(); }
    inline void tune() { g->getRuntime()->run(g, true); }
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -26,14 +26,23 @@ class LazyAllocator {
    size_t weightPeak = 0;
    size_t heapPeak = 0;
    size_t alignment;
    bool hasMemPool = false;
    size_t memPoolSize = 0;
    // pointer to the memory actually allocated
    void *ptr = nullptr;
    // pointer to the weight memory space
    void *weightPtr = nullptr;
    // memory pool ptr
    void *memPoolPtr = nullptr;
    // // a cache designed for a batch size that has already occurred
    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
    // batchsizeToTensorOffset;
@ -68,6 +77,10 @@ class LazyAllocator {
    void init();
    void setMemPool(size_t memPoolSize);
    bool getMemPoolStatus();
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
@ -76,6 +89,10 @@ class LazyAllocator {
    size_t allocWeight(size_t size);
    size_t heapAlloc(size_t size);
    void freeHeap();
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
@ -92,6 +109,8 @@ class LazyAllocator {
    void *getWeightPtr();
    void *getHeapPtr();
    void info();
  private:
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -1075,6 +1075,12 @@ class OnnxStub:
    def optimize(self) -> None:
        self.handler.optimize()
    def clone_KV(self, tensor: backend.Tensor) -> backend.Tensor:
        return self.handler.clone_KV(tensor)
    def free_heap(self) -> None:
        self.handler.free_heap()
    def tune(self) -> None:
        self.handler.tune()
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@ -123,10 +123,12 @@ void GraphObj::optimize() {
    }
 }
-void GraphObj::dataMalloc(bool useNaiveAllocator) {
+void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
    // topological sorting first
    IT_ASSERT(topo_sort() == true);
    if (useNaiveAllocator) {
        // can not set memory pool when use naive allocator
        IT_ASSERT(memPoolSize == 0);
        // used for debugging memory out-of-bounds access, tensors will not be
        // released correctly
        // note: behavior may not match running in non-naive mode, and it may
@ -136,6 +138,9 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
        }
        return;
    }
    if (memPoolSize > 0) {
        allocator.setMemPool(memPoolSize);
    }
    // count the number of times all tensors are used
    std::unordered_map<TensorObj *, size_t> tensorToRefCount;
    // record the memory address offsets of all tensors to be allocated
@ -222,6 +227,27 @@ void GraphObj::dataMalloc(bool useNaiveAllocator) {
    }
 }
 Tensor GraphObj::cloneKV(Tensor &tensor) {
    auto obj = tensor->clone();
    if (allocator.getMemPoolStatus()) {
        if (tensor->hasData()) {
            obj->setDataBlob(make_ref<BlobObj>(
                tensor->runtime,
                static_cast<uint8_t *>(allocator.getHeapPtr()) +
                    allocator.heapAlloc(tensor->getBytes())));
            obj->copyData(tensor);
        }
    } else {
        if (tensor->hasData()) {
            obj->dataMalloc();
            obj->copyData(tensor);
        }
    }
    return obj;
 }
 void GraphObj::freeHeap() { this->allocator.freeHeap(); }
 Tensor GraphObj::addTensor(Shape dim, DataType dtype) {
    return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
 }
--- a/src/core/lazy_allocator.cc
+++ b/src/core/lazy_allocator.cc
@ -30,6 +30,9 @@ LazyAllocator::~LazyAllocator() {
    if (this->weightPtr != nullptr) {
        runtime->dealloc(this->weightPtr);
    }
    if (this->memPoolPtr != nullptr) {
        runtime->dealloc(this->memPoolPtr);
    }
 }
 void LazyAllocator::init() {
@ -44,6 +47,17 @@ void LazyAllocator::init() {
    this->ptr = nullptr;
 }
 void LazyAllocator::setMemPool(size_t memPoolSize) {
    IT_ASSERT(memPoolSize > 0);
    if (!this->hasMemPool) {
        this->hasMemPool = true;
        this->memPoolSize = memPoolSize;
        this->memPoolPtr = runtime->alloc(memPoolSize);
    }
 }
 bool LazyAllocator::getMemPoolStatus() { return this->hasMemPool; }
 size_t LazyAllocator::alloc(size_t size) {
    // pad the size to the multiple of alignment
    size = this->getAlignedSize(size);
@ -102,6 +116,17 @@ size_t LazyAllocator::allocWeight(size_t size) {
    return retAddr;
 }
 size_t LazyAllocator::heapAlloc(size_t size) {
    size = this->getAlignedSize(size);
    this->heapPeak += size;
    IT_ASSERT(this->memPoolSize >=
              this->weightPeak + this->peak + this->heapPeak);
    size_t retAddr = this->memPoolSize - this->heapPeak;
    return retAddr;
 }
 void LazyAllocator::freeHeap() { this->heapPeak = 0; }
 void LazyAllocator::free(size_t addr, size_t size) {
    IT_ASSERT(this->ptr == nullptr);
    size = getAlignedSize(size);
@ -143,6 +168,7 @@ void LazyAllocator::free(size_t addr, size_t size) {
 }
 void *LazyAllocator::getPtr() {
    if (!hasMemPool) {
        if (this->ptr == nullptr) {
            this->ptr = runtime->alloc(this->peak);
            // #ifdef DEBUG_MODE
@ -151,17 +177,31 @@ void *LazyAllocator::getPtr() {
            // #endif
        }
        return this->ptr;
    } else {
        IT_ASSERT(this->memPoolSize >= this->weightPeak + this->peak);
        return static_cast<uint8_t *>(this->memPoolPtr) + weightPeak;
    }
 }
 void *LazyAllocator::getWeightPtr() {
    if (!hasMemPool) {
        if (this->weightPtr == nullptr) {
            this->weightPtr = runtime->alloc(this->weightPeak);
            // #ifdef DEBUG_MODE
-        //         printf("LazyAllocator really alloc weight: %p %lu bytes\n",
+            //         printf("LazyAllocator really alloc weight: %p %lu
            //         bytes\n",
            //                this->weightPtr, weightPeak);
            // #endif
        }
        return this->weightPtr;
    } else {
        return this->memPoolPtr;
    }
 }
 void *LazyAllocator::getHeapPtr() {
    IT_ASSERT(hasMemPool);
    return this->memPoolPtr;
 }
 size_t LazyAllocator::getAlignedSize(size_t size) {
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -437,7 +437,10 @@ void init_graph_builder(py::module &m) {
             })
        .def("has_target", &TensorObj::hasTarget, policy::automatic)
        .def("src", &TensorObj::getSource, policy::move)
-        .def("printData", &TensorObj::printData, policy::automatic);
+        .def("printData", &TensorObj::printData, policy::automatic)
        .def("copy_data",
             py::overload_cast<const Tensor &>(&TensorObj::copyData),
             policy::move);
    py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "Operator")
        .def("op_type", &OperatorObj::getOpType, policy::automatic)
        .def("inputs", py::overload_cast<>(&OperatorObj::getInputs, py::const_),
@ -499,7 +502,11 @@ void init_graph_builder(py::module &m) {
        .def("topo_sort", &Handler::topo_sort, policy::automatic)
        .def("optimize", &Handler::optimize, policy::automatic)
        .def("operators", &Handler::operators, policy::move)
-        .def("data_malloc", &Handler::data_malloc, policy::automatic)
+        .def("data_malloc", &Handler::data_malloc,
             py::arg("useNaiveAllocator") = false, py::arg("memPoolSize") = 0,
             policy::automatic)
        .def("clone_KV", &Handler::clone_KV, policy::move)
        .def("free_heap", &Handler::free_heap, policy::move)
        .def("get_perf_time", &Handler::get_perf_time, policy::automatic)
        .def("tune", &Handler::tune, policy::automatic)
        .def("run", &Handler::run, policy::automatic)