Merge branch 'dev-memory' of github.com:InfiniTensor/InfiniTensor into dev-memory

2023-08-03 15:36:17 +08:00 · 2023-08-03 15:36:17 +08:00 · e3da455aba
parent 0fa4e8efe1 951704d262
commit e3da455aba
12 changed files with 222 additions and 77 deletions
--- a/6
+++ b/6
@ -1,4 +1,4 @@
-.PHONY : build clean install-python test-cpp test-onnx
+.PHONY : build clean format install-python test-cpp test-onnx

 TYPE ?= release
 CUDA ?= OFF
@ -6,6 +6,7 @@ BANG ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
+FORMAT_ORIGIN ?=

 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
@ -24,6 +25,9 @@ build:
 clean:
 	rm -rf build

+format:
+	@python3 scripts/format.py $(FORMAT_ORIGIN)
+
 install-python: build
 	cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
 	pip install pyinfinitensor/
--- a/include/bang/bang_unary_list.h
+++ b/include/bang/bang_unary_list.h
@ -4,14 +4,14 @@
 #include "operators/unary.h"

 namespace infini {
-  // void unary_kernel(cnnlHandle_t handle,
-  //                   const float *input,
-  //                   float *output,
-  //                   const uint32_t num,
-  //                   const uint32_t op_num,
-  //                   int* list);
+// void unary_kernel(cnnlHandle_t handle,
+//                   const float *input,
+//                   float *output,
+//                   const uint32_t num,
+//                   const uint32_t op_num,
+//                   int* list);

-  void bang_unary_kernel(const RuntimeObj* obj, const Operator &_op) {
+void bang_unary_kernel(const RuntimeObj *obj, const Operator &_op) {
    auto op = as<UnaryKernelObj>(_op);
    float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
    float *const cData = (op->getOutput()->getRawDataPtr<float *>());
@ -20,7 +20,7 @@ namespace infini {
    auto context = dynamic_cast<const BangRuntimeObj *>(obj);
    auto list = op->getOpList();
    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
-    unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w, list.size(), list.data());
-
-  }
+    unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w,
+                      list.size(), list.data());
+}
 }; // namespace infini
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@ -38,7 +38,8 @@ class UnaryKernelObj : public OperatorObj {
     * @param input The input tensor.
     * @param output The output tensor.
     */
-    UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list);
+    UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
+                   std::vector<int> op_list);
    OP_CLONE(UnaryKernelObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

--- a/scripts/format.py
+++ b/scripts/format.py
@ -0,0 +1,50 @@
+import sys
+from pathlib import Path
+from subprocess import run
+
+c_style_file = [".h", ".hh", ".hpp", ".c", ".cc", ".cpp", ".cxx", ".cu", ".mlu"]
+py_file = ".py"
+proj_path = Path(sys.path[0]).parent
+
+
+# Formats one file under project path.
+def format_file(file):
+    file = Path(proj_path.joinpath(file))
+    if file.suffix in c_style_file:
+        run(f"clang-format-14 -i {file}", cwd=proj_path, shell=True)
+        run(f"git add {file}", cwd=proj_path, shell=True)
+    elif file.suffix == py_file:
+        run(f"black {file}", cwd=proj_path, shell=True)
+        run(f"git add {file}", cwd=proj_path, shell=True)
+
+
+if len(sys.argv) == 1:
+    # Last commit.
+    print("Formats git added files.")
+    for line in (
+        run("git status", cwd=proj_path, capture_output=True, shell=True)
+        .stdout.decode()
+        .splitlines()
+    ):
+        line = line.strip()
+        # Only formats git added files.
+        for pre in ["new file:", "modified:"]:
+            if line.startswith(pre):
+                format_file(line[len(pre) :].strip())
+            break
+else:
+    # Origin commit.
+    origin = sys.argv[1]
+    print(f'Formats changed files from "{origin}".')
+    for line in (
+        run(f"git diff {origin}", cwd=proj_path, capture_output=True, shell=True)
+        .stdout.decode()
+        .splitlines()
+    ):
+        diff = "diff --git "
+        if line.startswith(diff):
+            files = line[len(diff) :].split(" ")
+            assert len(files) == 2
+            assert files[0][:2] == "a/"
+            assert files[1][:2] == "b/"
+            format_file(files[1][2:])
--- a/src/kernels/bang/unary_kernel.cc
+++ b/src/kernels/bang/unary_kernel.cc
@ -7,11 +7,11 @@ namespace infini {
 class UnaryKernel : public BangKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
-      bang_unary_kernel(_context, _op);
+        bang_unary_kernel(_context, _op);
    }
 };

-REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32, UnaryKernel,
-                "Unary_BANG_Float32");
+REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32,
+                UnaryKernel, "Unary_BANG_Float32");

 }; // namespace infini
--- a/src/kernels/bang_kernel/include/bang_unarylist.h
+++ b/src/kernels/bang_kernel/include/bang_unarylist.h
@ -10,7 +10,6 @@ typedef enum {
 } UnaryOpType;

 void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
-                       const uint32_t num, const uint32_t op_num,
-                       int* list);
+                       const uint32_t num, const uint32_t op_num, int *list);

 }; // namespace infini
--- a/src/kernels/bang_kernel/src/bang_unary_device.mlu
+++ b/src/kernels/bang_kernel/src/bang_unary_device.mlu
@ -5,8 +5,9 @@
 __nram__ char left[NRAM_USE_SIZE];
 __nram__ char right[NRAM_USE_SIZE];

-template<typename T>
-__mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_list) {
+template <typename T>
+__mlu_device__ void UnaryFunction(T *output, T *input, size_t num,
+                                  size_t op_list) {
    int use_nram_size = NRAM_USE_SIZE;
    int deal_align = use_nram_size / sizeof(T);
    int num_per_core = num / taskDim;
@ -14,61 +15,61 @@ __mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_lis
    int easy = num_per_core;
    int hard = num_per_core + (num_rem != 0 ? 1 : 0);
    int my = taskId < num_rem ? hard : easy;
-    int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
-    char* input_start = (char*)input + start * sizeof(T);
-    char* output_start = (char*)output + start * sizeof(T);
+    int start = (taskId < num_rem)
+                    ? (hard * taskId)
+                    : (hard * num_rem + (taskId - num_rem) * easy);
+    char *input_start = (char *)input + start * sizeof(T);
+    char *output_start = (char *)output + start * sizeof(T);

    int my_repeat = my / deal_align;
    int my_rem = my % deal_align;
-    for(int i = 0; i < my_repeat; ++i) {
+    for (int i = 0; i < my_repeat; ++i) {
        __memcpy(left, input_start, use_nram_size, GDRAM2NRAM);
-        while(op_list){
-          int op = op_list % 10;
-          switch(op){
+        while (op_list) {
+            int op = op_list % 10;
+            switch (op) {
            case 1:
-              __bang_active_abs((T*)left, (T*)left, num);
-              break;
+                __bang_active_abs((T *)left, (T *)left, num);
+                break;
            case 2:
-              __bang_active_relu((T*)left, (T*)left, num);
-              break;
+                __bang_active_relu((T *)left, (T *)left, num);
+                break;
            case 3:
-              __bang_active_sigmoid((T*)left, (T*)left, num);
-              break;
+                __bang_active_sigmoid((T *)left, (T *)left, num);
+                break;
            default:
-              break;
-          }
-          op_list /= 10;
+                break;
+            }
+            op_list /= 10;
        }
        __memcpy(output_start, left, use_nram_size, NRAM2GDRAM);
        input_start += use_nram_size;
        output_start += use_nram_size;
    }
-    if(my_rem) {
+    if (my_rem) {
        __memcpy(left, input_start, my_rem * sizeof(T), GDRAM2NRAM);
-        while(op_list){
-          int op = op_list % 10;
-          switch(op){
+        while (op_list) {
+            int op = op_list % 10;
+            switch (op) {
            case 1:
-              __bang_active_abs((T*)left, (T*)left, my_rem);
-              break;
+                __bang_active_abs((T *)left, (T *)left, my_rem);
+                break;
            case 2:
-              __bang_active_relu((T*)left, (T*)left, my_rem);
-              break;
+                __bang_active_relu((T *)left, (T *)left, my_rem);
+                break;
            case 3:
-              __bang_active_sigmoid((T*)left, (T*)left, my_rem);
-              break;
+                __bang_active_sigmoid((T *)left, (T *)left, my_rem);
+                break;
            default:
-              break;
-          }
-          op_list /= 10;
+                break;
+            }
+            op_list /= 10;
        }
        __memcpy(output_start, left, my_rem * sizeof(T), NRAM2GDRAM);
    }
 }

-__mlu_global__ void MLUUnaryKernelUnion1(float *output,
-                                         float *input,
-                                         uint32_t num,
-                                         uint32_t op_list) {
-    UnaryFunction((float*)output, (float*)input, num, op_list);
+__mlu_global__ void MLUUnaryKernelUnion1(float *output, float *input,
+                                         uint32_t num, uint32_t op_list) {
+    UnaryFunction((float *)output, (float *)input, num, op_list);
 }
--- a/src/kernels/bang_kernel/src/bang_unarylist.mlu
+++ b/src/kernels/bang_kernel/src/bang_unarylist.mlu
@ -2,12 +2,8 @@
 #include "unarylist.h"
 namespace infini {

-void unary_kernel_list(cnnlHandle_t handle,
-                       const float *input,
-                       float *output,
-                       const uint32_t num,
-                       const uint32_t op_num,
-                       int* list) {
+void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
+                       const uint32_t num, const uint32_t op_num, int *list) {
    // 任务类型和调度方法
    cnrtDim3_t k_dim;
    cnrtFunctionType_t k_type;
@ -18,15 +14,13 @@ void unary_kernel_list(cnnlHandle_t handle,
    k_dim.z = 1;
    k_type = CNRT_FUNC_TYPE_UNION1;
    uint32_t op_list = 0;
-    for(int i = op_num-1; i >= 0; --i) {
-      op_list *= 10;
-      op_list += list[i];
+    for (int i = op_num - 1; i >= 0; --i) {
+        op_list *= 10;
+        op_list += list[i];
    }
    // launch 任务
-    MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
-                                                   (float*)input,
-                                                   num,
-                                                   op_list);
+    MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>(
+        (float *)output, (float *)input, num, op_list);
 }

-};
+}; // namespace infini
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@ -32,12 +32,14 @@ vector<int> UnaryObj::getOpAttrVector() const {
    return {enum_to_underlying(type)};
 }

-UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list)
+UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
+                               std::vector<int> op_list)
    : OperatorObj(OpType::UnaryKernel, {input}, {output}), opList(op_list) {
    IT_ASSERT(checkValid(graph));
 }

-optional<vector<Shape>> UnaryKernelObj::inferShape(const TensorVec &inputs) const {
+optional<vector<Shape>>
+UnaryKernelObj::inferShape(const TensorVec &inputs) const {
    const auto A = inputs[0];
    return {{A->getDims()}};
 }
@ -63,7 +65,6 @@ vector<int> UnaryKernelObj::getOpAttrVector() const {
    return {enum_to_underlying(type)};
 }

-
 ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
                 std::optional<float> min, std::optional<float> max)
    : OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
--- a/test/kernels/bang/test_bang_compare.cc
+++ b/test/kernels/bang/test_bang_compare.cc
@ -0,0 +1,98 @@
+#include "bang/bang_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+template <class T>
+void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Notifier and queue
+    cnrtNotifier_t st, et;
+    CNRT_CHECK(cnrtNotifierCreate(&st));
+    CNRT_CHECK(cnrtNotifierCreate(&et));
+    auto handle = bangRuntime->cnnlHandle();
+    cnrtQueue_t queue;
+    cnnlGetQueue(handle, &queue);
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+    std::vector<int> op_list = {3,2,3};
+
+    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);
+
+    bangGraph->dataMalloc();
+    CNRT_CHECK(cnrtPlaceNotifier(st, queue));
+    bangRuntime->run(bangGraph);
+    CNRT_CHECK(cnrtPlaceNotifier(et, queue));
+    CNRT_CHECK(cnrtQueueSync(queue));
+    float latency;
+    CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
+    auto outputGpu = gpuOp->getOutput();
+    auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
+    printf("单目融合 Kernel Hardware Time：%.3f us\n", latency);
+    EXPECT_TRUE(1);
+}
+
+void testUnaryNofusion(const std::function<void(void *, size_t, DataType)> &generator,
+              const Shape &shape) {
+    // Runtime
+    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
+    auto bangRuntime = make_ref<BangRuntimeObj>();
+
+    // Notifier and queue
+    cnrtNotifier_t st, et;
+    CNRT_CHECK(cnrtNotifierCreate(&st));
+    CNRT_CHECK(cnrtNotifierCreate(&et));
+    auto handle = bangRuntime->cnnlHandle();
+    cnrtQueue_t queue;
+    cnnlGetQueue(handle, &queue);
+
+    // Build input data on CPU
+    Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
+    inputCpu->dataMalloc();
+    inputCpu->setData(generator);
+
+    // GPU
+    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
+    auto inputGpu = bangGraph->cloneTensor(inputCpu);
+
+    auto gpuOp = bangGraph->addOp<SigmoidObj>(inputGpu, nullptr);
+    auto outputGpu = gpuOp->getOutput();
+    auto gpuOp2 = bangGraph->addOp<ReluObj>(outputGpu, nullptr);
+    auto outputGpu2 = gpuOp2->getOutput();
+    auto gpuOp3 = bangGraph->addOp<SigmoidObj>(outputGpu2, nullptr);
+
+    bangGraph->dataMalloc();
+    CNRT_CHECK(cnrtPlaceNotifier(st, queue));
+    bangRuntime->run(bangGraph);
+    CNRT_CHECK(cnrtPlaceNotifier(et, queue));
+    CNRT_CHECK(cnrtQueueSync(queue));
+    float latency;
+    CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
+    auto outputGpu3 = gpuOp3->getOutput();
+    auto outputGpu2Cpu = outputGpu3->clone(cpuRuntime);
+    printf("单目不融合 Kernel Hardware Time：%.3f us\n", latency);
+    EXPECT_TRUE(1);
+}
+
+TEST(cnnl_unary_kernel, run) {
+    testUnaryKernel<UnaryKernelObj>(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
+    testUnaryNofusion(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
+}
+
+} // namespace infini
--- a/test/kernels/bang/test_bang_unary_kernel.cc
+++ b/test/kernels/bang/test_bang_unary_kernel.cc
@ -9,8 +9,9 @@
 namespace infini {

 template <class T>
-void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
-              const Shape &shape) {
+void testUnaryKernel(
+    const std::function<void(void *, size_t, DataType)> &generator,
+    const Shape &shape) {
    // Runtime
    Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
    auto bangRuntime = make_ref<BangRuntimeObj>();
@ -23,7 +24,7 @@ void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &genera
    // GPU
    Graph bangGraph = make_ref<GraphObj>(bangRuntime);
    auto inputGpu = bangGraph->cloneTensor(inputCpu);
-    std::vector<int> op_list = {1,2,3};
+    std::vector<int> op_list = {1, 2, 3};

    auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);

--- a/test/script/clang_format_inplace.sh
+++ b/test/script/clang_format_inplace.sh
@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &>/dev/null && pwd 2>/dev/null)"
-PET_HOME="$(readlink -f ${script_dir}/../..)"
-find ${PET_HOME}/src ${PET_HOME}/include ${PET_HOME}/test  -iname *.h -o -iname *.cc | xargs clang-format -i