Merge branch 'dev-memory' of github.com:InfiniTensor/InfiniTensor into dev-memory

This commit is contained in:
wanghailu 2023-08-03 15:36:17 +08:00
commit e3da455aba
12 changed files with 222 additions and 77 deletions

View File

@ -1,4 +1,4 @@
.PHONY : build clean install-python test-cpp test-onnx
.PHONY : build clean format install-python test-cpp test-onnx
TYPE ?= release
CUDA ?= OFF
@ -6,6 +6,7 @@ BANG ?= OFF
INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
FORMAT_ORIGIN ?=
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
@ -24,6 +25,9 @@ build:
clean:
rm -rf build
format:
@python3 scripts/format.py $(FORMAT_ORIGIN)
install-python: build
cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
pip install pyinfinitensor/

View File

@ -4,14 +4,14 @@
#include "operators/unary.h"
namespace infini {
// void unary_kernel(cnnlHandle_t handle,
// const float *input,
// float *output,
// const uint32_t num,
// const uint32_t op_num,
// int* list);
// void unary_kernel(cnnlHandle_t handle,
// const float *input,
// float *output,
// const uint32_t num,
// const uint32_t op_num,
// int* list);
void bang_unary_kernel(const RuntimeObj* obj, const Operator &_op) {
void bang_unary_kernel(const RuntimeObj *obj, const Operator &_op) {
auto op = as<UnaryKernelObj>(_op);
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
@ -20,7 +20,7 @@ namespace infini {
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
auto list = op->getOpList();
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w, list.size(), list.data());
}
unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w,
list.size(), list.data());
}
}; // namespace infini

View File

@ -38,7 +38,8 @@ class UnaryKernelObj : public OperatorObj {
* @param input The input tensor.
* @param output The output tensor.
*/
UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list);
UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
std::vector<int> op_list);
OP_CLONE(UnaryKernelObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;

50
scripts/format.py Normal file
View File

@ -0,0 +1,50 @@
import sys
from pathlib import Path
from subprocess import run
c_style_file = [".h", ".hh", ".hpp", ".c", ".cc", ".cpp", ".cxx", ".cu", ".mlu"]
py_file = ".py"
proj_path = Path(sys.path[0]).parent
# Formats one file under project path.
def format_file(file):
file = Path(proj_path.joinpath(file))
if file.suffix in c_style_file:
run(f"clang-format-14 -i {file}", cwd=proj_path, shell=True)
run(f"git add {file}", cwd=proj_path, shell=True)
elif file.suffix == py_file:
run(f"black {file}", cwd=proj_path, shell=True)
run(f"git add {file}", cwd=proj_path, shell=True)
if len(sys.argv) == 1:
# Last commit.
print("Formats git added files.")
for line in (
run("git status", cwd=proj_path, capture_output=True, shell=True)
.stdout.decode()
.splitlines()
):
line = line.strip()
# Only formats git added files.
for pre in ["new file:", "modified:"]:
if line.startswith(pre):
format_file(line[len(pre) :].strip())
break
else:
# Origin commit.
origin = sys.argv[1]
print(f'Formats changed files from "{origin}".')
for line in (
run(f"git diff {origin}", cwd=proj_path, capture_output=True, shell=True)
.stdout.decode()
.splitlines()
):
diff = "diff --git "
if line.startswith(diff):
files = line[len(diff) :].split(" ")
assert len(files) == 2
assert files[0][:2] == "a/"
assert files[1][:2] == "b/"
format_file(files[1][2:])

View File

@ -7,11 +7,11 @@ namespace infini {
class UnaryKernel : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
bang_unary_kernel(_context, _op);
bang_unary_kernel(_context, _op);
}
};
REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32, UnaryKernel,
"Unary_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32,
UnaryKernel, "Unary_BANG_Float32");
}; // namespace infini

View File

@ -10,7 +10,6 @@ typedef enum {
} UnaryOpType;
void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
const uint32_t num, const uint32_t op_num,
int* list);
const uint32_t num, const uint32_t op_num, int *list);
}; // namespace infini

View File

@ -5,8 +5,9 @@
__nram__ char left[NRAM_USE_SIZE];
__nram__ char right[NRAM_USE_SIZE];
template<typename T>
__mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_list) {
template <typename T>
__mlu_device__ void UnaryFunction(T *output, T *input, size_t num,
size_t op_list) {
int use_nram_size = NRAM_USE_SIZE;
int deal_align = use_nram_size / sizeof(T);
int num_per_core = num / taskDim;
@ -14,61 +15,61 @@ __mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_lis
int easy = num_per_core;
int hard = num_per_core + (num_rem != 0 ? 1 : 0);
int my = taskId < num_rem ? hard : easy;
int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
char* input_start = (char*)input + start * sizeof(T);
char* output_start = (char*)output + start * sizeof(T);
int start = (taskId < num_rem)
? (hard * taskId)
: (hard * num_rem + (taskId - num_rem) * easy);
char *input_start = (char *)input + start * sizeof(T);
char *output_start = (char *)output + start * sizeof(T);
int my_repeat = my / deal_align;
int my_rem = my % deal_align;
for(int i = 0; i < my_repeat; ++i) {
for (int i = 0; i < my_repeat; ++i) {
__memcpy(left, input_start, use_nram_size, GDRAM2NRAM);
while(op_list){
int op = op_list % 10;
switch(op){
while (op_list) {
int op = op_list % 10;
switch (op) {
case 1:
__bang_active_abs((T*)left, (T*)left, num);
break;
__bang_active_abs((T *)left, (T *)left, num);
break;
case 2:
__bang_active_relu((T*)left, (T*)left, num);
break;
__bang_active_relu((T *)left, (T *)left, num);
break;
case 3:
__bang_active_sigmoid((T*)left, (T*)left, num);
break;
__bang_active_sigmoid((T *)left, (T *)left, num);
break;
default:
break;
}
op_list /= 10;
break;
}
op_list /= 10;
}
__memcpy(output_start, left, use_nram_size, NRAM2GDRAM);
input_start += use_nram_size;
output_start += use_nram_size;
}
if(my_rem) {
if (my_rem) {
__memcpy(left, input_start, my_rem * sizeof(T), GDRAM2NRAM);
while(op_list){
int op = op_list % 10;
switch(op){
while (op_list) {
int op = op_list % 10;
switch (op) {
case 1:
__bang_active_abs((T*)left, (T*)left, my_rem);
break;
__bang_active_abs((T *)left, (T *)left, my_rem);
break;
case 2:
__bang_active_relu((T*)left, (T*)left, my_rem);
break;
__bang_active_relu((T *)left, (T *)left, my_rem);
break;
case 3:
__bang_active_sigmoid((T*)left, (T*)left, my_rem);
break;
__bang_active_sigmoid((T *)left, (T *)left, my_rem);
break;
default:
break;
}
op_list /= 10;
break;
}
op_list /= 10;
}
__memcpy(output_start, left, my_rem * sizeof(T), NRAM2GDRAM);
}
}
__mlu_global__ void MLUUnaryKernelUnion1(float *output,
float *input,
uint32_t num,
uint32_t op_list) {
UnaryFunction((float*)output, (float*)input, num, op_list);
__mlu_global__ void MLUUnaryKernelUnion1(float *output, float *input,
uint32_t num, uint32_t op_list) {
UnaryFunction((float *)output, (float *)input, num, op_list);
}

View File

@ -2,12 +2,8 @@
#include "unarylist.h"
namespace infini {
void unary_kernel_list(cnnlHandle_t handle,
const float *input,
float *output,
const uint32_t num,
const uint32_t op_num,
int* list) {
void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
const uint32_t num, const uint32_t op_num, int *list) {
// 任务类型和调度方法
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
@ -18,15 +14,13 @@ void unary_kernel_list(cnnlHandle_t handle,
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
uint32_t op_list = 0;
for(int i = op_num-1; i >= 0; --i) {
op_list *= 10;
op_list += list[i];
for (int i = op_num - 1; i >= 0; --i) {
op_list *= 10;
op_list += list[i];
}
// launch 任务
MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
(float*)input,
num,
op_list);
MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>(
(float *)output, (float *)input, num, op_list);
}
};
}; // namespace infini

View File

@ -32,12 +32,14 @@ vector<int> UnaryObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list)
UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
std::vector<int> op_list)
: OperatorObj(OpType::UnaryKernel, {input}, {output}), opList(op_list) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> UnaryKernelObj::inferShape(const TensorVec &inputs) const {
optional<vector<Shape>>
UnaryKernelObj::inferShape(const TensorVec &inputs) const {
const auto A = inputs[0];
return {{A->getDims()}};
}
@ -63,7 +65,6 @@ vector<int> UnaryKernelObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
std::optional<float> min, std::optional<float> max)
: OperatorObj(OpType::Clip, {input}, {output}), minValue(min),

View File

@ -0,0 +1,98 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/unary.h"
#include "test.h"
namespace infini {
template <class T>
void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Notifier and queue
cnrtNotifier_t st, et;
CNRT_CHECK(cnrtNotifierCreate(&st));
CNRT_CHECK(cnrtNotifierCreate(&et));
auto handle = bangRuntime->cnnlHandle();
cnrtQueue_t queue;
cnnlGetQueue(handle, &queue);
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
std::vector<int> op_list = {3,2,3};
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);
bangGraph->dataMalloc();
CNRT_CHECK(cnrtPlaceNotifier(st, queue));
bangRuntime->run(bangGraph);
CNRT_CHECK(cnrtPlaceNotifier(et, queue));
CNRT_CHECK(cnrtQueueSync(queue));
float latency;
CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
printf("单目融合 Kernel Hardware Time%.3f us\n", latency);
EXPECT_TRUE(1);
}
void testUnaryNofusion(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Notifier and queue
cnrtNotifier_t st, et;
CNRT_CHECK(cnrtNotifierCreate(&st));
CNRT_CHECK(cnrtNotifierCreate(&et));
auto handle = bangRuntime->cnnlHandle();
cnrtQueue_t queue;
cnnlGetQueue(handle, &queue);
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SigmoidObj>(inputGpu, nullptr);
auto outputGpu = gpuOp->getOutput();
auto gpuOp2 = bangGraph->addOp<ReluObj>(outputGpu, nullptr);
auto outputGpu2 = gpuOp2->getOutput();
auto gpuOp3 = bangGraph->addOp<SigmoidObj>(outputGpu2, nullptr);
bangGraph->dataMalloc();
CNRT_CHECK(cnrtPlaceNotifier(st, queue));
bangRuntime->run(bangGraph);
CNRT_CHECK(cnrtPlaceNotifier(et, queue));
CNRT_CHECK(cnrtQueueSync(queue));
float latency;
CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
auto outputGpu3 = gpuOp3->getOutput();
auto outputGpu2Cpu = outputGpu3->clone(cpuRuntime);
printf("单目不融合 Kernel Hardware Time%.3f us\n", latency);
EXPECT_TRUE(1);
}
TEST(cnnl_unary_kernel, run) {
testUnaryKernel<UnaryKernelObj>(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
testUnaryNofusion(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
}
} // namespace infini

View File

@ -9,8 +9,9 @@
namespace infini {
template <class T>
void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
void testUnaryKernel(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
@ -23,7 +24,7 @@ void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &genera
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
std::vector<int> op_list = {1,2,3};
std::vector<int> op_list = {1, 2, 3};
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);

View File

@ -1,4 +0,0 @@
#!/usr/bin/env bash
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &>/dev/null && pwd 2>/dev/null)"
PET_HOME="$(readlink -f ${script_dir}/../..)"
find ${PET_HOME}/src ${PET_HOME}/include ${PET_HOME}/test -iname *.h -o -iname *.cc | xargs clang-format -i