forked from jiuyuan/InfiniTensor
Merge branch 'dev-memory' of github.com:InfiniTensor/InfiniTensor into dev-memory
This commit is contained in:
commit
e3da455aba
6
Makefile
6
Makefile
|
@ -1,4 +1,4 @@
|
|||
.PHONY : build clean install-python test-cpp test-onnx
|
||||
.PHONY : build clean format install-python test-cpp test-onnx
|
||||
|
||||
TYPE ?= release
|
||||
CUDA ?= OFF
|
||||
|
@ -6,6 +6,7 @@ BANG ?= OFF
|
|||
INTELCPU ?= off
|
||||
BACKTRACE ?= ON
|
||||
TEST ?= ON
|
||||
FORMAT_ORIGIN ?=
|
||||
|
||||
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
||||
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
||||
|
@ -24,6 +25,9 @@ build:
|
|||
clean:
|
||||
rm -rf build
|
||||
|
||||
format:
|
||||
@python3 scripts/format.py $(FORMAT_ORIGIN)
|
||||
|
||||
install-python: build
|
||||
cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
|
||||
pip install pyinfinitensor/
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
// void unary_kernel(cnnlHandle_t handle,
|
||||
// const float *input,
|
||||
// float *output,
|
||||
// const uint32_t num,
|
||||
// const uint32_t op_num,
|
||||
// int* list);
|
||||
// void unary_kernel(cnnlHandle_t handle,
|
||||
// const float *input,
|
||||
// float *output,
|
||||
// const uint32_t num,
|
||||
// const uint32_t op_num,
|
||||
// int* list);
|
||||
|
||||
void bang_unary_kernel(const RuntimeObj* obj, const Operator &_op) {
|
||||
void bang_unary_kernel(const RuntimeObj *obj, const Operator &_op) {
|
||||
auto op = as<UnaryKernelObj>(_op);
|
||||
float *const aData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
@ -20,7 +20,7 @@ namespace infini {
|
|||
auto context = dynamic_cast<const BangRuntimeObj *>(obj);
|
||||
auto list = op->getOpList();
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w, list.size(), list.data());
|
||||
|
||||
}
|
||||
unary_kernel_list(context->cnnlHandle(), aData, cData, n * c * h * w,
|
||||
list.size(), list.data());
|
||||
}
|
||||
}; // namespace infini
|
||||
|
|
|
@ -38,7 +38,8 @@ class UnaryKernelObj : public OperatorObj {
|
|||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
*/
|
||||
UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list);
|
||||
UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
std::vector<int> op_list);
|
||||
OP_CLONE(UnaryKernelObj);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from subprocess import run
|
||||
|
||||
c_style_file = [".h", ".hh", ".hpp", ".c", ".cc", ".cpp", ".cxx", ".cu", ".mlu"]
|
||||
py_file = ".py"
|
||||
proj_path = Path(sys.path[0]).parent
|
||||
|
||||
|
||||
# Formats one file under project path.
|
||||
def format_file(file):
|
||||
file = Path(proj_path.joinpath(file))
|
||||
if file.suffix in c_style_file:
|
||||
run(f"clang-format-14 -i {file}", cwd=proj_path, shell=True)
|
||||
run(f"git add {file}", cwd=proj_path, shell=True)
|
||||
elif file.suffix == py_file:
|
||||
run(f"black {file}", cwd=proj_path, shell=True)
|
||||
run(f"git add {file}", cwd=proj_path, shell=True)
|
||||
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
# Last commit.
|
||||
print("Formats git added files.")
|
||||
for line in (
|
||||
run("git status", cwd=proj_path, capture_output=True, shell=True)
|
||||
.stdout.decode()
|
||||
.splitlines()
|
||||
):
|
||||
line = line.strip()
|
||||
# Only formats git added files.
|
||||
for pre in ["new file:", "modified:"]:
|
||||
if line.startswith(pre):
|
||||
format_file(line[len(pre) :].strip())
|
||||
break
|
||||
else:
|
||||
# Origin commit.
|
||||
origin = sys.argv[1]
|
||||
print(f'Formats changed files from "{origin}".')
|
||||
for line in (
|
||||
run(f"git diff {origin}", cwd=proj_path, capture_output=True, shell=True)
|
||||
.stdout.decode()
|
||||
.splitlines()
|
||||
):
|
||||
diff = "diff --git "
|
||||
if line.startswith(diff):
|
||||
files = line[len(diff) :].split(" ")
|
||||
assert len(files) == 2
|
||||
assert files[0][:2] == "a/"
|
||||
assert files[1][:2] == "b/"
|
||||
format_file(files[1][2:])
|
|
@ -7,11 +7,11 @@ namespace infini {
|
|||
class UnaryKernel : public BangKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
bang_unary_kernel(_context, _op);
|
||||
bang_unary_kernel(_context, _op);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32, UnaryKernel,
|
||||
"Unary_BANG_Float32");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::UnaryKernel, DataType::Float32,
|
||||
UnaryKernel, "Unary_BANG_Float32");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -10,7 +10,6 @@ typedef enum {
|
|||
} UnaryOpType;
|
||||
|
||||
void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
|
||||
const uint32_t num, const uint32_t op_num,
|
||||
int* list);
|
||||
const uint32_t num, const uint32_t op_num, int *list);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -5,8 +5,9 @@
|
|||
__nram__ char left[NRAM_USE_SIZE];
|
||||
__nram__ char right[NRAM_USE_SIZE];
|
||||
|
||||
template<typename T>
|
||||
__mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_list) {
|
||||
template <typename T>
|
||||
__mlu_device__ void UnaryFunction(T *output, T *input, size_t num,
|
||||
size_t op_list) {
|
||||
int use_nram_size = NRAM_USE_SIZE;
|
||||
int deal_align = use_nram_size / sizeof(T);
|
||||
int num_per_core = num / taskDim;
|
||||
|
@ -14,61 +15,61 @@ __mlu_device__ void UnaryFunction(T* output, T* input, size_t num, size_t op_lis
|
|||
int easy = num_per_core;
|
||||
int hard = num_per_core + (num_rem != 0 ? 1 : 0);
|
||||
int my = taskId < num_rem ? hard : easy;
|
||||
int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy);
|
||||
char* input_start = (char*)input + start * sizeof(T);
|
||||
char* output_start = (char*)output + start * sizeof(T);
|
||||
int start = (taskId < num_rem)
|
||||
? (hard * taskId)
|
||||
: (hard * num_rem + (taskId - num_rem) * easy);
|
||||
char *input_start = (char *)input + start * sizeof(T);
|
||||
char *output_start = (char *)output + start * sizeof(T);
|
||||
|
||||
int my_repeat = my / deal_align;
|
||||
int my_rem = my % deal_align;
|
||||
for(int i = 0; i < my_repeat; ++i) {
|
||||
for (int i = 0; i < my_repeat; ++i) {
|
||||
__memcpy(left, input_start, use_nram_size, GDRAM2NRAM);
|
||||
while(op_list){
|
||||
int op = op_list % 10;
|
||||
switch(op){
|
||||
while (op_list) {
|
||||
int op = op_list % 10;
|
||||
switch (op) {
|
||||
case 1:
|
||||
__bang_active_abs((T*)left, (T*)left, num);
|
||||
break;
|
||||
__bang_active_abs((T *)left, (T *)left, num);
|
||||
break;
|
||||
case 2:
|
||||
__bang_active_relu((T*)left, (T*)left, num);
|
||||
break;
|
||||
__bang_active_relu((T *)left, (T *)left, num);
|
||||
break;
|
||||
case 3:
|
||||
__bang_active_sigmoid((T*)left, (T*)left, num);
|
||||
break;
|
||||
__bang_active_sigmoid((T *)left, (T *)left, num);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
op_list /= 10;
|
||||
break;
|
||||
}
|
||||
op_list /= 10;
|
||||
}
|
||||
__memcpy(output_start, left, use_nram_size, NRAM2GDRAM);
|
||||
input_start += use_nram_size;
|
||||
output_start += use_nram_size;
|
||||
}
|
||||
if(my_rem) {
|
||||
if (my_rem) {
|
||||
__memcpy(left, input_start, my_rem * sizeof(T), GDRAM2NRAM);
|
||||
while(op_list){
|
||||
int op = op_list % 10;
|
||||
switch(op){
|
||||
while (op_list) {
|
||||
int op = op_list % 10;
|
||||
switch (op) {
|
||||
case 1:
|
||||
__bang_active_abs((T*)left, (T*)left, my_rem);
|
||||
break;
|
||||
__bang_active_abs((T *)left, (T *)left, my_rem);
|
||||
break;
|
||||
case 2:
|
||||
__bang_active_relu((T*)left, (T*)left, my_rem);
|
||||
break;
|
||||
__bang_active_relu((T *)left, (T *)left, my_rem);
|
||||
break;
|
||||
case 3:
|
||||
__bang_active_sigmoid((T*)left, (T*)left, my_rem);
|
||||
break;
|
||||
__bang_active_sigmoid((T *)left, (T *)left, my_rem);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
op_list /= 10;
|
||||
break;
|
||||
}
|
||||
op_list /= 10;
|
||||
}
|
||||
__memcpy(output_start, left, my_rem * sizeof(T), NRAM2GDRAM);
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnaryKernelUnion1(float *output,
|
||||
float *input,
|
||||
uint32_t num,
|
||||
uint32_t op_list) {
|
||||
UnaryFunction((float*)output, (float*)input, num, op_list);
|
||||
__mlu_global__ void MLUUnaryKernelUnion1(float *output, float *input,
|
||||
uint32_t num, uint32_t op_list) {
|
||||
UnaryFunction((float *)output, (float *)input, num, op_list);
|
||||
}
|
||||
|
|
|
@ -2,12 +2,8 @@
|
|||
#include "unarylist.h"
|
||||
namespace infini {
|
||||
|
||||
void unary_kernel_list(cnnlHandle_t handle,
|
||||
const float *input,
|
||||
float *output,
|
||||
const uint32_t num,
|
||||
const uint32_t op_num,
|
||||
int* list) {
|
||||
void unary_kernel_list(cnnlHandle_t handle, const float *input, float *output,
|
||||
const uint32_t num, const uint32_t op_num, int *list) {
|
||||
// 任务类型和调度方法
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
|
@ -18,15 +14,13 @@ void unary_kernel_list(cnnlHandle_t handle,
|
|||
k_dim.z = 1;
|
||||
k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
uint32_t op_list = 0;
|
||||
for(int i = op_num-1; i >= 0; --i) {
|
||||
op_list *= 10;
|
||||
op_list += list[i];
|
||||
for (int i = op_num - 1; i >= 0; --i) {
|
||||
op_list *= 10;
|
||||
op_list += list[i];
|
||||
}
|
||||
// launch 任务
|
||||
MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>((float*)output,
|
||||
(float*)input,
|
||||
num,
|
||||
op_list);
|
||||
MLUUnaryKernelUnion1<<<k_dim, k_type, queue>>>(
|
||||
(float *)output, (float *)input, num, op_list);
|
||||
}
|
||||
|
||||
};
|
||||
}; // namespace infini
|
||||
|
|
|
@ -32,12 +32,14 @@ vector<int> UnaryObj::getOpAttrVector() const {
|
|||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output, std::vector<int> op_list)
|
||||
UnaryKernelObj::UnaryKernelObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
std::vector<int> op_list)
|
||||
: OperatorObj(OpType::UnaryKernel, {input}, {output}), opList(op_list) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> UnaryKernelObj::inferShape(const TensorVec &inputs) const {
|
||||
optional<vector<Shape>>
|
||||
UnaryKernelObj::inferShape(const TensorVec &inputs) const {
|
||||
const auto A = inputs[0];
|
||||
return {{A->getDims()}};
|
||||
}
|
||||
|
@ -63,7 +65,6 @@ vector<int> UnaryKernelObj::getOpAttrVector() const {
|
|||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
|
||||
ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
std::optional<float> min, std::optional<float> max)
|
||||
: OperatorObj(OpType::Clip, {input}, {output}), minValue(min),
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
#include "bang/bang_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Notifier and queue
|
||||
cnrtNotifier_t st, et;
|
||||
CNRT_CHECK(cnrtNotifierCreate(&st));
|
||||
CNRT_CHECK(cnrtNotifierCreate(&et));
|
||||
auto handle = bangRuntime->cnnlHandle();
|
||||
cnrtQueue_t queue;
|
||||
cnnlGetQueue(handle, &queue);
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
std::vector<int> op_list = {3,2,3};
|
||||
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);
|
||||
|
||||
bangGraph->dataMalloc();
|
||||
CNRT_CHECK(cnrtPlaceNotifier(st, queue));
|
||||
bangRuntime->run(bangGraph);
|
||||
CNRT_CHECK(cnrtPlaceNotifier(et, queue));
|
||||
CNRT_CHECK(cnrtQueueSync(queue));
|
||||
float latency;
|
||||
CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
printf("单目融合 Kernel Hardware Time:%.3f us\n", latency);
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
void testUnaryNofusion(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Notifier and queue
|
||||
cnrtNotifier_t st, et;
|
||||
CNRT_CHECK(cnrtNotifierCreate(&st));
|
||||
CNRT_CHECK(cnrtNotifierCreate(&et));
|
||||
auto handle = bangRuntime->cnnlHandle();
|
||||
cnrtQueue_t queue;
|
||||
cnnlGetQueue(handle, &queue);
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
|
||||
auto gpuOp = bangGraph->addOp<SigmoidObj>(inputGpu, nullptr);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto gpuOp2 = bangGraph->addOp<ReluObj>(outputGpu, nullptr);
|
||||
auto outputGpu2 = gpuOp2->getOutput();
|
||||
auto gpuOp3 = bangGraph->addOp<SigmoidObj>(outputGpu2, nullptr);
|
||||
|
||||
bangGraph->dataMalloc();
|
||||
CNRT_CHECK(cnrtPlaceNotifier(st, queue));
|
||||
bangRuntime->run(bangGraph);
|
||||
CNRT_CHECK(cnrtPlaceNotifier(et, queue));
|
||||
CNRT_CHECK(cnrtQueueSync(queue));
|
||||
float latency;
|
||||
CNRT_CHECK(cnrtNotifierDuration(st, et, &latency));
|
||||
auto outputGpu3 = gpuOp3->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu3->clone(cpuRuntime);
|
||||
printf("单目不融合 Kernel Hardware Time:%.3f us\n", latency);
|
||||
EXPECT_TRUE(1);
|
||||
}
|
||||
|
||||
TEST(cnnl_unary_kernel, run) {
|
||||
testUnaryKernel<UnaryKernelObj>(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
|
||||
testUnaryNofusion(IncrementalGenerator(), Shape{1024, 1024, 1, 1});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -9,8 +9,9 @@
|
|||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
void testUnaryKernel(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
@ -23,7 +24,7 @@ void testUnaryKernel(const std::function<void(void *, size_t, DataType)> &genera
|
|||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
std::vector<int> op_list = {1,2,3};
|
||||
std::vector<int> op_list = {1, 2, 3};
|
||||
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, op_list);
|
||||
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &>/dev/null && pwd 2>/dev/null)"
|
||||
PET_HOME="$(readlink -f ${script_dir}/../..)"
|
||||
find ${PET_HOME}/src ${PET_HOME}/include ${PET_HOME}/test -iname *.h -o -iname *.cc | xargs clang-format -i
|
Loading…
Reference in New Issue