forked from jiuyuan/InfiniTensor
add layernorm
This commit is contained in:
parent
a765cd2a3d
commit
6ba1a0648a
|
@ -0,0 +1,105 @@
|
||||||
|
#include "operators/layer_norm.h"
|
||||||
|
#include "aclnnop/level2/aclnn_layer_norm.h"
|
||||||
|
#include "ascend/ascend_kernel_without_config.h"
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
#include "operators/gather.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class LayerNormAclnn : public ASCENDKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<LayerNormObj>(_op);
|
||||||
|
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const weightData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto inputD = op->getInputs(0)->getDims();
|
||||||
|
auto inputS = op->getInputs(0)->getStride();
|
||||||
|
auto weightD = op->getInputs(1)->getDims();
|
||||||
|
auto weightS = op->getInputs(1)->getStride();
|
||||||
|
auto outD = op->getOutput()->getDims();
|
||||||
|
auto outS = op->getOutput()->getStride();
|
||||||
|
|
||||||
|
double eps = static_cast<double>(op->getEps());
|
||||||
|
|
||||||
|
std::vector<int64_t> inputDim = castTo64(inputD);
|
||||||
|
std::vector<int64_t> inputStride = castTo64(inputS);
|
||||||
|
std::vector<int64_t> weightDim = castTo64(weightD);
|
||||||
|
std::vector<int64_t> weightStride = castTo64(weightS);
|
||||||
|
std::vector<int64_t> outputDim = castTo64(outD);
|
||||||
|
std::vector<int64_t> outputStride = castTo64(outS);
|
||||||
|
|
||||||
|
auto axis = op->getAxis();
|
||||||
|
auto rank = static_cast<int>(inputDim.size());
|
||||||
|
std::vector<int64_t> normalizedShape(rank - axis, 0);
|
||||||
|
for (auto i = rank; i > axis; --i) {
|
||||||
|
normalizedShape[i - 1 - axis] = inputDim[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
auto inputTensor =
|
||||||
|
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||||
|
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
|
inputDim.data(), inputDim.size(), inputData);
|
||||||
|
auto weightTensor =
|
||||||
|
aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
|
||||||
|
weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
|
weightDim.data(), weightDim.size(), weightData);
|
||||||
|
auto outputTensor =
|
||||||
|
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||||
|
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||||
|
outputDim.data(), outputDim.size(), outputData);
|
||||||
|
|
||||||
|
auto *normArray =
|
||||||
|
aclCreateIntArray(normalizedShape.data(), normalizedShape.size());
|
||||||
|
|
||||||
|
aclTensor *biasTensor = NULL;
|
||||||
|
if (op->numInputs() == 3) {
|
||||||
|
void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto biasD = op->getInputs(2)->getDims();
|
||||||
|
auto biasS = op->getInputs(2)->getStride();
|
||||||
|
std::vector<int64_t> biasDim = castTo64(biasD);
|
||||||
|
std::vector<int64_t> biasStride = castTo64(biasS);
|
||||||
|
|
||||||
|
biasTensor = aclCreateTensor(
|
||||||
|
biasDim.data(), biasDim.size(), ACL_FLOAT, biasStride.data(), 0,
|
||||||
|
aclFormat::ACL_FORMAT_NCHW, biasDim.data(), biasDim.size(),
|
||||||
|
biasData);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor *executor;
|
||||||
|
|
||||||
|
auto ret = aclnnLayerNormGetWorkspaceSize(
|
||||||
|
inputTensor, normArray, weightTensor, biasTensor, eps, outputTensor,
|
||||||
|
NULL, NULL, &workspaceSize, &executor);
|
||||||
|
|
||||||
|
CHECK_RET(
|
||||||
|
ret == ACL_SUCCESS,
|
||||||
|
LOG_PRINT("aclnnLayerNormGetWorkspaceSize failed. ERROR: %d\n",
|
||||||
|
ret));
|
||||||
|
void *workspaceAddr = nullptr;
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = aclnnLayerNorm(workspaceAddr, workspaceSize, executor,
|
||||||
|
context->ASCENDHandle());
|
||||||
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
|
LOG_PRINT("aclnnLayerNorm failed. ERROR: %d\n", ret));
|
||||||
|
|
||||||
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
|
CHECK_RET(ret == ACL_SUCCESS,
|
||||||
|
LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret));
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::ASCEND, OpType::LayerNormalization, LayerNormAclnn,
|
||||||
|
"LayerNorm_ASCEND");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -92,52 +92,53 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
|
||||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>()); \
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>()); \
|
||||||
\
|
\
|
||||||
auto a = op->getInputs(0)->getDims(); \
|
auto a = op->getInputs(0) -> getDims();
|
||||||
std::vector<int64_t> aDim(a.size(), 1); \
|
|
||||||
for (size_t i = 0; i < a.size(); ++i) { \
|
std::vector<int64_t> aDim(a.size(), 1);
|
||||||
aDim[i] = int64_t(a[i]); \
|
for (size_t i = 0; i < a.size(); ++i) {
|
||||||
} \
|
aDim[i] = int64_t(a[i]);
|
||||||
auto aS = op->getInputs(0)->getStride(); \
|
}
|
||||||
std::vector<int64_t> aStride(aS.size(), 1); \
|
auto aS = op->getInputs(0)->getStride();
|
||||||
for (size_t i = 0; i < aS.size(); ++i) { \
|
std::vector<int64_t> aStride(aS.size(), 1);
|
||||||
aStride[i] = int64_t(aS[i]); \
|
for (size_t i = 0; i < aS.size(); ++i) {
|
||||||
} \
|
aStride[i] = int64_t(aS[i]);
|
||||||
auto c = op->getInputs(0)->getDims(); \
|
}
|
||||||
std::vector<int64_t> cDim(c.size(), 1); \
|
auto c = op->getInputs(0)->getDims();
|
||||||
for (size_t i = 0; i < c.size(); ++i) { \
|
std::vector<int64_t> cDim(c.size(), 1);
|
||||||
cDim[i] = int64_t(c[i]); \
|
for (size_t i = 0; i < c.size(); ++i) {
|
||||||
} \
|
cDim[i] = int64_t(c[i]);
|
||||||
auto cS = op->getInputs(0)->getStride(); \
|
}
|
||||||
std::vector<int64_t> cStride(cS.size(), 1); \
|
auto cS = op->getInputs(0)->getStride();
|
||||||
for (size_t i = 0; i < cS.size(); ++i) { \
|
std::vector<int64_t> cStride(cS.size(), 1);
|
||||||
cStride[i] = int64_t(cS[i]); \
|
for (size_t i = 0; i < cS.size(); ++i) {
|
||||||
} \
|
cStride[i] = int64_t(cS[i]);
|
||||||
\
|
}
|
||||||
auto input = aclCreateTensor( \
|
|
||||||
aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0, \
|
auto input =
|
||||||
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData); \
|
aclCreateTensor(aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
|
||||||
auto output = aclCreateTensor( \
|
aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
|
||||||
cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0, \
|
auto output =
|
||||||
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData); \
|
aclCreateTensor(cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
|
||||||
\
|
aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
|
||||||
uint64_t workspaceSize = 0; \
|
|
||||||
aclOpExecutor *executor; \
|
uint64_t workspaceSize = 0;
|
||||||
\
|
aclOpExecutor *executor;
|
||||||
auto ret = aclnn##prefix##GetWorkspaceSize( \
|
|
||||||
input, output, &workspaceSize, &executor); \
|
auto ret =
|
||||||
void *workspaceAddr = nullptr; \
|
aclnn##prefix##GetWorkspaceSize(input, output, &workspaceSize, &executor);
|
||||||
if (workspaceSize > 0) { \
|
void *workspaceAddr = nullptr;
|
||||||
workspaceAddr = context->getWorkspace(workspaceSize); \
|
if (workspaceSize > 0) {
|
||||||
} \
|
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||||
assert(ret == ACL_SUCCESS); \
|
}
|
||||||
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor, \
|
assert(ret == ACL_SUCCESS);
|
||||||
context->ASCENDHandle()); \
|
ret = aclnn##prefix(workspaceAddr, workspaceSize, executor,
|
||||||
assert(ret == ACL_SUCCESS); \
|
context->ASCENDHandle());
|
||||||
ret = aclrtSynchronizeStream(context->ASCENDHandle()); \
|
assert(ret == ACL_SUCCESS);
|
||||||
assert(ret == ACL_SUCCESS); \
|
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||||
\
|
assert(ret == ACL_SUCCESS);
|
||||||
return; \
|
|
||||||
} \
|
return;
|
||||||
|
} // namespace infini \
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_UNARY_Aclnn(Abs);
|
DEFINE_UNARY_Aclnn(Abs);
|
||||||
|
@ -184,4 +185,5 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, SqrtAclnn, "sqrt_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Round, RoundAclnn,
|
REGISTER_KERNEL(Device::ASCEND, OpType::Round, RoundAclnn,
|
||||||
"round_ASCEND_float");
|
"round_ASCEND_float");
|
||||||
REGISTER_KERNEL(Device::ASCEND, OpType::Erf, ErfAclnn, "erf_ASCEND_float");
|
REGISTER_KERNEL(Device::ASCEND, OpType::Erf, ErfAclnn, "erf_ASCEND_float");
|
||||||
}; // namespace infini
|
}
|
||||||
|
; // namespace infini
|
||||||
|
|
|
@ -0,0 +1,152 @@
|
||||||
|
#include "ascend/ascend_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/layer_norm.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void test_layernormFp32(
|
||||||
|
const Shape &inputShape, const vector<float> &inputData,
|
||||||
|
const Shape &scaleShape, const vector<float> &scaleData, float eps,
|
||||||
|
int axis, int stash_type, const vector<float> &ExpectData,
|
||||||
|
const std::optional<Shape> &bShape = std::nullopt,
|
||||||
|
const std::optional<std::vector<float>> &biasData = std::nullopt) {
|
||||||
|
|
||||||
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||||
|
|
||||||
|
if (bShape.has_value() && biasData.has_value()) {
|
||||||
|
Shape biasShape = *bShape;
|
||||||
|
|
||||||
|
auto bias = gCpu->addTensor(biasShape, DataType::Float32);
|
||||||
|
auto input = gCpu->addTensor(inputShape, DataType::Float32);
|
||||||
|
auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
bias->copyin(*biasData); //
|
||||||
|
// bias->printData();
|
||||||
|
input->copyin(inputData);
|
||||||
|
scale->copyin(scaleData); //
|
||||||
|
auto ascendRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
Graph gAscend = make_ref<GraphObj>(ascendRuntime);
|
||||||
|
auto biasNpu = gAscend->cloneTensor(bias);
|
||||||
|
auto inputNpu = gAscend->cloneTensor(input);
|
||||||
|
auto scaleNpu = gAscend->cloneTensor(scale);
|
||||||
|
// gCpu->cloneTensor(biasNpu)->printData();
|
||||||
|
auto op =
|
||||||
|
gAscend->addOp<LayerNormObj>(inputNpu, scaleNpu, nullptr, biasNpu,
|
||||||
|
eps, axis, stash_type); // LayernormObj
|
||||||
|
gAscend->dataMalloc();
|
||||||
|
biasNpu->copyin(*biasData);
|
||||||
|
// gCpu->cloneTensor(biasNpu)->printData();
|
||||||
|
inputNpu->copyin(inputData);
|
||||||
|
scaleNpu->copyin(scaleData);
|
||||||
|
ascendRuntime->run(gAscend);
|
||||||
|
|
||||||
|
auto oCpu =
|
||||||
|
gCpu->cloneTensor(op->getOutput()); // move Data from npu to cpu
|
||||||
|
oCpu->printData(); //->printData
|
||||||
|
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||||
|
} else {
|
||||||
|
|
||||||
|
auto input = gCpu->addTensor(inputShape, DataType::Float32);
|
||||||
|
auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
|
||||||
|
input->copyin(inputData);
|
||||||
|
scale->copyin(scaleData); //
|
||||||
|
auto ascendRuntime = make_ref<ASCENDRuntimeObj>();
|
||||||
|
Graph gAscend = make_ref<GraphObj>(ascendRuntime);
|
||||||
|
|
||||||
|
auto inputNpu = gAscend->cloneTensor(input);
|
||||||
|
auto scaleNpu = gAscend->cloneTensor(scale);
|
||||||
|
auto op =
|
||||||
|
gAscend->addOp<LayerNormObj>(inputNpu, scaleNpu, nullptr, nullptr,
|
||||||
|
eps, axis, stash_type); // LayernormObj
|
||||||
|
gAscend->dataMalloc();
|
||||||
|
|
||||||
|
inputNpu->copyin(inputData);
|
||||||
|
scaleNpu->copyin(scaleData);
|
||||||
|
ascendRuntime->run(gAscend);
|
||||||
|
|
||||||
|
auto oCpu =
|
||||||
|
gCpu->cloneTensor(op->getOutput()); // move Data from npu to cpu
|
||||||
|
oCpu->printData(); //->printData
|
||||||
|
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CUDA_LayernormFp32, run) {
|
||||||
|
aclInit(nullptr);
|
||||||
|
test_layernormFp32(
|
||||||
|
Shape{2, 3, 2, 3},
|
||||||
|
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||||
|
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||||
|
18., 19., 20., 21., 22., 23., 24., 25., 26.,
|
||||||
|
27., 28., 29., 30., 31., 32., 33., 34., 35.},
|
||||||
|
Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
|
||||||
|
vector<float>{
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
|
||||||
|
Shape{3}, vector<float>{0, 0, 0});
|
||||||
|
// test_layernormFp32(
|
||||||
|
// Shape{2, 3, 2, 3},
|
||||||
|
// vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||||
|
// 9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||||
|
// 18., 19., 20., 21., 22., 23., 24., 25., 26.,
|
||||||
|
// 27., 28., 29., 30., 31., 32., 33., 34., 35.},
|
||||||
|
// Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
|
||||||
|
// vector<float>{
|
||||||
|
// -0.0674207, 0.2000000, 1.1123679, -0.0674207,
|
||||||
|
// 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
|
||||||
|
// -0.0674207, 0.2000000, 1.1123679, -0.0674207,
|
||||||
|
// 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
|
||||||
|
// -0.0674207, 0.2000000, 1.1123679, -0.0674207,
|
||||||
|
// 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
|
||||||
|
// -0.0674207, 0.2000000, 1.1123679, -0.0674207,
|
||||||
|
// 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
|
||||||
|
// Shape{3}, vector<float>{0.3, 0.2, 0.5});
|
||||||
|
// test_layernormFp32(
|
||||||
|
// Shape{2, 3, 2, 3},
|
||||||
|
// vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||||
|
// 9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||||
|
// 18., 19., 20., 21., 22., 23., 24., 25., 26.,
|
||||||
|
// 27., 28., 29., 30., 31., 32., 33., 34., 35.},
|
||||||
|
// Shape{1}, vector<float>{0.3}, 1e-5, 3, 1,
|
||||||
|
// vector<float>{
|
||||||
|
// -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
|
||||||
|
// 0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207,
|
||||||
|
// 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
|
||||||
|
// -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
|
||||||
|
// 0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207,
|
||||||
|
// 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
|
||||||
|
// -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
|
||||||
|
// 0.8674207},
|
||||||
|
// Shape{3}, vector<float>{0.3, 0.2, 0.5});
|
||||||
|
// test_layernormFp32(
|
||||||
|
// Shape{2, 3, 2, 3},
|
||||||
|
// vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||||
|
// 9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||||
|
// 18., 19., 20., 21., 22., 23., 24., 25., 26.,
|
||||||
|
// 27., 28., 29., 30., 31., 32., 33., 34., 35.},
|
||||||
|
// Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
|
||||||
|
// vector<float>{-0.3674207, 0.0000000, 0.6123678, -0.3674207,
|
||||||
|
// 0.0000000, 0.6123678, -0.3674207, 0.0000000,
|
||||||
|
// 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
// -0.3674207, 0.0000000, 0.6123678, -0.3674207,
|
||||||
|
// 0.0000000, 0.6123678, -0.3674207, 0.0000000,
|
||||||
|
// 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||||
|
// -0.3674207, 0.0000000, 0.6123678, -0.3674207,
|
||||||
|
// 0.0000000, 0.6123678, -0.3674207, 0.0000000,
|
||||||
|
// 0.6123678, -0.3674207, 0.0000000, 0.6123678});
|
||||||
|
|
||||||
|
aclFinalize();
|
||||||
|
} // python output
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue