add layernorm

2024-04-26 15:25:41 +08:00 · 2024-04-26 15:25:41 +08:00 · 6ba1a0648a
parent a765cd2a3d
commit 6ba1a0648a
3 changed files with 306 additions and 47 deletions
--- a/src/kernels/ascend/layer_norm.cc
+++ b/src/kernels/ascend/layer_norm.cc
@ -0,0 +1,105 @@
+#include "operators/layer_norm.h"
+#include "aclnnop/level2/aclnn_layer_norm.h"
+#include "ascend/ascend_kernel_without_config.h"
+#include "ascend/ascend_runtime.h"
+#include "operators/gather.h"
+
+namespace infini {
+
+class LayerNormAclnn : public ASCENDKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *_context) const override {
+        auto op = as<LayerNormObj>(_op);
+        auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
+
+        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
+        void *const weightData = (op->getInputs(1)->getRawDataPtr<void *>());
+        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
+
+        auto inputD = op->getInputs(0)->getDims();
+        auto inputS = op->getInputs(0)->getStride();
+        auto weightD = op->getInputs(1)->getDims();
+        auto weightS = op->getInputs(1)->getStride();
+        auto outD = op->getOutput()->getDims();
+        auto outS = op->getOutput()->getStride();
+
+        double eps = static_cast<double>(op->getEps());
+
+        std::vector<int64_t> inputDim = castTo64(inputD);
+        std::vector<int64_t> inputStride = castTo64(inputS);
+        std::vector<int64_t> weightDim = castTo64(weightD);
+        std::vector<int64_t> weightStride = castTo64(weightS);
+        std::vector<int64_t> outputDim = castTo64(outD);
+        std::vector<int64_t> outputStride = castTo64(outS);
+
+        auto axis = op->getAxis();
+        auto rank = static_cast<int>(inputDim.size());
+        std::vector<int64_t> normalizedShape(rank - axis, 0);
+        for (auto i = rank; i > axis; --i) {
+            normalizedShape[i - 1 - axis] = inputDim[i - 1];
+        }
+
+        auto inputTensor =
+            aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
+                            inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
+                            inputDim.data(), inputDim.size(), inputData);
+        auto weightTensor =
+            aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
+                            weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
+                            weightDim.data(), weightDim.size(), weightData);
+        auto outputTensor =
+            aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
+                            outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
+                            outputDim.data(), outputDim.size(), outputData);
+
+        auto *normArray =
+            aclCreateIntArray(normalizedShape.data(), normalizedShape.size());
+
+        aclTensor *biasTensor = NULL;
+        if (op->numInputs() == 3) {
+            void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
+
+            auto biasD = op->getInputs(2)->getDims();
+            auto biasS = op->getInputs(2)->getStride();
+            std::vector<int64_t> biasDim = castTo64(biasD);
+            std::vector<int64_t> biasStride = castTo64(biasS);
+
+            biasTensor = aclCreateTensor(
+                biasDim.data(), biasDim.size(), ACL_FLOAT, biasStride.data(), 0,
+                aclFormat::ACL_FORMAT_NCHW, biasDim.data(), biasDim.size(),
+                biasData);
+        }
+
+        uint64_t workspaceSize = 0;
+        aclOpExecutor *executor;
+
+        auto ret = aclnnLayerNormGetWorkspaceSize(
+            inputTensor, normArray, weightTensor, biasTensor, eps, outputTensor,
+            NULL, NULL, &workspaceSize, &executor);
+
+        CHECK_RET(
+            ret == ACL_SUCCESS,
+            LOG_PRINT("aclnnLayerNormGetWorkspaceSize failed. ERROR: %d\n",
+                      ret));
+        void *workspaceAddr = nullptr;
+        if (workspaceSize > 0) {
+            workspaceAddr = context->getWorkspace(workspaceSize);
+        }
+
+        ret = aclnnLayerNorm(workspaceAddr, workspaceSize, executor,
+                             context->ASCENDHandle());
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnLayerNorm failed. ERROR: %d\n", ret));
+
+        ret = aclrtSynchronizeStream(context->ASCENDHandle());
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret));
+
+        return;
+    }
+};
+
+REGISTER_KERNEL(Device::ASCEND, OpType::LayerNormalization, LayerNormAclnn,
+                "LayerNorm_ASCEND");
+
+}; // namespace infini
--- a/src/kernels/ascend/unary.cc
+++ b/src/kernels/ascend/unary.cc
@ -92,52 +92,53 @@ class ReluAclnn : public ASCENDKernelWithoutConfig {
            void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());   \
            void *const cData = (op->getOutput()->getRawDataPtr<void *>());    \
                                                                               \
-            auto a = op->getInputs(0)->getDims();                              \
-            std::vector<int64_t> aDim(a.size(), 1);                            \
-            for (size_t i = 0; i < a.size(); ++i) {                            \
-                aDim[i] = int64_t(a[i]);                                       \
-            }                                                                  \
-            auto aS = op->getInputs(0)->getStride();                           \
-            std::vector<int64_t> aStride(aS.size(), 1);                        \
-            for (size_t i = 0; i < aS.size(); ++i) {                           \
-                aStride[i] = int64_t(aS[i]);                                   \
-            }                                                                  \
-            auto c = op->getInputs(0)->getDims();                              \
-            std::vector<int64_t> cDim(c.size(), 1);                            \
-            for (size_t i = 0; i < c.size(); ++i) {                            \
-                cDim[i] = int64_t(c[i]);                                       \
-            }                                                                  \
-            auto cS = op->getInputs(0)->getStride();                           \
-            std::vector<int64_t> cStride(cS.size(), 1);                        \
-            for (size_t i = 0; i < cS.size(); ++i) {                           \
-                cStride[i] = int64_t(cS[i]);                                   \
-            }                                                                  \
-                                                                               \
-            auto input = aclCreateTensor(                                      \
-                aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,        \
-                aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);    \
-            auto output = aclCreateTensor(                                     \
-                cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,        \
-                aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);    \
-                                                                               \
-            uint64_t workspaceSize = 0;                                        \
-            aclOpExecutor *executor;                                           \
-                                                                               \
-            auto ret = aclnn##prefix##GetWorkspaceSize(                        \
-                input, output, &workspaceSize, &executor);                     \
-            void *workspaceAddr = nullptr;                                     \
-            if (workspaceSize > 0) {                                           \
-                workspaceAddr = context->getWorkspace(workspaceSize);          \
-            }                                                                  \
-            assert(ret == ACL_SUCCESS);                                        \
-            ret = aclnn##prefix(workspaceAddr, workspaceSize, executor,        \
-                                context->ASCENDHandle());                      \
-            assert(ret == ACL_SUCCESS);                                        \
-            ret = aclrtSynchronizeStream(context->ASCENDHandle());             \
-            assert(ret == ACL_SUCCESS);                                        \
-                                                                               \
-            return;                                                            \
-        }                                                                      \
+            auto a = op->getInputs(0) -> getDims();
+
+std::vector<int64_t> aDim(a.size(), 1);
+for (size_t i = 0; i < a.size(); ++i) {
+    aDim[i] = int64_t(a[i]);
+}
+auto aS = op->getInputs(0)->getStride();
+std::vector<int64_t> aStride(aS.size(), 1);
+for (size_t i = 0; i < aS.size(); ++i) {
+    aStride[i] = int64_t(aS[i]);
+}
+auto c = op->getInputs(0)->getDims();
+std::vector<int64_t> cDim(c.size(), 1);
+for (size_t i = 0; i < c.size(); ++i) {
+    cDim[i] = int64_t(c[i]);
+}
+auto cS = op->getInputs(0)->getStride();
+std::vector<int64_t> cStride(cS.size(), 1);
+for (size_t i = 0; i < cS.size(); ++i) {
+    cStride[i] = int64_t(cS[i]);
+}
+
+auto input =
+    aclCreateTensor(aDim.data(), aDim.size(), ACL_FLOAT, aStride.data(), 0,
+                    aclFormat::ACL_FORMAT_ND, aDim.data(), aDim.size(), aData);
+auto output =
+    aclCreateTensor(cDim.data(), cDim.size(), ACL_FLOAT, cStride.data(), 0,
+                    aclFormat::ACL_FORMAT_ND, cDim.data(), cDim.size(), cData);
+
+uint64_t workspaceSize = 0;
+aclOpExecutor *executor;
+
+auto ret =
+    aclnn##prefix##GetWorkspaceSize(input, output, &workspaceSize, &executor);
+void *workspaceAddr = nullptr;
+if (workspaceSize > 0) {
+    workspaceAddr = context->getWorkspace(workspaceSize);
+}
+assert(ret == ACL_SUCCESS);
+ret = aclnn##prefix(workspaceAddr, workspaceSize, executor,
+                    context->ASCENDHandle());
+assert(ret == ACL_SUCCESS);
+ret = aclrtSynchronizeStream(context->ASCENDHandle());
+assert(ret == ACL_SUCCESS);
+
+return;
+} // namespace infini                                                                      \
    };

 DEFINE_UNARY_Aclnn(Abs);
@ -184,4 +185,5 @@ REGISTER_KERNEL(Device::ASCEND, OpType::Sqrt, SqrtAclnn, "sqrt_ASCEND_float");
 REGISTER_KERNEL(Device::ASCEND, OpType::Round, RoundAclnn,
                "round_ASCEND_float");
 REGISTER_KERNEL(Device::ASCEND, OpType::Erf, ErfAclnn, "erf_ASCEND_float");
-}; // namespace infini
+}
+; // namespace infini
--- a/test/kernels/ascend/test_ascend_layernorm.cc
+++ b/test/kernels/ascend/test_ascend_layernorm.cc
@ -0,0 +1,152 @@
+#include "ascend/ascend_runtime.h"
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/layer_norm.h"
+
+#include "test.h"
+
+namespace infini {
+
+void test_layernormFp32(
+    const Shape &inputShape, const vector<float> &inputData,
+    const Shape &scaleShape, const vector<float> &scaleData, float eps,
+    int axis, int stash_type, const vector<float> &ExpectData,
+    const std::optional<Shape> &bShape = std::nullopt,
+    const std::optional<std::vector<float>> &biasData = std::nullopt) {
+
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
+
+    if (bShape.has_value() && biasData.has_value()) {
+        Shape biasShape = *bShape;
+
+        auto bias = gCpu->addTensor(biasShape, DataType::Float32);
+        auto input = gCpu->addTensor(inputShape, DataType::Float32);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+        bias->copyin(*biasData); //
+        // bias->printData();
+        input->copyin(inputData);
+        scale->copyin(scaleData); //
+        auto ascendRuntime = make_ref<ASCENDRuntimeObj>();
+        Graph gAscend = make_ref<GraphObj>(ascendRuntime);
+        auto biasNpu = gAscend->cloneTensor(bias);
+        auto inputNpu = gAscend->cloneTensor(input);
+        auto scaleNpu = gAscend->cloneTensor(scale);
+        // gCpu->cloneTensor(biasNpu)->printData();
+        auto op =
+            gAscend->addOp<LayerNormObj>(inputNpu, scaleNpu, nullptr, biasNpu,
+                                         eps, axis, stash_type); // LayernormObj
+        gAscend->dataMalloc();
+        biasNpu->copyin(*biasData);
+        // gCpu->cloneTensor(biasNpu)->printData();
+        inputNpu->copyin(inputData);
+        scaleNpu->copyin(scaleData);
+        ascendRuntime->run(gAscend);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from npu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    } else {
+
+        auto input = gCpu->addTensor(inputShape, DataType::Float32);
+        auto scale = gCpu->addTensor(scaleShape, DataType::Float32);
+        gCpu->dataMalloc();
+
+        input->copyin(inputData);
+        scale->copyin(scaleData); //
+        auto ascendRuntime = make_ref<ASCENDRuntimeObj>();
+        Graph gAscend = make_ref<GraphObj>(ascendRuntime);
+
+        auto inputNpu = gAscend->cloneTensor(input);
+        auto scaleNpu = gAscend->cloneTensor(scale);
+        auto op =
+            gAscend->addOp<LayerNormObj>(inputNpu, scaleNpu, nullptr, nullptr,
+                                         eps, axis, stash_type); // LayernormObj
+        gAscend->dataMalloc();
+
+        inputNpu->copyin(inputData);
+        scaleNpu->copyin(scaleData);
+        ascendRuntime->run(gAscend);
+
+        auto oCpu =
+            gCpu->cloneTensor(op->getOutput()); // move Data from npu to cpu
+        oCpu->printData();                      //->printData
+        EXPECT_TRUE(oCpu->equalData(ExpectData));
+    }
+}
+
+TEST(CUDA_LayernormFp32, run) {
+    aclInit(nullptr);
+    test_layernormFp32(
+        Shape{2, 3, 2, 3},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+                      9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+                      18., 19., 20., 21., 22., 23., 24., 25., 26.,
+                      27., 28., 29., 30., 31., 32., 33., 34., 35.},
+        Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+        vector<float>{
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
+            -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
+        Shape{3}, vector<float>{0, 0, 0});
+    // test_layernormFp32(
+    //     Shape{2, 3, 2, 3},
+    //     vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+    //                   9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+    //                   18., 19., 20., 21., 22., 23., 24., 25., 26.,
+    //                   27., 28., 29., 30., 31., 32., 33., 34., 35.},
+    //     Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+    //     vector<float>{
+    //         -0.0674207, 0.2000000, 1.1123679, -0.0674207,
+    //         0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+    //         -0.0674207, 0.2000000, 1.1123679, -0.0674207,
+    //         0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+    //         -0.0674207, 0.2000000, 1.1123679, -0.0674207,
+    //         0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
+    //         -0.0674207, 0.2000000, 1.1123679, -0.0674207,
+    //         0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
+    //     Shape{3}, vector<float>{0.3, 0.2, 0.5});
+    // test_layernormFp32(
+    //     Shape{2, 3, 2, 3},
+    //     vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+    //                   9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+    //                   18., 19., 20., 21., 22., 23., 24., 25., 26.,
+    //                   27., 28., 29., 30., 31., 32., 33., 34., 35.},
+    //     Shape{1}, vector<float>{0.3}, 1e-5, 3, 1,
+    //     vector<float>{
+    //         -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
+    //         0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207,
+    //         0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+    //         -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
+    //         0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207,
+    //         0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
+    //         -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000,
+    //         0.8674207},
+    //     Shape{3}, vector<float>{0.3, 0.2, 0.5});
+    // test_layernormFp32(
+    //     Shape{2, 3, 2, 3},
+    //     vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
+    //                   9.,  10., 11., 12., 13., 14., 15., 16., 17.,
+    //                   18., 19., 20., 21., 22., 23., 24., 25., 26.,
+    //                   27., 28., 29., 30., 31., 32., 33., 34., 35.},
+    //     Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1e-5, 3, 1,
+    //     vector<float>{-0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+    //                   0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+    //                   0.6123678,  -0.3674207, 0.0000000,  0.6123678,
+    //                   -0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+    //                   0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+    //                   0.6123678,  -0.3674207, 0.0000000,  0.6123678,
+    //                   -0.3674207, 0.0000000,  0.6123678,  -0.3674207,
+    //                   0.0000000,  0.6123678,  -0.3674207, 0.0000000,
+    //                   0.6123678,  -0.3674207, 0.0000000,  0.6123678});
+
+    aclFinalize();
+} // python output
+
+} // namespace infini