From 7a9fcd93b243e94beceaca93a44eafbf750199a4 Mon Sep 17 00:00:00 2001 From: Haojie Wang Date: Mon, 9 Oct 2023 20:51:39 +0800 Subject: [PATCH 1/3] Pooling ceil mode (#155) * add ceil mode for pooling * do not print debug info for allocator by default * fix test bugs after introducing pooling ceil mode * fix onnx import bug --- include/core/graph_handler.h | 4 ++-- include/operators/pooling.h | 15 +++++++++----- pyinfinitensor/src/pyinfinitensor/onnx.py | 21 ++++++++++++++------ src/core/graph.cc | 4 ---- src/core/graph_handler.cc | 16 +++++++-------- src/core/lazy_allocator.cc | 16 +++++++-------- src/ffi/ffi_infinitensor.cc | 4 ++-- src/kernels/bang/pooling.cc | 1 + src/kernels/cpu/pooling.cc | 1 + src/kernels/cuda/pooling.cc | 22 +++++++++++++++------ src/operators/pooling.cc | 24 ++++++++++++++--------- test/core/test_graph_replace.cc | 22 ++++++++++----------- test/kernels/bang/test_bang_pooling.cc | 6 ++++-- test/kernels/cuda/test_cuda_inception.cc | 4 ++-- test/kernels/cuda/test_cuda_pooling.cc | 5 +++-- test/operators/test_pooling.cc | 12 ++++++------ 16 files changed, 103 insertions(+), 74 deletions(-) diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h index 7f514ebd..c93a355f 100644 --- a/include/core/graph_handler.h +++ b/include/core/graph_handler.h @@ -32,9 +32,9 @@ class GraphHandlerObj { float momentum, float eps, bool training); Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw, - int ph, int pw, int sh, int sw); + int ph, int pw, int sh, int sw, int ceilMode); Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw, - int ph, int pw, int sh, int sw); + int ph, int pw, int sh, int sw, int ceilMode); Tensor add(Tensor a, Tensor b, Tensor c); Tensor sub(Tensor a, Tensor b, Tensor c); diff --git a/include/operators/pooling.h b/include/operators/pooling.h index a163ab0f..7f28224d 100644 --- a/include/operators/pooling.h +++ b/include/operators/pooling.h @@ -12,6 +12,7 @@ class PoolingObj : public OperatorObj { int dh, dw; int ph, pw; int sh, sw; + int ceilMode; int n, c, h, w; public: @@ -32,9 +33,12 @@ class PoolingObj : public OperatorObj { * @param pw Padding at the width dimension. * @param sh Stride at the height dimension. * @param sw Stride at the width dimension. + * @param ceilMode Whether to use ceil(1) or floor(0) to compute the output + * shape. */ PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output, - int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw); + int kh, int kw, int dh, int dw, int ph, int pw, int sh, int sw, + int ceilMode); OP_CLONE(PoolingObj); optional> inferShape(const TensorVec &inputs) const override; @@ -50,6 +54,7 @@ class PoolingObj : public OperatorObj { int getPw() const { return pw; } int getSh() const { return sh; } int getSw() const { return sw; } + int getCeilMode() const { return ceilMode; } auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); } auto getNCHWRS() const { return tuple(n, c, h, w, kh, kw); } @@ -62,15 +67,15 @@ class PoolingObj : public OperatorObj { class MaxPoolObj : public PoolingObj { public: MaxPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw, - int dh, int dw, int ph, int pw, int sh, int sw) + int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode) : PoolingObj(graph, OpType::MaxPool, input, output, kh, kw, dh, dw, ph, - pw, sh, sw) {} + pw, sh, sw, ceilMode) {} }; class AvgPoolObj : public PoolingObj { public: AvgPoolObj(GraphObj *graph, Tensor input, Tensor output, int kh, int kw, - int dh, int dw, int ph, int pw, int sh, int sw) + int dh, int dw, int ph, int pw, int sh, int sw, int ceilMode) : PoolingObj(graph, OpType::AveragePool, input, output, kh, kw, dh, dw, - ph, pw, sh, sw) {} + ph, pw, sh, sw, ceilMode) {} }; }; // namespace infini diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py index f8e53b1c..149c284f 100644 --- a/pyinfinitensor/src/pyinfinitensor/onnx.py +++ b/pyinfinitensor/src/pyinfinitensor/onnx.py @@ -228,11 +228,12 @@ class OnnxStub: "dilations": [1, 1], "pads": [0, 0, 0, 0], "strides": [1, 1], + "ceil_mode": 0, }, ) - (k, d, p, s) = ( + (k, d, p, s, ceil_mode) = ( attributes[name] - for name in ["kernel_shape", "dilations", "pads", "strides"] + for name in ["kernel_shape", "dilations", "pads", "strides", "ceil_mode"] ) if p[0] != p[2] or p[1] != p[3]: adapt = "{}-adapt".format(node.output[0]) @@ -250,6 +251,7 @@ class OnnxStub: 0, s[0], s[1], + ceil_mode, ) else: tensors[node.output[0]] = self.handler.maxPool( @@ -263,6 +265,7 @@ class OnnxStub: p[1], s[0], s[1], + ceil_mode, ) elif node.op_type == "AveragePool": attributes = _parse_attribute( @@ -271,10 +274,11 @@ class OnnxStub: "kernel_shape": None, "pads": [0, 0, 0, 0], "strides": [1, 1], + "ceil_mode": 0, }, ) - (k, p, s) = ( - attributes[name] for name in ["kernel_shape", "pads", "strides"] + (k, p, s, ceil_mode) = ( + attributes[name] for name in ["kernel_shape", "pads", "strides", "ceil_mode"] ) if p[0] != p[2] or p[1] != p[3]: adapt = "{}-adapt".format(node.output[0]) @@ -292,6 +296,7 @@ class OnnxStub: 0, s[0], s[1], + ceil_mode, ) else: tensors[node.output[0]] = self.handler.avgPool( @@ -305,6 +310,7 @@ class OnnxStub: p[1], s[0], s[1], + ceil_mode, ) elif node.op_type == "GlobalAveragePool": [_, _, h, w] = _search_shape(model, node.input[0]) @@ -319,6 +325,7 @@ class OnnxStub: 0, 1, 1, + 0, ) elif node.op_type == "Add": tensors[node.output[0]] = self.handler.add( @@ -866,7 +873,7 @@ class OnnxStub: ) ) elif ty == backend.OpTypeId.MaxPool: - kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op) + kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op) ctx.push_node( make_node( ty.name, @@ -877,10 +884,11 @@ class OnnxStub: pads=[ph, pw, ph, pw], dilations=[dh, dw], strides=[sh, sw], + ceil_mode=ceil_mode, ) ) elif ty == backend.OpTypeId.AveragePool: - kh, kw, dh, dw, ph, pw, sh, sw = backend.pool_attrs_of(op) + kh, kw, dh, dw, ph, pw, sh, sw, ceil_mode = backend.pool_attrs_of(op) ctx.push_node( make_node( "AveragePool", @@ -890,6 +898,7 @@ class OnnxStub: kernel_shape=[kh, kw], pads=[ph, pw, ph, pw], strides=[sh, sw], + ceil_mode=ceil_mode, ) ) elif ty in [ diff --git a/src/core/graph.cc b/src/core/graph.cc index 0f844c34..f662cf32 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -210,10 +210,6 @@ void GraphObj::dataMalloc() { tensorToOffset[tensor.get()])); } } - -#ifdef DEBUG_MODE - allocator.info(); -#endif } Tensor GraphObj::addTensor(Shape dim, DataType dtype) { diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index a804a8c7..7267fddf 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -95,30 +95,30 @@ Tensor GraphHandlerObj::batchNormalization(Tensor input, Tensor output, } Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw, - int dh, int dw, int ph, int pw, int sh, - int sw) { + int dh, int dw, int ph, int pw, int sh, int sw, + int ceilMode) { if (output) { g->addOpWithOutputs(std::move(input), output, kh, kw, dh, - dw, ph, pw, sh, sw); + dw, ph, pw, sh, sw, ceilMode); return output; } else { return g ->addOp(std::move(input), output, kh, kw, dh, dw, ph, - pw, sh, sw) + pw, sh, sw, ceilMode) ->getOutput(); } } Tensor GraphHandlerObj::avgPool(Tensor input, Tensor output, int kh, int kw, - int dh, int dw, int ph, int pw, int sh, - int sw) { + int dh, int dw, int ph, int pw, int sh, int sw, + int ceilMode) { if (output) { g->addOpWithOutputs(std::move(input), output, kh, kw, dh, - dw, ph, pw, sh, sw); + dw, ph, pw, sh, sw, ceilMode); return output; } else { return g ->addOp(std::move(input), output, kh, kw, dh, dw, ph, - pw, sh, sw) + pw, sh, sw, ceilMode) ->getOutput(); } } diff --git a/src/core/lazy_allocator.cc b/src/core/lazy_allocator.cc index a5014e5c..c3407320 100644 --- a/src/core/lazy_allocator.cc +++ b/src/core/lazy_allocator.cc @@ -145,10 +145,10 @@ void LazyAllocator::free(size_t addr, size_t size) { void *LazyAllocator::getPtr() { if (this->ptr == nullptr) { this->ptr = runtime->alloc(this->peak); -#ifdef DEBUG_MODE - printf("LazyAllocator really alloc non-weight: %p %lu bytes\n", - this->ptr, peak); -#endif + // #ifdef DEBUG_MODE + // printf("LazyAllocator really alloc non-weight: %p %lu + // bytes\n", this->ptr, peak); + // #endif } return this->ptr; } @@ -156,10 +156,10 @@ void *LazyAllocator::getPtr() { void *LazyAllocator::getWeightPtr() { if (this->weightPtr == nullptr) { this->weightPtr = runtime->alloc(this->weightPeak); -#ifdef DEBUG_MODE - printf("LazyAllocator really alloc weight: %p %lu bytes\n", - this->weightPtr, weightPeak); -#endif + // #ifdef DEBUG_MODE + // printf("LazyAllocator really alloc weight: %p %lu bytes\n", + // this->weightPtr, weightPeak); + // #endif } return this->weightPtr; } diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index bea3f4bc..92feba2a 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -187,14 +187,14 @@ static std::tuple batch_norm_attrs_of(Operator op) { batchnorm->getTrainingMode()); } -static std::tuple +static std::tuple pool_attrs_of(Operator op) { IT_ASSERT(op->getOpType() == OpType::MaxPool || op->getOpType() == OpType::AveragePool); auto pool = dynamic_cast(op.get()); return std::make_tuple(pool->getKh(), pool->getKw(), pool->getDh(), pool->getDw(), pool->getPh(), pool->getPw(), - pool->getSh(), pool->getSw()); + pool->getSh(), pool->getSw(), pool->getCeilMode()); } static std::tuple, std::optional> diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc index 6f907705..8a91b466 100644 --- a/src/kernels/bang/pooling.cc +++ b/src/kernels/bang/pooling.cc @@ -30,6 +30,7 @@ class PoolingCnnl : public BangKernelWithoutConfig { ph, pw, pw, sh, sw, dh, dw, false)); // get outputs + // TODO: verify ceiling mode auto outVec = op->getOutput()->getDims(); int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]}; cnnlTensorDescriptor_t outDesc; diff --git a/src/kernels/cpu/pooling.cc b/src/kernels/cpu/pooling.cc index acb88d9c..1242e14f 100644 --- a/src/kernels/cpu/pooling.cc +++ b/src/kernels/cpu/pooling.cc @@ -21,6 +21,7 @@ template class NativePooling : public CpuKernelWithoutConfig { auto inoffset = i * (c * ih * iw) + j * ih * iw; for (auto h = 0; h < oh; h++) { for (auto w = 0; w < ow; w++) { + // TODO: verify ceil mode T val = getPoolingValue(kh, kw, h * sh - ph, w * sw - pw, ih, iw, inptr + inoffset); diff --git a/src/kernels/cuda/pooling.cc b/src/kernels/cuda/pooling.cc index 90aa3944..d8b2e0f8 100644 --- a/src/kernels/cuda/pooling.cc +++ b/src/kernels/cuda/pooling.cc @@ -29,17 +29,27 @@ class poolingCudnn : public CudaKernelWithoutConfig { pw, sh, sw)); // get outputs - int outn, outc, outh, outw; - checkCudnnError(cudnnGetPooling2dForwardOutputDim( - poolingDesc, inDesc, &outn, &outc, &outh, &outw)); + auto outDims = op->getOutput()->getDims(); + int outn = outDims[0], outc = outDims[1], outh = outDims[2], + outw = outDims[3]; + // NOTICE: cudnn pooling does not support ceil mode, so the shape + // inference of cudnn pooling is not consistant with our framework. Ceil + // mode is also supported in Pytorch and ONNX. See + // https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d + // and https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool + // for reference. + // TODO: Make sure the result after considering ceil mode is correct. + // int outn, outc, outh, outw; + // checkCudnnError(cudnnGetPooling2dForwardOutputDim(poolingDesc, + // inDesc, &outn, &outc, &outh, &outw)); cudnnTensorDescriptor_t outDesc; checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outn, outc, outh, outw)); - IT_ASSERT((vector{outn, outc, outh, outw}) == - op->getOutput()->getDims(), - "cuDNN output shape mismatches with OP output shape"); + // IT_ASSERT((vector{outn, outc, outh, outw}) == + // op->getOutput()->getDims(), + // "cuDNN output shape mismatches with OP output shape"); float alpha = 1.f, beta = 0.f; checkCudnnError(cudnnPoolingForward(context->cudnnHandle(), poolingDesc, diff --git a/src/operators/pooling.cc b/src/operators/pooling.cc index d7153699..b1bb2e3d 100644 --- a/src/operators/pooling.cc +++ b/src/operators/pooling.cc @@ -4,11 +4,9 @@ namespace infini { PoolingObj::PoolingObj(GraphObj *graph, OpType optype, Tensor input, Tensor output, int kh, int kw, int dh, int dw, int ph, - int pw, int sh, int sw) - : OperatorObj(optype, {input}, {output}), - - kh(kh), kw(kw), dh(dh), dw(dw), ph(ph), pw(pw), sh(sh), sw(sw), - + int pw, int sh, int sw, int ceilMode) + : OperatorObj(optype, {input}, {output}), kh(kh), kw(kw), dh(dh), dw(dw), + ph(ph), pw(pw), sh(sh), sw(sw), ceilMode(ceilMode), n(input->getDims()[0]), c(input->getDims()[1]), h(input->getDims()[2]), w(input->getDims()[3]) { IT_ASSERT(checkValid(graph)); @@ -18,8 +16,14 @@ optional> PoolingObj::inferShape(const TensorVec &inputs) const { const auto &input = inputs[0]; auto h = input->getDims()[input->getRank() - 2], w = input->getDims()[input->getRank() - 1]; - int oh = (h - (kh - sh) + ph * 2) / sh; - int ow = (w - (kw - sw) + pw * 2) / sw; + int oh, ow; + if (ceilMode) { + oh = ceil(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1); + ow = ceil(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1); + } else { + oh = floor(((float)(h + 2 * ph - dh * (kh - 1) - 1)) / sh + 1); + ow = floor(((float)(w + 2 * pw - dw * (kw - 1) - 1)) / sw + 1); + } auto ret = input->getDims(); ret[input->getRank() - 2] = oh; ret[input->getRank() - 1] = ow; @@ -34,17 +38,19 @@ std::string PoolingObj::toString() const { os << "p=[" << ph << "," << pw << "],"; os << "s=[" << sh << "," << sw << "],"; os << "d=[" << dh << "," << dw << "],"; + os << "ceil mode=" << ceilMode << ","; os << "input=" << inputs[0]->getGuid() << ","; os << "output=" << outputs[0]->getGuid() << ")"; return os.str(); } vector PoolingObj::getWorkloadVector() const { - return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw}; + return {type.underlying(), n, c, h, w, kh, kw, ph, pw, sh, sw, dh, dw, + ceilMode}; } vector PoolingObj::getOpAttrVector() const { - return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw}; + return {type.underlying(), kh, kw, ph, pw, sh, sw, dh, dw, ceilMode}; } }; // namespace infini diff --git a/test/core/test_graph_replace.cc b/test/core/test_graph_replace.cc index a77624ec..cada8860 100644 --- a/test/core/test_graph_replace.cc +++ b/test/core/test_graph_replace.cc @@ -208,16 +208,13 @@ TEST(MatchGraph, multi_output) { SubGraph subg0 = make_ref(runtime, TensorVec{i}); { auto maxpool = - subg0->addOp(i, nullptr, 3, 3, 0, 0, 0, 0, 2, 2); + subg0->addOp(i, nullptr, 3, 3, 1, 1, 0, 0, 2, 2, 0); Tensor w0 = subg0->addTensor(Shape{64, 192, 1, 1}, DataType::UInt32); auto conv0 = subg0->addOp(maxpool->getOutput(0), w0, nullptr); auto relu0 = subg0->addOp(conv0->getOutput(0), nullptr); - auto pad = subg0->addOp(maxpool->getOutput(0), nullptr, - vector{0, 0, 1, 1, 0, 0, 1, 1}, - std::nullopt); - auto avgpool = subg0->addOp(pad->getOutput(0), nullptr, 3, - 3, 0, 0, 0, 0, 1, 1); + auto avgpool = subg0->addOp(maxpool->getOutput(0), nullptr, + 3, 3, 0, 0, 0, 0, 1, 1, 0); subg0->setOutputs( TensorVec{relu0->getOutput(0), avgpool->getOutput(0)}); } @@ -225,8 +222,9 @@ TEST(MatchGraph, multi_output) { SubGraph subg1 = make_ref(runtime, TensorVec{i->clone(runtime)}); { - auto avgpool = subg1->addOp( - subg1->getInputsFromOutside()[0], nullptr, 3, 3, 0, 0, 0, 0, 2, 2); + auto avgpool = + subg1->addOp(subg1->getInputsFromOutside()[0], nullptr, + 3, 3, 1, 1, 0, 0, 2, 2, 0); auto relu0 = subg1->addOp(avgpool->getOutput(0), nullptr); @@ -295,7 +293,7 @@ TEST(MatchGraph, multi_input_output) { Tensor w2 = subg0->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32); auto conv2 = subg0->addOp(relu1->getOutput(0), w2, nullptr); auto maxpool = subg0->addOp(relu1->getOutput(0), nullptr, 3, - 3, 0, 0, 0, 0, 2, 2); + 3, 1, 1, 0, 0, 2, 2, 0); subg0->setOutputs( TensorVec{conv2->getOutput(0), maxpool->getOutput(0)}); } @@ -317,7 +315,7 @@ TEST(MatchGraph, multi_input_output) { Tensor w2 = subg1->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32); auto conv2 = subg1->addOp(relu1->getOutput(0), w2, nullptr); auto maxpool = subg1->addOp(relu1->getOutput(0), nullptr, 3, - 3, 0, 0, 0, 0, 2, 2); + 3, 1, 1, 0, 0, 2, 2, 0); subg1->setOutputs( TensorVec{maxpool->getOutput(0), conv2->getOutput(0)}); } @@ -338,7 +336,7 @@ TEST(MatchGraph, multi_input_output) { Tensor w2 = subg2->addTensor(Shape{128, 256, 1, 1}, DataType::UInt32); auto conv2 = subg2->addOp(relu1->getOutput(0), w2, nullptr); auto avgpool = subg2->addOp(relu1->getOutput(0), nullptr, 3, - 3, 0, 0, 0, 0, 2, 2); + 3, 1, 1, 0, 0, 2, 2, 0); subg2->setOutputs( TensorVec{conv2->getOutput(0), avgpool->getOutput(0)}); } @@ -349,7 +347,7 @@ TEST(MatchGraph, multi_input_output) { auto i = g->addTensor(Shape{1, 64, 112, 112}, DataType::UInt32); auto relu = g->addOp(i, nullptr); auto maxPool = g->addOp(relu->getOutput(0), nullptr, 3, 3, - 0, 0, 1, 1, 2, 2); + 1, 1, 1, 1, 2, 2, 0); auto out0 = v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)}); auto out1 = diff --git a/test/kernels/bang/test_bang_pooling.cc b/test/kernels/bang/test_bang_pooling.cc index 20347ae9..4bbc8091 100644 --- a/test/kernels/bang/test_bang_pooling.cc +++ b/test/kernels/bang/test_bang_pooling.cc @@ -8,7 +8,8 @@ namespace infini { -template +template {}, + int>::type = 0> void testPooling(const std::function &generator, const Shape &shape) { // Runtime @@ -23,7 +24,8 @@ void testPooling(const std::function &generator, // GPU Graph bangGraph = make_ref(bangRuntime); auto inputGpu = bangGraph->cloneTensor(inputCpu); - auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2); + auto gpuOp = + bangGraph->addOp(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0); bangGraph->dataMalloc(); bangRuntime->run(bangGraph); auto outputGpu = gpuOp->getOutput(); diff --git a/test/kernels/cuda/test_cuda_inception.cc b/test/kernels/cuda/test_cuda_inception.cc index 31a7b888..b6765648 100644 --- a/test/kernels/cuda/test_cuda_inception.cc +++ b/test/kernels/cuda/test_cuda_inception.cc @@ -29,7 +29,7 @@ TEST(CUDA_Inception_v3_block, run) { TensorVec outputs; vector ops; auto maxpool = - g->addOp(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1); + g->addOp(blockInput, nullptr, 3, 3, 1, 1, 1, 1, 1, 1, 0); auto chainInput = maxpool->getOutput(); for (auto &pathConfig : configs) { int inputChannels = initialChannels; @@ -52,7 +52,7 @@ TEST(CUDA_Inception_v3_block, run) { inputChannels = f; } else { // Add AveragePool auto pool = g->addOp(input, nullptr, r, r, 1, 1, - r / 2, r / 2, 1, 1); + r / 2, r / 2, 1, 1, 0); input = pool->getOutput(); ops.back().emplace_back(pool); } diff --git a/test/kernels/cuda/test_cuda_pooling.cc b/test/kernels/cuda/test_cuda_pooling.cc index 7347b951..7e9a6deb 100644 --- a/test/kernels/cuda/test_cuda_pooling.cc +++ b/test/kernels/cuda/test_cuda_pooling.cc @@ -9,7 +9,8 @@ namespace infini { using KDPS = vector; using ExpectOutput = vector; -template +template {}, + int>::type = 0> void testPoolCudnn( const std::function &generator, const Shape &shape, const KDPS &kdps, const ExpectOutput &ansVec) { @@ -24,7 +25,7 @@ void testPoolCudnn( Graph g = make_ref(cudaRuntime); auto i0 = g->cloneTensor(i0cpu); auto pool = g->addOp(i0, nullptr, kdps[0], kdps[1], kdps[2], kdps[3], - kdps[4], kdps[5], kdps[6], kdps[7]); + kdps[4], kdps[5], kdps[6], kdps[7], 0); // allocate CUDA memory g->dataMalloc(); diff --git a/test/operators/test_pooling.cc b/test/operators/test_pooling.cc index 8b4c52ed..1ddefa13 100644 --- a/test/operators/test_pooling.cc +++ b/test/operators/test_pooling.cc @@ -12,16 +12,16 @@ TEST(MaxPool, ShapeInference) { Graph g = make_ref(cpuRuntime); Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32); const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2, - sw = 2; - auto op = - g->addOp(i, nullptr, kh, kw, dh, dw, ph, pw, sh, sw); + sw = 2, ceilMode = 0; + auto op = g->addOp(i, nullptr, kh, kw, dh, dw, ph, pw, sh, + sw, ceilMode); EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 80, 80})); } { // dilation & stride Graph g = make_ref(cpuRuntime); Tensor i = g->addTensor({1, 64, 162, 162}, DataType::UInt32); - auto op = g->addOp(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2); + auto op = g->addOp(i, nullptr, 4, 3, 1, 1, 2, 1, 1, 2, 0); EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 64, 163, 81})); } } @@ -30,7 +30,7 @@ TEST(MaxPool, NaiveCPU) { Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Graph g = make_ref(cpuRuntime); Tensor i = g->addTensor({1, 2, 5, 5}, DataType::UInt32); - auto op = g->addOp(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2); + auto op = g->addOp(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0); g->dataMalloc(); i->setData(IncrementalGenerator()); @@ -49,7 +49,7 @@ TEST(AvgPool, NaiveCPU) { Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Graph g = make_ref(cpuRuntime); Tensor i = g->addTensor({1, 2, 5, 5}, DataType::Float32); - auto op = g->addOp(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2); + auto op = g->addOp(i, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0); g->dataMalloc(); i->setData(IncrementalGenerator()); From 7600fe688ce58496c1172012e8cbd93c67c77dae Mon Sep 17 00:00:00 2001 From: PanZezhong1725 <141193946+PanZezhong1725@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:54:56 +0800 Subject: [PATCH 2/3] Add Neg operator and kernel (#152) * Add Neg operator and kernel * handle neg in to_onnx --------- Co-authored-by: Haojie Wang --- include/core/graph_handler.h | 1 + include/cuda/cuda_unary.h | 18 ++--- pyinfinitensor/src/pyinfinitensor/onnx.py | 6 ++ pyinfinitensor/tests/test_onnx.py | 6 ++ src/core/graph_handler.cc | 1 + src/ffi/ffi_infinitensor.cc | 2 + src/kernels/cpu/unary.cc | 6 ++ src/kernels/cuda/unary.cc | 2 + src/kernels/cuda/unary.cu | 85 +++++++++++++---------- test/kernels/cuda/test_cuda_unary.cc | 1 + 10 files changed, 85 insertions(+), 43 deletions(-) diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h index c93a355f..1c79f51d 100644 --- a/include/core/graph_handler.h +++ b/include/core/graph_handler.h @@ -51,6 +51,7 @@ class GraphHandlerObj { Tensor softmax(Tensor x, Tensor y, int axis); Tensor abs(Tensor x, Tensor y); Tensor sqrt(Tensor x, Tensor y); + Tensor neg(Tensor x, Tensor y); Tensor shape(Tensor x, Tensor y); Tensor identity(Tensor x, Tensor y); Tensor flatten(Tensor s, Tensor y, int axis); diff --git a/include/cuda/cuda_unary.h b/include/cuda/cuda_unary.h index 0f26c2e3..c538682a 100644 --- a/include/cuda/cuda_unary.h +++ b/include/cuda/cuda_unary.h @@ -3,14 +3,14 @@ #include "operators/unary.h" namespace infini { -// TODO(constroy): num should be size_t. -void softmax_kernel(float *input, float *output, int num); -void relu_kernel(float *input, float *output, int num); -void sigmoid_kernel(float *input, float *output, int num); -void tanh_kernel(float *input, float *output, int num); -void abs_kernel(float *input, float *output, int num); -void sqrt_kernel(float *input, float *output, int num); -void erf_kernel(float *input, float *output, int num); +void softmax_kernel(float *input, float *output, size_t num); +void relu_kernel(float *input, float *output, size_t num); +void sigmoid_kernel(float *input, float *output, size_t num); +void tanh_kernel(float *input, float *output, size_t num); +void abs_kernel(float *input, float *output, size_t num); +void sqrt_kernel(float *input, float *output, size_t num); +void neg_kernel(float *input, float *output, size_t num); +void erf_kernel(float *input, float *output, size_t num); void unary_kernel(const Operator &_op) { auto op = as(_op); @@ -30,6 +30,8 @@ void unary_kernel(const Operator &_op) { abs_kernel(inputData, outputData, num); else if (op->getOpType() == OpType::Sqrt) sqrt_kernel(inputData, outputData, num); + else if (op->getOpType() == OpType::Neg) + neg_kernel(inputData, outputData, num); else if (op->getOpType() == OpType::Erf) erf_kernel(inputData, outputData, num); else diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py index 149c284f..96d4778d 100644 --- a/pyinfinitensor/src/pyinfinitensor/onnx.py +++ b/pyinfinitensor/src/pyinfinitensor/onnx.py @@ -403,6 +403,11 @@ class OnnxStub: tensors[node.input[0]], tensors.get(node.output[0]), ) + elif node.op_type == "Neg": + tensors[node.output[0]] = self.handler.neg( + tensors[node.input[0]], + tensors.get(node.output[0]), + ) elif node.op_type == "Shape": tensors[node.output[0]] = self.handler.shape( tensors[node.input[0]], @@ -916,6 +921,7 @@ class OnnxStub: backend.OpTypeId.PRelu, backend.OpTypeId.Sqrt, backend.OpTypeId.Erf, + backend.OpTypeId.Neg, ]: ctx.push_node(make_node(ty.name, inputs, outputs, name)) elif ty == backend.OpTypeId.Flatten: diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py index 6d041ed2..2d614b48 100644 --- a/pyinfinitensor/tests/test_onnx.py +++ b/pyinfinitensor/tests/test_onnx.py @@ -243,6 +243,12 @@ class TestStringMethods(unittest.TestCase): y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7]) abs = make_node("Abs", ["x"], ["y"], name="abs") make_and_import_model(make_graph([abs], "abs", [x], [y])) + + def test_neg(self): + x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) + y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7]) + neg = make_node("Neg", ["x"], ["y"], name="neg") + make_and_import_model(make_graph([neg], "neg", [x], [y])) def test_identity(self): x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index 7267fddf..bbd73e10 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -159,6 +159,7 @@ DEFINE_UNARY_METHOD(sigmoid, Sigmoid) DEFINE_UNARY_METHOD(tanh, Tanh) DEFINE_UNARY_METHOD(abs, Abs) DEFINE_UNARY_METHOD(sqrt, Sqrt) +DEFINE_UNARY_METHOD(neg, Neg) DEFINE_UNARY_METHOD(shape, Shape) DEFINE_UNARY_METHOD(erf, Erf) diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index 92feba2a..b515164f 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -100,6 +100,7 @@ void export_values(py::module &m) { .VALUE(OpType, Dropout) .VALUE(OpType, Cast) .VALUE(OpType, Sqrt) + .VALUE(OpType, Neg) .VALUE(OpType, Expand) .VALUE(OpType, Erf) .VALUE(OpType, Where) @@ -444,6 +445,7 @@ void init_graph_builder(py::module &m) { .def("softmax", &Handler::softmax, policy::move) .def("abs", &Handler::abs, policy::move) .def("sqrt", &Handler::sqrt, policy::move) + .def("neg", &Handler::neg, policy::move) .def("shape", &Handler::shape, policy::move) .def("identity", &Handler::identity, policy::move) .def("flatten", &Handler::flatten, policy::move) diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc index e559c909..15025115 100644 --- a/src/kernels/cpu/unary.cc +++ b/src/kernels/cpu/unary.cc @@ -64,6 +64,10 @@ template class NaiveErf : public NativeUnary { T doCompute(T val) const override { return std::erf(val); } }; +template class NaiveNeg : public NativeUnary { + T doCompute(T val) const override { return -val; } +}; + template class Clip : public CpuKernelWithoutConfig { void compute(const Operator &_op, const RuntimeObj *context) const override { @@ -103,6 +107,8 @@ REGISTER_KERNEL(Device::CPU, OpType::Sqrt, DataType::Float32, NaiveSqrt, "sqrtNaive_CPU_float32"); REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf, "erfNaive_CPU_float32"); +REGISTER_KERNEL(Device::CPU, OpType::Neg, DataType::Float32, NaiveNeg, + "negNaive_CPU_float32"); REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32, NaiveSoftmax, "softmaxNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32, diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc index 897e2c77..cb53bd80 100644 --- a/src/kernels/cuda/unary.cc +++ b/src/kernels/cuda/unary.cc @@ -140,6 +140,8 @@ REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda, "Abs_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda, "Sqrt_CUDA_Float32"); +REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda, + "Neg_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda, "Erf_CUDA_Float32"); diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu index 695762b4..061ac63d 100644 --- a/src/kernels/cuda/unary.cu +++ b/src/kernels/cuda/unary.cu @@ -8,7 +8,7 @@ constexpr unsigned int num_threads() { return 32 * 4; } constexpr int thread_work_size() { return 4; } constexpr int block_work_size() { return thread_work_size() * num_threads(); } -__global__ void _softmax_kernel1(float *input, float *output, int n) { +__global__ void _softmax_kernel1(float *input, float *output, size_t n) { float sum = 0.0f; for (size_t i = 0; i < n; ++i) { sum += pow(E_CONSTANT, input[i]); @@ -16,106 +16,121 @@ __global__ void _softmax_kernel1(float *input, float *output, int n) { *output = sum; } -__global__ void _softmax_kernel2(float *input, float *output, int n) { +__global__ void _softmax_kernel2(float *input, float *output, size_t n) { float sum = *output; - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = pow(E_CONSTANT, input[i]) / sum; } } -__global__ void _relu_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +__global__ void _relu_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = max(input[i], float(0)); } } -__global__ void _sigmoid_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +__global__ void _sigmoid_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = 1 / (1 + pow(E_CONSTANT, -input[i])); } } -__global__ void _tanh_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +__global__ void _tanh_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = (pow(E_CONSTANT, input[i]) - pow(E_CONSTANT, -input[i])) / (pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i])); } } -__global__ void _abs_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +__global__ void _abs_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = input[i] < 0 ? -input[i] : input[i]; } } -__global__ void _sqrt_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) { +__global__ void _sqrt_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { output[i] = sqrt(input[i]); } } -__global__ void _erf_kernel(float *input, float *output, int n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; +__global__ void _erf_kernel(float *input, float *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) { output[i] = erf(input[i]); } } +template +__global__ void _neg_kernel(T *input, T *output, size_t n) { + size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = index; i < n; i += stride) { + output[i] = -input[i]; + } +} + namespace infini { -void softmax_kernel(float *input, float *output, int num) { +void softmax_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _softmax_kernel1<<<1, 1>>>(input, output, num); _softmax_kernel2<<>>(input, output, num); } -void relu_kernel(float *input, float *output, int num) { +void relu_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _relu_kernel<<>>(input, output, num); } -void sigmoid_kernel(float *input, float *output, int num) { +void sigmoid_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _sigmoid_kernel<<>>(input, output, num); } -void tanh_kernel(float *input, float *output, int num) { +void tanh_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _tanh_kernel<<>>(input, output, num); } -void abs_kernel(float *input, float *output, int num) { +void abs_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _abs_kernel<<>>(input, output, num); } -void sqrt_kernel(float *input, float *output, int num) { +void sqrt_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _sqrt_kernel<<>>(input, output, num); } -void erf_kernel(float *input, float *output, int num) { +void erf_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size(); _erf_kernel<<>>(input, output, num); } +void neg_kernel(float *input, float *output, size_t num) { + + int blocksize = block_work_size(); + int gridsize = (num + block_work_size() - 1) / block_work_size(); + _neg_kernel<<>>(input, output, num); +} }; // namespace infini diff --git a/test/kernels/cuda/test_cuda_unary.cc b/test/kernels/cuda/test_cuda_unary.cc index 5d9f24ec..2f828fab 100644 --- a/test/kernels/cuda/test_cuda_unary.cc +++ b/test/kernels/cuda/test_cuda_unary.cc @@ -46,6 +46,7 @@ TEST(cuDNN_Unary, run) { testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); // more shapes testUnary(IncrementalGenerator(), Shape{13}); From 7f16fa353e76fd51ef22c30a54a410a898dc56b6 Mon Sep 17 00:00:00 2001 From: ChengXiang Qi <18630816527@163.com> Date: Tue, 10 Oct 2023 15:21:13 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E3=80=90Hackathon=20No.108=E3=80=91Add=20G?= =?UTF-8?q?elu=20operator,=20ffi,=20kernel=20for=20cpu=20and=20gpu.=20(#14?= =?UTF-8?q?8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feat: Add Gelu kernel, operator, ffi. --- include/core/graph_handler.h | 1 + include/core/op_type.h | 1 + include/cuda/cuda_unary.h | 3 +++ include/operators/unary.h | 1 + pyinfinitensor/src/pyinfinitensor/onnx.py | 6 ++++++ pyinfinitensor/tests/test_onnx.py | 8 ++++++++ src/core/graph_handler.cc | 1 + src/core/op_type.cc | 7 ++++--- src/ffi/ffi_infinitensor.cc | 2 ++ src/kernels/cpu/unary.cc | 10 ++++++++++ src/kernels/cuda/unary.cc | 2 ++ src/kernels/cuda/unary.cu | 15 +++++++++++++++ test/kernels/cuda/test_cuda_unary.cc | 4 ++++ test/operators/test_unary.cc | 21 +++++++++++++++++++++ 14 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 test/operators/test_unary.cc diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h index 1c79f51d..975a78bf 100644 --- a/include/core/graph_handler.h +++ b/include/core/graph_handler.h @@ -45,6 +45,7 @@ class GraphHandlerObj { Tensor max(Tensor a, Tensor b, Tensor c); Tensor relu(Tensor x, Tensor y); + Tensor gelu(Tensor x, Tensor y); Tensor sigmoid(Tensor x, Tensor y); Tensor tanh(Tensor x, Tensor y); Tensor erf(Tensor x, Tensor y); diff --git a/include/core/op_type.h b/include/core/op_type.h index e0146c5f..82439650 100644 --- a/include/core/op_type.h +++ b/include/core/op_type.h @@ -73,6 +73,7 @@ struct OpType { GatherElements, GatherND, Gemm, + Gelu, // Unary GlobalAveragePool, // GlobalPool GlobalLpPool, // GlobalPool GlobalMaxPool, // GlobalPool diff --git a/include/cuda/cuda_unary.h b/include/cuda/cuda_unary.h index c538682a..c839abc6 100644 --- a/include/cuda/cuda_unary.h +++ b/include/cuda/cuda_unary.h @@ -10,6 +10,7 @@ void tanh_kernel(float *input, float *output, size_t num); void abs_kernel(float *input, float *output, size_t num); void sqrt_kernel(float *input, float *output, size_t num); void neg_kernel(float *input, float *output, size_t num); +void gelu_kernel(float *input, float *output, size_t num); void erf_kernel(float *input, float *output, size_t num); void unary_kernel(const Operator &_op) { @@ -30,6 +31,8 @@ void unary_kernel(const Operator &_op) { abs_kernel(inputData, outputData, num); else if (op->getOpType() == OpType::Sqrt) sqrt_kernel(inputData, outputData, num); + else if (op->getOpType() == OpType::Gelu) + gelu_kernel(inputData, outputData, num); else if (op->getOpType() == OpType::Neg) neg_kernel(inputData, outputData, num); else if (op->getOpType() == OpType::Erf) diff --git a/include/operators/unary.h b/include/operators/unary.h index 8a3d9704..8349993c 100644 --- a/include/operators/unary.h +++ b/include/operators/unary.h @@ -258,6 +258,7 @@ class LogObj : public OperatorObj { }; DEFINE_UNARY_OBJ(Relu, OpType::Relu) +DEFINE_UNARY_OBJ(Gelu, OpType::Gelu) DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid) DEFINE_UNARY_OBJ(Tanh, OpType::Tanh) // DEFINE_UNARY_OBJ(Softmax, OpType::Softmax) diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py index 96d4778d..af1e1f95 100644 --- a/pyinfinitensor/src/pyinfinitensor/onnx.py +++ b/pyinfinitensor/src/pyinfinitensor/onnx.py @@ -374,6 +374,11 @@ class OnnxStub: tensors[node.input[0]], tensors.get(node.output[0]), ) + elif node.op_type == "Gelu": + tensors[node.output[0]] = self.handler.gelu( + tensors[node.input[0]], + tensors.get(node.output[0]), + ) elif node.op_type == "Sigmoid": tensors[node.output[0]] = self.handler.sigmoid( tensors[node.input[0]], @@ -913,6 +918,7 @@ class OnnxStub: backend.OpTypeId.Div, backend.OpTypeId.Pow, backend.OpTypeId.Relu, + backend.OpTypeId.Gelu, backend.OpTypeId.Sigmoid, backend.OpTypeId.Tanh, backend.OpTypeId.Softmax, diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py index 2d614b48..3420fa4f 100644 --- a/pyinfinitensor/tests/test_onnx.py +++ b/pyinfinitensor/tests/test_onnx.py @@ -208,6 +208,14 @@ class TestStringMethods(unittest.TestCase): relu = make_node("Relu", ["x"], ["y"], name="relu") make_and_import_model(make_graph([relu], "relu", [x], [y])) + '''Gelu operator is not supported by onnx 14.1 currently.''' + def test_gelu(self): + pass + # x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) + # y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7]) + # gelu = make_node("Gelu", ["x"], ["y"], name="gelu") + # make_and_import_model(make_graph([gelu], "gelu", [x], [y])) + def test_erf(self): x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7]) diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index bbd73e10..6255c8fd 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -155,6 +155,7 @@ DEFINE_ELEMENT_WISE_METHOD(max, Maximum) } DEFINE_UNARY_METHOD(relu, Relu) +DEFINE_UNARY_METHOD(gelu, Gelu) DEFINE_UNARY_METHOD(sigmoid, Sigmoid) DEFINE_UNARY_METHOD(tanh, Tanh) DEFINE_UNARY_METHOD(abs, Abs) diff --git a/src/core/op_type.cc b/src/core/op_type.cc index 38122bf9..5932513f 100644 --- a/src/core/op_type.cc +++ b/src/core/op_type.cc @@ -142,6 +142,7 @@ const char *OpType::toString() const { CASE(ReduceSum); CASE(ReduceSumSquare); CASE(Relu); + CASE(Gelu); CASE(Reshape); CASE(Resize); CASE(ReverseSequence); @@ -232,9 +233,9 @@ const char *OpType::toString() const { bool OpType::isUnary() const { static const std::unordered_set set{ - Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil, - Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not, - Relu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh, + Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Cast, Ceil, + Clip, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Not, + Relu, Gelu, Round, Sigmoid, Sin, Sinh, Sqrt, Tan, Tanh, }; return set.find(type) != set.end(); diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index b515164f..27a1ba81 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -92,6 +92,7 @@ void export_values(py::module &m) { .VALUE(OpType, BatchNormalization) .VALUE(OpType, Softmax) .VALUE(OpType, Relu) + .VALUE(OpType, Gelu) .VALUE(OpType, PRelu) .VALUE(OpType, Sigmoid) .VALUE(OpType, Tanh) @@ -440,6 +441,7 @@ void init_graph_builder(py::module &m) { .def("min", &Handler::min, policy::move) .def("max", &Handler::max, policy::move) .def("relu", &Handler::relu, policy::move) + .def("gelu", &Handler::gelu, policy::move) .def("sigmoid", &Handler::sigmoid, policy::move) .def("tanh", &Handler::tanh, policy::move) .def("softmax", &Handler::softmax, policy::move) diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc index 15025115..ec7497c3 100644 --- a/src/kernels/cpu/unary.cc +++ b/src/kernels/cpu/unary.cc @@ -60,6 +60,12 @@ template class NaiveSqrt : public NativeUnary { T doCompute(T val) const override { return std::sqrt(val); } }; +template class NaiveGelu : public NativeUnary { + T doCompute(T val) const override { + return 0.5 * val * (1 + std::erf(val / std::sqrt(2))); + } +}; + template class NaiveErf : public NativeUnary { T doCompute(T val) const override { return std::erf(val); } }; @@ -91,6 +97,10 @@ REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32, NaiveRelu, "reluNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu, "reluNaive_CPU_float32"); +REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu, + "geluNaive_CPU_float32"); +REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu, + "geluNaive_CPU_float32"); REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32, NaiveSigmoid, "sigmoidNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32, diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc index cb53bd80..48f6daaa 100644 --- a/src/kernels/cuda/unary.cc +++ b/src/kernels/cuda/unary.cc @@ -140,6 +140,8 @@ REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda, "Abs_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda, "Sqrt_CUDA_Float32"); +REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda, + "Gelu_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda, "Neg_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda, diff --git a/src/kernels/cuda/unary.cu b/src/kernels/cuda/unary.cu index 061ac63d..2267e6eb 100644 --- a/src/kernels/cuda/unary.cu +++ b/src/kernels/cuda/unary.cu @@ -66,6 +66,15 @@ __global__ void _sqrt_kernel(float *input, float *output, size_t n) { } } +__global__ void _gelu_kernel(float *input, float *output, size_t n) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = index; i < n; i += stride) { + float x = input[i]; + output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f))); + } +} + __global__ void _erf_kernel(float *input, float *output, size_t n) { size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t stride = blockDim.x * gridDim.x; @@ -121,6 +130,12 @@ void sqrt_kernel(float *input, float *output, size_t num) { int gridsize = (num + block_work_size() - 1) / block_work_size(); _sqrt_kernel<<>>(input, output, num); } +void gelu_kernel(float *input, float *output, size_t num) { + + int blocksize = block_work_size(); + int gridsize = (num + block_work_size() - 1) / block_work_size(); + _gelu_kernel<<>>(input, output, num); +} void erf_kernel(float *input, float *output, size_t num) { int blocksize = block_work_size(); diff --git a/test/kernels/cuda/test_cuda_unary.cc b/test/kernels/cuda/test_cuda_unary.cc index 2f828fab..09a2255e 100644 --- a/test/kernels/cuda/test_cuda_unary.cc +++ b/test/kernels/cuda/test_cuda_unary.cc @@ -52,6 +52,10 @@ TEST(cuDNN_Unary, run) { testUnary(IncrementalGenerator(), Shape{13}); testUnary(IncrementalGenerator(), Shape{4, 3}); testUnary(IncrementalGenerator(), Shape{2, 3, 4, 5, 6}); + + testUnary(IncrementalGenerator(), Shape{1}); + testUnary(IncrementalGenerator(), Shape{1, 2}); + testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); } } // namespace infini diff --git a/test/operators/test_unary.cc b/test/operators/test_unary.cc new file mode 100644 index 00000000..911d815e --- /dev/null +++ b/test/operators/test_unary.cc @@ -0,0 +1,21 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +using ExpectOutput = vector; +TEST(Unary, ShapeInference) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + { + Graph g = make_ref(runtime); + Tensor i0 = g->addTensor({2}, DataType::Float32); + auto op = g->addOp(i0, nullptr); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{2})); + } +} + +} // namespace infini