diff --git a/CMakeLists.txt b/CMakeLists.txt index 49d2c5a7..291adf92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,6 +267,7 @@ if(BUILD_TEST) if(BUILD_TEST_CORE) build_test(test/core/*.cc) build_test(test/operators/*.cc) + build_test(test/kernels/nativecpu/*.cc) if (USE_CUDA) build_test(test/kernels/cuda/*.cc) build_test(test/cuda/*.cc) diff --git a/include/operators/transpose.h b/include/operators/transpose.h index 61dc8e5a..c20d0a08 100644 --- a/include/operators/transpose.h +++ b/include/operators/transpose.h @@ -19,4 +19,4 @@ class TransposeObj : public OperatorObj { vector getWorkloadVector() const override; vector getOpAttrVector() const override; }; -}; // namespace infini +} // namespace infini diff --git a/src/kernels/cpu/concat.cc b/src/kernels/cpu/concat.cc new file mode 100644 index 00000000..5dd73866 --- /dev/null +++ b/src/kernels/cpu/concat.cc @@ -0,0 +1,51 @@ +#include "operators/concat.h" +#include "core/kernel.h" + +namespace infini { + +template class NaiveConcat : public CpuKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *context) const override { + auto op = as(_op); + auto inputs = op->getInputs(), outputs = op->getOutputs(); + auto dim = op->getDim(); + auto output = outputs[0]; + std::vector iDims; + for (auto input : inputs) + iDims.emplace_back(input->getDims()); + const auto &outDim = output->getDims(); + size_t blockOffsetInner = 1; + for (size_t i = outDim.size() - 1; i > (size_t)dim; --i) + blockOffsetInner *= outDim[i]; + size_t blockOffset = outDim[dim] * blockOffsetInner; + for (size_t i = 0; i < inputs.size(); ++i) { + auto input = inputs[i]; + auto dimOffset = 0; + auto iDim = iDims[i]; + for (size_t j = 0; j < i; ++j) + dimOffset += iDims[j][dim]; + size_t localBlockOffset = 1; + for (size_t i = iDim.size() - 1; + i >= (size_t)dim && i != (size_t)-1; --i) + localBlockOffset *= iDim[i]; + auto innerOffset = blockOffsetInner * dimOffset; + auto inSize = input->size(); + auto inPtr = input->getRawDataPtr(), + outPtr = output->getRawDataPtr(); +#pragma omp parallel for + for (size_t iOffset = 0; iOffset < inSize; ++iOffset) { + auto oOffset = iOffset % localBlockOffset + innerOffset + + iOffset / localBlockOffset * blockOffset; + // output->setData(oOffset, input->getData(iOffset)); + outPtr[oOffset] = inPtr[iOffset]; + } + } + } +}; + +REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32, + NaiveConcat, "ConcatNaive_CPU_uint32"); +REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32, + NaiveConcat, "ConcatNaive_CPU_float32"); + +} // namespace infini diff --git a/src/kernels/cpu/split.cc b/src/kernels/cpu/split.cc new file mode 100644 index 00000000..3ef0cea3 --- /dev/null +++ b/src/kernels/cpu/split.cc @@ -0,0 +1,50 @@ +#include "operators/split.h" +#include "core/kernel.h" + +namespace infini { + +template class NaiveSplit : public CpuKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *context) const override { + auto op = as(_op); + auto inputs = op->getInputs(), outputs = op->getOutputs(); + auto dim = op->getDim(); + auto input = inputs[0]; + const auto &inDim = input->getDims(); + std::vector outDims; + for (auto output : outputs) + outDims.emplace_back(output->getDims()); + size_t blockOffsetInner = 1; + for (size_t i = inDim.size() - 1; i > (size_t)dim; --i) + blockOffsetInner *= inDim[i]; + size_t blockOffset = inDim[dim] * blockOffsetInner; + for (size_t i = 0; i < outputs.size(); ++i) { + auto output = outputs[i]; + auto dimOffset = 0; + auto outDim = outDims[i]; + for (size_t j = 0; j < i; ++j) + dimOffset += outDims[j][dim]; + size_t localBlockOffset = 1; + for (size_t i = outDim.size() - 1; + i >= (size_t)dim && i != (size_t)-1; --i) + localBlockOffset *= outDim[i]; + auto innerOffset = blockOffsetInner * dimOffset; + auto outSize = output->size(); + auto inPtr = input->getRawDataPtr(), + outPtr = output->getRawDataPtr(); +#pragma omp parallel for + for (size_t oOffset = 0; oOffset < outSize; ++oOffset) { + auto iOffset = oOffset % localBlockOffset + innerOffset + + oOffset / localBlockOffset * blockOffset; + outPtr[oOffset] = inPtr[iOffset]; + } + } + } +}; + +REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32, + NaiveSplit, "SplitNaive_CPU_uint32"); +REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32, + NaiveSplit, "SplitNaive_CPU_float32"); + +} // namespace infini diff --git a/src/kernels/cpu/transpose.cc b/src/kernels/cpu/transpose.cc new file mode 100644 index 00000000..997c427e --- /dev/null +++ b/src/kernels/cpu/transpose.cc @@ -0,0 +1,45 @@ +#include "operators/transpose.h" +#include "core/kernel.h" + +namespace infini { + +inline Shape idx2Pos(const Shape &shape, size_t idx) { + Shape pos = Shape(shape.size(), 0); + auto rest = idx, curDimId = shape.size() - 1; + while (rest > 0) { + pos[curDimId] = rest % shape[curDimId]; + rest /= shape[curDimId]; + curDimId--; + } + return pos; +} + +template class NaiveTranspose : public CpuKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *context) const override { + auto op = as(_op); + auto inputs = op->getInputs(), outputs = op->getOutputs(); + const auto &inDim = inputs[0]->getDims(); + const auto &perm = op->getPermute(); + + size_t inSize = inputs[0]->size(); + auto inPtr = inputs[0]->getRawDataPtr(), + outPtr = outputs[0]->getRawDataPtr(); + // #pragma omp parallel for + for (size_t inIdx = 0; inIdx < inSize; ++inIdx) { + auto posInput = idx2Pos(inDim, inIdx); + int outIdx = 0; + for (size_t j = 0, jEnd = perm.size(); j < jEnd; ++j) { + outIdx = outIdx * inDim[perm[j]] + posInput[perm[j]]; + } + outPtr[outIdx] = inPtr[inIdx]; + } + } +}; + +REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32, + NaiveTranspose, "TransposeNaive_CPU_uint32"); +REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32, + NaiveTranspose, "TransposeNaive_CPU_float32"); + +} // namespace infini diff --git a/test/kernels/nativecpu/test_nativecpu_concat.cc b/test/kernels/nativecpu/test_nativecpu_concat.cc new file mode 100644 index 00000000..fc87fb19 --- /dev/null +++ b/test/kernels/nativecpu/test_nativecpu_concat.cc @@ -0,0 +1,28 @@ +#include "core/graph.h" +#include "core/runtime.h" +#include "operators/concat.h" + +#include "test.h" + +namespace infini { + +TEST(Concat, NativeCpu) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph g = make_ref(runtime); + + auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32); + auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32); + auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32); + auto op = g->addOp(TensorVec{t1, t2, t3}, nullptr, 2); + g->dataMalloc(); + t1->setData(IncrementalGenerator()); + t2->setData(OneGenerator()); + t3->setData(OneGenerator()); + + runtime->run(g); + EXPECT_TRUE(op->getOutput()->equalData( + vector{0, 1, 2, 1, 1, 1, 3, 4, 5, 1, 1, 1, + 6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1})); +} + +} // namespace infini diff --git a/test/kernels/nativecpu/test_nativecpu_split.cc b/test/kernels/nativecpu/test_nativecpu_split.cc new file mode 100644 index 00000000..80779e0f --- /dev/null +++ b/test/kernels/nativecpu/test_nativecpu_split.cc @@ -0,0 +1,32 @@ +#include "core/graph.h" +#include "core/runtime.h" +#include "operators/split.h" + +#include "test.h" + +namespace infini { + +TEST(Split, NativeCpu) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph g = make_ref(runtime); + + auto input = g->addTensor({2, 10, 2, 1}, DataType::Float32); + auto op = g->addOp(input, std::nullopt, 1, 3); + g->dataMalloc(); + input->setData(IncrementalGenerator()); + + runtime->run(g); + + EXPECT_EQ(op->getOutputs().size(), (size_t)3); + auto o0 = g->cloneTensor(op->getOutput(0)); + auto o1 = g->cloneTensor(op->getOutput(1)); + auto o2 = g->cloneTensor(op->getOutput(2)); + EXPECT_TRUE( + o0->equalData(vector{0, 1, 2, 3, 4, 5, 20, 21, 22, 23, 24, 25})); + EXPECT_TRUE(o1->equalData( + vector{6, 7, 8, 9, 10, 11, 26, 27, 28, 29, 30, 31})); + EXPECT_TRUE(o2->equalData(vector{12, 13, 14, 15, 16, 17, 18, 19, 32, + 33, 34, 35, 36, 37, 38, 39})); +} + +} // namespace infini diff --git a/test/kernels/nativecpu/test_nativecpu_transpose.cc b/test/kernels/nativecpu/test_nativecpu_transpose.cc new file mode 100644 index 00000000..db050146 --- /dev/null +++ b/test/kernels/nativecpu/test_nativecpu_transpose.cc @@ -0,0 +1,28 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/transpose.h" + +#include "test.h" + +namespace infini { + +TEST(Transpose, NativeCpu) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph g = make_ref(runtime); + + Shape permute = {0, 2, 1, 3}; + auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32); + auto op = g->addOp(input, nullptr, permute); + g->dataMalloc(); + input->setData(IncrementalGenerator()); + + runtime->run(g); + + auto o = g->cloneTensor(op->getOutput(0)); + EXPECT_TRUE(o->equalData(vector{0, 1, 2, 3, 12, 13, 14, 15, + 4, 5, 6, 7, 16, 17, 18, 19, + 8, 9, 10, 11, 20, 21, 22, 23})); +} + +} // namespace infini