add transpose, concat and split for native cpu (#158)

This commit is contained in:
Haojie Wang 2023-10-12 10:14:28 +08:00 committed by GitHub
parent 36ae7b7fb6
commit 8e4d88fb9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 236 additions and 1 deletions

View File

@ -267,6 +267,7 @@ if(BUILD_TEST)
if(BUILD_TEST_CORE)
build_test(test/core/*.cc)
build_test(test/operators/*.cc)
build_test(test/kernels/nativecpu/*.cc)
if (USE_CUDA)
build_test(test/kernels/cuda/*.cc)
build_test(test/cuda/*.cc)

View File

@ -19,4 +19,4 @@ class TransposeObj : public OperatorObj {
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
}; // namespace infini
} // namespace infini

51
src/kernels/cpu/concat.cc Normal file
View File

@ -0,0 +1,51 @@
#include "operators/concat.h"
#include "core/kernel.h"
namespace infini {
template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto op = as<ConcatObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs();
auto dim = op->getDim();
auto output = outputs[0];
std::vector<Shape> iDims;
for (auto input : inputs)
iDims.emplace_back(input->getDims());
const auto &outDim = output->getDims();
size_t blockOffsetInner = 1;
for (size_t i = outDim.size() - 1; i > (size_t)dim; --i)
blockOffsetInner *= outDim[i];
size_t blockOffset = outDim[dim] * blockOffsetInner;
for (size_t i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto dimOffset = 0;
auto iDim = iDims[i];
for (size_t j = 0; j < i; ++j)
dimOffset += iDims[j][dim];
size_t localBlockOffset = 1;
for (size_t i = iDim.size() - 1;
i >= (size_t)dim && i != (size_t)-1; --i)
localBlockOffset *= iDim[i];
auto innerOffset = blockOffsetInner * dimOffset;
auto inSize = input->size();
auto inPtr = input->getRawDataPtr<T *>(),
outPtr = output->getRawDataPtr<T *>();
#pragma omp parallel for
for (size_t iOffset = 0; iOffset < inSize; ++iOffset) {
auto oOffset = iOffset % localBlockOffset + innerOffset +
iOffset / localBlockOffset * blockOffset;
// output->setData(oOffset, input->getData(iOffset));
outPtr[oOffset] = inPtr[iOffset];
}
}
}
};
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32,
NaiveConcat<uint32_t>, "ConcatNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32,
NaiveConcat<float>, "ConcatNaive_CPU_float32");
} // namespace infini

50
src/kernels/cpu/split.cc Normal file
View File

@ -0,0 +1,50 @@
#include "operators/split.h"
#include "core/kernel.h"
namespace infini {
template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto op = as<SplitObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs();
auto dim = op->getDim();
auto input = inputs[0];
const auto &inDim = input->getDims();
std::vector<Shape> outDims;
for (auto output : outputs)
outDims.emplace_back(output->getDims());
size_t blockOffsetInner = 1;
for (size_t i = inDim.size() - 1; i > (size_t)dim; --i)
blockOffsetInner *= inDim[i];
size_t blockOffset = inDim[dim] * blockOffsetInner;
for (size_t i = 0; i < outputs.size(); ++i) {
auto output = outputs[i];
auto dimOffset = 0;
auto outDim = outDims[i];
for (size_t j = 0; j < i; ++j)
dimOffset += outDims[j][dim];
size_t localBlockOffset = 1;
for (size_t i = outDim.size() - 1;
i >= (size_t)dim && i != (size_t)-1; --i)
localBlockOffset *= outDim[i];
auto innerOffset = blockOffsetInner * dimOffset;
auto outSize = output->size();
auto inPtr = input->getRawDataPtr<T *>(),
outPtr = output->getRawDataPtr<T *>();
#pragma omp parallel for
for (size_t oOffset = 0; oOffset < outSize; ++oOffset) {
auto iOffset = oOffset % localBlockOffset + innerOffset +
oOffset / localBlockOffset * blockOffset;
outPtr[oOffset] = inPtr[iOffset];
}
}
}
};
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32,
NaiveSplit<uint32_t>, "SplitNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32,
NaiveSplit<float>, "SplitNaive_CPU_float32");
} // namespace infini

View File

@ -0,0 +1,45 @@
#include "operators/transpose.h"
#include "core/kernel.h"
namespace infini {
inline Shape idx2Pos(const Shape &shape, size_t idx) {
Shape pos = Shape(shape.size(), 0);
auto rest = idx, curDimId = shape.size() - 1;
while (rest > 0) {
pos[curDimId] = rest % shape[curDimId];
rest /= shape[curDimId];
curDimId--;
}
return pos;
}
template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *context) const override {
auto op = as<TransposeObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs();
const auto &inDim = inputs[0]->getDims();
const auto &perm = op->getPermute();
size_t inSize = inputs[0]->size();
auto inPtr = inputs[0]->getRawDataPtr<T *>(),
outPtr = outputs[0]->getRawDataPtr<T *>();
// #pragma omp parallel for
for (size_t inIdx = 0; inIdx < inSize; ++inIdx) {
auto posInput = idx2Pos(inDim, inIdx);
int outIdx = 0;
for (size_t j = 0, jEnd = perm.size(); j < jEnd; ++j) {
outIdx = outIdx * inDim[perm[j]] + posInput[perm[j]];
}
outPtr[outIdx] = inPtr[inIdx];
}
}
};
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32,
NaiveTranspose<uint32_t>, "TransposeNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32,
NaiveTranspose<float>, "TransposeNaive_CPU_float32");
} // namespace infini

View File

@ -0,0 +1,28 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/concat.h"
#include "test.h"
namespace infini {
TEST(Concat, NativeCpu) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
g->dataMalloc();
t1->setData(IncrementalGenerator());
t2->setData(OneGenerator());
t3->setData(OneGenerator());
runtime->run(g);
EXPECT_TRUE(op->getOutput()->equalData(
vector<float>{0, 1, 2, 1, 1, 1, 3, 4, 5, 1, 1, 1,
6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
}
} // namespace infini

View File

@ -0,0 +1,32 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/split.h"
#include "test.h"
namespace infini {
TEST(Split, NativeCpu) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
auto input = g->addTensor({2, 10, 2, 1}, DataType::Float32);
auto op = g->addOp<SplitObj>(input, std::nullopt, 1, 3);
g->dataMalloc();
input->setData(IncrementalGenerator());
runtime->run(g);
EXPECT_EQ(op->getOutputs().size(), (size_t)3);
auto o0 = g->cloneTensor(op->getOutput(0));
auto o1 = g->cloneTensor(op->getOutput(1));
auto o2 = g->cloneTensor(op->getOutput(2));
EXPECT_TRUE(
o0->equalData(vector<float>{0, 1, 2, 3, 4, 5, 20, 21, 22, 23, 24, 25}));
EXPECT_TRUE(o1->equalData(
vector<float>{6, 7, 8, 9, 10, 11, 26, 27, 28, 29, 30, 31}));
EXPECT_TRUE(o2->equalData(vector<float>{12, 13, 14, 15, 16, 17, 18, 19, 32,
33, 34, 35, 36, 37, 38, 39}));
}
} // namespace infini

View File

@ -0,0 +1,28 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/transpose.h"
#include "test.h"
namespace infini {
TEST(Transpose, NativeCpu) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Shape permute = {0, 2, 1, 3};
auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32);
auto op = g->addOp<TransposeObj>(input, nullptr, permute);
g->dataMalloc();
input->setData(IncrementalGenerator());
runtime->run(g);
auto o = g->cloneTensor(op->getOutput(0));
EXPECT_TRUE(o->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
4, 5, 6, 7, 16, 17, 18, 19,
8, 9, 10, 11, 20, 21, 22, 23}));
}
} // namespace infini