forked from jiuyuan/InfiniTensor
add transpose, concat and split for native cpu (#158)
This commit is contained in:
parent
36ae7b7fb6
commit
8e4d88fb9f
|
@ -267,6 +267,7 @@ if(BUILD_TEST)
|
|||
if(BUILD_TEST_CORE)
|
||||
build_test(test/core/*.cc)
|
||||
build_test(test/operators/*.cc)
|
||||
build_test(test/kernels/nativecpu/*.cc)
|
||||
if (USE_CUDA)
|
||||
build_test(test/kernels/cuda/*.cc)
|
||||
build_test(test/cuda/*.cc)
|
||||
|
|
|
@ -19,4 +19,4 @@ class TransposeObj : public OperatorObj {
|
|||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
}; // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
#include "operators/concat.h"
|
||||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
auto dim = op->getDim();
|
||||
auto output = outputs[0];
|
||||
std::vector<Shape> iDims;
|
||||
for (auto input : inputs)
|
||||
iDims.emplace_back(input->getDims());
|
||||
const auto &outDim = output->getDims();
|
||||
size_t blockOffsetInner = 1;
|
||||
for (size_t i = outDim.size() - 1; i > (size_t)dim; --i)
|
||||
blockOffsetInner *= outDim[i];
|
||||
size_t blockOffset = outDim[dim] * blockOffsetInner;
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
auto input = inputs[i];
|
||||
auto dimOffset = 0;
|
||||
auto iDim = iDims[i];
|
||||
for (size_t j = 0; j < i; ++j)
|
||||
dimOffset += iDims[j][dim];
|
||||
size_t localBlockOffset = 1;
|
||||
for (size_t i = iDim.size() - 1;
|
||||
i >= (size_t)dim && i != (size_t)-1; --i)
|
||||
localBlockOffset *= iDim[i];
|
||||
auto innerOffset = blockOffsetInner * dimOffset;
|
||||
auto inSize = input->size();
|
||||
auto inPtr = input->getRawDataPtr<T *>(),
|
||||
outPtr = output->getRawDataPtr<T *>();
|
||||
#pragma omp parallel for
|
||||
for (size_t iOffset = 0; iOffset < inSize; ++iOffset) {
|
||||
auto oOffset = iOffset % localBlockOffset + innerOffset +
|
||||
iOffset / localBlockOffset * blockOffset;
|
||||
// output->setData(oOffset, input->getData(iOffset));
|
||||
outPtr[oOffset] = inPtr[iOffset];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32,
|
||||
NaiveConcat<uint32_t>, "ConcatNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32,
|
||||
NaiveConcat<float>, "ConcatNaive_CPU_float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,50 @@
|
|||
#include "operators/split.h"
|
||||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<SplitObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
auto dim = op->getDim();
|
||||
auto input = inputs[0];
|
||||
const auto &inDim = input->getDims();
|
||||
std::vector<Shape> outDims;
|
||||
for (auto output : outputs)
|
||||
outDims.emplace_back(output->getDims());
|
||||
size_t blockOffsetInner = 1;
|
||||
for (size_t i = inDim.size() - 1; i > (size_t)dim; --i)
|
||||
blockOffsetInner *= inDim[i];
|
||||
size_t blockOffset = inDim[dim] * blockOffsetInner;
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
auto output = outputs[i];
|
||||
auto dimOffset = 0;
|
||||
auto outDim = outDims[i];
|
||||
for (size_t j = 0; j < i; ++j)
|
||||
dimOffset += outDims[j][dim];
|
||||
size_t localBlockOffset = 1;
|
||||
for (size_t i = outDim.size() - 1;
|
||||
i >= (size_t)dim && i != (size_t)-1; --i)
|
||||
localBlockOffset *= outDim[i];
|
||||
auto innerOffset = blockOffsetInner * dimOffset;
|
||||
auto outSize = output->size();
|
||||
auto inPtr = input->getRawDataPtr<T *>(),
|
||||
outPtr = output->getRawDataPtr<T *>();
|
||||
#pragma omp parallel for
|
||||
for (size_t oOffset = 0; oOffset < outSize; ++oOffset) {
|
||||
auto iOffset = oOffset % localBlockOffset + innerOffset +
|
||||
oOffset / localBlockOffset * blockOffset;
|
||||
outPtr[oOffset] = inPtr[iOffset];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32,
|
||||
NaiveSplit<uint32_t>, "SplitNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32,
|
||||
NaiveSplit<float>, "SplitNaive_CPU_float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,45 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
inline Shape idx2Pos(const Shape &shape, size_t idx) {
|
||||
Shape pos = Shape(shape.size(), 0);
|
||||
auto rest = idx, curDimId = shape.size() - 1;
|
||||
while (rest > 0) {
|
||||
pos[curDimId] = rest % shape[curDimId];
|
||||
rest /= shape[curDimId];
|
||||
curDimId--;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
const auto &inDim = inputs[0]->getDims();
|
||||
const auto &perm = op->getPermute();
|
||||
|
||||
size_t inSize = inputs[0]->size();
|
||||
auto inPtr = inputs[0]->getRawDataPtr<T *>(),
|
||||
outPtr = outputs[0]->getRawDataPtr<T *>();
|
||||
// #pragma omp parallel for
|
||||
for (size_t inIdx = 0; inIdx < inSize; ++inIdx) {
|
||||
auto posInput = idx2Pos(inDim, inIdx);
|
||||
int outIdx = 0;
|
||||
for (size_t j = 0, jEnd = perm.size(); j < jEnd; ++j) {
|
||||
outIdx = outIdx * inDim[perm[j]] + posInput[perm[j]];
|
||||
}
|
||||
outPtr[outIdx] = inPtr[inIdx];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32,
|
||||
NaiveTranspose<uint32_t>, "TransposeNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32,
|
||||
NaiveTranspose<float>, "TransposeNaive_CPU_float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,28 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/concat.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Concat, NativeCpu) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
|
||||
auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
|
||||
auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
|
||||
auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
|
||||
g->dataMalloc();
|
||||
t1->setData(IncrementalGenerator());
|
||||
t2->setData(OneGenerator());
|
||||
t3->setData(OneGenerator());
|
||||
|
||||
runtime->run(g);
|
||||
EXPECT_TRUE(op->getOutput()->equalData(
|
||||
vector<float>{0, 1, 2, 1, 1, 1, 3, 4, 5, 1, 1, 1,
|
||||
6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,32 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/split.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Split, NativeCpu) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto input = g->addTensor({2, 10, 2, 1}, DataType::Float32);
|
||||
auto op = g->addOp<SplitObj>(input, std::nullopt, 1, 3);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
|
||||
runtime->run(g);
|
||||
|
||||
EXPECT_EQ(op->getOutputs().size(), (size_t)3);
|
||||
auto o0 = g->cloneTensor(op->getOutput(0));
|
||||
auto o1 = g->cloneTensor(op->getOutput(1));
|
||||
auto o2 = g->cloneTensor(op->getOutput(2));
|
||||
EXPECT_TRUE(
|
||||
o0->equalData(vector<float>{0, 1, 2, 3, 4, 5, 20, 21, 22, 23, 24, 25}));
|
||||
EXPECT_TRUE(o1->equalData(
|
||||
vector<float>{6, 7, 8, 9, 10, 11, 26, 27, 28, 29, 30, 31}));
|
||||
EXPECT_TRUE(o2->equalData(vector<float>{12, 13, 14, 15, 16, 17, 18, 19, 32,
|
||||
33, 34, 35, 36, 37, 38, 39}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,28 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Transpose, NativeCpu) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
||||
Shape permute = {0, 2, 1, 3};
|
||||
auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32);
|
||||
auto op = g->addOp<TransposeObj>(input, nullptr, permute);
|
||||
g->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
|
||||
runtime->run(g);
|
||||
|
||||
auto o = g->cloneTensor(op->getOutput(0));
|
||||
EXPECT_TRUE(o->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
|
||||
4, 5, 6, 7, 16, 17, 18, 19,
|
||||
8, 9, 10, 11, 20, 21, 22, 23}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue