forked from jiuyuan/InfiniTensor
fix xpu code, let it can run.
This commit is contained in:
parent
a69390e310
commit
d18d40a2e9
|
@ -12,14 +12,14 @@ class CastXdnn : public XPUKernelWithoutConfig {
|
|||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
CastObj::CastType type = op->getType();
|
||||
CastType type = op->getType();
|
||||
|
||||
int ret = 0;
|
||||
switch (type) {
|
||||
case CastObj::Float2Int32:
|
||||
case CastType::Float2Int32:
|
||||
ret = baidu::xpu::api::cast<float,int>(context->XPUHandle(), (float*)aData, (int*)cData, len);
|
||||
break;
|
||||
case CastObj::Int322Int8:
|
||||
case CastType::Int322Int8:
|
||||
ret = baidu::xpu::api::cast<int,float>(context->XPUHandle(), (int*)aData, (float*)cData, len);
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -174,30 +174,6 @@ class EqualXdnn : public XPUKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class NotEqualXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
auto context = dynamic_cast<const XPURuntimeObj *>(_context);
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
size_t len = op->getOutput()->size();
|
||||
XPUPtr wsData = context->getWorkspace(len);
|
||||
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
if (aDim.size() != 4 || bDim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
auto ret = baidu::xpu::api::broadcast_not_equal<float>(context->XPUHandle(), (float*)aData, (float*)bData, (bool*)wsData, aDim, bDim);
|
||||
ret = baidu::xpu::api::cast<bool, float>(context->XPUHandle(), (bool*)wsData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
class GreaterEqualXdnn : public XPUKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
|
@ -304,20 +280,18 @@ REGISTER_KERNEL(Device::XPU, OpType::Div, DataType::Float32, DivXdnn,
|
|||
"Div_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Pow, DataType::Float32, PowXdnn,
|
||||
"Pow_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Maximum, DataType::Float32, MaxXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Max, DataType::Float32, MaxXdnn,
|
||||
"Max_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Minimum, DataType::Float32, MinXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Min, DataType::Float32, MinXdnn,
|
||||
"Min_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Equal, DataType::Float32, EqualXdnn,
|
||||
"Equal_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::NotEqual, DataType::Float32, NotEqualXdnn,
|
||||
"NotEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::GreaterEqual, DataType::Float32, GreaterEqualXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::GreaterOrEqual, DataType::Float32, GreaterEqualXdnn,
|
||||
"GreaterEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::GreaterThan, DataType::Float32, GreaterThanXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Greater, DataType::Float32, GreaterThanXdnn,
|
||||
"GreaterThan_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::LessEqual, DataType::Float32, LessEqualXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::LessOrEqual, DataType::Float32, LessEqualXdnn,
|
||||
"LessEqual_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::LessThan, DataType::Float32, LessThanXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Less, DataType::Float32, LessThanXdnn,
|
||||
"LessThan_xdnn_XPU_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -30,6 +30,6 @@ class MatmulXdnn : public XPUKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Matmul, DataType::Float32, MatmulXdnn,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::MatMul, DataType::Float32, MatmulXdnn,
|
||||
"Matmul_xdnn_XPU_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -57,6 +57,6 @@ class MaxPooling : public XPUKernelWithoutConfig {
|
|||
|
||||
REGISTER_KERNEL(Device::XPU, OpType::MaxPool, DataType::Float32, MaxPooling,
|
||||
"MaxPool_xdnn_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::AvgPool, DataType::Float32, AvgPooling,
|
||||
REGISTER_KERNEL(Device::XPU, OpType::AveragePool, DataType::Float32, AvgPooling,
|
||||
"AvgPool_xdnn_Float32");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -201,9 +201,9 @@ class CopyXdnn : public XPUKernelWithoutConfig {
|
|||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
auto len = op->getInputs(0)->size();
|
||||
|
||||
auto ret = baidu::xpu::api::copy<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
auto ret = baidu::xpu::api::copy<float>(context->XPUHandle(), (float*)aData, (float*)cData, len);
|
||||
assert(ret == 0);
|
||||
return;
|
||||
|
||||
}
|
||||
};
|
||||
|
@ -248,8 +248,6 @@ REGISTER_KERNEL(Device::XPU, OpType::Floor, DataType::Float32, FloorXdnn,
|
|||
"Floor_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Neg, DataType::Float32, NegXdnn,
|
||||
"Neg_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Copy, DataType::Float32, CopyXdnn,
|
||||
"Copy_xdnn_XPU_Float32");
|
||||
REGISTER_KERNEL(Device::XPU, OpType::Reciprocal, DataType::Float32, ReciprocalXdnn,
|
||||
"Reciprocal_xdnn_XPU_Float32");
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ void XPURuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
|||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
|
|
@ -19,12 +19,8 @@ void testAdd(
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generator);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
|
@ -36,12 +32,16 @@ void testAdd(
|
|||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
// Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
// auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
// cpuGraph->dataMalloc();
|
||||
// cpuRuntime->run(cpuGraph);
|
||||
// auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
// Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
// auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
// cpuGraph->addTensor(inputCpu1);
|
||||
// cpuGraph->addTensor(inputCpu2);
|
||||
// cpuGraph->dataMalloc();
|
||||
// inputCpu1->setData(generator);
|
||||
// inputCpu2->setData(generator);
|
||||
// cpuRuntime->run(cpuGraph);
|
||||
// auto outputCpu = cpuOp->getOutput();
|
||||
// // Check
|
||||
// outputCpu->printData();
|
||||
outputGpu2Cpu->printData();
|
||||
// EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu));
|
||||
|
@ -54,7 +54,6 @@ TEST(xpu_add, run) {
|
|||
testAdd<MulObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<DivObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<EqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<NotEqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<GreaterEqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<GreaterThanObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
testAdd<LessEqualObj>(IncrementalGenerator(), Shape{1, 1, 1, 30});
|
||||
|
|
|
@ -32,6 +32,8 @@ void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
auto gpuOp =
|
||||
xpuGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
|
||||
xpuGraph->dataMalloc();
|
||||
inputGpu1->setData(generator);
|
||||
inputGpu2->setData(generator);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
|
|
|
@ -19,13 +19,8 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
auto inputMlu1 = xpuGraph->cloneTensor(inputCpu1);
|
||||
|
@ -38,9 +33,13 @@ void testConv(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
auto outputXpu2Cpu = outputXpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
auto cpuOp =
|
||||
cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr, 1, 1, 1, 1, 1, 1);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
|
|
|
@ -20,12 +20,8 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
// Build input data on CPU
|
||||
Tensor inputCpu1 =
|
||||
make_ref<TensorObj>(shapeA, DataType::Float32, cpuRuntime);
|
||||
inputCpu1->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
Tensor inputCpu2 =
|
||||
make_ref<TensorObj>(shapeB, DataType::Float32, cpuRuntime);
|
||||
inputCpu2->dataMalloc();
|
||||
inputCpu2->setData(generatorB);
|
||||
|
||||
// MLU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
|
@ -33,13 +29,19 @@ void testMatmul(const std::function<void(void *, size_t, DataType)> &generatorA,
|
|||
auto inputMlu2 = xpuGraph->cloneTensor(inputCpu2);
|
||||
auto mluOp = xpuGraph->addOp<T>(inputMlu1, inputMlu2, nullptr);
|
||||
xpuGraph->dataMalloc();
|
||||
inputMlu1->setData(generatorA);
|
||||
inputMlu2->setData(generatorB);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
auto outputMlu = mluOp->getOutput();
|
||||
auto outputMlu2Cpu = outputMlu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu1, inputCpu2, nullptr);
|
||||
cpuGraph->addTensor(inputCpu1);
|
||||
cpuGraph->addTensor(inputCpu2);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu1->setData(generatorA);
|
||||
inputCpu2->setData(generatorB);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
outputCpu->print();
|
||||
|
|
|
@ -17,21 +17,22 @@ void testUnary(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = xpuGraph->addOp<T>(inputGpu, nullptr);
|
||||
xpuGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<T>(inputCpu, nullptr);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
@ -46,8 +47,6 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
float min = 1.0;
|
||||
float max = 5.0;
|
||||
|
||||
|
@ -56,13 +55,16 @@ void testClip(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = xpuGraph->addOp<ClipObj>(inputGpu, nullptr, min, max);
|
||||
xpuGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<ClipObj>(inputCpu, nullptr, min, max);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
@ -77,21 +79,22 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph xpuGraph = make_ref<GraphObj>(xpuRuntime);
|
||||
auto inputGpu = xpuGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastObj::Float2Int32);
|
||||
auto gpuOp = xpuGraph->addOp<CastObj>(inputGpu, nullptr, CastType::Float2Int32);
|
||||
xpuGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
xpuRuntime->run(xpuGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// CPU
|
||||
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||
auto cpuOp = cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastObj::Float2Int32);
|
||||
auto cpuOp = cpuGraph->addOp<CastObj>(inputCpu, nullptr, CastType::Float2Int32);
|
||||
cpuGraph->addTensor(inputCpu);
|
||||
cpuGraph->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
cpuRuntime->run(cpuGraph);
|
||||
auto outputCpu = cpuOp->getOutput();
|
||||
// Check
|
||||
|
@ -102,17 +105,6 @@ TEST(xdnn_Unary, run) {
|
|||
testUnary<ReluObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<SigmoidObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
testUnary<TanhObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<SquareObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<SqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<RsqrtObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<ExpObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<CeilObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<FloorObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<NegObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testClip(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<CopyObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testUnary<ReciprocalObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
//testCast(IncrementalGenerator(), Shape{1, 2, 2, 3});
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue