forked from jiuyuan/InfiniTensor
add bangSoftmax , compare cnnl and bang C
This commit is contained in:
parent
186a6f37f2
commit
1ed4b36db2
|
@ -1,11 +1,11 @@
|
|||
#pragma once
|
||||
#include "bang/bang_runtime.h"
|
||||
#include "bang_highSoftmax.h"
|
||||
#include "bang_bangSoftmax.h"
|
||||
#include "operators/softmax.h"
|
||||
namespace infini {
|
||||
|
||||
void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
auto op = as<BangSoftmaxObj>(_op);
|
||||
void *const mlu_src = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const mlu_destination = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
|
@ -31,7 +31,7 @@ void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
|
|||
othersize *= shape[s];
|
||||
}
|
||||
}
|
||||
if (op->getOpType() == OpType::Softmax)
|
||||
if (op->getOpType() == OpType::BangSoftmax)
|
||||
softmaxKernel(context->cnnlHandle(), (float *)mlu_destination,
|
||||
(float *)mlu_src, nDim, axis, othersize, frontsize,
|
||||
dimsize, stride);
|
||||
|
|
|
@ -59,6 +59,7 @@ class GraphHandlerObj {
|
|||
Tensor tanh(Tensor x, Tensor y);
|
||||
Tensor erf(Tensor x, Tensor y);
|
||||
Tensor softmax(Tensor x, Tensor y, int axis);
|
||||
Tensor bangSoftmax(Tensor x, Tensor y, int axis);
|
||||
Tensor abs(Tensor x, Tensor y);
|
||||
Tensor sqrt(Tensor x, Tensor y);
|
||||
Tensor neg(Tensor x, Tensor y);
|
||||
|
|
|
@ -180,6 +180,7 @@ struct OpType {
|
|||
Size,
|
||||
Slice,
|
||||
Softmax,
|
||||
BangSoftmax,
|
||||
SoftmaxCrossEntropyLoss,
|
||||
Softplus,
|
||||
Softsign,
|
||||
|
|
|
@ -24,4 +24,26 @@ class SoftmaxObj : public OperatorObj {
|
|||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
class BangSoftmaxObj : public OperatorObj {
|
||||
int axis;
|
||||
|
||||
public:
|
||||
BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int axis);
|
||||
|
||||
OP_CLONE(BangSoftmaxObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
|
||||
return {{inputs[0]->getDims()}};
|
||||
};
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
int getAxis() const { return axis; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
} // namespace infini
|
||||
|
|
|
@ -227,6 +227,15 @@ Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
|
|||
->getOutput();
|
||||
}
|
||||
}
|
||||
Tensor GraphHandlerObj::bangSoftmax(Tensor input, Tensor output, int axis) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<BangSoftmaxObj>(std::move(input), output, axis);
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<BangSoftmaxObj>(std::move(input), output, axis)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
|
||||
if (output) {
|
||||
|
|
|
@ -522,6 +522,7 @@ void init_graph_builder(py::module &m) {
|
|||
.def("hardSigmoid", &Handler::hardSigmoid, policy::move)
|
||||
.def("hardSwish", &Handler::hardSwish, policy::move)
|
||||
.def("softmax", &Handler::softmax, policy::move)
|
||||
.def("bangSoftmax", &Handler::bangSoftmax, policy::move)
|
||||
.def("abs", &Handler::abs, policy::move)
|
||||
.def("sqrt", &Handler::sqrt, policy::move)
|
||||
.def("neg", &Handler::neg, policy::move)
|
||||
|
|
|
@ -246,7 +246,8 @@ REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
|
|||
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
|
||||
"Sigmoid_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
|
||||
"Softmax_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
|
||||
"HardSigmoid_cnnl_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,
|
||||
|
|
|
@ -10,5 +10,5 @@ class SoftmaxBang : public BangKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxBang, "Softmax_BANG");
|
||||
REGISTER_KERNEL(Device::BANG, OpType::BangSoftmax, SoftmaxBang, "Softmax_BANG");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#include "bang_highSoftmax.h"
|
||||
#include "highSoftmax.h"
|
||||
#include "bang_bangSoftmax.h"
|
||||
#include "bangSoftmax.h"
|
||||
|
||||
namespace infini{
|
||||
void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){
|
|
@ -31,4 +31,33 @@ vector<int> SoftmaxObj::getWorkloadVector() const {
|
|||
vector<int> SoftmaxObj::getOpAttrVector() const {
|
||||
return {type.underlying(), axis};
|
||||
}
|
||||
BangSoftmaxObj::BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
int _axis)
|
||||
: OperatorObj(OpType::BangSoftmax, {input}, {output}) {
|
||||
int rank = input->getRank();
|
||||
axis = get_real_axis(_axis, rank);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
std::string BangSoftmaxObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << type.toString() << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ",";
|
||||
os << "axis=" << axis << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> BangSoftmaxObj::getWorkloadVector() const {
|
||||
vector<int> ret{type.underlying(), axis};
|
||||
const Shape shape = outputs[0]->getDims();
|
||||
ret.insert(ret.end(), shape.begin(), shape.end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> BangSoftmaxObj::getOpAttrVector() const {
|
||||
return {type.underlying(), axis};
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,134 +5,121 @@
|
|||
#include "operators/softmax.h"
|
||||
#include "test.h"
|
||||
#include <cmath>
|
||||
#include <sys/time.h>
|
||||
namespace infini {
|
||||
double eps = 3e-3;
|
||||
TEST(cuDNN_Softmax, run_axis1) {
|
||||
// Runtime
|
||||
void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
|
||||
int axis, const vector<float> &expectData) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
// cnnlSoftmax
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
inputGpu->copyin(inputData);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// bangSoftmax
|
||||
auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->copyin(inputData);
|
||||
bangRuntime->run(bangGraph);
|
||||
auto bangOutputGpu = gpuOp->getOutput();
|
||||
auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(expectData, eps)); // cnnlSoftmax
|
||||
EXPECT_TRUE(bangOutputGpu2Cpu->equalData(expectData, eps)); // bangSoftmax
|
||||
}
|
||||
double get_walltime() {
|
||||
struct timeval tp;
|
||||
gettimeofday(&tp, NULL);
|
||||
return (double)(tp.tv_sec + tp.tv_usec * 1e-6);
|
||||
}
|
||||
float err(float *x, float *y, const Shape &inputShape, int nDim) {
|
||||
int size = 1;
|
||||
for (int i = 0; i < nDim; i++) {
|
||||
size *= inputShape[i];
|
||||
}
|
||||
float error = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (fabs(x[i] - y[i]) > error) {
|
||||
error = fabs(x[i] - y[i]);
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
void test_compareSoftmaxFp32(
|
||||
int axis, const Shape &inputShape, int nDim,
|
||||
const std::function<void(void *, size_t, DataType)> &generator) {
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
// cnnlSoftmax
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
double bangst, bangela;
|
||||
bangst = get_walltime();
|
||||
bangRuntime->run(bangGraph);
|
||||
bangela = 1000 * (get_walltime() - bangst);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// bangSoftmax
|
||||
auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(generator);
|
||||
double cnnlst, cnnlela;
|
||||
cnnlst = get_walltime();
|
||||
bangRuntime->run(bangGraph);
|
||||
cnnlela = 1000 * (get_walltime() - cnnlst);
|
||||
auto bangOutputGpu = gpuOp->getOutput();
|
||||
auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
float error =
|
||||
err(outputGpu2Cpu->getRawDataPtr<float *>(),
|
||||
bangOutputGpu2Cpu->getRawDataPtr<float *>(), inputShape, nDim);
|
||||
printf("axis:%d. bang time:%.2f ms, cnnl time:%.2f ms, err:%.4e\n", axis,
|
||||
bangela, cnnlela, error);
|
||||
}
|
||||
TEST(BANG_SoftmaxFp32, run) {
|
||||
test_softmaxFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
|
||||
test_softmaxFp32(
|
||||
Shape{2, 4}, vector<float>{0., 1., 2., 3., 1000, 1001, 1002, 1003}, 1,
|
||||
vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
|
||||
0.032058604, 0.08714432, 0.23688284, 0.6439143},
|
||||
eps));
|
||||
0.032058604, 0.08714432, 0.23688284, 0.6439143});
|
||||
}
|
||||
TEST(BANG_CompareSoftmaxFp32, run) {
|
||||
test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, RandomGenerator());
|
||||
test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, RandomGenerator());
|
||||
test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, IncrementalGenerator());
|
||||
test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, IncrementalGenerator());
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax, run_axis0) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
vector<float>{0., 0., 0., 0., 1, 1, 1, 1}, eps));
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis1) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
|
||||
0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
|
||||
0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
|
||||
0.9820138},
|
||||
eps));
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis2) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
vector<float>{0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029,
|
||||
0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
|
||||
0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971,
|
||||
0.8807971},
|
||||
eps));
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis3) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
|
||||
bangGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
bangRuntime->run(bangGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||
0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
|
||||
0.7310586},
|
||||
eps));
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue