add bangSoftmax , compare cnnl and bang C

This commit is contained in:
xgqdut2016 2024-02-28 03:14:58 +00:00
parent 186a6f37f2
commit 1ed4b36db2
14 changed files with 169 additions and 118 deletions

View File

@ -1,11 +1,11 @@
#pragma once
#include "bang/bang_runtime.h"
#include "bang_highSoftmax.h"
#include "bang_bangSoftmax.h"
#include "operators/softmax.h"
namespace infini {
void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
auto op = as<SoftmaxObj>(_op);
auto op = as<BangSoftmaxObj>(_op);
void *const mlu_src = (op->getInputs(0)->getRawDataPtr<void *>());
void *const mlu_destination = (op->getOutput()->getRawDataPtr<void *>());
@ -31,7 +31,7 @@ void softmax_kernel(const RuntimeObj *obj, const Operator &_op) {
othersize *= shape[s];
}
}
if (op->getOpType() == OpType::Softmax)
if (op->getOpType() == OpType::BangSoftmax)
softmaxKernel(context->cnnlHandle(), (float *)mlu_destination,
(float *)mlu_src, nDim, axis, othersize, frontsize,
dimsize, stride);

View File

@ -59,6 +59,7 @@ class GraphHandlerObj {
Tensor tanh(Tensor x, Tensor y);
Tensor erf(Tensor x, Tensor y);
Tensor softmax(Tensor x, Tensor y, int axis);
Tensor bangSoftmax(Tensor x, Tensor y, int axis);
Tensor abs(Tensor x, Tensor y);
Tensor sqrt(Tensor x, Tensor y);
Tensor neg(Tensor x, Tensor y);

View File

@ -180,6 +180,7 @@ struct OpType {
Size,
Slice,
Softmax,
BangSoftmax,
SoftmaxCrossEntropyLoss,
Softplus,
Softsign,

View File

@ -24,4 +24,26 @@ class SoftmaxObj : public OperatorObj {
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
class BangSoftmaxObj : public OperatorObj {
int axis;
public:
BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output, int axis);
OP_CLONE(BangSoftmaxObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
return {{inputs[0]->getDims()}};
};
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
int getAxis() const { return axis; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -227,6 +227,15 @@ Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) {
->getOutput();
}
}
Tensor GraphHandlerObj::bangSoftmax(Tensor input, Tensor output, int axis) {
if (output) {
g->addOpWithOutputs<BangSoftmaxObj>(std::move(input), output, axis);
return output;
} else {
return g->addOp<BangSoftmaxObj>(std::move(input), output, axis)
->getOutput();
}
}
Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) {
if (output) {

View File

@ -522,6 +522,7 @@ void init_graph_builder(py::module &m) {
.def("hardSigmoid", &Handler::hardSigmoid, policy::move)
.def("hardSwish", &Handler::hardSwish, policy::move)
.def("softmax", &Handler::softmax, policy::move)
.def("bangSoftmax", &Handler::bangSoftmax, policy::move)
.def("abs", &Handler::abs, policy::move)
.def("sqrt", &Handler::sqrt, policy::move)
.def("neg", &Handler::neg, policy::move)

View File

@ -246,7 +246,8 @@ REGISTER_KERNEL(Device::BANG, OpType::PRelu, PReluCnnl, "PRelu_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, SigmoidCnnl,
"Sigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Round, RoundCnnl, "Round_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxCnnl,
"Softmax_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSigmoid, HardSigmoidCnnl,
"HardSigmoid_cnnl_BANG");
REGISTER_KERNEL(Device::BANG, OpType::HardSwish, HardSwishCnnl,

View File

@ -10,5 +10,5 @@ class SoftmaxBang : public BangKernelWithoutConfig {
}
};
REGISTER_KERNEL(Device::BANG, OpType::Softmax, SoftmaxBang, "Softmax_BANG");
REGISTER_KERNEL(Device::BANG, OpType::BangSoftmax, SoftmaxBang, "Softmax_BANG");
}; // namespace infini

View File

@ -1,5 +1,5 @@
#include "bang_highSoftmax.h"
#include "highSoftmax.h"
#include "bang_bangSoftmax.h"
#include "bangSoftmax.h"
namespace infini{
void softmaxKernel(cnnlHandle_t handle, float *mlu_destination, float *mlu_src, int nDim, int axis, int othersize, int frontsize, int dimsize, int stride){

View File

@ -31,4 +31,33 @@ vector<int> SoftmaxObj::getWorkloadVector() const {
vector<int> SoftmaxObj::getOpAttrVector() const {
return {type.underlying(), axis};
}
BangSoftmaxObj::BangSoftmaxObj(GraphObj *graph, Tensor input, Tensor output,
int _axis)
: OperatorObj(OpType::BangSoftmax, {input}, {output}) {
int rank = input->getRank();
axis = get_real_axis(_axis, rank);
IT_ASSERT(checkValid(graph));
}
std::string BangSoftmaxObj::toString() const {
std::ostringstream os;
os << type.toString() << "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ",";
os << "axis=" << axis << ")";
return os.str();
}
vector<int> BangSoftmaxObj::getWorkloadVector() const {
vector<int> ret{type.underlying(), axis};
const Shape shape = outputs[0]->getDims();
ret.insert(ret.end(), shape.begin(), shape.end());
return ret;
}
vector<int> BangSoftmaxObj::getOpAttrVector() const {
return {type.underlying(), axis};
}
} // namespace infini

View File

@ -5,134 +5,121 @@
#include "operators/softmax.h"
#include "test.h"
#include <cmath>
#include <sys/time.h>
namespace infini {
double eps = 3e-3;
TEST(cuDNN_Softmax, run_axis1) {
// Runtime
void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
int axis, const vector<float> &expectData) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
// cnnlSoftmax
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
bangGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
inputGpu->copyin(inputData);
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// bangSoftmax
auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
bangGraph->dataMalloc();
inputGpu->copyin(inputData);
bangRuntime->run(bangGraph);
auto bangOutputGpu = gpuOp->getOutput();
auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
EXPECT_TRUE(outputGpu2Cpu->equalData(expectData, eps)); // cnnlSoftmax
EXPECT_TRUE(bangOutputGpu2Cpu->equalData(expectData, eps)); // bangSoftmax
}
double get_walltime() {
struct timeval tp;
gettimeofday(&tp, NULL);
return (double)(tp.tv_sec + tp.tv_usec * 1e-6);
}
float err(float *x, float *y, const Shape &inputShape, int nDim) {
int size = 1;
for (int i = 0; i < nDim; i++) {
size *= inputShape[i];
}
float error = 0;
for (int i = 0; i < size; i++) {
if (fabs(x[i] - y[i]) > error) {
error = fabs(x[i] - y[i]);
}
}
return error;
}
void test_compareSoftmaxFp32(
int axis, const Shape &inputShape, int nDim,
const std::function<void(void *, size_t, DataType)> &generator) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(inputShape, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
// cnnlSoftmax
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
bangGraph->dataMalloc();
inputGpu->setData(generator);
double bangst, bangela;
bangst = get_walltime();
bangRuntime->run(bangGraph);
bangela = 1000 * (get_walltime() - bangst);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// bangSoftmax
auto bangGpuOp = bangGraph->addOp<BangSoftmaxObj>(inputGpu, nullptr, axis);
bangGraph->dataMalloc();
inputGpu->setData(generator);
double cnnlst, cnnlela;
cnnlst = get_walltime();
bangRuntime->run(bangGraph);
cnnlela = 1000 * (get_walltime() - cnnlst);
auto bangOutputGpu = gpuOp->getOutput();
auto bangOutputGpu2Cpu = bangOutputGpu->clone(cpuRuntime);
// Check
float error =
err(outputGpu2Cpu->getRawDataPtr<float *>(),
bangOutputGpu2Cpu->getRawDataPtr<float *>(), inputShape, nDim);
printf("axis:%d. bang time:%.2f ms, cnnl time:%.2f ms, err:%.4e\n", axis,
bangela, cnnlela, error);
}
TEST(BANG_SoftmaxFp32, run) {
test_softmaxFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
test_softmaxFp32(
Shape{2, 4}, vector<float>{0., 1., 2., 3., 1000, 1001, 1002, 1003}, 1,
vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
0.032058604, 0.08714432, 0.23688284, 0.6439143},
eps));
0.032058604, 0.08714432, 0.23688284, 0.6439143});
}
TEST(BANG_CompareSoftmaxFp32, run) {
test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, RandomGenerator());
test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, RandomGenerator());
test_compareSoftmaxFp32(3, Shape{1, 32, 1, 5}, 4, IncrementalGenerator());
test_compareSoftmaxFp32(3, Shape{1, 32, 128, 5}, 4, IncrementalGenerator());
}
TEST(cuDNN_Softmax, run_axis0) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
bangGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
vector<float>{0., 0., 0., 0., 1, 1, 1, 1}, eps));
}
TEST(cuDNN_Softmax2, run_axis1) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
bangGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
vector<float>{0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138,
0.9820138, 0.9820138, 0.9820138, 0.0179862, 0.0179862,
0.0179862, 0.0179862, 0.9820138, 0.9820138, 0.9820138,
0.9820138},
eps));
}
TEST(cuDNN_Softmax2, run_axis2) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
bangGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
vector<float>{0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029,
0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971,
0.8807971},
eps));
}
TEST(cuDNN_Softmax2, run_axis3) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
bangGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
vector<float>{0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414,
0.7310586},
eps));
}
} // namespace infini