add addcdiv and addcmul operation

This commit is contained in:
wanghailu 2023-01-30 06:36:36 +00:00
parent c51b19b198
commit 1fcab531ec
7 changed files with 342 additions and 0 deletions

View File

@ -104,6 +104,8 @@ enum class OpType {
Or,
Xor,
Not,
Addcdiv,
Addcmul,
//
MemBound = 300,
};
@ -216,6 +218,8 @@ class OpRegistry {
FOP(Or);
FOP(Xor);
FOP(Not);
FOP(Addcdiv);
FOP(Addcmul);
//
FOP(MemBound);
default:

View File

@ -65,6 +65,40 @@ class MulNObj : public OperatorObj {
vector<int> getOpAttrVector() const override;
};
class AddcdivObj : public OperatorObj {
public:
AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
Tensor input1, Tensor input2, Tensor output);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 3; }
int numOutputs() const override { return 1; }
float getAlpha() { return alphaValue; }
private:
float alphaValue;
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
class AddcmulObj : public OperatorObj {
public:
AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
Tensor input1, Tensor input2, Tensor output);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 3; }
int numOutputs() const override { return 1; }
float getAlpha() { return alphaValue; }
private:
float alphaValue;
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
#define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \
class prefix##Obj : public ElementWiseObj { \
public: \

View File

@ -623,6 +623,110 @@ class SquaredDifferenceCnnl : public BangKernelWithoutConfig {
}
};
class AddcdivCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AddcdivObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const oData = (op->getOutput()->getRawDataPtr<void *>());
float alpha = op->getAlpha();
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
size_t wsSize;
cnnlGetAddcdivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
cnnlStatus_t stat = cnnlAddcdiv(context->cnnlHandle(), aDesc, aData, &alpha,
bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
}
};
class AddcmulCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AddcmulObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const cData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const oData = (op->getOutput()->getRawDataPtr<void *>());
float alpha = op->getAlpha();
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, oDesc;
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
// get inputs
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
// get outputs
checkCnnlError(cnnlCreateTensorDescriptor(&oDesc));
checkCnnlError(cnnlSetTensorDescriptor(oDesc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
size_t wsSize;
cnnlGetAddcmulWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc,
&wsSize);
BangPtr wsData = context->getWorkspace(wsSize);
cnnlStatus_t stat = cnnlAddcmul(context->cnnlHandle(), aDesc, aData, &alpha,
bDesc, bData, cDesc, cData, wsData, wsSize, oDesc, oData);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
checkCnnlError(cnnlDestroyTensorDescriptor(oDesc));
}
};
// class FloorModTruncCnnl : public BangKernelWithoutConfig {
// void compute(const Operator &_op,
// const RuntimeObj *_context) const override {
@ -779,6 +883,11 @@ REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl,
"Xor_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl,
"Not_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Addcdiv, DataType::Float32, AddcdivCnnl,
"Addcdiv_cnnl_BANG_Float32");
REGISTER_KERNEL(Device::BANG, OpType::Addcmul, DataType::Float32, AddcmulCnnl,
"Addcmul_cnnl_BANG_Float32");
// REGISTER_KERNEL(Device::BANG, OpType::FloorModTrunc, DataType::Float32,
// FloorModTruncCnnl,
// "FloorModTrunc_cnnl_BANG_Float32");

View File

@ -184,4 +184,90 @@ vector<int> MulNObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
AddcdivObj::AddcdivObj(GraphObj *graph, float alpha, Tensor input0,
Tensor input1, Tensor input2, Tensor output)
: OperatorObj(OpType::Addcdiv, {input0, input1, input2}, {output}), alphaValue(alpha) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
AddcdivObj::inferShape(const TensorVec &inputs) const {
// For now,we only process the same dims here, broardcast will be considered
// in the opt layer.
const auto A = inputs[0], B = inputs[1];
if (A->getDims().size() != B->getDims().size() ||
A->getDims() != B->getDims())
return {};
return {{A->getDims()}};
}
std::string AddcdivObj::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
os << vecToString(inputs[2]->getDims()) << ",";
os << "input0=" << inputs[0]->getGuid() << ",";
os << "input1=" << inputs[1]->getGuid() << ",";
os << "input1=" << inputs[2]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
// use output dim or inputs dim?
vector<int> AddcdivObj::getWorkloadVector() const {
vector<int> ret = outputs[0]->getDims();
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> AddcdivObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
AddcmulObj::AddcmulObj(GraphObj *graph, float alpha, Tensor input0,
Tensor input1, Tensor input2, Tensor output)
: OperatorObj(OpType::Addcmul, {input0, input1, input2}, {output}), alphaValue(alpha) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
AddcmulObj::inferShape(const TensorVec &inputs) const {
// For now,we only process the same dims here, broardcast will be considered
// in the opt layer.
const auto A = inputs[0], B = inputs[1];
if (A->getDims().size() != B->getDims().size() ||
A->getDims() != B->getDims())
return {};
return {{A->getDims()}};
}
std::string AddcmulObj::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
os << vecToString(inputs[2]->getDims()) << ",";
os << "input0=" << inputs[0]->getGuid() << ",";
os << "input1=" << inputs[1]->getGuid() << ",";
os << "input1=" << inputs[2]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
// use output dim or inputs dim?
vector<int> AddcmulObj::getWorkloadVector() const {
vector<int> ret = outputs[0]->getDims();
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> AddcmulObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
}; // namespace infini

View File

@ -0,0 +1,54 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "test.h"
namespace infini {
template <class T>
void testAddcdiv(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
Tensor inputCpu3 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu3->dataMalloc();
inputCpu3->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
float alpha = 1.1;
auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
inputCpu1->printData();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(cnnl_addcdiv, run) {
testAddcdiv<AddcdivObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -0,0 +1,54 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "test.h"
namespace infini {
template <class T>
void testAddcmul(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
Tensor inputCpu3 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu3->dataMalloc();
inputCpu3->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto inputGpu3 = bangGraph->cloneTensor(inputCpu3);
float alpha = 1.1;
auto gpuOp = bangGraph->addOp<T>(alpha, inputGpu1, inputGpu2, inputGpu3, nullptr);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
inputCpu1->printData();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(cnnl_addcmul, run) {
testAddcmul<AddcmulObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini

View File

@ -38,6 +38,7 @@ void testLogicOp(
inputCpu1->printData();
inputCpu2->printData();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(cnnl_LogicOp, run) {