add muln operation

This commit is contained in:
wanghailu 2022-12-27 08:22:50 +00:00
parent 45ea5c83f6
commit 5329e66d0f
4 changed files with 157 additions and 0 deletions

View File

@ -49,6 +49,21 @@ class AddNObj : public OperatorObj {
vector<int> getOpAttrVector() const override;
};
class MulNObj : public OperatorObj {
public:
MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return num; }
int numOutputs() const override { return 1; }
private:
int num;
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
#define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \
class prefix##Obj : public ElementWiseObj { \
public: \

49
src/kernels/bang/muln.cc Normal file
View File

@ -0,0 +1,49 @@
#include "operators/element_wise.h"
#include "bang/bang_kernel_without_config.h"
#include "bang/bang_runtime.h"
namespace infini {
class MulNCnnl : public BangKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<MulNObj>(_op);
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
int num = op->numInputs();
void *argv[num];
for(int i = 0; i < num; ++i) {
argv[i] = op->getInputs(i)->getRawDataPtr<void *>();
}
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
cnnlTensorDescriptor_t desc;
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]};
checkCnnlError(cnnlCreateTensorDescriptor(&desc));
checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
cnnlTensorDescriptor_t descArray[num];
for(int i = 0; i < num; ++i) {
checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i]));
checkCnnlError(cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW,
CNNL_DTYPE_FLOAT, 4, dim_array));
}
cnnlStatus_t stat = cnnlMulN(context->cnnlHandle(), descArray, argv, num, desc, cData);
if (stat != CNNL_STATUS_SUCCESS)
return;
// Destories in BANG does not require sync. But cnnl does not state
// whether sync is required before destories.
for(int i = 0; i < num; ++i) {
checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i]));
}
checkCnnlError(cnnlDestroyTensorDescriptor(desc));
}
};
REGISTER_KERNEL(Device::BANG, OpType::MulN, DataType::Float32, MulNCnnl,
"MulN_cnnl_BANG_Float32");
}; // namespace infini

View File

@ -142,4 +142,48 @@ vector<int> AddNObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
MulNObj::MulNObj(GraphObj *graph, int tensorNum, Tensor output, ...)
: OperatorObj(OpType::MulN), num(tensorNum) {
TensorVec temp;
Tensor *start = &output;
++start;
for(int i = 0; i < num; ++i) {
temp.push_back(*start);
start++;
}
setOutputs({output});
setInputs(temp);
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
MulNObj::inferShape(const TensorVec &inputs) const {
// For now,we only process the same dims here, broardcast will be considered
// in the opt layer.
const auto A = inputs[0];
return {{A->getDims()}};
}
std::string MulNObj::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(type) << "[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
os << "input0=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
// use output dim or inputs dim?
vector<int> MulNObj::getWorkloadVector() const {
vector<int> ret = outputs[0]->getDims();
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> MulNObj::getOpAttrVector() const {
return {enum_to_underlying(type)};
}
}; // namespace infini

View File

@ -0,0 +1,49 @@
#include "bang/bang_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "test.h"
namespace infini {
template <class T>
void testmulN(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
auto bangRuntime = make_ref<BangRuntimeObj>();
// Build input data on CPU
Tensor inputCpu1 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu1->dataMalloc();
inputCpu1->setData(generator);
Tensor inputCpu2 =
make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu2->dataMalloc();
inputCpu2->setData(generator);
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu1 = bangGraph->cloneTensor(inputCpu1);
auto inputGpu2 = bangGraph->cloneTensor(inputCpu2);
auto gpuOp = bangGraph->addOp<T>(2, nullptr, inputGpu1, inputGpu2);
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
// Check
inputCpu1->printData();
inputCpu2->printData();
outputGpu2Cpu->printData();
EXPECT_TRUE(1);
}
TEST(cnnl_mulN, run) {
testmulN<MulNObj>(IncrementalGenerator(), Shape{1, 2, 2, 3});
}
} // namespace infini