add AnyOp and cuda kernel

This commit is contained in:
whjthu 2023-04-23 00:16:03 +08:00
parent acc64fd32c
commit 1ab2118716
10 changed files with 294 additions and 17 deletions

View File

@ -102,6 +102,7 @@ enum class OpType {
Dropout,
//
MemBound = 300,
Any,
};
using KernelAttrs = std::tuple<Device, OpType, DataType>;
@ -209,6 +210,7 @@ class OpRegistry {
FOP(BitRightShift);
//
FOP(MemBound);
FOP(Any);
default:
IT_ASSERT(false, "Unknown OpType " +
std::to_string(enum_to_underlying(opType)));

10
include/cuda/cuda_any.h Normal file
View File

@ -0,0 +1,10 @@
#pragma once
#include "operators/any.h"
namespace infini {
void any_kernel_mapping(vector<float *> input, vector<float *> output,
const string &kernel_name, const vector<int> &attr);
} // namespace infini

View File

@ -8,4 +8,4 @@ namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape);
}; // namespace infini
} // namespace infini

29
include/operators/any.h Normal file
View File

@ -0,0 +1,29 @@
#pragma once
#include "core/operator.h"
namespace infini {
class AnyObj : public OperatorObj {
private:
string kernelName;
vector<int> attr;
public:
AnyObj(GraphObj *graph, const TensorVec &inputs, const TensorVec &outputs,
string &kernelName, const vector<int> &attr);
OP_CLONE(AnyObj);
string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return outputs.size(); }
const string getKernelName() const;
vector<int> getOpAttrVector() const override;
vector<int> getWorkloadVector() const override;
};
} // namespace infini

View File

@ -15,13 +15,17 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
// HACK: set correct data type
auto kernelAttrs =
KernelAttrs{device, op->getOpType(), DataType::Float32};
std::cout << 1 << std::endl;
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
std::cout << 2 << std::endl;
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
std::cout << 3 << std::endl;
auto perfData = perfEngine.getPerfData(perfKey);
if (!perfData && !tune) {
kernel->compute(op, this);
continue;
}
std::cout << 4 << std::endl;
PerfRecord record;
if (!perfData) {
@ -29,6 +33,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
perfEngine.setPerfData(perfKey, record);
} else
record = perfData;
std::cout << 5 << std::endl;
double t = record->time;
totalTime += t;

52
src/kernels/cuda/any.cc Normal file
View File

@ -0,0 +1,52 @@
#include "operators/any.h"
#include "cuda/cuda_any.h"
#include "cuda/cuda_conv2dreduce.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class AnyCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<AnyObj>(_op);
auto inputs = op->getInputs();
auto outputs = op->getOutputs();
vector<float *> inputsRawPtr;
for (auto &input : inputs) {
inputsRawPtr.emplace_back(input->getRawDataPtr<float *>());
}
vector<float *> outputsRawPtr;
for (auto &output : outputs) {
outputsRawPtr.emplace_back(output->getRawDataPtr<float *>());
}
any_kernel_mapping(inputsRawPtr, outputsRawPtr, op->getKernelName(),
op->getOpAttrVector());
}
};
void any_kernel_mapping(vector<float *> inputs, vector<float *> outputs,
const string &kernelName, const vector<int> &attr) {
if (kernelName == "conv2dreduce_kernel") {
IT_ASSERT(attr.size() == 15);
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
IT_ASSERT(outputs.size() == 1);
conv2dreduce_kernel(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
outputs[0], attr[0] != 0, attr[1], attr[2], attr[3],
attr[4], attr[5], attr[6], attr[7], attr[8],
attr[9], attr[10], attr[11], attr[12], attr[13],
attr[14]);
} else {
std::cout << "Unimplemented AnyOp cuda kernel: " << kernelName
<< std::endl;
IT_TODO_HALT();
}
}
REGISTER_KERNEL(Device::CUDA, OpType::Any, DataType::Float32, AnyCuda,
"Any_CUDA_Float32");
} // namespace infini

74
src/operators/any.cc Normal file
View File

@ -0,0 +1,74 @@
#include "operators/any.h"
namespace infini {
AnyObj::AnyObj(GraphObj *graph, const TensorVec &inputs,
const TensorVec &outputs, string &kernelName,
const vector<int> &attr)
: OperatorObj(OpType::Any, inputs, outputs), kernelName(kernelName),
attr(attr) {
IT_ASSERT(checkValid(graph));
// Outputs must assigned when constructing AnyObj
IT_ASSERT(!outputs.empty());
for (auto &output : outputs)
IT_ASSERT(output != nullptr && output->size() > 0);
}
string AnyObj::toString() const {
std::ostringstream os;
os << "Any[" << getGuid() << "](";
for (size_t i = 0; i < inputs.size(); ++i) {
os << "i" << i << "=" << inputs[i]->getGuid();
if (i != inputs.size() - 1)
os << " ";
}
os << ", ";
for (size_t i = 0; i < outputs.size(); ++i) {
os << "o" << i << "=" << outputs[i]->getGuid();
if (i != outputs.size() - 1)
os << " ";
}
os << ", ";
os << "kernel name: " << kernelName << ", ";
os << "attr = [";
for (size_t i = 0; i < attr.size(); ++i) {
os << attr[i];
if (i != attr.size() - 1)
os << ", ";
}
os << "])\n";
return os.str();
}
optional<vector<Shape>> AnyObj::inferShape(const TensorVec &inputs) const {
vector<Shape> ret;
for (auto output : outputs) {
ret.emplace_back(output->getDims());
}
return ret;
}
const string AnyObj::getKernelName() const { return kernelName; }
vector<int> AnyObj::getOpAttrVector() const { return attr; };
vector<int> AnyObj::getWorkloadVector() const {
vector<int> ret = {};
for (auto &input : inputs) {
auto inputDims = input->getDims();
ret.insert(ret.end(), inputDims.begin(), inputDims.end());
}
for (auto &output : outputs) {
auto outputDims = output->getDims();
ret.insert(ret.end(), outputDims.begin(), outputDims.end());
}
for (auto c : kernelName) {
ret.emplace_back(c);
}
for (auto at : attr) {
ret.emplace_back(at);
}
return ret;
}
} // namespace infini

View File

@ -48,7 +48,7 @@ TEST(SubGraphRewriter, subGraphMatch1) {
SubGraphRewriter v(g);
vector<MatchGraph> subgs = v.findMatch(subG);
EXPECT_TRUE(subgs.size() == 2);
EXPECT_TRUE(subgs.size() == 2u);
}
TEST(MatchGraph, single_input) {
@ -116,12 +116,12 @@ TEST(MatchGraph, single_input) {
auto o4 = v.addSubGraph(subG, TensorVec{add1->getOutput(0)});
EXPECT_EQ(g->getOperators().size(), 52);
EXPECT_EQ(g->getOperators().size(), 52u);
vector<MatchGraph> subgs = v.findMatch(subG);
EXPECT_TRUE(subgs.size() == 5);
EXPECT_TRUE(subgs.size() == 5u);
vector<MatchGraph> subgs1 = v.findMatch(subG1);
EXPECT_TRUE(subgs1.size() == 4);
EXPECT_TRUE(subgs1.size() == 4u);
// test replace
Tensor sii0 =
@ -135,7 +135,7 @@ TEST(MatchGraph, single_input) {
}
v.replaceSubGraph(subG, subG2);
EXPECT_EQ(g->getOperators().size(), 37);
EXPECT_EQ(g->getOperators().size(), 37u);
}
TEST(MatchGraph, multi_input) {
@ -186,17 +186,17 @@ TEST(MatchGraph, multi_input) {
nullptr);
auto matches = v.findMatch(subG);
EXPECT_EQ(2, matches.size());
EXPECT_EQ(2u, matches.size());
auto div0 = g->addOp<DivObj>(reduce1->getOutput(0), i2, nullptr);
auto add1 =
g->addOp<AddObj>(sub0->getOutput(), div0->getOutput(), nullptr);
matches = v.findMatch(subG);
EXPECT_EQ(1, matches.size());
EXPECT_EQ(1u, matches.size());
// two matched subgraphs overlaped,so only replaced one sub graph
v.replaceSubGraph(subG, replaceG);
EXPECT_EQ(1, v.findMatch(replaceG).size());
EXPECT_EQ(1u, v.findMatch(replaceG).size());
}
}
@ -240,7 +240,7 @@ TEST(MatchGraph, multi_output) {
{
auto input = g->cloneTensor(i);
auto outs = v.addSubGraph(subg0, {input});
EXPECT_EQ(2, outs.size());
EXPECT_EQ(2u, outs.size());
Tensor w0 = g->addTensor(Shape{96, 64, 3, 3}, DataType::UInt32);
auto conv0 = g->addOp<ConvObj>(outs[0], w0, nullptr, 1, 1);
auto relu0 = g->addOp<ReluObj>(conv0->getOutput(0), nullptr);
@ -263,11 +263,11 @@ TEST(MatchGraph, multi_output) {
}
auto matches = v.findMatch(subg0);
EXPECT_EQ(1, matches.size());
EXPECT_EQ(1u, matches.size());
v.replaceSubGraph(subg0, subg1);
auto matches2 = v.findMatch(subg1);
EXPECT_EQ(1, matches2.size());
EXPECT_EQ(1u, matches2.size());
}
// gcn
@ -354,16 +354,16 @@ TEST(MatchGraph, multi_input_output) {
v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
auto out1 =
v.addSubGraph(subg1, {maxPool->getOutput(0), relu->getOutput(0)});
EXPECT_EQ(2, out0.size());
EXPECT_EQ(2, out1.size());
EXPECT_EQ(2u, out0.size());
EXPECT_EQ(2u, out1.size());
auto div = g->addOp<DivObj>(out0[0], out1[1], nullptr);
auto sub = g->addOp<SubObj>(out0[1], out1[0], nullptr);
}
EXPECT_EQ(2, v.findMatch(subg0).size());
EXPECT_EQ(2, v.findMatch(subg1).size());
EXPECT_EQ(2u, v.findMatch(subg0).size());
EXPECT_EQ(2u, v.findMatch(subg1).size());
v.replaceSubGraph(subg0, subg2);
EXPECT_EQ(v.findMatch(subg2).size(), 2);
EXPECT_EQ(v.findMatch(subg2).size(), 2u);
}
/* One Node having two or more successors is not supported yet.

View File

@ -0,0 +1,57 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "operators/any.h"
#include "test.h"
namespace infini {
TEST(cuda_Any, anyKernel) {
// conv2dreduce
{
// Construct Runtime and graph for CPU and CUDA
Runtime cpu =
NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime cuda = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cuda);
auto generator = IncrementalGenerator();
int PRelu = 0, n = 1, h = 4, w = 4, f = 2, r = 3, s = 3, oh = 4, ow = 4,
ph = 1, pw = 1, sh = 1, sw = 1, dh = 1, dw = 1;
string kernelName = "conv2dreduce_kernel";
vector<int> attr{PRelu, n, h, w, f, r, s, oh,
ow, ph, pw, sh, sw, dh, dw};
// Build input data on CPu
Tensor i0Cpu = gCpu->addTensor({n, 1, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, 1, r, s}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit
// allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
Tensor o0Cuda = gCuda->addTensor({n, f, oh, ow});
auto anyOp = gCuda->addOpWithOutputs<AnyObj>(
TensorVec{i0Cuda, w0Cuda}, TensorVec{o0Cuda}, kernelName, attr);
anyOp->print();
// allocate CUDA memory
gCuda->dataMalloc();
std::cout << "data malloc success..." << std::endl;
// Execute on CUDA
cuda->run(gCuda);
std::cout << "cuda run success..." << std::endl;
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(anyOp->getOutput());
// check results on CPU
EXPECT_TRUE(1);
// print a tensor/operator/graph by print()
gCuda->print();
}
}
} // namespace infini

View File

@ -0,0 +1,48 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "operators/any.h"
#include "test.h"
using namespace infini;
using namespace std;
TEST(Any, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
vector<int> attr;
string kernelName = "fake_kernel_name";
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
Tensor o0 = g->addTensor({3, 2, 3}, DataType::Float32);
auto anyOp = g->addOpWithOutputs<AnyObj>(
TensorVec{i0, i1}, TensorVec{o0}, kernelName, attr);
EXPECT_TRUE(anyOp->getOutputs().size() == 1);
EXPECT_EQ(anyOp->getOutput()->getDims(), (Shape{3, 2, 3}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
Tensor o0 = g->addTensor({2, 2, 3}, DataType::Float32);
Tensor o1 = g->addTensor({1, 2, 3}, DataType::Float32);
auto anyOp = g->addOpWithOutputs<AnyObj>(
TensorVec{i0, i1}, TensorVec{o0, o1}, kernelName, attr);
EXPECT_TRUE(anyOp->getOutputs().size() == 2);
EXPECT_EQ(anyOp->getOutput(0)->getDims(), (Shape{2, 2, 3}));
EXPECT_EQ(anyOp->getOutput(1)->getDims(), (Shape{1, 2, 3}));
}
}
TEST(Any, Attr) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
string kernelName = "fake_kernel_name";
vector<int> attr = {2, 3, 2, 1, 4, 4};
Graph g = make_ref<GraphObj>(runtime);
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
Tensor o0 = g->addTensor({3, 2, 3}, DataType::Float32);
auto anyOp = g->addOpWithOutputs<AnyObj>(TensorVec{i0, i1}, TensorVec{o0},
kernelName, attr);
EXPECT_EQ(anyOp->getOpAttrVector(), attr);
}