forked from jiuyuan/InfiniTensor
add AnyOp and cuda kernel
This commit is contained in:
parent
acc64fd32c
commit
1ab2118716
|
@ -102,6 +102,7 @@ enum class OpType {
|
|||
Dropout,
|
||||
//
|
||||
MemBound = 300,
|
||||
Any,
|
||||
};
|
||||
|
||||
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
||||
|
@ -209,6 +210,7 @@ class OpRegistry {
|
|||
FOP(BitRightShift);
|
||||
//
|
||||
FOP(MemBound);
|
||||
FOP(Any);
|
||||
default:
|
||||
IT_ASSERT(false, "Unknown OpType " +
|
||||
std::to_string(enum_to_underlying(opType)));
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/any.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void any_kernel_mapping(vector<float *> input, vector<float *> output,
|
||||
const string &kernel_name, const vector<int> &attr);
|
||||
|
||||
} // namespace infini
|
|
@ -8,4 +8,4 @@ namespace infini {
|
|||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class AnyObj : public OperatorObj {
|
||||
private:
|
||||
string kernelName;
|
||||
vector<int> attr;
|
||||
|
||||
public:
|
||||
AnyObj(GraphObj *graph, const TensorVec &inputs, const TensorVec &outputs,
|
||||
string &kernelName, const vector<int> &attr);
|
||||
|
||||
OP_CLONE(AnyObj);
|
||||
|
||||
string toString() const override;
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return outputs.size(); }
|
||||
|
||||
const string getKernelName() const;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
vector<int> getWorkloadVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -15,13 +15,17 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
|||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType(), DataType::Float32};
|
||||
std::cout << 1 << std::endl;
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
std::cout << 2 << std::endl;
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
std::cout << 3 << std::endl;
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
if (!perfData && !tune) {
|
||||
kernel->compute(op, this);
|
||||
continue;
|
||||
}
|
||||
std::cout << 4 << std::endl;
|
||||
|
||||
PerfRecord record;
|
||||
if (!perfData) {
|
||||
|
@ -29,6 +33,7 @@ void BangRuntimeObj::runWithoutSync(const Graph &graph, bool tune = false,
|
|||
perfEngine.setPerfData(perfKey, record);
|
||||
} else
|
||||
record = perfData;
|
||||
std::cout << 5 << std::endl;
|
||||
|
||||
double t = record->time;
|
||||
totalTime += t;
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
#include "operators/any.h"
|
||||
#include "cuda/cuda_any.h"
|
||||
#include "cuda/cuda_conv2dreduce.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class AnyCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<AnyObj>(_op);
|
||||
|
||||
auto inputs = op->getInputs();
|
||||
auto outputs = op->getOutputs();
|
||||
|
||||
vector<float *> inputsRawPtr;
|
||||
for (auto &input : inputs) {
|
||||
inputsRawPtr.emplace_back(input->getRawDataPtr<float *>());
|
||||
}
|
||||
vector<float *> outputsRawPtr;
|
||||
for (auto &output : outputs) {
|
||||
outputsRawPtr.emplace_back(output->getRawDataPtr<float *>());
|
||||
}
|
||||
|
||||
any_kernel_mapping(inputsRawPtr, outputsRawPtr, op->getKernelName(),
|
||||
op->getOpAttrVector());
|
||||
}
|
||||
};
|
||||
|
||||
void any_kernel_mapping(vector<float *> inputs, vector<float *> outputs,
|
||||
const string &kernelName, const vector<int> &attr) {
|
||||
if (kernelName == "conv2dreduce_kernel") {
|
||||
IT_ASSERT(attr.size() == 15);
|
||||
IT_ASSERT(inputs.size() == 1 || inputs.size() == 2)
|
||||
IT_ASSERT(outputs.size() == 1);
|
||||
conv2dreduce_kernel(inputs[0], inputs.size() > 1 ? inputs[1] : nullptr,
|
||||
outputs[0], attr[0] != 0, attr[1], attr[2], attr[3],
|
||||
attr[4], attr[5], attr[6], attr[7], attr[8],
|
||||
attr[9], attr[10], attr[11], attr[12], attr[13],
|
||||
attr[14]);
|
||||
} else {
|
||||
std::cout << "Unimplemented AnyOp cuda kernel: " << kernelName
|
||||
<< std::endl;
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Any, DataType::Float32, AnyCuda,
|
||||
"Any_CUDA_Float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,74 @@
|
|||
#include "operators/any.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
AnyObj::AnyObj(GraphObj *graph, const TensorVec &inputs,
|
||||
const TensorVec &outputs, string &kernelName,
|
||||
const vector<int> &attr)
|
||||
: OperatorObj(OpType::Any, inputs, outputs), kernelName(kernelName),
|
||||
attr(attr) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
// Outputs must assigned when constructing AnyObj
|
||||
IT_ASSERT(!outputs.empty());
|
||||
for (auto &output : outputs)
|
||||
IT_ASSERT(output != nullptr && output->size() > 0);
|
||||
}
|
||||
|
||||
string AnyObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Any[" << getGuid() << "](";
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
os << "i" << i << "=" << inputs[i]->getGuid();
|
||||
if (i != inputs.size() - 1)
|
||||
os << " ";
|
||||
}
|
||||
os << ", ";
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
os << "o" << i << "=" << outputs[i]->getGuid();
|
||||
if (i != outputs.size() - 1)
|
||||
os << " ";
|
||||
}
|
||||
os << ", ";
|
||||
os << "kernel name: " << kernelName << ", ";
|
||||
os << "attr = [";
|
||||
for (size_t i = 0; i < attr.size(); ++i) {
|
||||
os << attr[i];
|
||||
if (i != attr.size() - 1)
|
||||
os << ", ";
|
||||
}
|
||||
os << "])\n";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
optional<vector<Shape>> AnyObj::inferShape(const TensorVec &inputs) const {
|
||||
vector<Shape> ret;
|
||||
for (auto output : outputs) {
|
||||
ret.emplace_back(output->getDims());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
const string AnyObj::getKernelName() const { return kernelName; }
|
||||
|
||||
vector<int> AnyObj::getOpAttrVector() const { return attr; };
|
||||
|
||||
vector<int> AnyObj::getWorkloadVector() const {
|
||||
vector<int> ret = {};
|
||||
for (auto &input : inputs) {
|
||||
auto inputDims = input->getDims();
|
||||
ret.insert(ret.end(), inputDims.begin(), inputDims.end());
|
||||
}
|
||||
for (auto &output : outputs) {
|
||||
auto outputDims = output->getDims();
|
||||
ret.insert(ret.end(), outputDims.begin(), outputDims.end());
|
||||
}
|
||||
for (auto c : kernelName) {
|
||||
ret.emplace_back(c);
|
||||
}
|
||||
for (auto at : attr) {
|
||||
ret.emplace_back(at);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -48,7 +48,7 @@ TEST(SubGraphRewriter, subGraphMatch1) {
|
|||
SubGraphRewriter v(g);
|
||||
vector<MatchGraph> subgs = v.findMatch(subG);
|
||||
|
||||
EXPECT_TRUE(subgs.size() == 2);
|
||||
EXPECT_TRUE(subgs.size() == 2u);
|
||||
}
|
||||
|
||||
TEST(MatchGraph, single_input) {
|
||||
|
@ -116,12 +116,12 @@ TEST(MatchGraph, single_input) {
|
|||
|
||||
auto o4 = v.addSubGraph(subG, TensorVec{add1->getOutput(0)});
|
||||
|
||||
EXPECT_EQ(g->getOperators().size(), 52);
|
||||
EXPECT_EQ(g->getOperators().size(), 52u);
|
||||
vector<MatchGraph> subgs = v.findMatch(subG);
|
||||
EXPECT_TRUE(subgs.size() == 5);
|
||||
EXPECT_TRUE(subgs.size() == 5u);
|
||||
|
||||
vector<MatchGraph> subgs1 = v.findMatch(subG1);
|
||||
EXPECT_TRUE(subgs1.size() == 4);
|
||||
EXPECT_TRUE(subgs1.size() == 4u);
|
||||
|
||||
// test replace
|
||||
Tensor sii0 =
|
||||
|
@ -135,7 +135,7 @@ TEST(MatchGraph, single_input) {
|
|||
}
|
||||
|
||||
v.replaceSubGraph(subG, subG2);
|
||||
EXPECT_EQ(g->getOperators().size(), 37);
|
||||
EXPECT_EQ(g->getOperators().size(), 37u);
|
||||
}
|
||||
|
||||
TEST(MatchGraph, multi_input) {
|
||||
|
@ -186,17 +186,17 @@ TEST(MatchGraph, multi_input) {
|
|||
nullptr);
|
||||
|
||||
auto matches = v.findMatch(subG);
|
||||
EXPECT_EQ(2, matches.size());
|
||||
EXPECT_EQ(2u, matches.size());
|
||||
|
||||
auto div0 = g->addOp<DivObj>(reduce1->getOutput(0), i2, nullptr);
|
||||
auto add1 =
|
||||
g->addOp<AddObj>(sub0->getOutput(), div0->getOutput(), nullptr);
|
||||
matches = v.findMatch(subG);
|
||||
EXPECT_EQ(1, matches.size());
|
||||
EXPECT_EQ(1u, matches.size());
|
||||
|
||||
// two matched subgraphs overlaped,so only replaced one sub graph
|
||||
v.replaceSubGraph(subG, replaceG);
|
||||
EXPECT_EQ(1, v.findMatch(replaceG).size());
|
||||
EXPECT_EQ(1u, v.findMatch(replaceG).size());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ TEST(MatchGraph, multi_output) {
|
|||
{
|
||||
auto input = g->cloneTensor(i);
|
||||
auto outs = v.addSubGraph(subg0, {input});
|
||||
EXPECT_EQ(2, outs.size());
|
||||
EXPECT_EQ(2u, outs.size());
|
||||
Tensor w0 = g->addTensor(Shape{96, 64, 3, 3}, DataType::UInt32);
|
||||
auto conv0 = g->addOp<ConvObj>(outs[0], w0, nullptr, 1, 1);
|
||||
auto relu0 = g->addOp<ReluObj>(conv0->getOutput(0), nullptr);
|
||||
|
@ -263,11 +263,11 @@ TEST(MatchGraph, multi_output) {
|
|||
}
|
||||
|
||||
auto matches = v.findMatch(subg0);
|
||||
EXPECT_EQ(1, matches.size());
|
||||
EXPECT_EQ(1u, matches.size());
|
||||
|
||||
v.replaceSubGraph(subg0, subg1);
|
||||
auto matches2 = v.findMatch(subg1);
|
||||
EXPECT_EQ(1, matches2.size());
|
||||
EXPECT_EQ(1u, matches2.size());
|
||||
}
|
||||
|
||||
// gcn
|
||||
|
@ -354,16 +354,16 @@ TEST(MatchGraph, multi_input_output) {
|
|||
v.addSubGraph(subg0, {relu->getOutput(0), maxPool->getOutput(0)});
|
||||
auto out1 =
|
||||
v.addSubGraph(subg1, {maxPool->getOutput(0), relu->getOutput(0)});
|
||||
EXPECT_EQ(2, out0.size());
|
||||
EXPECT_EQ(2, out1.size());
|
||||
EXPECT_EQ(2u, out0.size());
|
||||
EXPECT_EQ(2u, out1.size());
|
||||
auto div = g->addOp<DivObj>(out0[0], out1[1], nullptr);
|
||||
auto sub = g->addOp<SubObj>(out0[1], out1[0], nullptr);
|
||||
}
|
||||
|
||||
EXPECT_EQ(2, v.findMatch(subg0).size());
|
||||
EXPECT_EQ(2, v.findMatch(subg1).size());
|
||||
EXPECT_EQ(2u, v.findMatch(subg0).size());
|
||||
EXPECT_EQ(2u, v.findMatch(subg1).size());
|
||||
v.replaceSubGraph(subg0, subg2);
|
||||
EXPECT_EQ(v.findMatch(subg2).size(), 2);
|
||||
EXPECT_EQ(v.findMatch(subg2).size(), 2u);
|
||||
}
|
||||
|
||||
/* One Node having two or more successors is not supported yet.
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/any.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
TEST(cuda_Any, anyKernel) {
|
||||
// conv2dreduce
|
||||
{
|
||||
// Construct Runtime and graph for CPU and CUDA
|
||||
Runtime cpu =
|
||||
NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||
|
||||
auto generator = IncrementalGenerator();
|
||||
|
||||
int PRelu = 0, n = 1, h = 4, w = 4, f = 2, r = 3, s = 3, oh = 4, ow = 4,
|
||||
ph = 1, pw = 1, sh = 1, sw = 1, dh = 1, dw = 1;
|
||||
string kernelName = "conv2dreduce_kernel";
|
||||
vector<int> attr{PRelu, n, h, w, f, r, s, oh,
|
||||
ow, ph, pw, sh, sw, dh, dw};
|
||||
|
||||
// Build input data on CPu
|
||||
Tensor i0Cpu = gCpu->addTensor({n, 1, h, w}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({f, 1, r, s}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit
|
||||
// allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||
Tensor o0Cuda = gCuda->addTensor({n, f, oh, ow});
|
||||
auto anyOp = gCuda->addOpWithOutputs<AnyObj>(
|
||||
TensorVec{i0Cuda, w0Cuda}, TensorVec{o0Cuda}, kernelName, attr);
|
||||
anyOp->print();
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
std::cout << "data malloc success..." << std::endl;
|
||||
// Execute on CUDA
|
||||
cuda->run(gCuda);
|
||||
std::cout << "cuda run success..." << std::endl;
|
||||
// copy output from CUDA to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(anyOp->getOutput());
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(1);
|
||||
// print a tensor/operator/graph by print()
|
||||
gCuda->print();
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
|
@ -0,0 +1,48 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/any.h"
|
||||
#include "test.h"
|
||||
using namespace infini;
|
||||
using namespace std;
|
||||
|
||||
TEST(Any, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
vector<int> attr;
|
||||
string kernelName = "fake_kernel_name";
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
|
||||
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
|
||||
Tensor o0 = g->addTensor({3, 2, 3}, DataType::Float32);
|
||||
auto anyOp = g->addOpWithOutputs<AnyObj>(
|
||||
TensorVec{i0, i1}, TensorVec{o0}, kernelName, attr);
|
||||
EXPECT_TRUE(anyOp->getOutputs().size() == 1);
|
||||
EXPECT_EQ(anyOp->getOutput()->getDims(), (Shape{3, 2, 3}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
|
||||
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
|
||||
Tensor o0 = g->addTensor({2, 2, 3}, DataType::Float32);
|
||||
Tensor o1 = g->addTensor({1, 2, 3}, DataType::Float32);
|
||||
auto anyOp = g->addOpWithOutputs<AnyObj>(
|
||||
TensorVec{i0, i1}, TensorVec{o0, o1}, kernelName, attr);
|
||||
EXPECT_TRUE(anyOp->getOutputs().size() == 2);
|
||||
EXPECT_EQ(anyOp->getOutput(0)->getDims(), (Shape{2, 2, 3}));
|
||||
EXPECT_EQ(anyOp->getOutput(1)->getDims(), (Shape{1, 2, 3}));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Any, Attr) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
string kernelName = "fake_kernel_name";
|
||||
vector<int> attr = {2, 3, 2, 1, 4, 4};
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 2, 3}, DataType::Float32);
|
||||
Tensor i1 = g->addTensor({2, 2, 3}, DataType::Float32);
|
||||
Tensor o0 = g->addTensor({3, 2, 3}, DataType::Float32);
|
||||
auto anyOp = g->addOpWithOutputs<AnyObj>(TensorVec{i0, i1}, TensorVec{o0},
|
||||
kernelName, attr);
|
||||
EXPECT_EQ(anyOp->getOpAttrVector(), attr);
|
||||
}
|
Loading…
Reference in New Issue