forked from jiuyuan/InfiniTensor
add: graph for bert.
This commit is contained in:
parent
5ed540be6e
commit
3046dd5901
|
@ -0,0 +1,148 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
|
||||
float *tensor_ptr_4) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 2 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 15627264; loop_idx += 216) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 15627264 * 32;
|
||||
offset_src_buf /= 15627264;
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
|
||||
float *tensor_ptr_6, float *tensor_ptr_7,
|
||||
float *tensor_ptr_8) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[48];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 256 * 256;
|
||||
offset_src_buf /= 256;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_7[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_8[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
|
||||
float *tensor_ptr_10) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 2 + warp_id;
|
||||
float buf[48];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 128 * 512;
|
||||
offset_src_buf /= 128;
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 256 * 256;
|
||||
offset_src_buf /= 256;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_10[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
|
||||
float *tensor_ptr_4) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(64, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_3, tensor_ptr_2,
|
||||
tensor_ptr_4);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
|
||||
float *tensor_ptr_6, float *tensor_ptr_7,
|
||||
float *tensor_ptr_8) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_6<<<gridDim, blockDim>>>(
|
||||
tensor_ptr_4, tensor_ptr_5, tensor_ptr_6, tensor_ptr_7, tensor_ptr_8);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
|
||||
float *tensor_ptr_10) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(64, 1);
|
||||
kernel_func_7<<<gridDim, blockDim>>>(tensor_ptr_8, tensor_ptr_9,
|
||||
tensor_ptr_10);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_10, tensor_ptr_11,
|
||||
tensor_ptr_12);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -16,7 +16,11 @@ enum OpType {
|
|||
RELU,
|
||||
ADD,
|
||||
SUB,
|
||||
REDUCEMEAN,
|
||||
REDUCESUM,
|
||||
GATHER,
|
||||
TRANSPOSE,
|
||||
BROADCAST,
|
||||
};
|
||||
|
||||
enum MemType {
|
||||
|
|
|
@ -15,4 +15,18 @@ instantiateBinary(const OpType opType,
|
|||
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
||||
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape, const std::vector<size_t> &perm);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateGather(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape,
|
||||
const std::vector<size_t> &indexShape,
|
||||
const std::vector<size_t> &outputShape, const size_t axis);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateReduce(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis);
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateBroadcast(
|
||||
const OpType opType, const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis, const size_t num);
|
||||
|
||||
} // namespace memb
|
||||
|
|
|
@ -191,4 +191,102 @@ std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
|||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateGather(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape,
|
||||
const std::vector<size_t> &indexShape,
|
||||
const std::vector<size_t> &outputShape, const size_t axis) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = 1;
|
||||
for (size_t i = 0; i < outputShape.size() - 1; i++) {
|
||||
par_size *= inputShape[i];
|
||||
}
|
||||
size_t seq_size = inputShape[outputShape.size() - 1];
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numWarps = 2;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateReduce(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = 1;
|
||||
for (size_t i = 0; i < inputShape.size(); i++) {
|
||||
if (i != axis) {
|
||||
par_size *= inputShape[i];
|
||||
}
|
||||
}
|
||||
size_t seq_size = inputShape[axis];
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numWarps = 2;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBroadcast(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis,
|
||||
const size_t num) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = getSize(inputShape);
|
||||
size_t seq_size = num;
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numWarps = 2;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
} // namespace memb
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
#include "core/graph.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "pfusion/instantiate.h"
|
||||
|
@ -36,6 +39,8 @@ std::vector<size_t> convertShape(const std::vector<int> &_shape) {
|
|||
return shape;
|
||||
}
|
||||
|
||||
size_t convertIndex(const size_t idx, const size_t size) { return size - idx; }
|
||||
|
||||
std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
|
||||
std::vector<size_t> perm;
|
||||
for (int i = int(_perm.size()); i > 0; i--) {
|
||||
|
@ -44,54 +49,110 @@ std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
|
|||
return perm;
|
||||
}
|
||||
|
||||
void convertTranspose(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
searchGraph->addNode(memb::instantiateTranspose(
|
||||
memb::TRANSPOSE,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
|
||||
}
|
||||
|
||||
void convertUnary(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
searchGraph->addNode(memb::instantiateUnary(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
}
|
||||
|
||||
void convertBinary(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
searchGraph->addNode(memb::instantiateBinary(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
}
|
||||
void convertGather(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
searchGraph->addNode(memb::instantiateGather(
|
||||
memb::GATHER,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()),
|
||||
convertShape(op->getInputs()[1]->getDims()),
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertIndex(infini::as<infini::GatherObj>(op)->getAxis(),
|
||||
op->getInputs()[0]->getDims().size())));
|
||||
}
|
||||
void convertReduce(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
auto reduceMeanOp = infini::as<infini::ReduceMeanObj>(op);
|
||||
int axis = -1,
|
||||
dimSize = int(reduceMeanOp->getInputs()[0]->getDims().size());
|
||||
for (int i = 0; i < dimSize; i++) {
|
||||
if (reduceMeanOp->isReduced(i)) {
|
||||
if (axis != -1) {
|
||||
IT_ASSERT(false);
|
||||
} else {
|
||||
axis = dimSize - i - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
IT_ASSERT(axis != -1);
|
||||
searchGraph->addNode(memb::instantiateReduce(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()), axis));
|
||||
}
|
||||
|
||||
void convertBroadcast(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
auto extendOp = infini::as<infini::ExtendObj>(op);
|
||||
IT_ASSERT(op->getInputs()[0]->getDims()[extendOp->getDim()] == 1);
|
||||
searchGraph->addNode(memb::instantiateBroadcast(
|
||||
memb::BROADCAST,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()), extendOp->getDim(),
|
||||
extendOp->getNum() + 1));
|
||||
}
|
||||
|
||||
std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
|
||||
auto metaGraph = std::make_shared<memb::SearchGraph>();
|
||||
auto searchGraph = std::make_shared<memb::SearchGraph>();
|
||||
std::unordered_map<int, int> opMap;
|
||||
int id = 0;
|
||||
for (auto op : graph->getOperators()) {
|
||||
switch (op->getOpType()) {
|
||||
case infini::OpType::Transpose:
|
||||
metaGraph->addNode(memb::instantiateTranspose(
|
||||
memb::TRANSPOSE,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
|
||||
convertTranspose(searchGraph, op);
|
||||
break;
|
||||
case infini::OpType::Relu:
|
||||
metaGraph->addNode(memb::instantiateUnary(
|
||||
memb::RELU,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
convertUnary(searchGraph, op, memb::RELU);
|
||||
break;
|
||||
case infini::OpType::Add:
|
||||
metaGraph->addNode(memb::instantiateBinary(
|
||||
memb::ADD,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
convertBinary(searchGraph, op, memb::ADD);
|
||||
break;
|
||||
case infini::OpType::Sub:
|
||||
metaGraph->addNode(memb::instantiateBinary(
|
||||
memb::SUB,
|
||||
{memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(
|
||||
op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
convertBinary(searchGraph, op, memb::SUB);
|
||||
break;
|
||||
case infini::OpType::Gather:
|
||||
convertGather(searchGraph, op);
|
||||
break;
|
||||
case infini::OpType::ReduceMean:
|
||||
convertReduce(searchGraph, op, memb::REDUCEMEAN);
|
||||
break;
|
||||
case infini::OpType::Extend:
|
||||
convertBroadcast(searchGraph, op);
|
||||
break;
|
||||
default:
|
||||
std::cout << int(op->getOpType()) << std::endl;
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
IT_ASSERT(opMap.find(op->getGuid()) == opMap.end());
|
||||
|
@ -102,10 +163,11 @@ std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
|
|||
for (auto nextOp : op->getSuccessors()) {
|
||||
assert(opMap.find(op->getGuid()) != opMap.end());
|
||||
assert(opMap.find(nextOp->getGuid()) != opMap.end());
|
||||
metaGraph->addEdge(opMap[op->getGuid()], opMap[nextOp->getGuid()]);
|
||||
searchGraph->addEdge(opMap[op->getGuid()],
|
||||
opMap[nextOp->getGuid()]);
|
||||
}
|
||||
}
|
||||
return metaGraph;
|
||||
return searchGraph;
|
||||
}
|
||||
|
||||
std::string infini::MemoryCodegen::generate(Graph graph) {
|
||||
|
|
|
@ -23,6 +23,8 @@ std::string TensorMapping::genOffset() {
|
|||
}
|
||||
|
||||
void MetaOp::optimize() {
|
||||
if (microOps.size() == 0)
|
||||
return;
|
||||
std::vector<std::shared_ptr<MicroOp>> ops;
|
||||
int numOp = microOps.size();
|
||||
int cur = 0;
|
||||
|
|
|
@ -13,7 +13,7 @@ std::shared_ptr<MicroOp> MicroOp::merge(std::shared_ptr<MicroOp> op0,
|
|||
return std::make_shared<EmptyOp>();
|
||||
} else {
|
||||
// TODO: gen reg to reg.
|
||||
IT_ASSERT(false);
|
||||
// IT_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, bert_0) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({1, 128}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({30522, 512}, DataType::Float32);
|
||||
Tensor t2 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
|
||||
Tensor t3 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
Tensor t4 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
|
||||
Tensor t5 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
Tensor t6 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
|
||||
Tensor t7 = g->addTensor({1, 128, 1}, DataType::Float32);
|
||||
|
||||
Tensor t8 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
|
||||
Tensor t9 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
Tensor t10 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<GatherObj>(t1, t0, t2, 0);
|
||||
g->addOpWithOutputs<AddObj>(t2, t3, t4);
|
||||
g->addOpWithOutputs<AddObj>(t4, t5, t6);
|
||||
g->addOpWithOutputs<ReduceMeanObj>(t6, t7, Shape({2}));
|
||||
g->addOpWithOutputs<ExtendObj>(t7, t8, 2, 511);
|
||||
g->addOpWithOutputs<SubObj>(t8, t9, t10);
|
||||
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportCode(g, "bert_0.cu");
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue