add: graph for bert.

This commit is contained in:
mazx 2022-10-29 00:18:00 +08:00
parent 5ed540be6e
commit 3046dd5901
10 changed files with 414 additions and 37 deletions

148
generated_code/bert_0.cu Normal file
View File

@ -0,0 +1,148 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
float *tensor_ptr_4) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 2 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 15627264; loop_idx += 216) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 15627264 * 32;
offset_src_buf /= 15627264;
}
}
// Kernel
__global__ void kernel_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
float *tensor_ptr_6, float *tensor_ptr_7,
float *tensor_ptr_8) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[48];
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 256 * 256;
offset_src_buf /= 256;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_7[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_8[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
// Kernel
__global__ void kernel_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
float *tensor_ptr_10) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 2 + warp_id;
float buf[48];
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 128 * 512;
offset_src_buf /= 128;
}
}
// Kernel
__global__ void kernel_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
float *tensor_ptr_12) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 256 * 256;
offset_src_buf /= 256;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_10[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
float *tensor_ptr_4) {
dim3 gridDim(108, 1);
dim3 blockDim(64, 1);
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_3, tensor_ptr_2,
tensor_ptr_4);
cudaCheckError();
}
void invoke_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
float *tensor_ptr_6, float *tensor_ptr_7,
float *tensor_ptr_8) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_6<<<gridDim, blockDim>>>(
tensor_ptr_4, tensor_ptr_5, tensor_ptr_6, tensor_ptr_7, tensor_ptr_8);
cudaCheckError();
}
void invoke_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
float *tensor_ptr_10) {
dim3 gridDim(108, 1);
dim3 blockDim(64, 1);
kernel_func_7<<<gridDim, blockDim>>>(tensor_ptr_8, tensor_ptr_9,
tensor_ptr_10);
cudaCheckError();
}
void invoke_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
float *tensor_ptr_12) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_10, tensor_ptr_11,
tensor_ptr_12);
cudaCheckError();
}

View File

@ -16,7 +16,11 @@ enum OpType {
RELU,
ADD,
SUB,
REDUCEMEAN,
REDUCESUM,
GATHER,
TRANSPOSE,
BROADCAST,
};
enum MemType {

View File

@ -15,4 +15,18 @@ instantiateBinary(const OpType opType,
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape, const std::vector<size_t> &perm);
std::vector<std::shared_ptr<MetaOp>>
instantiateGather(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape,
const std::vector<size_t> &indexShape,
const std::vector<size_t> &outputShape, const size_t axis);
std::vector<std::shared_ptr<MetaOp>>
instantiateReduce(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis);
std::vector<std::shared_ptr<MetaOp>> instantiateBroadcast(
const OpType opType, const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis, const size_t num);
} // namespace memb

View File

@ -191,4 +191,102 @@ std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateGather(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape,
const std::vector<size_t> &indexShape,
const std::vector<size_t> &outputShape, const size_t axis) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = 1;
for (size_t i = 0; i < outputShape.size() - 1; i++) {
par_size *= inputShape[i];
}
size_t seq_size = inputShape[outputShape.size() - 1];
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numWarps = 2;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateReduce(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = 1;
for (size_t i = 0; i < inputShape.size(); i++) {
if (i != axis) {
par_size *= inputShape[i];
}
}
size_t seq_size = inputShape[axis];
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numWarps = 2;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateBroadcast(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis,
const size_t num) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = getSize(inputShape);
size_t seq_size = num;
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numWarps = 2;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(std::make_shared<TensorMapping>(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
} // namespace memb

View File

@ -1,4 +1,7 @@
#include "core/graph.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/reduce_mean.h"
#include "operators/transpose.h"
#include "pfusion/instantiate.h"
@ -36,6 +39,8 @@ std::vector<size_t> convertShape(const std::vector<int> &_shape) {
return shape;
}
size_t convertIndex(const size_t idx, const size_t size) { return size - idx; }
std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
std::vector<size_t> perm;
for (int i = int(_perm.size()); i > 0; i--) {
@ -44,54 +49,110 @@ std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
return perm;
}
void convertTranspose(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
searchGraph->addNode(memb::instantiateTranspose(
memb::TRANSPOSE,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims()),
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
}
void convertUnary(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
searchGraph->addNode(memb::instantiateUnary(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
}
void convertBinary(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
searchGraph->addNode(memb::instantiateBinary(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
}
void convertGather(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
searchGraph->addNode(memb::instantiateGather(
memb::GATHER,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()),
convertShape(op->getInputs()[1]->getDims()),
convertShape(op->getOutputs()[0]->getDims()),
convertIndex(infini::as<infini::GatherObj>(op)->getAxis(),
op->getInputs()[0]->getDims().size())));
}
void convertReduce(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
auto reduceMeanOp = infini::as<infini::ReduceMeanObj>(op);
int axis = -1,
dimSize = int(reduceMeanOp->getInputs()[0]->getDims().size());
for (int i = 0; i < dimSize; i++) {
if (reduceMeanOp->isReduced(i)) {
if (axis != -1) {
IT_ASSERT(false);
} else {
axis = dimSize - i - 1;
}
}
}
IT_ASSERT(axis != -1);
searchGraph->addNode(memb::instantiateReduce(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()), axis));
}
void convertBroadcast(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
auto extendOp = infini::as<infini::ExtendObj>(op);
IT_ASSERT(op->getInputs()[0]->getDims()[extendOp->getDim()] == 1);
searchGraph->addNode(memb::instantiateBroadcast(
memb::BROADCAST,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()), extendOp->getDim(),
extendOp->getNum() + 1));
}
std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
auto metaGraph = std::make_shared<memb::SearchGraph>();
auto searchGraph = std::make_shared<memb::SearchGraph>();
std::unordered_map<int, int> opMap;
int id = 0;
for (auto op : graph->getOperators()) {
switch (op->getOpType()) {
case infini::OpType::Transpose:
metaGraph->addNode(memb::instantiateTranspose(
memb::TRANSPOSE,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims()),
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
convertTranspose(searchGraph, op);
break;
case infini::OpType::Relu:
metaGraph->addNode(memb::instantiateUnary(
memb::RELU,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
convertUnary(searchGraph, op, memb::RELU);
break;
case infini::OpType::Add:
metaGraph->addNode(memb::instantiateBinary(
memb::ADD,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
convertBinary(searchGraph, op, memb::ADD);
break;
case infini::OpType::Sub:
metaGraph->addNode(memb::instantiateBinary(
memb::SUB,
{memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(
op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
convertBinary(searchGraph, op, memb::SUB);
break;
case infini::OpType::Gather:
convertGather(searchGraph, op);
break;
case infini::OpType::ReduceMean:
convertReduce(searchGraph, op, memb::REDUCEMEAN);
break;
case infini::OpType::Extend:
convertBroadcast(searchGraph, op);
break;
default:
std::cout << int(op->getOpType()) << std::endl;
IT_ASSERT(false);
}
IT_ASSERT(opMap.find(op->getGuid()) == opMap.end());
@ -102,10 +163,11 @@ std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
for (auto nextOp : op->getSuccessors()) {
assert(opMap.find(op->getGuid()) != opMap.end());
assert(opMap.find(nextOp->getGuid()) != opMap.end());
metaGraph->addEdge(opMap[op->getGuid()], opMap[nextOp->getGuid()]);
searchGraph->addEdge(opMap[op->getGuid()],
opMap[nextOp->getGuid()]);
}
}
return metaGraph;
return searchGraph;
}
std::string infini::MemoryCodegen::generate(Graph graph) {

View File

@ -23,6 +23,8 @@ std::string TensorMapping::genOffset() {
}
void MetaOp::optimize() {
if (microOps.size() == 0)
return;
std::vector<std::shared_ptr<MicroOp>> ops;
int numOp = microOps.size();
int cur = 0;

View File

@ -13,7 +13,7 @@ std::shared_ptr<MicroOp> MicroOp::merge(std::shared_ptr<MicroOp> op0,
return std::make_shared<EmptyOp>();
} else {
// TODO: gen reg to reg.
IT_ASSERT(false);
// IT_ASSERT(false);
}
}
}

49
test/pfusion/test_bert.cc Normal file
View File

@ -0,0 +1,49 @@
#include "core/blob.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/reduce_mean.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, bert_0) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({1, 128}, DataType::Float32);
Tensor t1 = g->addTensor({30522, 512}, DataType::Float32);
Tensor t2 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t3 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t4 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t5 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t6 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t7 = g->addTensor({1, 128, 1}, DataType::Float32);
Tensor t8 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t9 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t10 = g->addTensor({1, 128, 512}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<GatherObj>(t1, t0, t2, 0);
g->addOpWithOutputs<AddObj>(t2, t3, t4);
g->addOpWithOutputs<AddObj>(t4, t5, t6);
g->addOpWithOutputs<ReduceMeanObj>(t6, t7, Shape({2}));
g->addOpWithOutputs<ExtendObj>(t7, t8, 2, 511);
g->addOpWithOutputs<SubObj>(t8, t9, t10);
MemoryCodegen codegen;
codegen.exportCode(g, "bert_0.cu");
}
} // namespace infini