forked from jiuyuan/InfiniTensor
ADD: batch norm operator and cuda kernel. (#44)
fix numInputs of batchNorm, add new line in file ending. ADD: batch norm operator and cuda kernel. add training remove comments. fix compile error. add batch norm operator and cuda kernel.
This commit is contained in:
parent
1152adc94a
commit
a4d6426589
|
@ -0,0 +1,28 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class BatchNormObj : public OperatorObj {
|
||||
float momentum, eps;
|
||||
bool training;
|
||||
|
||||
public:
|
||||
BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean,
|
||||
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
||||
float eps = 1e-5, bool training = false);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
|
||||
// output size will be 3 when training
|
||||
int numInputs() const override { return 5; }
|
||||
int numOutputs() const override { return outputs.size(); }
|
||||
float getEps() const { return eps; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
};
|
||||
} // namespace infini
|
|
@ -38,14 +38,14 @@ class IncrementalGenerator : public DataGenerator {
|
|||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
};
|
||||
|
||||
class OneGenerator : public DataGenerator {
|
||||
template <int val> class ValGenerator : public DataGenerator {
|
||||
public:
|
||||
virtual ~OneGenerator() {}
|
||||
virtual ~ValGenerator() {}
|
||||
|
||||
private:
|
||||
template <typename T> void fill(T *data, size_t size) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = 1;
|
||||
data[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -54,4 +54,6 @@ class OneGenerator : public DataGenerator {
|
|||
}
|
||||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
};
|
||||
typedef ValGenerator<1> OneGenerator;
|
||||
typedef ValGenerator<0> ZeroGenerator;
|
||||
} // namespace infini
|
|
@ -0,0 +1,64 @@
|
|||
#include "operators/batch_norm.h"
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
namespace infini {
|
||||
class BatchNormCudnn : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<BatchNormObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
cudnnStatus_t stat;
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
void *const meanData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const varData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const scaleData = (op->getInputs(3)->getRawDataPtr<void *>());
|
||||
void *const biasData = (op->getInputs(4)->getRawDataPtr<void *>());
|
||||
|
||||
auto dims = op->getInputs(0)->getDims();
|
||||
if (dims.size() == 2)
|
||||
IT_TODO_HALT();
|
||||
// Only 4D and 5D tensors are supported by
|
||||
// cudnnBatchNormalizationForwardInference
|
||||
IT_ASSERT(dims.size() == 4 || dims.size() == 5);
|
||||
|
||||
int dimArray[CUDNN_DIM_MAX], strideArray[CUDNN_DIM_MAX],
|
||||
dimPArray[CUDNN_DIM_MAX], stridePArray[CUDNN_DIM_MAX];
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
dimArray[i] = dims[i];
|
||||
strideArray[i] = op->getInputs(0)->getStride()[i];
|
||||
dimPArray[i] = op->getInputs(1)->getDims()[i];
|
||||
stridePArray[i] = op->getInputs(1)->getStride()[i];
|
||||
}
|
||||
// get inputs
|
||||
cudnnTensorDescriptor_t inDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
inDesc, CUDNN_DATA_FLOAT, dims.size(), dimArray, strideArray));
|
||||
|
||||
// get bnScaleBiasMeanVarDesc
|
||||
cudnnTensorDescriptor_t paraDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(¶Desc));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
paraDesc, CUDNN_DATA_FLOAT, dims.size(), dimPArray, stridePArray));
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
// This mode is intended for use after convolutional layers
|
||||
stat = cudnnBatchNormalizationForwardInference(
|
||||
context->cudnnHandle(), CUDNN_BATCHNORM_SPATIAL, &alpha, &beta,
|
||||
inDesc, inData, inDesc, outData, paraDesc, scaleData, biasData,
|
||||
meanData, varData, op->getEps());
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
// Destories in CUDA does not require sync. But cuDNN does not state
|
||||
// whether sync is required before destories.
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(paraDesc));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::BatchNorm, DataType::Float32,
|
||||
BatchNormCudnn, "BatchNorm_cuDNN_CUDA_Float32");
|
||||
} // namespace infini
|
|
@ -4,8 +4,9 @@
|
|||
#include "cuda/gather.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void initGatherMetaData(GatherMetaData &metaData, const Operator &_op) {
|
||||
class GatherCuda : public CudaKernelWithoutConfig {
|
||||
void initGatherMetaData(GatherMetaData &metaData,
|
||||
const Operator &_op) const {
|
||||
memset(&metaData, 0, sizeof(metaData));
|
||||
auto op = as<GatherObj>(_op);
|
||||
auto in = op->getInputs(0);
|
||||
|
@ -27,7 +28,6 @@ void initGatherMetaData(GatherMetaData &metaData, const Operator &_op) {
|
|||
}
|
||||
}
|
||||
|
||||
class GatherCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &op,
|
||||
const RuntimeObj *_context) const override {
|
||||
|
||||
|
|
|
@ -14,4 +14,3 @@ void _sgbmml(float *__restrict__ q, float *__restrict__ k,
|
|||
}
|
||||
|
||||
} // namespace infini
|
||||
|
||||
|
|
|
@ -6,8 +6,9 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
class CudaCompute {
|
||||
void initComposedTensorMetadata(ComposedTensorMetadata &metadata,
|
||||
Tensor tensor) {
|
||||
Tensor tensor) const {
|
||||
int nDims = tensor->getDims().size();
|
||||
auto strides = tensor->getStride();
|
||||
IT_ASSERT(strides.size() == (size_t)nDims);
|
||||
|
@ -20,7 +21,7 @@ void initComposedTensorMetadata(ComposedTensorMetadata &metadata,
|
|||
|
||||
void initElementTensorMetadata(ElementTensorMetadata &metadata,
|
||||
TensorVec tensors, int idx, int dim,
|
||||
int &dimBgIdx, int &batchCounter) {
|
||||
int &dimBgIdx, int &batchCounter) const {
|
||||
int nTensors = tensors.size();
|
||||
for (; batchCounter < BATCH_SIZE && idx + batchCounter < nTensors;
|
||||
++batchCounter) {
|
||||
|
@ -34,7 +35,6 @@ void initElementTensorMetadata(ElementTensorMetadata &metadata,
|
|||
}
|
||||
}
|
||||
|
||||
class CudaCompute {
|
||||
public:
|
||||
void do_compute(Tensor composedTensor, TensorVec elementsTensor, int dim,
|
||||
int nDims, bool isSplit) const {
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
#include "operators/batch_norm.h"
|
||||
|
||||
namespace infini {
|
||||
BatchNormObj::BatchNormObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
Tensor mean, Tensor var, Tensor scale, Tensor bias,
|
||||
float momentum, float eps, bool training)
|
||||
: OperatorObj(OpType::BatchNorm, {input, mean, var, scale, bias}, {output}),
|
||||
momentum(momentum), eps(eps), training(training) {
|
||||
if (training)
|
||||
IT_TODO_HALT();
|
||||
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
BatchNormObj::inferShape(const TensorVec &inputs) const {
|
||||
auto input = inputs[0];
|
||||
auto mean = inputs[1];
|
||||
auto var = inputs[2];
|
||||
auto scale = inputs[3];
|
||||
auto bias = inputs[4];
|
||||
if (input->getDims().size() < 2)
|
||||
return {};
|
||||
Shape dims(input->getDims().size(), 1);
|
||||
dims[1] = input->getDims()[1]; //
|
||||
if (mean->getDims() != dims || var->getDims() != dims ||
|
||||
scale->getDims() != dims || bias->getDims() != dims)
|
||||
return {};
|
||||
return {{input->getDims()}};
|
||||
}
|
||||
|
||||
vector<DataType> BatchNormObj::inferDataType(const TensorVec &inputs) const {
|
||||
IT_ASSERT(inputs.size() == 5);
|
||||
auto index = inputs[1];
|
||||
IT_ASSERT(inputs[1]->getDType() == DataType::Float32);
|
||||
IT_ASSERT(inputs[2]->getDType() == DataType::Float32);
|
||||
IT_ASSERT(inputs[3]->getDType() == DataType::Float32);
|
||||
IT_ASSERT(inputs[4]->getDType() == DataType::Float32);
|
||||
return {inputs[0]->getDType()};
|
||||
}
|
||||
|
||||
std::string BatchNormObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "BatchNorm[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "momentum=" << momentum << ",";
|
||||
os << "eps=" << eps << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "mean=" << inputs[1]->getGuid() << ",";
|
||||
os << "var=" << inputs[2]->getGuid() << ",";
|
||||
os << "scale=" << inputs[3]->getGuid() << ",";
|
||||
os << "bias=" << inputs[4]->getGuid() << ",";
|
||||
os << "output=";
|
||||
for (auto output : outputs)
|
||||
os << output->getGuid() << ",";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// need eps and momentum?
|
||||
vector<int> BatchNormObj::getWorkloadVector() const {
|
||||
vector<int> ret = inputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// need eps and momentum?
|
||||
vector<int> BatchNormObj::getOpAttrVector() const {
|
||||
return {enum_to_underlying(type)};
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,54 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(CUDA_BatchNorm, run) {
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
|
||||
// Build cpu graph
|
||||
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
|
||||
auto meanCpu = gCpu->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
|
||||
auto varCpu = gCpu->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
|
||||
auto scaleCpu = gCpu->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
|
||||
auto biasCpu = gCpu->addTensor(Shape{1, 3, 1, 1}, DataType::Float32);
|
||||
|
||||
// Build input data on CPU
|
||||
gCpu->dataMalloc();
|
||||
iCpu->setData(IncrementalGenerator());
|
||||
meanCpu->copyData(vector<float>{1, 6, 9});
|
||||
varCpu->copyData(vector<float>{4, 1, 9});
|
||||
scaleCpu->setData(OneGenerator());
|
||||
biasCpu->setData(ZeroGenerator());
|
||||
|
||||
// Build CUDA graph
|
||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
||||
auto i = g->cloneTensor(iCpu);
|
||||
auto mean = g->cloneTensor(meanCpu);
|
||||
auto var = g->cloneTensor(varCpu);
|
||||
auto scale = g->cloneTensor(scaleCpu);
|
||||
auto bias = g->cloneTensor(biasCpu);
|
||||
auto op =
|
||||
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
|
||||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
|
||||
// Execute on CUDA
|
||||
cudaRuntime->run(g);
|
||||
|
||||
// clone CUDA output to CPU
|
||||
auto o = op->getOutput();
|
||||
auto ocpu = o->clone(cpuRuntime);
|
||||
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.333333, 0.666667}));
|
||||
}
|
||||
} // namespace infini
|
|
@ -0,0 +1,21 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/batch_norm.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
TEST(BatchNorm, ShapeInference) {
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(cpuRuntime);
|
||||
Tensor i = g->addTensor({1, 3, 2, 2}, DataType::UInt32);
|
||||
Tensor mean = g->addTensor({1, 3, 1, 1}, DataType::Float32);
|
||||
Tensor var = g->addTensor({1, 3, 1, 1}, DataType::Float32);
|
||||
Tensor scaler = g->addTensor({1, 3, 1, 1}, DataType::Float32);
|
||||
Tensor bias = g->addTensor({1, 3, 1, 1}, DataType::Float32);
|
||||
auto op = g->addOp<BatchNormObj>(i, nullptr, mean, var, scaler, bias,
|
||||
0.9, 1e-5);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
Loading…
Reference in New Issue