forked from jiuyuan/InfiniTensor
add where fp16
This commit is contained in:
parent
a000cb0304
commit
8b2e3b8e19
|
@ -3,10 +3,15 @@
|
|||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize);
|
||||
|
||||
void whereKernel(const half *inputX, const half *inputY,
|
||||
const uint8_t *condition, half *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -91,6 +91,12 @@ template <int val> class ValGenerator : public DataGenerator {
|
|||
fill<uint32_t>(data, size);
|
||||
}
|
||||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
void fill_fp16(uint16_t *data, size_t size) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
float x = 1.0f * val;
|
||||
data[i] = float_to_fp16(x);
|
||||
}
|
||||
}
|
||||
};
|
||||
typedef ValGenerator<1> OneGenerator;
|
||||
typedef ValGenerator<0> ZeroGenerator;
|
||||
|
|
|
@ -36,10 +36,17 @@ class WhereCuda : public CudaKernelWithoutConfig {
|
|||
broadcastShape(opInputYShape, inputYShape, nDims, ySize);
|
||||
broadcastShape(opConditionShape, conditionShape, nDims, cSize);
|
||||
|
||||
whereKernel((float *)inputXData, (float *)inputYData,
|
||||
(uint8_t *)conditionData, (float *)outputData, nDims,
|
||||
outputsize, inputXShape, inputYShape, conditionShape,
|
||||
outputShape, xSize, ySize, cSize);
|
||||
if (op->getDType() == DataType::Float32) {
|
||||
whereKernel((float *)inputXData, (float *)inputYData,
|
||||
(uint8_t *)conditionData, (float *)outputData, nDims,
|
||||
outputsize, inputXShape, inputYShape, conditionShape,
|
||||
outputShape, xSize, ySize, cSize);
|
||||
} else if (op->getDType() == DataType::Float16) {
|
||||
whereKernel((half *)inputXData, (half *)inputYData,
|
||||
(uint8_t *)conditionData, (half *)outputData, nDims,
|
||||
outputsize, inputXShape, inputYShape, conditionShape,
|
||||
outputShape, xSize, ySize, cSize);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -17,13 +17,13 @@ __device__ int inferIndex(infini::SmallArray inputShape,
|
|||
}
|
||||
return inputIdx;
|
||||
}
|
||||
__global__ void _whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputXShape,
|
||||
infini::SmallArray inputYShape,
|
||||
infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize) {
|
||||
template <typename T>
|
||||
__global__ void
|
||||
_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
|
||||
T *output, int nDims, int outputsize,
|
||||
infini::SmallArray inputXShape, infini::SmallArray inputYShape,
|
||||
infini::SmallArray conditionShape, infini::SmallArray outputShape,
|
||||
int xSize, int ySize, int cSize) {
|
||||
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (outputIdx < outputsize) {
|
||||
|
@ -61,7 +61,31 @@ void whereKernel(const float *inputX, const float *inputY,
|
|||
blocksize = 32;
|
||||
}
|
||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
||||
_whereKernel<<<gridsize, blocksize>>>(
|
||||
_whereKernel<float><<<gridsize, blocksize>>>(
|
||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape,
|
||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
||||
}
|
||||
void whereKernel(const half *inputX, const half *inputY,
|
||||
const uint8_t *condition, half *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize) {
|
||||
int blocksize;
|
||||
if (outputsize > 511) {
|
||||
blocksize = 1024;
|
||||
} else if (outputsize > 255) {
|
||||
blocksize = 512;
|
||||
} else if (outputsize > 127) {
|
||||
blocksize = 256;
|
||||
} else if (outputsize > 63) {
|
||||
blocksize = 128;
|
||||
} else if (outputsize > 31) {
|
||||
blocksize = 64;
|
||||
} else {
|
||||
blocksize = 32;
|
||||
}
|
||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
||||
_whereKernel<half><<<gridsize, blocksize>>>(
|
||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape,
|
||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ void test_softmaxFp16(
|
|||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float32);
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float16);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
|
||||
|
@ -115,40 +115,20 @@ TEST(CUDA_SoftmaxFP32, run) {
|
|||
0.26894143, 0.73105860, 0.26894143, 0.73105860});
|
||||
} // python output
|
||||
TEST(CUDA_SoftmaxFP16, run) {
|
||||
test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0,
|
||||
vector<float>{0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000});
|
||||
test_softmaxFp16(
|
||||
Shape{2, 3, 2, 2}, IncrementalGenerator(), 0,
|
||||
vector<float>{
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
|
||||
test_softmaxFp16(
|
||||
Shape{2, 3, 2, 2}, IncrementalGenerator(), 1,
|
||||
vector<float>{
|
||||
3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
|
||||
1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02,
|
||||
9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01,
|
||||
3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
|
||||
1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02,
|
||||
9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01});
|
||||
test_softmaxFp16(
|
||||
Shape{2, 3, 2, 2}, IncrementalGenerator(), 2,
|
||||
vector<float>{0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703});
|
||||
test_softmaxFp16(
|
||||
Shape{2, 3, 2, 2}, IncrementalGenerator(), 3,
|
||||
vector<float>{0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860});
|
||||
Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down
|
||||
vector<float>{0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,11 +8,11 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void test_where(const Shape &inputXShape, const vector<float> &inputXData,
|
||||
const Shape &inputYShape, const vector<float> &inputYData,
|
||||
const Shape &conditionShape,
|
||||
const vector<uint8_t> &conditionData,
|
||||
const vector<float> &ExpectData) {
|
||||
void test_whereFp32(const Shape &inputXShape, const vector<float> &inputXData,
|
||||
const Shape &inputYShape, const vector<float> &inputYData,
|
||||
const Shape &conditionShape,
|
||||
const vector<uint8_t> &conditionData,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
|
||||
|
@ -43,22 +43,62 @@ void test_where(const Shape &inputXShape, const vector<float> &inputXData,
|
|||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
void test_whereFp16(
|
||||
const Shape &inputXShape,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorX,
|
||||
const Shape &inputYShape,
|
||||
const std::function<void(void *, size_t, DataType)> &generatorY,
|
||||
const Shape &conditionShape, const vector<uint8_t> &conditionData,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
TEST(CUDA_Where, run) {
|
||||
test_where(
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::Float16);
|
||||
auto inputY = gCpu->addTensor(inputYShape, DataType::Float16);
|
||||
auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
|
||||
gCpu->dataMalloc();
|
||||
|
||||
inputX->setData(generatorX);
|
||||
inputY->setData(generatorY);
|
||||
condition->copyin(conditionData); //
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputYGpu = gCuda->cloneTensor(inputY);
|
||||
auto conditionGpu = gCuda->cloneTensor(condition);
|
||||
|
||||
auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
|
||||
nullptr); // WhereObj
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputXGpu->setData(generatorX);
|
||||
inputYGpu->setData(generatorY);
|
||||
conditionGpu->copyin(conditionData);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
|
||||
TEST(CUDA_WhereFp32, run) {
|
||||
test_whereFp32(
|
||||
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Shape{2, 2, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
|
||||
vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
|
||||
|
||||
test_where(Shape{2, 1, 1, 3}, // inputx
|
||||
vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
|
||||
vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
|
||||
vector<uint8_t>{0, 1, 1, 0, 0, 0},
|
||||
vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1.,
|
||||
0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1.,
|
||||
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.});
|
||||
test_where(
|
||||
test_whereFp32(Shape{2, 1, 1, 3}, // inputx
|
||||
vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
|
||||
vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
|
||||
vector<uint8_t>{0, 1, 1, 0, 0, 0},
|
||||
vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2.,
|
||||
1., 1., 1., 0., 1., 2., 0., 1., 2.,
|
||||
1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||
1., 1., 1., 1., 1., 1., 1., 1., 1.});
|
||||
test_whereFp32(
|
||||
Shape{
|
||||
3,
|
||||
},
|
||||
|
@ -68,7 +108,7 @@ TEST(CUDA_Where, run) {
|
|||
vector<float>{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3.,
|
||||
0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
|
||||
2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
|
||||
test_where(
|
||||
test_whereFp32(
|
||||
Shape{
|
||||
3,
|
||||
},
|
||||
|
@ -80,6 +120,30 @@ TEST(CUDA_Where, run) {
|
|||
0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
|
||||
2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
|
||||
|
||||
} // python output
|
||||
TEST(CUDA_WhereFp16, run) {
|
||||
test_whereFp16(
|
||||
Shape{
|
||||
3,
|
||||
},
|
||||
ValGenerator<1>(), // inputX
|
||||
Shape{2, 3, 1}, ValGenerator<2>(), // inputY
|
||||
Shape{2, 1, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0}, // condition
|
||||
vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
|
||||
1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
|
||||
2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
|
||||
test_whereFp16(
|
||||
Shape{
|
||||
3,
|
||||
},
|
||||
ValGenerator<1>(), // inputX
|
||||
Shape{2, 3, 1}, ValGenerator<2>(), // inputY
|
||||
Shape{2, 1, 3, 1},
|
||||
vector<uint8_t>{false, true, true, false, false, false}, // condition
|
||||
vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
|
||||
1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
|
||||
2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
TEST(Where, ShapeInference) {
|
||||
TEST(WhereFp32, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
|
@ -42,5 +42,39 @@ TEST(Where, ShapeInference) {
|
|||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(WhereFp16, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({2, 2}, DataType::Float16);
|
||||
Tensor y = g->addTensor({2, 2}, DataType::Float16);
|
||||
Tensor con = g->addTensor({2, 2}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float16);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
|
||||
Tensor con = g->addTensor({1, 224, 1}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
|
||||
Tensor con = g->addTensor({1, 224}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
|
||||
Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
Loading…
Reference in New Issue