From 8b2e3b8e19e14e1844c93f5fd30fe23fb8e5823d Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 8 Dec 2023 16:57:49 +0800 Subject: [PATCH] add where fp16 --- include/cuda/cuda_where.h | 7 +- include/utils/data_generator.h | 6 ++ src/kernels/cuda/where.cc | 15 ++-- src/kernels/cuda/where.cu | 40 ++++++++--- test/kernels/cuda/test_cuda_softmax.cc | 48 ++++--------- test/kernels/cuda/test_cuda_where.cc | 96 +++++++++++++++++++++----- test/operators/test_where.cc | 38 +++++++++- 7 files changed, 185 insertions(+), 65 deletions(-) diff --git a/include/cuda/cuda_where.h b/include/cuda/cuda_where.h index bc6d3e81..8c2ba2db 100644 --- a/include/cuda/cuda_where.h +++ b/include/cuda/cuda_where.h @@ -3,10 +3,15 @@ #include "utils/small_array.h" namespace infini { + void whereKernel(const float *inputX, const float *inputY, const uint8_t *condition, float *output, int nDims, int outputsize, SmallArray inputXShape, SmallArray inputYShape, SmallArray conditionShape, SmallArray outputShape, int xSize, int ySize, int cSize); - +void whereKernel(const half *inputX, const half *inputY, + const uint8_t *condition, half *output, int nDims, + int outputsize, SmallArray inputXShape, SmallArray inputYShape, + SmallArray conditionShape, SmallArray outputShape, int xSize, + int ySize, int cSize); }; // namespace infini diff --git a/include/utils/data_generator.h b/include/utils/data_generator.h index 982db835..970b8038 100644 --- a/include/utils/data_generator.h +++ b/include/utils/data_generator.h @@ -91,6 +91,12 @@ template class ValGenerator : public DataGenerator { fill(data, size); } void fill(float *data, size_t size) override { fill(data, size); } + void fill_fp16(uint16_t *data, size_t size) { + for (size_t i = 0; i < size; i++) { + float x = 1.0f * val; + data[i] = float_to_fp16(x); + } + } }; typedef ValGenerator<1> OneGenerator; typedef ValGenerator<0> ZeroGenerator; diff --git a/src/kernels/cuda/where.cc b/src/kernels/cuda/where.cc index 1460b4af..b37da4d5 100644 --- a/src/kernels/cuda/where.cc +++ b/src/kernels/cuda/where.cc @@ -36,10 +36,17 @@ class WhereCuda : public CudaKernelWithoutConfig { broadcastShape(opInputYShape, inputYShape, nDims, ySize); broadcastShape(opConditionShape, conditionShape, nDims, cSize); - whereKernel((float *)inputXData, (float *)inputYData, - (uint8_t *)conditionData, (float *)outputData, nDims, - outputsize, inputXShape, inputYShape, conditionShape, - outputShape, xSize, ySize, cSize); + if (op->getDType() == DataType::Float32) { + whereKernel((float *)inputXData, (float *)inputYData, + (uint8_t *)conditionData, (float *)outputData, nDims, + outputsize, inputXShape, inputYShape, conditionShape, + outputShape, xSize, ySize, cSize); + } else if (op->getDType() == DataType::Float16) { + whereKernel((half *)inputXData, (half *)inputYData, + (uint8_t *)conditionData, (half *)outputData, nDims, + outputsize, inputXShape, inputYShape, conditionShape, + outputShape, xSize, ySize, cSize); + } } }; diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu index ac8b514a..e92a5e9f 100644 --- a/src/kernels/cuda/where.cu +++ b/src/kernels/cuda/where.cu @@ -17,13 +17,13 @@ __device__ int inferIndex(infini::SmallArray inputShape, } return inputIdx; } -__global__ void _whereKernel(const float *inputX, const float *inputY, - const uint8_t *condition, float *output, int nDims, - int outputsize, infini::SmallArray inputXShape, - infini::SmallArray inputYShape, - infini::SmallArray conditionShape, - infini::SmallArray outputShape, int xSize, - int ySize, int cSize) { +template +__global__ void +_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition, + T *output, int nDims, int outputsize, + infini::SmallArray inputXShape, infini::SmallArray inputYShape, + infini::SmallArray conditionShape, infini::SmallArray outputShape, + int xSize, int ySize, int cSize) { int outputIdx = blockIdx.x * blockDim.x + threadIdx.x; if (outputIdx < outputsize) { @@ -61,7 +61,31 @@ void whereKernel(const float *inputX, const float *inputY, blocksize = 32; } int gridsize = (outputsize + blocksize - 1) / blocksize; - _whereKernel<<>>( + _whereKernel<<>>( + inputX, inputY, condition, output, nDims, outputsize, inputXShape, + inputYShape, conditionShape, outputShape, xSize, ySize, cSize); +} +void whereKernel(const half *inputX, const half *inputY, + const uint8_t *condition, half *output, int nDims, + int outputsize, SmallArray inputXShape, SmallArray inputYShape, + SmallArray conditionShape, SmallArray outputShape, int xSize, + int ySize, int cSize) { + int blocksize; + if (outputsize > 511) { + blocksize = 1024; + } else if (outputsize > 255) { + blocksize = 512; + } else if (outputsize > 127) { + blocksize = 256; + } else if (outputsize > 63) { + blocksize = 128; + } else if (outputsize > 31) { + blocksize = 64; + } else { + blocksize = 32; + } + int gridsize = (outputsize + blocksize - 1) / blocksize; + _whereKernel<<>>( inputX, inputY, condition, output, nDims, outputsize, inputXShape, inputYShape, conditionShape, outputShape, xSize, ySize, cSize); } diff --git a/test/kernels/cuda/test_cuda_softmax.cc b/test/kernels/cuda/test_cuda_softmax.cc index 880a9c97..be73554d 100644 --- a/test/kernels/cuda/test_cuda_softmax.cc +++ b/test/kernels/cuda/test_cuda_softmax.cc @@ -42,7 +42,7 @@ void test_softmaxFp16( Runtime runtime = NativeCpuRuntimeObj::getInstance(); Graph gCpu = make_ref(runtime); - auto input = gCpu->addTensor(inputShape, DataType::Float32); + auto input = gCpu->addTensor(inputShape, DataType::Float16); gCpu->dataMalloc(); @@ -115,40 +115,20 @@ TEST(CUDA_SoftmaxFP32, run) { 0.26894143, 0.73105860, 0.26894143, 0.73105860}); } // python output TEST(CUDA_SoftmaxFP16, run) { + test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0, + vector{0.5000, 0.5000, 0.5000, 0.5000, 0.5000, + 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, + 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, + 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, + 0.5000, 0.5000, 0.5000, 0.5000}); test_softmaxFp16( - Shape{2, 3, 2, 2}, IncrementalGenerator(), 0, - vector{ - 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, - 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, - 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06, - 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, - 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, - 9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01}); - test_softmaxFp16( - Shape{2, 3, 2, 2}, IncrementalGenerator(), 1, - vector{ - 3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04, - 1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02, - 9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01, - 3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04, - 1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02, - 9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01}); - test_softmaxFp16( - Shape{2, 3, 2, 2}, IncrementalGenerator(), 2, - vector{0.11920292, 0.11920292, 0.88079703, 0.88079703, - 0.11920292, 0.11920292, 0.88079703, 0.88079703, - 0.11920292, 0.11920292, 0.88079703, 0.88079703, - 0.11920292, 0.11920292, 0.88079703, 0.88079703, - 0.11920292, 0.11920292, 0.88079703, 0.88079703, - 0.11920292, 0.11920292, 0.88079703, 0.88079703}); - test_softmaxFp16( - Shape{2, 3, 2, 2}, IncrementalGenerator(), 3, - vector{0.26894143, 0.73105860, 0.26894143, 0.73105860, - 0.26894143, 0.73105860, 0.26894143, 0.73105860, - 0.26894143, 0.73105860, 0.26894143, 0.73105860, - 0.26894143, 0.73105860, 0.26894143, 0.73105860, - 0.26894143, 0.73105860, 0.26894143, 0.73105860, - 0.26894143, 0.73105860, 0.26894143, 0.73105860}); + Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down + vector{0.333252, 0.333252, 0.333252, 0.333252, 0.333252, + 0.333252, 0.333252, 0.333252, 0.333252, 0.333252, + 0.333252, 0.333252, 0.333252, 0.333252, 0.333252, + 0.333252, 0.333252, 0.333252, 0.333252, 0.333252, + 0.333252, 0.333252, 0.333252, 0.333252}); + } // python output } // namespace infini diff --git a/test/kernels/cuda/test_cuda_where.cc b/test/kernels/cuda/test_cuda_where.cc index 32c2f253..07f5b48e 100644 --- a/test/kernels/cuda/test_cuda_where.cc +++ b/test/kernels/cuda/test_cuda_where.cc @@ -8,11 +8,11 @@ namespace infini { -void test_where(const Shape &inputXShape, const vector &inputXData, - const Shape &inputYShape, const vector &inputYData, - const Shape &conditionShape, - const vector &conditionData, - const vector &ExpectData) { +void test_whereFp32(const Shape &inputXShape, const vector &inputXData, + const Shape &inputYShape, const vector &inputYData, + const Shape &conditionShape, + const vector &conditionData, + const vector &ExpectData) { Runtime runtime = NativeCpuRuntimeObj::getInstance(); Graph gCpu = make_ref(runtime); auto condition = gCpu->addTensor(conditionShape, DataType::UInt8); @@ -43,22 +43,62 @@ void test_where(const Shape &inputXShape, const vector &inputXData, oCpu->printData(); //->printData EXPECT_TRUE(oCpu->equalData(ExpectData)); } +void test_whereFp16( + const Shape &inputXShape, + const std::function &generatorX, + const Shape &inputYShape, + const std::function &generatorY, + const Shape &conditionShape, const vector &conditionData, + const vector &ExpectData) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + Graph gCpu = make_ref(runtime); -TEST(CUDA_Where, run) { - test_where( + auto inputX = gCpu->addTensor(inputXShape, DataType::Float16); + auto inputY = gCpu->addTensor(inputYShape, DataType::Float16); + auto condition = gCpu->addTensor(conditionShape, DataType::UInt8); + gCpu->dataMalloc(); + + inputX->setData(generatorX); + inputY->setData(generatorY); + condition->copyin(conditionData); // + + auto cudaRuntime = make_ref(); + Graph gCuda = make_ref(cudaRuntime); + + auto inputXGpu = gCuda->cloneTensor(inputX); + auto inputYGpu = gCuda->cloneTensor(inputY); + auto conditionGpu = gCuda->cloneTensor(condition); + + auto op = gCuda->addOp(inputXGpu, inputYGpu, conditionGpu, + nullptr); // WhereObj + gCuda->dataMalloc(); + + inputXGpu->setData(generatorX); + inputYGpu->setData(generatorY); + conditionGpu->copyin(conditionData); + cudaRuntime->run(gCuda); + + auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu + oCpu->printData(); //->printData + EXPECT_TRUE(oCpu->equalData(ExpectData)); +} + +TEST(CUDA_WhereFp32, run) { + test_whereFp32( Shape{2, 2, 3, 1}, vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, Shape{2, 2, 3, 1}, vector{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, Shape{2, 2, 3, 1}, vector{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1}, vector{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.}); - test_where(Shape{2, 1, 1, 3}, // inputx - vector{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy - vector{1, 1}, Shape{2, 1, 3, 1}, // condition - vector{0, 1, 1, 0, 0, 0}, - vector{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1., - 0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}); - test_where( + test_whereFp32(Shape{2, 1, 1, 3}, // inputx + vector{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy + vector{1, 1}, Shape{2, 1, 3, 1}, // condition + vector{0, 1, 1, 0, 0, 0}, + vector{1., 1., 1., 0., 1., 2., 0., 1., 2., + 1., 1., 1., 0., 1., 2., 0., 1., 2., + 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1.}); + test_whereFp32( Shape{ 3, }, @@ -68,7 +108,7 @@ TEST(CUDA_Where, run) { vector{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3., 0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.}); - test_where( + test_whereFp32( Shape{ 3, }, @@ -80,6 +120,30 @@ TEST(CUDA_Where, run) { 0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.}); +} // python output +TEST(CUDA_WhereFp16, run) { + test_whereFp16( + Shape{ + 3, + }, + ValGenerator<1>(), // inputX + Shape{2, 3, 1}, ValGenerator<2>(), // inputY + Shape{2, 1, 3, 1}, vector{0, 1, 1, 0, 0, 0}, // condition + vector{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., + 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., + 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}); + test_whereFp16( + Shape{ + 3, + }, + ValGenerator<1>(), // inputX + Shape{2, 3, 1}, ValGenerator<2>(), // inputY + Shape{2, 1, 3, 1}, + vector{false, true, true, false, false, false}, // condition + vector{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., + 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., + 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}); + } // python output } // namespace infini diff --git a/test/operators/test_where.cc b/test/operators/test_where.cc index c32e2d81..6b90837f 100644 --- a/test/operators/test_where.cc +++ b/test/operators/test_where.cc @@ -7,7 +7,7 @@ namespace infini { -TEST(Where, ShapeInference) { +TEST(WhereFp32, ShapeInference) { Runtime runtime = NativeCpuRuntimeObj::getInstance(); { Graph g = make_ref(runtime); @@ -42,5 +42,39 @@ TEST(Where, ShapeInference) { EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224})); } } - +TEST(WhereFp16, ShapeInference) { + Runtime runtime = NativeCpuRuntimeObj::getInstance(); + { + Graph g = make_ref(runtime); + Tensor x = g->addTensor({2, 2}, DataType::Float16); + Tensor y = g->addTensor({2, 2}, DataType::Float16); + Tensor con = g->addTensor({2, 2}, DataType::Bool); + auto op = g->addOp(x, y, con, nullptr); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2})); + } + { + Graph g = make_ref(runtime); + Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float16); + Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16); + Tensor con = g->addTensor({1, 224, 1}, DataType::Bool); + auto op = g->addOp(x, y, con, nullptr); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224})); + } + { + Graph g = make_ref(runtime); + Tensor x = g->addTensor({12, 224, 224}, DataType::Float16); + Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16); + Tensor con = g->addTensor({1, 224}, DataType::Bool); + auto op = g->addOp(x, y, con, nullptr); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224})); + } + { + Graph g = make_ref(runtime); + Tensor x = g->addTensor({12, 224, 224}, DataType::Float16); + Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16); + Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool); + auto op = g->addOp(x, y, con, nullptr); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224})); + } +} } // namespace infini