From 8b2e3b8e19e14e1844c93f5fd30fe23fb8e5823d Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 8 Dec 2023 16:57:49 +0800
Subject: [PATCH] add where fp16

---
 include/cuda/cuda_where.h              |  7 +-
 include/utils/data_generator.h         |  6 ++
 src/kernels/cuda/where.cc              | 15 ++--
 src/kernels/cuda/where.cu              | 40 ++++++++---
 test/kernels/cuda/test_cuda_softmax.cc | 48 ++++---------
 test/kernels/cuda/test_cuda_where.cc   | 96 +++++++++++++++++++++-----
 test/operators/test_where.cc           | 38 +++++++++-
 7 files changed, 185 insertions(+), 65 deletions(-)
diff --git a/include/cuda/cuda_where.h b/include/cuda/cuda_where.h
index bc6d3e81..8c2ba2db 100644
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@@ -3,10 +3,15 @@
 #include "utils/small_array.h"
 
 namespace infini {
+
 void whereKernel(const float *inputX, const float *inputY,
                  const uint8_t *condition, float *output, int nDims,
                  int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                  SmallArray conditionShape, SmallArray outputShape, int xSize,
                  int ySize, int cSize);
-
+void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, half *output, int nDims,
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize);
 }; // namespace infini
diff --git a/include/utils/data_generator.h b/include/utils/data_generator.h
index 982db835..970b8038 100644
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@@ -91,6 +91,12 @@ template <int val> class ValGenerator : public DataGenerator {
         fill<uint32_t>(data, size);
     }
     void fill(float *data, size_t size) override { fill<float>(data, size); }
+    void fill_fp16(uint16_t *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            float x = 1.0f * val;
+            data[i] = float_to_fp16(x);
+        }
+    }
 };
 typedef ValGenerator<1> OneGenerator;
 typedef ValGenerator<0> ZeroGenerator;
diff --git a/src/kernels/cuda/where.cc b/src/kernels/cuda/where.cc
index 1460b4af..b37da4d5 100644
--- a/src/kernels/cuda/where.cc
+++ b/src/kernels/cuda/where.cc
@@ -36,10 +36,17 @@ class WhereCuda : public CudaKernelWithoutConfig {
         broadcastShape(opInputYShape, inputYShape, nDims, ySize);
         broadcastShape(opConditionShape, conditionShape, nDims, cSize);
 
-        whereKernel((float *)inputXData, (float *)inputYData,
-                    (uint8_t *)conditionData, (float *)outputData, nDims,
-                    outputsize, inputXShape, inputYShape, conditionShape,
-                    outputShape, xSize, ySize, cSize);
+        if (op->getDType() == DataType::Float32) {
+            whereKernel((float *)inputXData, (float *)inputYData,
+                        (uint8_t *)conditionData, (float *)outputData, nDims,
+                        outputsize, inputXShape, inputYShape, conditionShape,
+                        outputShape, xSize, ySize, cSize);
+        } else if (op->getDType() == DataType::Float16) {
+            whereKernel((half *)inputXData, (half *)inputYData,
+                        (uint8_t *)conditionData, (half *)outputData, nDims,
+                        outputsize, inputXShape, inputYShape, conditionShape,
+                        outputShape, xSize, ySize, cSize);
+        }
     }
 };
 
diff --git a/src/kernels/cuda/where.cu b/src/kernels/cuda/where.cu
index ac8b514a..e92a5e9f 100644
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@@ -17,13 +17,13 @@ __device__ int inferIndex(infini::SmallArray inputShape,
     }
     return inputIdx;
 }
-__global__ void _whereKernel(const float *inputX, const float *inputY,
-                             const uint8_t *condition, float *output, int nDims,
-                             int outputsize, infini::SmallArray inputXShape,
-                             infini::SmallArray inputYShape,
-                             infini::SmallArray conditionShape,
-                             infini::SmallArray outputShape, int xSize,
-                             int ySize, int cSize) {
+template <typename T>
+__global__ void
+_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
+             T *output, int nDims, int outputsize,
+             infini::SmallArray inputXShape, infini::SmallArray inputYShape,
+             infini::SmallArray conditionShape, infini::SmallArray outputShape,
+             int xSize, int ySize, int cSize) {
 
     int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
     if (outputIdx < outputsize) {
@@ -61,7 +61,31 @@ void whereKernel(const float *inputX, const float *inputY,
         blocksize = 32;
     }
     int gridsize = (outputsize + blocksize - 1) / blocksize;
-    _whereKernel<<<gridsize, blocksize>>>(
+    _whereKernel<float><<<gridsize, blocksize>>>(
+        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
+        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
+}
+void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, half *output, int nDims,
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize) {
+    int blocksize;
+    if (outputsize > 511) {
+        blocksize = 1024;
+    } else if (outputsize > 255) {
+        blocksize = 512;
+    } else if (outputsize > 127) {
+        blocksize = 256;
+    } else if (outputsize > 63) {
+        blocksize = 128;
+    } else if (outputsize > 31) {
+        blocksize = 64;
+    } else {
+        blocksize = 32;
+    }
+    int gridsize = (outputsize + blocksize - 1) / blocksize;
+    _whereKernel<half><<<gridsize, blocksize>>>(
         inputX, inputY, condition, output, nDims, outputsize, inputXShape,
         inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 }
diff --git a/test/kernels/cuda/test_cuda_softmax.cc b/test/kernels/cuda/test_cuda_softmax.cc
index 880a9c97..be73554d 100644
--- a/test/kernels/cuda/test_cuda_softmax.cc
+++ b/test/kernels/cuda/test_cuda_softmax.cc
@@ -42,7 +42,7 @@ void test_softmaxFp16(
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);
 
-    auto input = gCpu->addTensor(inputShape, DataType::Float32);
+    auto input = gCpu->addTensor(inputShape, DataType::Float16);
 
     gCpu->dataMalloc();
 
@@ -115,40 +115,20 @@ TEST(CUDA_SoftmaxFP32, run) {
                          0.26894143, 0.73105860, 0.26894143, 0.73105860});
 } // python output
 TEST(CUDA_SoftmaxFP16, run) {
+    test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0,
+                     vector<float>{0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
+                                   0.5000, 0.5000, 0.5000, 0.5000});
     test_softmaxFp16(
-        Shape{2, 3, 2, 2}, IncrementalGenerator(), 0,
-        vector<float>{
-            6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
-            6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
-            6.14417422e-06, 6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
-            9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
-            9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
-            9.99993801e-01, 9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
-    test_softmaxFp16(
-        Shape{2, 3, 2, 2}, IncrementalGenerator(), 1,
-        vector<float>{
-            3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
-            1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02,
-            9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01,
-            3.29320435e-04, 3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
-            1.79802869e-02, 1.79802869e-02, 1.79802869e-02, 1.79802869e-02,
-            9.81690347e-01, 9.81690347e-01, 9.81690347e-01, 9.81690347e-01});
-    test_softmaxFp16(
-        Shape{2, 3, 2, 2}, IncrementalGenerator(), 2,
-        vector<float>{0.11920292, 0.11920292, 0.88079703, 0.88079703,
-                      0.11920292, 0.11920292, 0.88079703, 0.88079703,
-                      0.11920292, 0.11920292, 0.88079703, 0.88079703,
-                      0.11920292, 0.11920292, 0.88079703, 0.88079703,
-                      0.11920292, 0.11920292, 0.88079703, 0.88079703,
-                      0.11920292, 0.11920292, 0.88079703, 0.88079703});
-    test_softmaxFp16(
-        Shape{2, 3, 2, 2}, IncrementalGenerator(), 3,
-        vector<float>{0.26894143, 0.73105860, 0.26894143, 0.73105860,
-                      0.26894143, 0.73105860, 0.26894143, 0.73105860,
-                      0.26894143, 0.73105860, 0.26894143, 0.73105860,
-                      0.26894143, 0.73105860, 0.26894143, 0.73105860,
-                      0.26894143, 0.73105860, 0.26894143, 0.73105860,
-                      0.26894143, 0.73105860, 0.26894143, 0.73105860});
+        Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down
+        vector<float>{0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
+                      0.333252, 0.333252, 0.333252, 0.333252});
+
 } // python output
 
 } // namespace infini
diff --git a/test/kernels/cuda/test_cuda_where.cc b/test/kernels/cuda/test_cuda_where.cc
index 32c2f253..07f5b48e 100644
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@@ -8,11 +8,11 @@
 
 namespace infini {
 
-void test_where(const Shape &inputXShape, const vector<float> &inputXData,
-                const Shape &inputYShape, const vector<float> &inputYData,
-                const Shape &conditionShape,
-                const vector<uint8_t> &conditionData,
-                const vector<float> &ExpectData) {
+void test_whereFp32(const Shape &inputXShape, const vector<float> &inputXData,
+                    const Shape &inputYShape, const vector<float> &inputYData,
+                    const Shape &conditionShape,
+                    const vector<uint8_t> &conditionData,
+                    const vector<float> &ExpectData) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     Graph gCpu = make_ref<GraphObj>(runtime);
     auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
@@ -43,22 +43,62 @@ void test_where(const Shape &inputXShape, const vector<float> &inputXData,
     oCpu->printData();                              //->printData
     EXPECT_TRUE(oCpu->equalData(ExpectData));
 }
+void test_whereFp16(
+    const Shape &inputXShape,
+    const std::function<void(void *, size_t, DataType)> &generatorX,
+    const Shape &inputYShape,
+    const std::function<void(void *, size_t, DataType)> &generatorY,
+    const Shape &conditionShape, const vector<uint8_t> &conditionData,
+    const vector<float> &ExpectData) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(runtime);
 
-TEST(CUDA_Where, run) {
-    test_where(
+    auto inputX = gCpu->addTensor(inputXShape, DataType::Float16);
+    auto inputY = gCpu->addTensor(inputYShape, DataType::Float16);
+    auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
+    gCpu->dataMalloc();
+
+    inputX->setData(generatorX);
+    inputY->setData(generatorY);
+    condition->copyin(conditionData); //
+
+    auto cudaRuntime = make_ref<CudaRuntimeObj>();
+    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
+
+    auto inputXGpu = gCuda->cloneTensor(inputX);
+    auto inputYGpu = gCuda->cloneTensor(inputY);
+    auto conditionGpu = gCuda->cloneTensor(condition);
+
+    auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
+                                     nullptr); // WhereObj
+    gCuda->dataMalloc();
+
+    inputXGpu->setData(generatorX);
+    inputYGpu->setData(generatorY);
+    conditionGpu->copyin(conditionData);
+    cudaRuntime->run(gCuda);
+
+    auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
+    oCpu->printData();                              //->printData
+    EXPECT_TRUE(oCpu->equalData(ExpectData));
+}
+
+TEST(CUDA_WhereFp32, run) {
+    test_whereFp32(
         Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
         Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
         Shape{2, 2, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
         vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
 
-    test_where(Shape{2, 1, 1, 3},                                  // inputx
-               vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
-               vector<float>{1, 1}, Shape{2, 1, 3, 1},             // condition
-               vector<uint8_t>{0, 1, 1, 0, 0, 0},
-               vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1.,
-                             0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1.,
-                             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.});
-    test_where(
+    test_whereFp32(Shape{2, 1, 1, 3},                                  // inputx
+                   vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
+                   vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
+                   vector<uint8_t>{0, 1, 1, 0, 0, 0},
+                   vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2.,
+                                 1., 1., 1., 0., 1., 2., 0., 1., 2.,
+                                 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+                                 1., 1., 1., 1., 1., 1., 1., 1., 1.});
+    test_whereFp32(
         Shape{
             3,
         },
@@ -68,7 +108,7 @@ TEST(CUDA_Where, run) {
         vector<float>{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3.,
                       0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
                       2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
-    test_where(
+    test_whereFp32(
         Shape{
             3,
         },
@@ -80,6 +120,30 @@ TEST(CUDA_Where, run) {
                       0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
                       2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
 
+} // python output
+TEST(CUDA_WhereFp16, run) {
+    test_whereFp16(
+        Shape{
+            3,
+        },
+        ValGenerator<1>(),                                    // inputX
+        Shape{2, 3, 1}, ValGenerator<2>(),                    // inputY
+        Shape{2, 1, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0}, // condition
+        vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
+                      1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
+                      2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
+    test_whereFp16(
+        Shape{
+            3,
+        },
+        ValGenerator<1>(),                 // inputX
+        Shape{2, 3, 1}, ValGenerator<2>(), // inputY
+        Shape{2, 1, 3, 1},
+        vector<uint8_t>{false, true, true, false, false, false}, // condition
+        vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
+                      1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
+                      2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
+
 } // python output
 
 } // namespace infini
diff --git a/test/operators/test_where.cc b/test/operators/test_where.cc
index c32e2d81..6b90837f 100644
--- a/test/operators/test_where.cc
+++ b/test/operators/test_where.cc
@@ -7,7 +7,7 @@
 
 namespace infini {
 
-TEST(Where, ShapeInference) {
+TEST(WhereFp32, ShapeInference) {
     Runtime runtime = NativeCpuRuntimeObj::getInstance();
     {
         Graph g = make_ref<GraphObj>(runtime);
@@ -42,5 +42,39 @@ TEST(Where, ShapeInference) {
         EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
     }
 }
-
+TEST(WhereFp16, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({2, 2}, DataType::Float16);
+        Tensor y = g->addTensor({2, 2}, DataType::Float16);
+        Tensor con = g->addTensor({2, 2}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({1, 224, 1}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({1, 224}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor x = g->addTensor({12, 224, 224}, DataType::Float16);
+        Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float16);
+        Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool);
+        auto op = g->addOp<WhereObj>(x, y, con, nullptr);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
+    }
+}
 } // namespace infini