InfiniTensor/test/kernels/cuda/test_cuda_where.cc

150 lines
6.1 KiB
C++
Raw Permalink Normal View History

#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/where.h"
#include "test.h"
namespace infini {
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
void test_whereFp32(const Shape &inputXShape, const vector<float> &inputXData,
const Shape &inputYShape, const vector<float> &inputYData,
const Shape &conditionShape,
const vector<uint8_t> &conditionData,
const vector<float> &ExpectData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
auto inputX = gCpu->addTensor(inputXShape, DataType::Float32);
auto inputY = gCpu->addTensor(inputYShape, DataType::Float32);
gCpu->dataMalloc();
condition->copyin(conditionData); //
inputX->copyin(inputXData);
inputY->copyin(inputYData); //
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto conditionGpu = gCuda->cloneTensor(condition);
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputYGpu = gCuda->cloneTensor(inputY);
auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
nullptr); // WhereObj
gCuda->dataMalloc();
conditionGpu->copyin(conditionData);
inputXGpu->copyin(inputXData);
inputYGpu->copyin(inputYData);
cudaRuntime->run(gCuda);
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
void test_whereFp16(
const Shape &inputXShape,
const std::function<void(void *, size_t, DataType)> &generatorX,
const Shape &inputYShape,
const std::function<void(void *, size_t, DataType)> &generatorY,
const Shape &conditionShape, const vector<uint8_t> &conditionData,
const vector<float> &ExpectData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto inputX = gCpu->addTensor(inputXShape, DataType::Float16);
auto inputY = gCpu->addTensor(inputYShape, DataType::Float16);
auto condition = gCpu->addTensor(conditionShape, DataType::UInt8);
gCpu->dataMalloc();
inputX->setData(generatorX);
inputY->setData(generatorY);
condition->copyin(conditionData); //
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputYGpu = gCuda->cloneTensor(inputY);
auto conditionGpu = gCuda->cloneTensor(condition);
auto op = gCuda->addOp<WhereObj>(inputXGpu, inputYGpu, conditionGpu,
nullptr); // WhereObj
gCuda->dataMalloc();
inputXGpu->setData(generatorX);
inputYGpu->setData(generatorY);
conditionGpu->copyin(conditionData);
cudaRuntime->run(gCuda);
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
TEST(CUDA_WhereFp32, run) {
test_whereFp32(
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Shape{2, 2, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
test_whereFp32(Shape{2, 1, 1, 3}, // inputx
vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
vector<uint8_t>{0, 1, 1, 0, 0, 0},
vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2.,
1., 1., 1., 0., 1., 2., 0., 1., 2.,
1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1.});
test_whereFp32(
Shape{
3,
},
vector<float>{0, 1, 2}, // inputX
Shape{2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5}, // inputY
Shape{2, 1, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0}, // condition
vector<float>{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3.,
0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
test_whereFp32(
Shape{
3,
},
vector<float>{0, 1, 2}, // inputX
Shape{2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5}, // inputY
Shape{2, 1, 3, 1},
vector<uint8_t>{false, true, true, false, false, false}, // condition
vector<float>{0., 0., 0., 0., 1., 2., 0., 1., 2., 3., 3., 3.,
0., 1., 2., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5.});
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
} // python output
TEST(CUDA_WhereFp16, run) {
test_whereFp16(
Shape{
3,
},
ValGenerator<1>(), // inputX
Shape{2, 3, 1}, ValGenerator<2>(), // inputY
Shape{2, 1, 3, 1}, vector<uint8_t>{0, 1, 1, 0, 0, 0}, // condition
vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
test_whereFp16(
Shape{
3,
},
ValGenerator<1>(), // inputX
Shape{2, 3, 1}, ValGenerator<2>(), // inputY
Shape{2, 1, 3, 1},
vector<uint8_t>{false, true, true, false, false, false}, // condition
vector<float>{2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.});
} // python output
} // namespace infini