From 26f0d13c26bf5b264b8d9ca65e3034bdb34aa552 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Tue, 18 Apr 2023 15:10:33 +0800 Subject: [PATCH] Dev for 202303ddl (#66) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add activation operatiopn relu, tanh, sigmoid on mlu * commit for format * add activation backward operation * add test for activation_backward * add test * add convbpfilter * fix * add transpsoe code and test * add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh * add copy operation on mlu * add ceil operation and floor operation * add operation clip * add operation cnnl div, test and test for divdemo bangc kernel * add divnonan operation and test * add erf operation * add exp operation * add operation fill * add log operation * add log1p operation * add l2loss operation * add maximum and minimum operation * add mseloss operation * add negTensor operation * add power operation * add reciprocal operation * add sqrt and rsqrt operation * add transform operation * add addn operation * add muln operation * cherrry pick some operation * add floordiv operation and floordivtrunc operation * add floormod operation * add cumsum operation * add det operation * add pad operation * format * add concat operation * format * add split operation * fix concat and split operation * add round operation * add pooling operation * add square operation * add squaredDifference operation * code format fix * add flip operation * code format fix * add hardtanh operation * add logic operation * add addcdiv and addcmul operation * add arange operation * add bitcompute operation * add net test * fmt Signed-off-by: YdrMaster * style: rename Signed-off-by: YdrMaster * fix: 用 NativeCpuRuntime 替换 CpuRuntime Signed-off-by: YdrMaster * fix code * fix code * fix code by review suggestion * remove operation which is not the onnx operation * fix format * clang format * refactor: tensor 的 print 加一层模板的 dataToString Signed-off-by: YdrMaster * fix: onnx 导出 Signed-off-by: YdrMaster * feat: 增加计算图优化接口 Signed-off-by: YdrMaster * add clip operation * feat: 支持导入 clip Signed-off-by: YdrMaster * test: 导入导出测试加入 ci Signed-off-by: YdrMaster * fix batch norm * feat: 增加 Shape 算子 Signed-off-by: YdrMaster * feat: 支持导入 unsqueeze Signed-off-by: YdrMaster * fix: 修正 clip 接口 feat: 支持导入 transpose Signed-off-by: YdrMaster * add broadcast operation * fix elementwise-broadcast * fix elementwise broadcast * add broadcast for gpu elementsie * feat: pad 支持 axes 负数 feat: 不支持的 padding 导出为独立的 pad 算子 feat: 支持导入 onnxsim 过的 inception Signed-off-by: YdrMaster * fix: 修正池化的测试 Signed-off-by: YdrMaster * feat: 导出 pads,支持 inception 导入导出,已加入 ci Signed-off-by: YdrMaster * feat: 支持 densenet 导入导出,并加入 ci Signed-off-by: YdrMaster * feat: 导入 squeeze Signed-off-by: YdrMaster * fix softmax * feat: 导出 clip 和 transpose Signed-off-by: YdrMaster * feat: 支持 Conv 的 bias Signed-off-by: YdrMaster * fix: bias of conv Signed-off-by: YdrMaster * fix: bias of conv Signed-off-by: YdrMaster * feat: 导入 split Signed-off-by: YdrMaster * feat: 导出 split Signed-off-by: YdrMaster * fix: conv Signed-off-by: YdrMaster * fix: conv group Signed-off-by: YdrMaster * fix: matmul 的 bias 没有放在输入里,修正 Signed-off-by: YdrMaster * fix exmaple * fix: 改正 reduce_mean 导出 Signed-off-by: YdrMaster * refactor: 修改 slice 实现与 onnx 一致 Signed-off-by: YdrMaster * style: 不导出两个 runtime 函数 Signed-off-by: YdrMaster * doc: 中文使用指南 Signed-off-by: YdrMaster * doc: 补全指南 Signed-off-by: YdrMaster * fix: 修复导入数据的问题 Signed-off-by: YdrMaster * fmt Signed-off-by: YdrMaster * feat: 添加 Dropout 基本结构,但不支持两个输出是不同的类型 Signed-off-by: YdrMaster * feat: 重新导出优化接口 feat: dropout 导入 Signed-off-by: YdrMaster * build: BANG 选项加入 Makefile Signed-off-by: YdrMaster * fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime. chaneg of include/bang/bang_runtime is for the cntoolkit upgrade. * feat: 导出 bang runtime Signed-off-by: YdrMaster * add USE_BANG=1 * fix matmul * fix reshape * fix * fix activation * fix transpose * format * format * update Makefile Signed-off-by: YdrMaster * feat: 支持导入导出 ConvTranspose Signed-off-by: YdrMaster * add prelu on mlu * fix: ConvTranspose Signed-off-by: YdrMaster * feat: 支持导入导出 PRelu Signed-off-by: YdrMaster * add convtrans on mlu * fmt Signed-off-by: YdrMaster * docs: 更新 README_CN.md Signed-off-by: YdrMaster * fix code by review suggestions * style Signed-off-by: YdrMaster * fix: Softmax 的 axis 可以用默认值?感觉是 onnx 不标准 Signed-off-by: YdrMaster * fix cuda & intelcpu bugs after merging --------- Signed-off-by: YdrMaster Co-authored-by: wanghailu Co-authored-by: wanghailu Co-authored-by: whjthu --- .github/workflows/build.yml | 56 +- .github/workflows/clang-format-check.yml | 3 + .gitmodules | 3 + CMakeLists.txt | 20 +- Makefile | 15 +- README.md | 4 +- README_CN.md | 221 +++++++ example | 1 + include/bang/bang_element_wise.h | 22 - include/core/constants.h | 2 +- include/core/data_type.h | 10 + include/core/graph.h | 2 + include/core/graph_handler.h | 12 + include/core/hash.h | 2 +- include/core/object.h | 2 +- include/core/operator.h | 123 +++- include/core/perf_engine.h | 2 +- include/core/ref.h | 2 +- include/core/tensor.h | 43 +- include/cuda/cuda_clip.h | 9 + include/cuda/cuda_element_wise.h | 6 +- include/nnet/Pass/MatchComputationKernel.h | 2 +- include/nnet/Pass/MatchMemBoundKernel.h | 2 +- include/nnet/Pass/Pass.h | 2 +- include/nnet/Pass/Rule1VariableSplit.h | 2 +- include/nnet/Pass/Rule2VariableMerging.h | 2 +- include/nnet/Pass/Rule3StageSplit.h | 2 +- include/nnet/Pass/Rule4StageMerging.h | 2 +- include/nnet/Pass/Rule5RangeRelaxation.h | 2 +- include/nnet/Pass/Rule6KenerlMatching.h | 2 +- include/nnet/Pass/Rule7DLT.h | 2 +- include/nnet/Pass/Rule8GuidedDLT.h | 2 +- include/nnet/Pass/Rule90TwoStageElementWise.h | 2 +- include/nnet/Pass/Rule91MergeStagesWithSum.h | 2 +- include/nnet/Pass/Rule9RangeMagnify.h | 2 +- include/nnet/ReplaceKit.h | 2 +- include/nnet/dbg.h | 4 +- include/nnet/dlt.h | 2 +- include/nnet/nmutator.h | 2 +- include/nnet/permutation.h | 2 +- include/nnet/ref.h | 2 +- include/nnet/test.h | 2 +- include/operators/activation_backward.h | 32 + include/operators/batch_norm.h | 8 +- include/operators/conv.h | 24 + include/operators/det.h | 22 + include/operators/dropout.h | 52 ++ include/operators/element_wise.h | 41 ++ include/operators/matmul.h | 4 +- include/operators/slice.h | 26 +- include/operators/transpose.h | 22 + include/operators/unary.h | 262 ++++++++ pyinfinitensor/pyproject.toml | 2 +- pyinfinitensor/src/pyinfinitensor/onnx.py | 468 ++++++++++--- pyinfinitensor/tests/test_onnx.py | 66 +- src/core/graph.cc | 9 + src/core/graph_handler.cc | 64 ++ src/core/tensor.cc | 121 ++-- src/cuda/cuda_runtime.cc | 8 +- src/ffi/ffi_infinitensor.cc | 119 +++- src/kernels/bang/activation.cc | 208 ++++++ src/kernels/bang/activation_backward.cc | 94 +++ src/kernels/bang/cast.cc | 185 ++++++ src/kernels/bang/ceil.cc | 46 ++ src/kernels/bang/clip.cc | 42 ++ src/kernels/bang/concat.cc | 68 ++ src/kernels/bang/conv_trans.cc | 88 +++ src/kernels/bang/convbpfilter.cc | 159 +++++ src/kernels/bang/copy.cc | 46 ++ src/kernels/bang/det.cc | 53 ++ src/kernels/bang/element_wise.cc | 616 +++++++++++++++++- src/kernels/bang/erf.cc | 47 ++ src/kernels/bang/exp.cc | 47 ++ src/kernels/bang/fill.cc | 40 ++ src/kernels/bang/floor.cc | 46 ++ src/kernels/bang/hardtanh.cc | 42 ++ src/kernels/bang/l2loss.cc | 40 ++ src/kernels/bang/log.cc | 62 ++ src/kernels/bang/matmul.cc | 26 +- src/kernels/bang/negtensor.cc | 46 ++ src/kernels/bang/pad.cc | 65 ++ src/kernels/bang/pooling.cc | 73 +++ src/kernels/bang/reciprocal.cc | 46 ++ src/kernels/bang/reshape.cc | 42 ++ src/kernels/bang/rsqrt.cc | 47 ++ src/kernels/bang/split.cc | 69 ++ src/kernels/bang/sqrt.cc | 47 ++ src/kernels/bang/transpose.cc | 60 ++ src/kernels/bang/trigon.cc | 184 ++++++ src/kernels/cpu/element_wise.cc | 42 +- src/kernels/cpu/unary.cc | 21 + src/kernels/cuda/batch_norm.cc | 9 +- src/kernels/cuda/clip.cc | 27 + src/kernels/cuda/clip.cu | 32 + src/kernels/cuda/element_wise.cc | 53 +- src/kernels/cuda/element_wise.cu | 59 +- src/kernels/cuda/pad_slice.cc | 2 +- src/kernels/cuda/unary.cc | 46 ++ src/kernels/intelcpu/slice.cc | 2 +- src/kernels/mlu/CMakeLists.txt | 46 -- src/kernels/mlu/include/bang_div.h | 7 - src/kernels/mlu/include/div.h | 7 - src/kernels/mlu/src/div.mlu | 24 - src/kernels/mlu/src/div_device.mlu | 50 -- src/operators/activation_backward.cc | 37 ++ src/operators/batch_norm.cc | 6 +- src/operators/conv.cc | 75 +++ src/operators/det.cc | 43 ++ src/operators/dropout.cc | 40 ++ src/operators/element_wise.cc | 78 ++- src/operators/matmul.cc | 5 +- src/operators/pad.cc | 3 +- src/operators/reduce_mean.cc | 7 +- src/operators/slice.cc | 124 ++-- src/operators/transpose.cc | 50 ++ src/operators/unary.cc | 284 ++++++++ test/core/test_graph_replace.cc | 6 +- test/core/test_tensor_save.cc | 2 + .../bang/test_bang_activation_backward.cc | 56 ++ test/kernels/bang/test_bang_bitcompute.cc | 51 ++ test/kernels/bang/test_bang_cast.cc | 40 ++ test/kernels/bang/test_bang_ceil.cc | 40 ++ test/kernels/bang/test_bang_clip.cc | 42 ++ test/kernels/bang/test_bang_concat.cc | 52 ++ test/kernels/bang/test_bang_copy.cc | 40 ++ test/kernels/bang/test_bang_det.cc | 41 ++ test/kernels/bang/test_bang_erf.cc | 40 ++ test/kernels/bang/test_bang_exp.cc | 40 ++ test/kernels/bang/test_bang_fill.cc | 40 ++ test/kernels/bang/test_bang_floor.cc | 40 ++ test/kernels/bang/test_bang_floordiv.cc | 49 ++ test/kernels/bang/test_bang_floormod.cc | 49 ++ test/kernels/bang/test_bang_hardtanh.cc | 43 ++ test/kernels/bang/test_bang_l2loss.cc | 40 ++ test/kernels/bang/test_bang_log.cc | 42 ++ test/kernels/bang/test_bang_logic.cc | 56 ++ test/kernels/bang/test_bang_maximum.cc | 46 ++ test/kernels/bang/test_bang_minimum.cc | 46 ++ test/kernels/bang/test_bang_mseloss.cc | 57 ++ test/kernels/bang/test_bang_neg.cc | 40 ++ test/kernels/bang/test_bang_net.cc | 47 ++ test/kernels/bang/test_bang_optensor.cc | 3 + test/kernels/bang/test_bang_pad.cc | 44 ++ test/kernels/bang/test_bang_pooling.cc | 41 ++ test/kernels/bang/test_bang_pow.cc | 46 ++ test/kernels/bang/test_bang_prelu.cc | 46 ++ test/kernels/bang/test_bang_reciprocal.cc | 41 ++ test/kernels/bang/test_bang_round.cc | 40 ++ test/kernels/bang/test_bang_rsqrt.cc | 40 ++ test/kernels/bang/test_bang_split.cc | 48 ++ test/kernels/bang/test_bang_sqrt.cc | 40 ++ test/kernels/bang/test_bang_square.cc | 40 ++ .../bang/test_bang_squaredDifference.cc | 48 ++ test/kernels/bang/test_bang_transpose.cc | 43 ++ test/kernels/bang/test_bang_trigon.cc | 52 ++ test/kernels/bang/test_bang_unary.cc | 47 ++ test/kernels/cuda/test_cuda_clip.cc | 48 ++ test/kernels/cuda/test_cuda_slice.cc | 2 +- test/kernels/intelcpu/test_mkl_slice.cc | 2 +- test/operators/test_clip.cc | 38 ++ test/operators/test_slice.cc | 6 +- 161 files changed, 6913 insertions(+), 614 deletions(-) create mode 100644 README_CN.md create mode 160000 example delete mode 100644 include/bang/bang_element_wise.h create mode 100644 include/cuda/cuda_clip.h create mode 100644 include/operators/activation_backward.h create mode 100644 include/operators/det.h create mode 100644 include/operators/dropout.h create mode 100644 include/operators/transpose.h create mode 100644 src/kernels/bang/activation.cc create mode 100644 src/kernels/bang/activation_backward.cc create mode 100644 src/kernels/bang/cast.cc create mode 100644 src/kernels/bang/ceil.cc create mode 100644 src/kernels/bang/clip.cc create mode 100644 src/kernels/bang/concat.cc create mode 100644 src/kernels/bang/conv_trans.cc create mode 100644 src/kernels/bang/convbpfilter.cc create mode 100644 src/kernels/bang/copy.cc create mode 100644 src/kernels/bang/det.cc create mode 100644 src/kernels/bang/erf.cc create mode 100644 src/kernels/bang/exp.cc create mode 100644 src/kernels/bang/fill.cc create mode 100644 src/kernels/bang/floor.cc create mode 100644 src/kernels/bang/hardtanh.cc create mode 100644 src/kernels/bang/l2loss.cc create mode 100644 src/kernels/bang/log.cc create mode 100644 src/kernels/bang/negtensor.cc create mode 100644 src/kernels/bang/pad.cc create mode 100644 src/kernels/bang/pooling.cc create mode 100644 src/kernels/bang/reciprocal.cc create mode 100644 src/kernels/bang/reshape.cc create mode 100644 src/kernels/bang/rsqrt.cc create mode 100644 src/kernels/bang/split.cc create mode 100644 src/kernels/bang/sqrt.cc create mode 100644 src/kernels/bang/transpose.cc create mode 100644 src/kernels/bang/trigon.cc create mode 100644 src/kernels/cuda/clip.cc create mode 100644 src/kernels/cuda/clip.cu delete mode 100644 src/kernels/mlu/CMakeLists.txt delete mode 100644 src/kernels/mlu/include/bang_div.h delete mode 100644 src/kernels/mlu/include/div.h delete mode 100644 src/kernels/mlu/src/div.mlu delete mode 100644 src/kernels/mlu/src/div_device.mlu create mode 100644 src/operators/activation_backward.cc create mode 100644 src/operators/det.cc create mode 100644 src/operators/dropout.cc create mode 100644 src/operators/transpose.cc create mode 100644 test/kernels/bang/test_bang_activation_backward.cc create mode 100644 test/kernels/bang/test_bang_bitcompute.cc create mode 100644 test/kernels/bang/test_bang_cast.cc create mode 100644 test/kernels/bang/test_bang_ceil.cc create mode 100644 test/kernels/bang/test_bang_clip.cc create mode 100644 test/kernels/bang/test_bang_concat.cc create mode 100644 test/kernels/bang/test_bang_copy.cc create mode 100644 test/kernels/bang/test_bang_det.cc create mode 100644 test/kernels/bang/test_bang_erf.cc create mode 100644 test/kernels/bang/test_bang_exp.cc create mode 100644 test/kernels/bang/test_bang_fill.cc create mode 100644 test/kernels/bang/test_bang_floor.cc create mode 100644 test/kernels/bang/test_bang_floordiv.cc create mode 100644 test/kernels/bang/test_bang_floormod.cc create mode 100644 test/kernels/bang/test_bang_hardtanh.cc create mode 100644 test/kernels/bang/test_bang_l2loss.cc create mode 100644 test/kernels/bang/test_bang_log.cc create mode 100644 test/kernels/bang/test_bang_logic.cc create mode 100644 test/kernels/bang/test_bang_maximum.cc create mode 100644 test/kernels/bang/test_bang_minimum.cc create mode 100644 test/kernels/bang/test_bang_mseloss.cc create mode 100644 test/kernels/bang/test_bang_neg.cc create mode 100644 test/kernels/bang/test_bang_net.cc create mode 100644 test/kernels/bang/test_bang_pad.cc create mode 100644 test/kernels/bang/test_bang_pooling.cc create mode 100644 test/kernels/bang/test_bang_pow.cc create mode 100644 test/kernels/bang/test_bang_prelu.cc create mode 100644 test/kernels/bang/test_bang_reciprocal.cc create mode 100644 test/kernels/bang/test_bang_round.cc create mode 100644 test/kernels/bang/test_bang_rsqrt.cc create mode 100644 test/kernels/bang/test_bang_split.cc create mode 100644 test/kernels/bang/test_bang_sqrt.cc create mode 100644 test/kernels/bang/test_bang_square.cc create mode 100644 test/kernels/bang/test_bang_squaredDifference.cc create mode 100644 test/kernels/bang/test_bang_transpose.cc create mode 100644 test/kernels/bang/test_bang_trigon.cc create mode 100644 test/kernels/bang/test_bang_unary.cc create mode 100644 test/kernels/cuda/test_cuda_clip.cc create mode 100644 test/operators/test_clip.cc diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 142bf78b..c595a3b6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,6 +2,9 @@ name: Build and test cpu on: push: branch: 'master' + paths-ignore: + - '**.md' + - 'LICENSE' pull_request: paths-ignore: - '**.md' @@ -11,8 +14,11 @@ env: protobuf-download: https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protobuf-cpp-3.21.12.tar.gz protobuf-version: "3.21.12" python-version: "3.10" + resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx - resnet-file: resnet18-v2-7.onnx + inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx + densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx + efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx jobs: build: @@ -31,28 +37,28 @@ jobs: - name: Install libdw run: sudo apt-get update && sudo apt-get install libdw-dev - - name: Cache protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: protobuf-${{ env.protobuf-version }} - key: protobuf-${{ env.protobuf-version }} + # - name: Cache protobuf + # id: cache-protobuf + # uses: actions/cache@v3 + # with: + # path: protobuf-${{ env.protobuf-version }} + # key: protobuf-${{ env.protobuf-version }} - - name: Download and compile protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - wget ${{ env.protobuf-download }} - tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz - cd protobuf-${{ env.protobuf-version }} - ./autogen.sh - ./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC" - make -j8 + # - name: Download and compile protobuf + # if: steps.cache-protobuf.outputs.cache-hit != 'true' + # run: | + # wget ${{ env.protobuf-download }} + # tar xf protobuf-cpp-${{ env.protobuf-version }}.tar.gz + # cd protobuf-${{ env.protobuf-version }} + # ./autogen.sh + # ./configure CFLAGS="-fPIC" CXXFLAGS="-fPIC" + # make -j8 - - name: Install protobuf - run: | - cd protobuf-${{ env.protobuf-version }} - sudo make install - sudo ldconfig + # - name: Install protobuf + # run: | + # cd protobuf-${{ env.protobuf-version }} + # sudo make install + # sudo ldconfig - name: Build run: make @@ -65,8 +71,12 @@ jobs: python -m pip install --upgrade pip make install-python - - name: Download test model - run: wget ${{ env.resnet-download }} + - name: Download test models + run: | + wget ${{ env.resnet-download }} + wget ${{ env.inception-download }} + wget ${{ env.densenet-download }} + wget ${{ env.efficientnet-download }} - name: Test onnx frontend run: make test-onnx diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 02f6cd1c..dfd15b07 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -2,6 +2,9 @@ name: clang-format Check on: push: branch: 'master' + paths-ignore: + - '**.md' + - 'LICENSE' pull_request: paths-ignore: - '**.md' diff --git a/.gitmodules b/.gitmodules index a40171ca..02a80785 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "3rd-party/backward-cpp"] path = 3rd-party/backward-cpp url = git@github.com:bombela/backward-cpp.git +[submodule "example"] + path = example + url = git@github.com:wanghailu0717/NNmodel.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f3c06283..f5fd43ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,8 +7,8 @@ option(USE_CUDA "Support CUDA GPU" OFF) option(USE_BANG "Support BANG MLU" OFF) option(USE_INTELCPU "Support INTELCPU" OFF) option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON) -option(USE_PROTOBUF "Serialize and deserialize tensors" ON) -option(BUILD_TEST "Build tests" ON) +option(USE_PROTOBUF "Serialize and deserialize tensors" OFF) +option(BUILD_TEST "Build tests" OFF) cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF) cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF) @@ -78,7 +78,7 @@ if(BUILD_TEST_EINNET) include_directories(${DMLC_INCLUDE_DIR}) include_directories(${DLPACK_INCLUDE_DIR}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels endif() if(BUILD_TEST) @@ -142,7 +142,7 @@ if(USE_BACKTRACE) endif() if(USE_INTELCPU) - add_compile_definitions(USE_INTELCPU=1) + add_compile_definitions(USE_INTELCPU=1) find_package(MKL CONFIG REQUIRED) # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html @@ -150,10 +150,11 @@ if(USE_INTELCPU) set(DNNL_CONFIGURATION "cpu_gomp") find_package(dnnl CONFIG REQUIRED) - if(dnnl_FOUND) + if(dnnl_FOUND) + add_compile_definitions(USE_MKL=1) include_directories(BEFORE ${dnnl_DIR}/../../../cpu_gomp/include/) - link_directories(${dnnl_DIR}/../../../cpu_gomp/lib) - target_link_libraries(InfiniTensor dnnl) + link_directories(${dnnl_DIR}/../../../cpu_gomp/lib) + target_link_libraries(InfiniTensor dnnl) else() message(FATAL_ERROR "dnnl library not found") endif() @@ -161,7 +162,7 @@ if(USE_INTELCPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMKL_ILP64 -qmkl=parallel -Werror ${WNO_ERRORS}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DMKL_ILP64 -qmkl=parallel ${WNO_ERRORS}") # Enable assertion - + find_package(IntelDPCPP REQUIRED) endif() @@ -179,6 +180,7 @@ if(USE_CUDA) endif() if(USE_BANG) + add_compile_definitions(USE_BANG=1) include_directories(src/kernels/mlu/include) ################################################################################ # Neuware Evironment @@ -212,10 +214,8 @@ if(USE_BANG) ################################################################################ # BangC Kernels ################################################################################ - add_subdirectory(src/kernels/mlu) target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++) - target_link_libraries(InfiniTensor bangops) endif() # # Python bindings diff --git a/Makefile b/Makefile index 6b5fa090..0edf6c20 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,17 @@ .PHONY : build clean install-python test-cpp test-onnx TYPE ?= release -CUDA ?= off +CUDA ?= OFF +BANG ?= OFF INTELCPU ?= off +BACKTRACE ?= ON +TEST ?= ON -CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE) - -ifeq ($(CUDA), ON) - CMAKE_OPT += -DUSE_CUDA=ON -endif +CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE) +CMAKE_OPT += -DUSE_CUDA=$(CUDA) +CMAKE_OPT += -DUSE_BANG=$(BANG) +CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE) +CMAKE_OPT += -DBUILD_TEST=$(TEST) ifeq ($(INTELCPU), ON) CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp diff --git a/README.md b/README.md index ea404705..231831a3 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ cmake -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp .. && make -j 12 --- -> Sets env: `CUDA=ON` to enable cuda. +> - Sets env: `TEST=OFF` to accelerate compiling. +> - Sets env: `CUDA=ON` to enable cuda. +> - Sets env: `BANG=ON` to enable bang. ### CMake Options diff --git a/README_CN.md b/README_CN.md new file mode 100644 index 00000000..5480987b --- /dev/null +++ b/README_CN.md @@ -0,0 +1,221 @@ +# 使用指南 + +## 目录 + +- [编译](#编译) +- [使用](#使用) +- [python-前端应用指南](#python-前端应用指南) + - [导入-onnx-模型](#导入-onnx-模型) + - [导出-onnx-模型](#导出-onnx-模型) + - [执行推理](#执行推理) +- [测试](#测试) + +## 编译 + +推荐使用 Ubuntu-22.04,本文以此环境为例。 + +1. 使用 apt 安装依赖 + + > 如果不使用 Ubuntu-22.04,部分软件版本可能不够高。 + + ```bash + sudo apt-get install make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev + ``` + +2. 更新 pip 并换清华源 + + ```bash + python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + ``` + +3. 编译并安装 python 库 + + > 第一次执行会同时安装 python 依赖库,比较慢 + + 仅编译 CPU 部分: + + ```bash + make install-python + ``` + + 编译 GPU 部分: + + ```bash + make install-python CUDA=ON + ``` + +## 使用 + +项目管理功能已写到 [Makefile](Makefile),支持下列功能: + +- 编译项目:`make`/`make build` +- 清理生成文件:`make clean` +- 安装 python 库:`make install-python` +- 测试 c++ 后端:`make test-cpp` +- 测试 python 前端:`make test-onnx` + +并使用下列环境变量传递选项参数: + +- `TYPE`:编译模式(`debug`/`release`),默认值为 `release` +- `CUDA`:是否编译 CUDA 后端,默认为 `OFF`,`ON` 打开 +- `BANG`:是否编译寒武纪后端,默认为 `OFF`,`ON` 打开 +- `BACKTRACE`:是否启用栈回溯,默认为 `ON`,`OFF` 关闭,建议调试时打开 +- `TEST`:是否编译 `googletest`,默认为 `ON`,`OFF` 关闭,只有 `test-cpp` 时必要 + +## python 前端应用指南 + +`make install-python` 会将项目的 python 前端以 `pyinfinitensor` 为名字安装到系统目录,可以直接 `import pyinfinitensor` 来使用。现阶段,项目的主要用法是从 onnx 导入模型进行优化,然后可以再导出优化后的模型到 onnx,也可以直接运行推理。 + +### 导入 onnx 模型 + +支持的模型: + +- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx) +- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx) +- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx) +- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx) + +```python +import onnx +from pyinfinitensor.onnx import OnnxStub +from pyinfinitensor import backend + +stub = OnnxStub(onnx.load("model_file"), backend.cpu_runtime()) +``` + +[`onnx.load`](https://onnx.ai/onnx/api/serialization.html#load-a-model) 是 onnx 提供的加载函数,将 onnx 文件读取为保存在内存中的 onnx 模型。 + +`OnnxStub` 是 onnx 模型在项目中的表示,通过构造这个对象,将 onnx 模型导入到项目中。其构造器的第一个参数是 onnx 模型文件;第二个参数是模型运行的后端运行时,可以是 `backend.cpu_runtime()`、`backend.cuda_runtime()` 或 `backend.bang_runtime()`。 + +构造出的 stub 对象可以用于操作项目中的模型和运行时。 + +### 优化 + +TODO + +### 导出 onnx 模型 + +优化后的模型可以导出成 onnx 文件提供给其他运行时。 + +```python +with open("optimized.onnx", "wb") as f: + f.write(stub.to_onnx("optimized").SerializeToString()) +``` + +`stub.to_onnx()` 将模型转换为 onnx 模型对象,`` 将填写到 onnx 模型的 `name` 字段。序列化到文件的代码见[官方示例](https://onnx.ai/onnx/intro/python.html#model-serialization)。 + +要可视化检查导出的模型文件,可以利用 [onnx 提供的功能](https://onnx.ai/onnx/api/shape_inference.html#infer-shapes)将所有的张量的形状推理出来再导出: + +```python +from onnx.shape_inference import infer_shapes + +with open("optimized.onnx", "wb") as f: + f.write(infer_shapes(stub.to_onnx("optimized")).SerializeToString()) +``` + +然后用 [Netron](https://netron.app/) 绘制计算图。 + +### 执行推理 + +也可以使用项目的运行时执行推理。 + +第一步是将数据传入计算图。`OnnxStub.inputs` 是一个 `Dict[str, Tensor]`,保存着模型的所有输入的名字和对象。可以用 [`items()`](https://docs.python.org/zh-cn/3/library/stdtypes.html#dict.items) 来遍历。 + +这个代码片段显示了如何打印出模型所有输入张量的名字、形状和对象指针: + +```python +for name, tensor in stub.inputs.items(): + print(name, tensor.shape(), tensor) +``` + +对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出: + +```plaintext +data [1, 3, 224, 224] +``` + +当然,地址是随机的。这个输出表明需要输入一个名为 “data”,形为 1×3×224×224 的数据。通常来说,这表示一张 224×224 的 rgb 图片。而这个模型是一个 1000 分类的图像分类模型。 + +为了方便,这里我们向模型传入一个随机的数据。 + +```python +import numpy + +stub.init() +for name, tensor in stub.inputs.items(): + print(name, tensor.shape(), tensor) + input = numpy.random.random(tensor.shape()).astype(numpy.float32) + tensor.copyin_float(input.flatten().tolist()) +``` + +`stub.init()` 为所有张量分配空间。空间是预分配的,所以不支持动态 size 的模型。 + +`tensor.copyin_float()` 向张量传入数据。其参数必须是一个 `List[float]`,即压平的数据。类似的函数还有 `copyin_int32()` 和 `copyin_int64()` + +然后,调用 `stub.run()` 执行推理: + +```python +stub.run() +``` + +最后,将结果拷贝出来,传入类似: + +```python +stub.init() +for name, tensor in stub.outputs.items(): + print(name, tensor.shape(), tensor) + print(tensor.copyout_float()) +``` + +## 测试 + +除了单元测试 `make test-cpp` 和 `make test-onnx` 之外,还可以用其他方式来测试单个模型导入导出和优化的正确性。 + +这个脚本利用 onnxruntime 来测试导出的模型是否与导入的模型等价: + +```python +import onnx +import numpy +import sys +from onnx import ModelProto, ValueInfoProto +from pyinfinitensor.onnx import OnnxStub +from pyinfinitensor import backend +from onnxruntime import InferenceSession + + +def infer(model: ModelProto, input) -> dict: + collection = set() + for node in model.graph.node: + for output in node.output: + collection.add(output) + model.graph.output.extend([ValueInfoProto(name=x) for x in collection]) + session = InferenceSession(model.SerializeToString()) + i = session.get_inputs()[0].name + return dict( + zip( + [x.name for x in session.get_outputs()], + [x.flatten() for x in session.run(None, {i: input})], + ) + ) + + +model0 = onnx.load(sys.argv[1]) +model1 = OnnxStub(model0, backend.cpu_runtime()).to_onnx("new") + +input_shape = [x.dim_value for x in model1.graph.input[0].type.tensor_type.shape.dim] +input = numpy.random.random(input_shape).astype(numpy.float32) + +output0 = infer(model0, input)[model0.graph.output[0].name] +output1 = infer(model1, input)[model1.graph.output[0].name] + +print("error =", sum((output1 - output0) ** 2) / len(output0)) +``` + +要运行脚本,先安装 onnxruntime: + +```bash +pip install onnxruntime +``` + +打印出的 `error = ...` 是两个模型输出张量的均方误差。对于不同的模型,这个误差最小为 0,最大不超过 1e-9。 diff --git a/example b/example new file mode 160000 index 00000000..d6ac8c8c --- /dev/null +++ b/example @@ -0,0 +1 @@ +Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9 diff --git a/include/bang/bang_element_wise.h b/include/bang/bang_element_wise.h deleted file mode 100644 index a0bf03c4..00000000 --- a/include/bang/bang_element_wise.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once -#include "bang/bang_runtime.h" -#include "bang_div.h" -#include "operators/element_wise.h" -namespace infini { - -void element_wise_kernel(const RuntimeObj *obj, const Operator &_op) { - auto op = as(_op); - float *const aData = (op->getInputs(0)->getRawDataPtr()); - float *const bData = (op->getInputs(1)->getRawDataPtr()); - float *const cData = (op->getOutput()->getRawDataPtr()); - - auto dim = op->getInputs(0)->getDims(); - auto context = dynamic_cast(obj); - int n = dim[0], c = dim[1], h = dim[2], w = dim[3]; - if (op->getOpType() == OpType::Div) - div_kernel(context->cnnlHandle(), aData, bData, cData, n * c * h * w); - else - IT_TODO_HALT(); -} - -}; // namespace infini diff --git a/include/core/constants.h b/include/core/constants.h index 655c8989..bf74f47d 100644 --- a/include/core/constants.h +++ b/include/core/constants.h @@ -2,4 +2,4 @@ namespace infini { constexpr double E_CONSTANT = 2.718281828459; -} \ No newline at end of file +} diff --git a/include/core/data_type.h b/include/core/data_type.h index 72699e22..878f4bdb 100644 --- a/include/core/data_type.h +++ b/include/core/data_type.h @@ -52,4 +52,14 @@ template <> inline DataType DataType::get() { return Int16; } template <> inline DataType DataType::get() { return Int32; } template <> inline DataType DataType::get() { return Int64; } +template struct DT {}; +template <> struct DT<0> { using t = float; }; +template <> struct DT<1> { using t = uint32_t; }; +template <> struct DT<2> { using t = uint8_t; }; +template <> struct DT<3> { using t = int8_t; }; +template <> struct DT<4> { using t = uint16_t; }; +template <> struct DT<5> { using t = int16_t; }; +template <> struct DT<6> { using t = int32_t; }; +template <> struct DT<7> { using t = int64_t; }; + } // namespace infini diff --git a/include/core/graph.h b/include/core/graph.h index 3d62be30..dab31d79 100644 --- a/include/core/graph.h +++ b/include/core/graph.h @@ -59,6 +59,8 @@ class GraphObj : public Object { */ bool topo_sort(); + void optimize(); + void dataMalloc(); /** diff --git a/include/core/graph_handler.h b/include/core/graph_handler.h index a76197de..7af4bbe1 100644 --- a/include/core/graph_handler.h +++ b/include/core/graph_handler.h @@ -46,6 +46,9 @@ class GraphHandlerObj { Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw, int sh, int sw, int dh, int dw); + Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph, + int pw, int sh, int sw, int dh, int dw, int oph, + int opw); Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB, Tensor bias, ActType act); Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var, @@ -68,10 +71,17 @@ class GraphHandlerObj { Tensor tanh(Tensor x, Tensor y); Tensor softmax(Tensor x, Tensor y, int axis); Tensor abs(Tensor x, Tensor y); + Tensor shape(Tensor x, Tensor y); Tensor identity(Tensor x, Tensor y); Tensor flatten(Tensor s, Tensor y, int axis); + Tensor pRelu(Tensor x, Tensor slope, Tensor y); + Tensor clip(Tensor x, Tensor y, std::optional min, + std::optional max); + Tensor transpose(Tensor data, Tensor transposed, Shape perm); Tensor reshape(Tensor data, Tensor reshaped, Shape shape); Tensor concat(TensorVec inputs, Tensor output, int dim); + TensorVec split(Tensor input, std::optional outputs, int axis, + int num_outputs); Tensor gather(Tensor data, Tensor indices, Tensor output, int axis); Tensor reduceMean(Tensor data, Tensor reduced, const optional> &axes, bool keepdims); @@ -85,6 +95,8 @@ class GraphHandlerObj { inline bool topo_sort() { return g->topo_sort(); } + inline void optimize() { g->optimize(); } + //------ runtime inline void data_malloc() { g->dataMalloc(); } diff --git a/include/core/hash.h b/include/core/hash.h index 3963af91..79e64163 100644 --- a/include/core/hash.h +++ b/include/core/hash.h @@ -15,4 +15,4 @@ template inline HashType hashVector(const vector &vec) { return ret; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/core/object.h b/include/core/object.h index ee5430e1..2db50ad7 100644 --- a/include/core/object.h +++ b/include/core/object.h @@ -68,4 +68,4 @@ inline std::ostream &operator<<(std::ostream &os, const Ref &obj) { return os; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/core/operator.h b/include/core/operator.h index 87069096..cca67297 100644 --- a/include/core/operator.h +++ b/include/core/operator.h @@ -6,6 +6,8 @@ enum class OpType { Unknown = 0, // linear Conv = 100, + ConvBackwardFilter, + ConvBackwardData, Matmul, ConvTrans, ConvTransNHWC, @@ -34,10 +36,70 @@ enum class OpType { Softmax, Activation, Relu, + ReluBackward, + PRelu, Sigmoid, + SigmoidBackward, Tanh, + TanhBackward, Abs, + Sin, + Cos, + Tan, + ASin, + ACos, + ATan, + SinH, + CosH, + TanH, + ASinH, + ACosH, + ATanH, Resize, + Arange, + Shape, + Copy, + Ceil, + Floor, + Clip, + Erf, + Exp, + Fill, + Log, + L2Loss, + Maximum, + Minimum, + MSELoss, + Neg, + Power, + Reciprocal, + Sqrt, + Rsqrt, + Cast, + FloorDiv, + FloorMod, + Det, + Round, + Square, + SquaredDifference, + Hardtanh, + Equal, + NotEqual, + GreaterThan, + GreaterEqual, + LessThan, + LessEqual, + And, + Or, + Xor, + Not, + BitAnd, + BitOr, + BitXor, + BitNot, + BitLeftShift, + BitRightShift, + Dropout, // MemBound = 300, }; @@ -55,6 +117,8 @@ class OpRegistry { FOP(Unknown); // linear FOP(Conv); + FOP(ConvBackwardFilter); + FOP(ConvBackwardData); FOP(Matmul); FOP(ConvTrans); FOP(G2BMM); @@ -76,15 +140,72 @@ class OpRegistry { FOP(ReduceMean); FOP(Reshape); FOP(Identity); + FOP(Shape); // element wise FOP(BatchNorm); FOP(Softmax); FOP(Activation); FOP(Relu); + FOP(ReluBackward); + FOP(PRelu); FOP(Sigmoid); + FOP(SigmoidBackward); FOP(Tanh); + FOP(TanhBackward); FOP(Abs); - FOP(ConvTransNHWC); + FOP(Sin); + FOP(Cos); + FOP(Tan); + FOP(ASin); + FOP(ACos); + FOP(ATan); + FOP(SinH); + FOP(CosH); + FOP(TanH); + FOP(ASinH); + FOP(ACosH); + FOP(ATanH); + FOP(Copy); + FOP(Ceil); + FOP(Floor); + FOP(Clip); + FOP(Erf); + FOP(Exp); + FOP(Fill); + FOP(Log); + FOP(L2Loss); + FOP(Maximum); + FOP(Minimum); + FOP(MSELoss); + FOP(Neg); + FOP(Power); + FOP(Reciprocal); + FOP(Sqrt); + FOP(Rsqrt); + FOP(Cast); + FOP(FloorDiv); + FOP(FloorMod); + FOP(Det); + FOP(Round); + FOP(Square); + FOP(SquaredDifference); + FOP(Hardtanh); + FOP(Equal); + FOP(NotEqual); + FOP(GreaterThan); + FOP(GreaterEqual); + FOP(LessThan); + FOP(LessEqual); + FOP(And); + FOP(Or); + FOP(Xor); + FOP(Not); + FOP(BitAnd); + FOP(BitOr); + FOP(BitXor); + FOP(BitNot); + FOP(BitLeftShift); + FOP(BitRightShift); // FOP(MemBound); default: diff --git a/include/core/perf_engine.h b/include/core/perf_engine.h index 58659134..fb65da34 100644 --- a/include/core/perf_engine.h +++ b/include/core/perf_engine.h @@ -49,4 +49,4 @@ class PerfEngine { void to_json(json &j, const PerfEngine &p); void from_json(const json &j, PerfEngine &p); -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/core/ref.h b/include/core/ref.h index b88bca99..825b0f2c 100644 --- a/include/core/ref.h +++ b/include/core/ref.h @@ -40,4 +40,4 @@ std::vector> wrefs_to_refs(const std::vector> &wrefs) { return refs; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/core/tensor.h b/include/core/tensor.h index 8f8fa356..a1081e15 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -73,7 +73,7 @@ class TensorObj : public TensorBaseObj { // FIXME: std::fucntion copies the generator instead of passing it by ref. // Thus the internal state of generator cannot be updated. void setData( - const std::function &generator) const; + std::function const &generator) const; Tensor clone() const { auto obj = make_ref(*this); obj->freeData(); @@ -100,26 +100,53 @@ class TensorObj : public TensorBaseObj { template bool equalData(const vector &dataVector) { IT_ASSERT(DataType::get() == dtype); IT_ASSERT(size() == dataVector.size()); - return equalDataImpl(getRawDataPtr(), dataVector.data(), size(), - 1e-6); + return equalDataImpl(getRawDataPtr(), dataVector.data(), size()); } size_t getOffsetByBroadcastOffset(size_t bcOffset, Shape bcShape) const; private: - void printDataFloat(float *ptr) const; - void printDataUint32_t(uint32_t *ptr) const; + template string dataToString() const { + std::stringstream builder; + builder << "Tensor: " << guid << std::endl; + + auto numDims = shape.size(); + auto dimSzVec = vector(numDims, 1); + auto ptr = data->getPtr(); + dimSzVec[numDims - 1] = shape[numDims - 1]; + + for (int i = numDims - 1; i != 0; --i) + dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; + + for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { + for (size_t j = 0; j < numDims; ++j) + if (i % dimSzVec[j] == 0) + builder << "["; + + builder << ptr[i]; + for (size_t j = 0; j < numDims; ++j) + if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) + builder << "]"; + + if (i != size() - 1) + builder << ", "; + + auto column = (size_t)dimSzVec[numDims - 1]; + if (i % column == column - 1) + builder << std::endl; + } + return builder.str(); + } template - bool equalDataImpl(const T *a, const T *b, size_t size, - double relativeError) const { + bool equalDataImpl(const T *a, const T *b, size_t size) const { for (size_t i = 0; i < size; ++i) { if constexpr (std::is_integral_v) { if (a[i] != b[i]) return false; } else if constexpr (std::is_floating_point_v) { if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) > - relativeError) { + 1e-6) { printf("Error on %lu: %f %f\n", i, a[i], b[i]); return false; } diff --git a/include/cuda/cuda_clip.h b/include/cuda/cuda_clip.h new file mode 100644 index 00000000..bb602803 --- /dev/null +++ b/include/cuda/cuda_clip.h @@ -0,0 +1,9 @@ +#pragma once + +#include "operators/unary.h" + +namespace infini { +void clip_kernel(float *input, float *output, int num, float minValue, + float maxValue); + +}; // namespace infini diff --git a/include/cuda/cuda_element_wise.h b/include/cuda/cuda_element_wise.h index 39de0ea3..eb3b99a2 100644 --- a/include/cuda/cuda_element_wise.h +++ b/include/cuda/cuda_element_wise.h @@ -1,6 +1,8 @@ #pragma once namespace infini { -void div_kernel(float *a, float *b, float *c, int num); -void pow_kernel(float *a, float *b, float *c, int num); +void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3); +void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3); }; // namespace infini diff --git a/include/nnet/Pass/MatchComputationKernel.h b/include/nnet/Pass/MatchComputationKernel.h index 4b182a89..270b6e7f 100644 --- a/include/nnet/Pass/MatchComputationKernel.h +++ b/include/nnet/Pass/MatchComputationKernel.h @@ -12,4 +12,4 @@ class MatchComputationKernel : public Pass { virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/MatchMemBoundKernel.h b/include/nnet/Pass/MatchMemBoundKernel.h index 6b0a4bec..2f2bc2ce 100644 --- a/include/nnet/Pass/MatchMemBoundKernel.h +++ b/include/nnet/Pass/MatchMemBoundKernel.h @@ -12,4 +12,4 @@ class MatchMemBoundKernel : public Pass { virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Pass.h b/include/nnet/Pass/Pass.h index a8d517fa..9bf077be 100644 --- a/include/nnet/Pass/Pass.h +++ b/include/nnet/Pass/Pass.h @@ -38,4 +38,4 @@ class Pass { const VecExpr &getTransformations(); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule1VariableSplit.h b/include/nnet/Pass/Rule1VariableSplit.h index 06f2bb4e..397cc514 100644 --- a/include/nnet/Pass/Rule1VariableSplit.h +++ b/include/nnet/Pass/Rule1VariableSplit.h @@ -15,4 +15,4 @@ class Rule1VariableSplit : public Pass { Expr replaceIters(Expr cur, const Replace &replace); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule2VariableMerging.h b/include/nnet/Pass/Rule2VariableMerging.h index 1f277f46..5acbb951 100644 --- a/include/nnet/Pass/Rule2VariableMerging.h +++ b/include/nnet/Pass/Rule2VariableMerging.h @@ -26,4 +26,4 @@ class Rule2VariableMerging : public Pass { pair pb); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule3StageSplit.h b/include/nnet/Pass/Rule3StageSplit.h index 99e172cf..3cc06eb3 100644 --- a/include/nnet/Pass/Rule3StageSplit.h +++ b/include/nnet/Pass/Rule3StageSplit.h @@ -16,4 +16,4 @@ class Rule3StageSplit : public Pass { vector> getSplitSummationIters(RangeOp rangeOp); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule4StageMerging.h b/include/nnet/Pass/Rule4StageMerging.h index 13f11074..a69e5754 100644 --- a/include/nnet/Pass/Rule4StageMerging.h +++ b/include/nnet/Pass/Rule4StageMerging.h @@ -19,4 +19,4 @@ class Rule4StageMerging : public Pass { virtual void transform(Formula &origin, int depth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule5RangeRelaxation.h b/include/nnet/Pass/Rule5RangeRelaxation.h index c1719dfc..364c9e9a 100644 --- a/include/nnet/Pass/Rule5RangeRelaxation.h +++ b/include/nnet/Pass/Rule5RangeRelaxation.h @@ -13,4 +13,4 @@ class Rule5RangeRelaxation : public Pass { virtual void transform(Formula &origin, int depth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule6KenerlMatching.h b/include/nnet/Pass/Rule6KenerlMatching.h index 19648eaf..53867b42 100644 --- a/include/nnet/Pass/Rule6KenerlMatching.h +++ b/include/nnet/Pass/Rule6KenerlMatching.h @@ -14,4 +14,4 @@ class Rule6KenerlMatching : public Pass { VecExpr matchElementWise(const RangeOp &rangeOp); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule7DLT.h b/include/nnet/Pass/Rule7DLT.h index d2ce0ecc..d05f7014 100644 --- a/include/nnet/Pass/Rule7DLT.h +++ b/include/nnet/Pass/Rule7DLT.h @@ -13,4 +13,4 @@ class Rule7DLT : public Pass { vector getFactors(); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule8GuidedDLT.h b/include/nnet/Pass/Rule8GuidedDLT.h index e6536df4..3167b337 100644 --- a/include/nnet/Pass/Rule8GuidedDLT.h +++ b/include/nnet/Pass/Rule8GuidedDLT.h @@ -45,4 +45,4 @@ class Rule8GuidedDLT : public Pass { vector tensorDimAxes, vector newShape); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule90TwoStageElementWise.h b/include/nnet/Pass/Rule90TwoStageElementWise.h index ab37cf99..fb049041 100644 --- a/include/nnet/Pass/Rule90TwoStageElementWise.h +++ b/include/nnet/Pass/Rule90TwoStageElementWise.h @@ -13,4 +13,4 @@ class Rule90TwoStageElementWise : public Pass { VecExpr matchTwoStageElementWise(const RangeOp &rangeOp); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule91MergeStagesWithSum.h b/include/nnet/Pass/Rule91MergeStagesWithSum.h index 221e1772..a9e861e0 100644 --- a/include/nnet/Pass/Rule91MergeStagesWithSum.h +++ b/include/nnet/Pass/Rule91MergeStagesWithSum.h @@ -12,4 +12,4 @@ class Rule91MergeStagesWithSum : public Pass { virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/Pass/Rule9RangeMagnify.h b/include/nnet/Pass/Rule9RangeMagnify.h index 613de158..890d2dbb 100644 --- a/include/nnet/Pass/Rule9RangeMagnify.h +++ b/include/nnet/Pass/Rule9RangeMagnify.h @@ -12,4 +12,4 @@ class Rule9RangeMagnify : public Pass { virtual void transform(Formula &origin, int dfsDepth, Expr &rCur) override; }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/ReplaceKit.h b/include/nnet/ReplaceKit.h index 19f2ec0e..bd28cf1e 100644 --- a/include/nnet/ReplaceKit.h +++ b/include/nnet/ReplaceKit.h @@ -43,4 +43,4 @@ class ReplaceKit { const Expr &replacement); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/dbg.h b/include/nnet/dbg.h index f5894e4b..2727c928 100644 --- a/include/nnet/dbg.h +++ b/include/nnet/dbg.h @@ -231,7 +231,7 @@ template std::string type_list_to_string() { result.pop_back(); } return result; -} +} // namespace dbg template std::string get_type_name(type_tag>) { return "std::tuple<" + type_list_to_string() + ">"; @@ -855,4 +855,4 @@ auto identity(T &&, U &&...u) -> last_t { #define dbg(...) dbg::identity(__VA_ARGS__) #endif // DBG_MACRO_DISABLE -#endif // DBG_MACRO_DBG_H \ No newline at end of file +#endif // DBG_MACRO_DBG_H diff --git a/include/nnet/dlt.h b/include/nnet/dlt.h index 4e5e56ce..ecc9c6b2 100644 --- a/include/nnet/dlt.h +++ b/include/nnet/dlt.h @@ -50,4 +50,4 @@ class DLT { RangeOp rangeOp); }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/nmutator.h b/include/nnet/nmutator.h index c3009b10..74834f7e 100644 --- a/include/nnet/nmutator.h +++ b/include/nnet/nmutator.h @@ -55,4 +55,4 @@ class NMutator : public Mutator { // Graph transformConv1xk(Operator op); }; -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/nnet/permutation.h b/include/nnet/permutation.h index 300a5d6b..4b1917b9 100644 --- a/include/nnet/permutation.h +++ b/include/nnet/permutation.h @@ -35,4 +35,4 @@ template class SubsetGenerator { } }; -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/nnet/ref.h b/include/nnet/ref.h index d4c6ef63..b12f6ffb 100644 --- a/include/nnet/ref.h +++ b/include/nnet/ref.h @@ -115,4 +115,4 @@ constexpr Ref<_Tp> make_ref_from_tuple(_Tuple &&__t) { // } // }; -// } // namespace nnet \ No newline at end of file +// } // namespace nnet diff --git a/include/nnet/test.h b/include/nnet/test.h index dbc32c34..6cf75873 100644 --- a/include/nnet/test.h +++ b/include/nnet/test.h @@ -27,4 +27,4 @@ namespace nnet { int matchExprResult(Derivator &derivator, string fn); bool checkExprLogSame(string fnPrefix, int start, int end); bool checkExprsEquvivalence(VecExpr exprs); -} // namespace nnet \ No newline at end of file +} // namespace nnet diff --git a/include/operators/activation_backward.h b/include/operators/activation_backward.h new file mode 100644 index 00000000..5f55d8cc --- /dev/null +++ b/include/operators/activation_backward.h @@ -0,0 +1,32 @@ +#pragma once +#include "core/operator.h" + +namespace infini { +class ActivationBackwardObj : public OperatorObj { + public: + ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y, + Tensor x, Tensor diff_x); + OP_CLONE(ActivationBackwardObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 3; } + int numOutputs() const override { return 1; } + + private: + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +#define DEFINE_ACTIVATION_BACKWARD_OBJ(prefix, type) \ + class prefix##Obj : public ActivationBackwardObj { \ + public: \ + prefix##Obj(GraphObj *graph, Tensor y, Tensor diff_y, Tensor x, \ + Tensor diff_x) \ + : ActivationBackwardObj(type, graph, y, diff_y, x, diff_x) {} \ + }; + +DEFINE_ACTIVATION_BACKWARD_OBJ(ReluBackward, OpType::ReluBackward) +DEFINE_ACTIVATION_BACKWARD_OBJ(SigmoidBackward, OpType::SigmoidBackward) +DEFINE_ACTIVATION_BACKWARD_OBJ(TanhBackward, OpType::TanhBackward) +}; // namespace infini diff --git a/include/operators/batch_norm.h b/include/operators/batch_norm.h index 76f1eff4..cfacf2ca 100644 --- a/include/operators/batch_norm.h +++ b/include/operators/batch_norm.h @@ -9,7 +9,7 @@ namespace infini { */ class BatchNormObj : public OperatorObj { float momentum, eps; - bool training; + bool trainingMode; public: /** @@ -28,11 +28,11 @@ class BatchNormObj : public OperatorObj { * Default is 0.9. * @param eps The epsilon value to use to avoid division by zero. Default is * 1e-5. - * @param training Set to true when used for training. + * @param trainingMode Set to true when used for training. */ BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean, Tensor var, Tensor scale, Tensor bias, float momentum = 0.9, - float eps = 1e-5, bool training = false); + float eps = 1e-5, bool trainingMode = false); OP_CLONE(BatchNormObj); optional> inferShape(const TensorVec &inputs) const override; std::string toString() const override; @@ -42,7 +42,7 @@ class BatchNormObj : public OperatorObj { int numOutputs() const override { return outputs.size(); } float getMomentum() const { return momentum; } float getEps() const { return eps; } - bool getTraining() const { return training; } + bool getTrainingMode() const { return trainingMode; } private: vector getWorkloadVector() const override; diff --git a/include/operators/conv.h b/include/operators/conv.h index 793afd48..449f4334 100644 --- a/include/operators/conv.h +++ b/include/operators/conv.h @@ -149,6 +149,29 @@ class ConvObj : public ConvBaseObj { void setAuxilaryAttributes(PaddingMode mode) override; }; +class ConvBackwardFilterObj : public ConvBaseObj { + private: + ActType act; + + public: + ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY, + Tensor diffW, int ph, int pw, int sh = 1, int sw = 1, + int dh = 1, int dw = 1, Tensor bias = nullptr, + ActType act = ActType::None); + // Constructors for setting padding mode + ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, Tensor diffY, + Tensor diffW, PaddingMode mode = PaddingMode::Same, + int sh = 1, int sw = 1, int dh = 1, int dw = 1, + Tensor bias = nullptr, ActType act = ActType::None); + + optional> inferShape(const TensorVec &inputs) const override; + ActType getAct() const { return act; } + int getNumGroups() const override { return c / getChannelPerGroup(); } + + private: + void setAuxilaryAttributes(PaddingMode mode) override; +}; + class ConvTransposed2dObj : public ConvBaseObj { private: int oph, opw; @@ -170,6 +193,7 @@ class ConvTransposed2dObj : public ConvBaseObj { optional> inferShape(const TensorVec &inputs) const override; int getNumGroups() const override { return group; } + std::pair getOutputPadding() const { return {oph, opw}; } private: void setAuxilaryAttributes(PaddingMode mode) override; diff --git a/include/operators/det.h b/include/operators/det.h new file mode 100644 index 00000000..d5e887c1 --- /dev/null +++ b/include/operators/det.h @@ -0,0 +1,22 @@ +#pragma once +#include "core/operator.h" + +namespace infini { +class DetObj : public OperatorObj { + public: + enum Mode { NormalDet = 0, LogDet }; + DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode); + OP_CLONE(DetObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + Mode getMode() const { return modeValue; } + + private: + Mode modeValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; +}; // namespace infini diff --git a/include/operators/dropout.h b/include/operators/dropout.h new file mode 100644 index 00000000..8c4c7300 --- /dev/null +++ b/include/operators/dropout.h @@ -0,0 +1,52 @@ +#pragma once +#include "core/operator.h" + +namespace infini { +/** + * @brief Copy a tensor along a centain dimension for multiple times. + */ +class DropoutObj : public OperatorObj { + float ratio; + // bool training_mode; // TODO must be false. + + public: + /** + * @brief Dropout takes an input floating-point tensor, an input ratio + * (floating-point scalar) and an input training_mode (boolean scalar). It + * produces two tensor outputs, output (floating-point tensor) and mask + * (bool tensor). If training_mode is true then the output Y will be a + * random dropout; Note that this Dropout scales the masked input data by + * the following equation, so to convert the trained model into inference + * mode, the user can simply not pass training_mode input or set it to + * false. + * + * @param graph The computation graph that this operator belongs to. + * @param data The input tensor. + * @param output The output tensor. + * @param mask The mask tensor. + * @param ratio The ratio of random dropout, with value in [0, 1). If this + * input was not set, or if it was set to 0, the output would be a simple + * copy of the input. If it’s non-zero, output will be a random dropout of + * the scaled input, which is typically the case during training. + * @param training_mode If set to true then it indicates dropout is being + * used for training. It is an optional value hence unless specified + * explicitly, it is false. If it is false, ratio is ignored and the + * operation mimics inference mode where nothing will be dropped from the + * input data and if mask is requested as output it will contain all ones. + */ + DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask, + float ratio, bool training_mode); + OP_CLONE(DropoutObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 2; } + float getRatio() const { return ratio; } + bool getTrainingMode() const { return false; } + + private: + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; +} // namespace infini diff --git a/include/operators/element_wise.h b/include/operators/element_wise.h index c3ffe9f1..8bda60de 100644 --- a/include/operators/element_wise.h +++ b/include/operators/element_wise.h @@ -32,6 +32,25 @@ class ElementWiseObj : public OperatorObj { vector getOpAttrVector() const override; }; +class MSELossObj : public OperatorObj { + public: + enum Reduction { None = 0, Sum, Mean }; + MSELossObj(GraphObj *graph, Tensor input0, Tensor input1, + Reduction reduction, Tensor output); + OP_CLONE(MSELossObj); + optional> inferShape(const TensorVec &inputs) const override; + + Reduction getReduction() const { return reductionMode; } + std::string toString() const override; + int numInputs() const override { return 2; } + int numOutputs() const override { return 1; } + + private: + Reduction reductionMode; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + #define DEFINE_ELEMENT_WISE_OBJ(prefix, type) \ class prefix##Obj : public ElementWiseObj { \ public: \ @@ -46,4 +65,26 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub) DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul) DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div) DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow) +DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum) +DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum) +DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power) +DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv) +DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod) +DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference) +DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal) +DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual) +DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan) +DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual) +DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan) +DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual) +DEFINE_ELEMENT_WISE_OBJ(And, OpType::And) +DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or) +DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor) +DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not) +DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd) +DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr) +DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor) +DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot) +DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift) +DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift) }; // namespace infini diff --git a/include/operators/matmul.h b/include/operators/matmul.h index a1c57cfe..91a0a57c 100644 --- a/include/operators/matmul.h +++ b/include/operators/matmul.h @@ -47,10 +47,10 @@ class MatmulObj : public OperatorObj { std::string toString() const override; optional> inferShape(const TensorVec &inputs) const override; - int numInputs() const override { return 2; } + int numInputs() const override { return inputs.size(); } int numOutputs() const override { return 1; } - Tensor getBias() const { return inputs[2]; } + Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; } ActType getAct() const { return act; } auto getBMNKTransAB() const { return tuple(b, m, n, k, transA, transB); } bool getTransA() const { return transA; } diff --git a/include/operators/slice.h b/include/operators/slice.h index 7aeb0941..55acf505 100644 --- a/include/operators/slice.h +++ b/include/operators/slice.h @@ -7,7 +7,8 @@ namespace infini { * */ class SliceObj : public OperatorObj { - vector starts, ends; // the start no. and end no. for all dims. + template struct range_t { T start, end, step; }; + vector> axes; public: /** @@ -33,9 +34,26 @@ class SliceObj : public OperatorObj { optional> inferShape(const TensorVec &inputs) const override; std::string toString() const override; - int numInputs() const override { return 1; } - int numOutputs() const override { return 1; } - Shape getStart() const { return starts; } + inline int numInputs() const override { return 1; } + inline int numOutputs() const override { return 1; } + inline Shape getStarts() const { + Shape ans(axes.size()); + std::transform(axes.begin(), axes.end(), ans.begin(), + [](auto x) { return x.start; }); + return ans; + } + inline Shape getEnds() const { + Shape ans(axes.size()); + std::transform(axes.begin(), axes.end(), ans.begin(), + [](auto x) { return x.end; }); + return ans; + } + inline Shape getSteps() const { + Shape ans(axes.size()); + std::transform(axes.begin(), axes.end(), ans.begin(), + [](auto x) { return x.step; }); + return ans; + } private: vector getWorkloadVector() const override; diff --git a/include/operators/transpose.h b/include/operators/transpose.h new file mode 100644 index 00000000..b26ed49a --- /dev/null +++ b/include/operators/transpose.h @@ -0,0 +1,22 @@ +#pragma once +#include "core/operator.h" + +namespace infini { +class TransposeObj : public OperatorObj { + public: + TransposeObj(GraphObj *graph, Tensor input, Tensor output, + vector permute); + OP_CLONE(TransposeObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + std::vector getPermute() const { return transposePermute; } + + private: + vector transposePermute = {1, 1, 1, 1}; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; +}; // namespace infini diff --git a/include/operators/unary.h b/include/operators/unary.h index e75025c5..1df2b4a7 100644 --- a/include/operators/unary.h +++ b/include/operators/unary.h @@ -28,6 +28,244 @@ class UnaryObj : public OperatorObj { vector getOpAttrVector() const override; }; +class ClipObj : public OperatorObj { + public: + ClipObj(GraphObj *graph, Tensor input, Tensor output, + std::optional min, std::optional max); + OP_CLONE(ClipObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + std::optional getMin() const { return minValue; }; + std::optional getMax() const { return maxValue; }; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + std::optional minValue, maxValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class HardtanhObj : public OperatorObj { + public: + HardtanhObj(GraphObj *graph, Tensor input, Tensor output, float min, + float max); + OP_CLONE(HardtanhObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + float getMin() const { return minValue; }; + float getMax() const { return maxValue; }; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + float minValue, maxValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class FlipObj : public OperatorObj { + public: + FlipObj(GraphObj *graph, Tensor input, Tensor output, vector axis); + OP_CLONE(FlipObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + vector getAxis() const { return axisValue; }; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + vector axisValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class FillObj : public OperatorObj { + public: + FillObj(GraphObj *graph, Tensor input, Tensor output, float value); + OP_CLONE(FillObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + float getValue() const { return setValue; }; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + float setValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class L2LossObj : public OperatorObj { + public: + L2LossObj(GraphObj *graph, Tensor input, Tensor output); + OP_CLONE(L2LossObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class TransformObj : public OperatorObj { + public: + TransformObj(GraphObj *graph, Tensor input, Tensor output, float alpha, + float beta); + OP_CLONE(TransformObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + float getAlpha() const { return alphaValue; } + float getBeta() const { return betaValue; } + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + float alphaValue, betaValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class CastObj : public OperatorObj { + public: + enum CastType { + Float2Half = 0, + Float2Int64, + Float2Int32, + Float2Int16, + Float2Int8, + Int322Float, + Int322Int8, + Int322Int16, + Int162Float, + Int162Int32, + Int82Float, + Int82Int16, + Int82Int32, + Uint82Float, + Uint82Int32, + Uint82Int64, + Int322Int64, + Int642Int32, + Int642Uint32, + Int642Float, + Uint322Int64, + }; + CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type); + OP_CLONE(CastObj); + optional> inferShape(const TensorVec &inputs) const override; + vector inferDataType(const TensorVec &inputs) const override; + + std::string toString() const override; + CastType getType() const { return castType; } + DataType getOutputDataType() const; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + CastType castType; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class CumsumObj : public OperatorObj { + public: + CumsumObj(GraphObj *graph, Tensor input, Tensor output, int axis, + bool exclusive, bool reverse); + OP_CLONE(CumsumObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int getAxis() const { return axisValue; } + float getExclusive() const { return exclusiveValue; } + float getReverse() const { return reverseValue; } + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + int axisValue; + bool exclusiveValue, reverseValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class ArangeObj : public OperatorObj { + public: + ArangeObj(GraphObj *graph, float start, float step, int length, + Tensor output); + OP_CLONE(ArangeObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 0; } + int numOutputs() const override { return 1; } + float getStartValue() { return startValue; } + float getStepValue() { return stepValue; } + int getLength() { return lengthValue; } + + private: + float startValue, stepValue; + int lengthValue; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class ShapeObj : public OperatorObj { + public: + ShapeObj(GraphObj *graph, Tensor input, Tensor output); + OP_CLONE(ShapeObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } +}; + +class PReluObj : public OperatorObj { + public: + PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output); + OP_CLONE(PReluObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + int numInputs() const override { return 2; } + int numOutputs() const override { return 1; } + + private: + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + +class LogObj : public OperatorObj { + public: + enum LogType { + LogE = 0, + Log2, + Log10, + }; + LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type); + OP_CLONE(LogObj); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + LogType getType() const { return logType; } + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + LogType logType; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + #define DEFINE_UNARY_OBJ(prefix, type) \ class prefix##Obj : public UnaryObj { \ public: \ @@ -42,4 +280,28 @@ DEFINE_UNARY_OBJ(Tanh, OpType::Tanh) // DEFINE_UNARY_OBJ(Softmax, OpType::Softmax) DEFINE_UNARY_OBJ(Abs, OpType::Abs) +DEFINE_UNARY_OBJ(Sin, OpType::Sin) +DEFINE_UNARY_OBJ(Cos, OpType::Cos) +DEFINE_UNARY_OBJ(Tan, OpType::Tan) +DEFINE_UNARY_OBJ(ASin, OpType::ASin) +DEFINE_UNARY_OBJ(ACos, OpType::ACos) +DEFINE_UNARY_OBJ(ATan, OpType::ATan) +DEFINE_UNARY_OBJ(SinH, OpType::SinH) +DEFINE_UNARY_OBJ(CosH, OpType::CosH) +DEFINE_UNARY_OBJ(TanH, OpType::TanH) +DEFINE_UNARY_OBJ(ASinH, OpType::ASinH) +DEFINE_UNARY_OBJ(ACosH, OpType::ACosH) +DEFINE_UNARY_OBJ(ATanH, OpType::ATanH) + +DEFINE_UNARY_OBJ(Copy, OpType::Copy) +DEFINE_UNARY_OBJ(Ceil, OpType::Ceil) +DEFINE_UNARY_OBJ(Floor, OpType::Floor) +DEFINE_UNARY_OBJ(Erf, OpType::Erf) +DEFINE_UNARY_OBJ(Exp, OpType::Exp) +DEFINE_UNARY_OBJ(Neg, OpType::Neg) +DEFINE_UNARY_OBJ(Reciprocal, OpType::Reciprocal) +DEFINE_UNARY_OBJ(Sqrt, OpType::Sqrt) +DEFINE_UNARY_OBJ(Rsqrt, OpType::Rsqrt) +DEFINE_UNARY_OBJ(Round, OpType::Round) +DEFINE_UNARY_OBJ(Square, OpType::Square) }; // namespace infini diff --git a/pyinfinitensor/pyproject.toml b/pyinfinitensor/pyproject.toml index aca83ee3..8c438812 100644 --- a/pyinfinitensor/pyproject.toml +++ b/pyinfinitensor/pyproject.toml @@ -8,7 +8,7 @@ version = "0.0.0" authors = [{ name = "YdrMaster", email = "ydrml@hotmail.com" }] description = "Python frontend of InfiniTensor" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.7" keywords = ["optimizer"] license = { text = "Apache" } classifiers = ["Programming Language :: Python :: 3"] diff --git a/pyinfinitensor/src/pyinfinitensor/onnx.py b/pyinfinitensor/src/pyinfinitensor/onnx.py index 360f5aaa..a450b281 100644 --- a/pyinfinitensor/src/pyinfinitensor/onnx.py +++ b/pyinfinitensor/src/pyinfinitensor/onnx.py @@ -22,12 +22,17 @@ from onnx.checker import ( check_tensor, ) from onnx.shape_inference import infer_shapes +from onnx.numpy_helper import to_array from typing import Dict, List, Any, Tuple, Sequence, Union, Optional from functools import reduce -runtime = backend.runtime() class OnnxStub: + """ + The Onnx model imported into infinitensor. + It can be generated from an Onnx model object. + """ + inputs: Dict[str, backend.Tensor] = {} outputs: Dict[str, backend.Tensor] = {} initializer: Dict[int, TensorProto] = {} @@ -53,6 +58,8 @@ class OnnxStub: ) for initializer in model.graph.initializer: + dims = [d for d in initializer.dims] + tensors[initializer.name] = self.handler.tensor(dims, initializer.data_type) data[initializer.name] = initializer for node in model.graph.node: @@ -61,14 +68,81 @@ class OnnxStub: node, { "dilations": [1, 1], - "pads": [0, 0], + "pads": [0, 0, 0, 0], "strides": [1, 1], }, ) (d, p, s) = ( attributes[name] for name in ["dilations", "pads", "strides"] ) - tensors[node.output[0]] = self.handler.conv( + if p[0] != p[2] or p[1] != p[3]: + adapt = "{}-adapt".format(node.output[0]) + tensors[adapt] = self.handler.pad( + tensors[node.input[0]], None, p, [-2, -1] + ) + p = [0, 0, 0, 0] + else: + adapt = node.input[0] + + if len(node.input) > 2: + bias = "{}-bias".format(node.output[0]) + reshape = "{}-reshape".format(node.output[0]) + tensors[bias] = self.handler.conv( + tensors[adapt], + tensors[node.input[1]], + None, + p[0], + p[1], + s[0], + s[1], + d[0], + d[1], + ) + tensors[reshape] = self.handler.reshape( + tensors[node.input[2]], + None, + [ + 1, + reduce( + lambda acc, x: acc * x, + _search_shape(model, node.input[2]), + ), + 1, + 1, + ], + ) + tensors[node.output[0]] = self.handler.add( + tensors[bias], + tensors[reshape], + tensors.get(node.output[0]), + ) + else: + tensors[node.output[0]] = self.handler.conv( + tensors[adapt], + tensors[node.input[1]], + tensors.get(node.output[0]), + p[0], + p[1], + s[0], + s[1], + d[0], + d[1], + ) + elif node.op_type == "ConvTranspose": + attributes = _parse_attribute( + node, + { + "dilations": [1, 1], + "pads": [0, 0], + "strides": [1, 1], + "output_padding": [0, 0], + }, + ) + (d, p, s, op) = ( + attributes[name] + for name in ["dilations", "pads", "strides", "output_padding"] + ) + tensors[node.output[0]] = self.handler.convTransposed2d( tensors[node.input[0]], tensors[node.input[1]], tensors.get(node.output[0]), @@ -78,6 +152,8 @@ class OnnxStub: s[1], d[0], d[1], + op[0], + op[1], ) elif node.op_type == "MatMul": tensors[node.output[0]] = self.handler.matmul( @@ -129,7 +205,7 @@ class OnnxStub: { "kernel_shape": None, "dilations": [1, 1], - "pads": [0, 0], + "pads": [0, 0, 0, 0], "strides": [1, 1], }, ) @@ -137,56 +213,80 @@ class OnnxStub: attributes[name] for name in ["kernel_shape", "dilations", "pads", "strides"] ) - tensors[node.output[0]] = self.handler.maxPool( - tensors[node.input[0]], - tensors.get(node.output[0]), - k[0], - k[1], - d[0], - d[1], - p[0], - p[1], - s[0], - s[1], - ) + if p[0] != p[2] or p[1] != p[3]: + adapt = "{}-adapt".format(node.output[0]) + tensors[adapt] = self.handler.pad( + tensors.get(node.input[0]), None, p, [-2, -1] + ) + tensors[node.output[0]] = self.handler.maxPool( + tensors[adapt], + tensors.get(node.output[0]), + k[0], + k[1], + d[0], + d[1], + 0, + 0, + s[0], + s[1], + ) + else: + tensors[node.output[0]] = self.handler.maxPool( + tensors[node.input[0]], + tensors.get(node.output[0]), + k[0], + k[1], + d[0], + d[1], + p[0], + p[1], + s[0], + s[1], + ) elif node.op_type == "AveragePool": attributes = _parse_attribute( node, { "kernel_shape": None, - "pads": [0, 0], + "pads": [0, 0, 0, 0], "strides": [1, 1], }, ) (k, p, s) = ( attributes[name] for name in ["kernel_shape", "pads", "strides"] ) - tensors[node.output[0]] = self.handler.avgPool( - tensors[node.input[0]], - tensors.get(node.output[0]), - k[0], - k[1], - 1, - 1, - p[0], - p[1], - s[0], - s[1], - ) + if p[0] != p[2] or p[1] != p[3]: + adapt = "{}-adapt".format(node.output[0]) + tensors[adapt] = self.handler.pad( + tensors.get(node.input[0]), None, p, [-2, -1] + ) + tensors[node.output[0]] = self.handler.avgPool( + tensors[adapt], + tensors.get(node.output[0]), + k[0], + k[1], + 1, + 1, + 0, + 0, + s[0], + s[1], + ) + else: + tensors[node.output[0]] = self.handler.avgPool( + tensors[node.input[0]], + tensors.get(node.output[0]), + k[0], + k[1], + 1, + 1, + p[0], + p[1], + s[0], + s[1], + ) elif node.op_type == "GlobalAveragePool": - shape = next( - ( - value.type.tensor_type.shape - for value in model.graph.value_info - if value.name == node.input[0] - ), - None, - ) or next( - input.type.tensor_type.shape - for input in model.graph.input - if input.name == node.input[0] - ) - [_, _, h, w] = _take_shape_dim(shape) + [_, _, h, w] = _search_shape(model, node.input[0]) tensors[node.output[0]] = self.handler.avgPool( tensors[node.input[0]], tensors.get(node.output[0]), @@ -248,58 +348,123 @@ class OnnxStub: tensors[node.output[0]] = self.handler.softmax( tensors[node.input[0]], tensors.get(node.output[0]), - next((attr.i for attr in node.attribute if attr.name == "axis")), + next( + (attr.i for attr in node.attribute if attr.name == "axis"), -1 + ), ) elif node.op_type == "Abs": tensors[node.output[0]] = self.handler.abs( tensors[node.input[0]], tensors.get(node.output[0]), ) + elif node.op_type == "Shape": + tensors[node.output[0]] = self.handler.shape( + tensors[node.input[0]], + tensors.get(node.output[0]), + ) elif node.op_type == "Identity": tensors[node.output[0]] = self.handler.identity( tensors[node.input[0]], tensors.get(node.output[0]), ) elif node.op_type == "Flatten": - tensors[node.output[0]] = self.handler.flatten( tensors[node.input[0]], tensors.get(node.output[0]), next((attr.i for attr in node.attribute if attr.name == "axis")), ) - elif node.op_type == "Reshape": - input_shape = next( - ( - value.type.tensor_type.shape - for value in model.graph.value_info - if value.name == node.input[0] - ), - None, - ) or next( - input.type.tensor_type.shape - for input in model.graph.input - if input.name == node.input[0] + elif node.op_type == "PRelu": + tensors[node.output[0]] = self.handler.pRelu( + tensors[node.input[0]], + tensors[node.input[1]], + tensors.get(node.output[0]), ) - dims = _take_shape_dim(input_shape) + elif node.op_type == "Clip": + tensors[node.output[0]] = self.handler.clip( + tensors[node.input[0]], + tensors.get(node.output[0]), + next(_parse_data(data[node.input[1]]).__iter__(), None) + if len(node.input) > 1 + else None, + next(_parse_data(data[node.input[2]]).__iter__(), None) + if len(node.input) > 2 + else None, + ) + elif node.op_type == "Transpose": + perm = next( + (attr.ints for attr in node.attribute if attr.name == "perm"), None + ) + tensors[node.output[0]] = self.handler.transpose( + tensors[node.input[0]], + tensors.get(node.output[0]), + perm, + ) + elif node.op_type == "Reshape": + dims = _search_shape(model, node.input[0]) size = reduce(lambda acc, x: acc * x, dims) - output_shape = [int(i) for i in data[node.input[1]].int64_data] - for i, x in enumerate(output_shape): + input_shape = _parse_data(data[node.input[1]]) + for i, x in enumerate(input_shape): if x == 0: - output_shape[i] = dims[i] - temp = reduce(lambda acc, x: acc * x, output_shape) + input_shape[i] = dims[i] + temp = reduce(lambda acc, x: acc * x, input_shape, 1) if temp < 0: - output_shape[output_shape.index(-1)] = size // -temp + input_shape[input_shape.index(-1)] = size // -temp + tensors[node.output[0]] = self.handler.reshape( + tensors[node.input[0]], + tensors.get(node.output[0]), + input_shape, + ) + elif node.op_type == "Squeeze": + input_shape = _search_shape(model, node.input[0]) + axes = set( + [int(i) for i in data[node.input[1]].int64_data] + if len(node.input) > 1 + else _parse_attribute(node, {"axes": None})["axes"] + ) + assert all(input_shape[d] == 1 for d in axes) + output_shape = [] + for i, x in enumerate(input_shape): + if i not in axes: + output_shape.append(x) tensors[node.output[0]] = self.handler.reshape( tensors[node.input[0]], tensors.get(node.output[0]), output_shape, ) + elif node.op_type == "Unsqueeze": + input_shape = _search_shape(model, node.input[0]) + axes = ( + [int(i) for i in data[node.input[1]].int64_data] + if len(node.input) > 1 + else _parse_attribute(node, {"axes": None})["axes"] + ) + for i in axes: + input_shape.insert(i, 1) + tensors[node.output[0]] = self.handler.reshape( + tensors[node.input[0]], + tensors.get(node.output[0]), + input_shape, + ) elif node.op_type == "Concat": tensors[node.output[0]] = self.handler.concat( [tensors[name] for name in node.input], tensors.get(node.output[0]), next((attr.i for attr in node.attribute if attr.name == "axis")), ) + elif node.op_type == "Split": + for name, tensor in zip( + node.output, + self.handler.split( + tensors[node.input[0]], + None, + next( + (attr.i for attr in node.attribute if attr.name == "axis"), + 0, + ), + len(node.output), + ), + ): + tensors[name] = tensor elif node.op_type == "Gather": tensors[node.output[0]] = self.handler.gather( tensors[node.input[0]], @@ -331,6 +496,22 @@ class OnnxStub: _parse_data(data[node.input[1]]), _parse_data(data[node.input[3]]) if len(node.input) > 3 else None, ) + elif node.op_type == "Dropout": + for name, tensor in zip( + node.output, + self.handler.dropout( + tensors[node.input[0]], + tensors.get(node.output[0]), + tensors.get(node.output[1]) if len(node.output) > 1 else None, + _parse_data(data[node.input[1]])[0] + if len(node.input) > 1 + else 0.5, + _parse_data(data[node.input[2]])[0] + if len(node.input) > 2 + else False, + ), + ): + tensors[name] = tensor else: raise Exception('Unsupported operator "{}"'.format(node.op_type)) @@ -344,11 +525,11 @@ class OnnxStub: else: self.initializer[obj.fuid()] = tensor if tensor.data_type == TensorProto.INT32: - obj.copyin_int32([int(i) for i in tensor.int32_data]) + obj.copyin_int32(_parse_data(tensor)) elif tensor.data_type == TensorProto.INT64: - obj.copyin_int64([int(i) for i in tensor.int64_data]) + obj.copyin_int64(_parse_data(tensor)) elif tensor.data_type == TensorProto.FLOAT: - obj.copyin_float([int(i) for i in tensor.float_data]) + obj.copyin_float(_parse_data(tensor)) else: assert False, "Unsupported Tensor Type: {}".format(tensor.data_type) @@ -398,14 +579,15 @@ class OnnxStub: self.count_in += 1 name = "input{}".format(self.count_in) self.names[tensor] = name - shape = tensor.shape() - dtype = backend.tensor_dtype(tensor) - value_info = make_tensor_value_info(name, dtype, shape) - check_value_info(value_info) - self.inputs.append(value_info) if init != None: init.name = name self.initializers.append(init) + else: + shape = tensor.shape() + dtype = backend.tensor_dtype(tensor) + value_info = make_tensor_value_info(name, dtype, shape) + check_value_info(value_info) + self.inputs.append(value_info) return name def push_data_input( @@ -417,11 +599,8 @@ class OnnxStub: vals: Any, ) -> str: name = "{}_{}".format(node_name, attr_name) - value_info = make_tensor_value_info(name, elem_type, shape) tensor = make_tensor(name, elem_type, shape, vals) - check_value_info(value_info) check_tensor(tensor) - self.inputs.append(value_info) self.initializers.append(tensor) return name @@ -459,20 +638,40 @@ class OnnxStub: for (i, it) in enumerate(op.outputs()) ] if ty == backend.OpType.Conv: - ph, pw, sh, sw, dh, dw = backend.conv_attrs_of(op) + ph, pw, dh, dw, sh, sw = backend.conv_attrs_of(op) ctx.push_node( make_node( ty.name, inputs, outputs, name, + pads=[ph, pw, ph, pw], + strides=[sh, sw], + dilations=[dh, dw], + group=op.inputs()[0].shape()[1] // op.inputs()[1].shape()[1], + ) + ) + elif ty == backend.OpType.ConvTrans: + ph, pw, sh, sw, dh, dw, oph, opw = backend.conv_trans_attrs_of(op) + ctx.push_node( + make_node( + "ConvTranspose", + inputs, + outputs, + name, pads=[ph, pw], strides=[sh, sw], dilations=[dh, dw], + output_padding=[oph, opw], ) ) elif ty == backend.OpType.Matmul: - ctx.push_node(make_node("MatMul", inputs, outputs, name)) + transA, transB = backend.matmul_attrs_of(op) + ctx.push_node( + make_node( + "Gemm", inputs, outputs, name, transA=transA, transB=transB + ) + ) elif ty == backend.OpType.BatchNorm: inputs = [inputs[i] for i in [0, 3, 4, 1, 2]] momentum, eps, training = backend.batch_norm_attrs_of(op) @@ -496,7 +695,7 @@ class OnnxStub: outputs, name, kernel_shape=[kh, kw], - pads=[ph, pw], + pads=[ph, pw, ph, pw], dilations=[dh, dw], strides=[sh, sw], ) @@ -510,7 +709,7 @@ class OnnxStub: outputs, name, kernel_shape=[kh, kw], - pads=[ph, pw], + pads=[ph, pw, ph, pw], strides=[sh, sw], ) ) @@ -526,17 +725,21 @@ class OnnxStub: backend.OpType.Softmax, backend.OpType.Abs, backend.OpType.Identity, + backend.OpType.PRelu, ]: ctx.push_node(make_node(ty.name, inputs, outputs, name)) elif ty == backend.OpType.Flatten: raise Exception("TODO") + elif ty == backend.OpType.Transpose: + perm = backend.transpose_permute_of(op) + ctx.push_node(make_node(ty.name, inputs, outputs, name, perm=perm)) elif ty == backend.OpType.Reshape: shape = backend.reshape_shape_of(op) inputs.append( ctx.push_data_input( name, "shape", - TensorProto.INT32, + TensorProto.INT64, [len(shape)], shape, ) @@ -545,29 +748,81 @@ class OnnxStub: elif ty == backend.OpType.Concat: axis = backend.concat_axis_of(op) ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis)) + elif ty == backend.OpType.Split: + axis = backend.split_axis_of(op) + num_outputs = len(outputs) + split = op.inputs()[0].shape()[axis] // num_outputs + inputs.append( + ctx.push_data_input( + name, + "split", + TensorProto.INT64, + [len(outputs)], + [split for _ in range(0, num_outputs)], + ) + ) + ctx.push_node( + make_node( + ty.name, + inputs, + outputs, + name, + axis=axis, + ) + ) elif ty == backend.OpType.Gather: axis = backend.gather_axis_of(op) ctx.push_node(make_node(ty.name, inputs, outputs, name, axis=axis)) elif ty == backend.OpType.ReduceMean: - axes = backend.reduce_mean_axes_of(op) + axes, keepdims = backend.reduce_mean_attrs_of(op) inputs.append( ctx.push_data_input( - name, "axes", TensorProto.INT32, [len(axes)], axes + name, "axes", TensorProto.INT64, [len(axes)], axes ) ) - ctx.push_node(make_node(ty.name, inputs, outputs, name, keepdims=1)) + ctx.push_node( + make_node(ty.name, inputs, outputs, name, keepdims=keepdims) + ) elif ty == backend.OpType.Slice: raise Exception("TODO") elif ty == backend.OpType.Pad: - raise Exception("TODO") + pads = backend.pad_pads_of(op) + inputs.append( + ctx.push_data_input( + name, "pads", TensorProto.INT64, [len(pads)], pads + ) + ) + ctx.push_node(make_node(ty.name, inputs, outputs, name)) + elif ty == backend.OpType.Clip: + min, max = backend.clip_attrs_of(op) + if min != None: + inputs.append( + ctx.push_data_input(name, "min", TensorProto.FLOAT, [], [min]) + ) + else: + inputs.append( + ctx.push_data_input(name, "min", TensorProto.FLOAT, [], []) + ) + if max != None: + inputs.append( + ctx.push_data_input(name, "max", TensorProto.FLOAT, [], [max]) + ) + else: + inputs.append( + ctx.push_data_input(name, "max", TensorProto.FLOAT, [], []) + ) + ctx.push_node(make_node(ty.name, inputs, outputs, name)) else: - raise Exception("Unsupported OpType {}".format(ty.name)) + raise Exception("Unsupported OpType", ty) return ctx.build(name) def init(self) -> None: self.handler.data_malloc() + def optimize(self) -> None: + self.handler.optimize() + def run(self) -> None: self.handler.run() @@ -576,9 +831,39 @@ def from_onnx(model: ModelProto, runtime): stub = OnnxStub(model, runtime) return stub.inputs, stub.outputs, stub.handler -def run_onnx(model: ModelProto, runtime): - stub = OnnxStub(model, runtime) - stub.run() + +def _search_shape(model: ModelProto, name: str) -> List[int]: + ans = ( + next( + ( + [ + (d.dim_value if d.dim_value > 0 else 1) + for d in tensor.type.tensor_type.shape.dim + ] + for tensor in model.graph.value_info + if tensor.name == name + ), + None, + ) + or next( + ( + [ + (d.dim_value if d.dim_value > 0 else 1) + for d in tensor.type.tensor_type.shape.dim + ] + for tensor in model.graph.input + if tensor.name == name + ), + None, + ) + or next( + [int(d) for d in tensor.dims] + for tensor in model.graph.initializer + if tensor.name == name + ) + ) + return ans + def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]: for attr in node.attribute: @@ -598,15 +883,8 @@ def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[st return attrs -def _parse_data(tensor: TensorProto) -> List[Union[int, float]]: - if tensor.data_type == TensorProto.INT32: - return [int(i) for i in tensor.int32_data] - elif tensor.data_type == TensorProto.INT64: - return [int(i) for i in tensor.int64_data] - elif tensor.data_type == TensorProto.FLOAT: - return [float(i) for i in tensor.float_data] - else: - assert False, "Unsupported Tensor Type: {}".format(tensor.data_type) +def _parse_data(tensor: TensorProto) -> List[Any]: + return to_array(tensor).flatten().tolist() def _take_shape_dim(shape: TensorShapeProto) -> List[int]: diff --git a/pyinfinitensor/tests/test_onnx.py b/pyinfinitensor/tests/test_onnx.py index 99ad3b9d..fd589eeb 100644 --- a/pyinfinitensor/tests/test_onnx.py +++ b/pyinfinitensor/tests/test_onnx.py @@ -7,18 +7,20 @@ from onnx.helper import ( make_graph, make_tensor_value_info, ) -from onnx.checker import check_model -from pyinfinitensor.onnx import from_onnx, backend, runtime, run_onnx +from onnx.checker import check_model, check_graph +from onnx.shape_inference import infer_shapes +from pyinfinitensor.onnx import from_onnx, OnnxStub, backend def make_and_import_model(graph: onnx.GraphProto): + check_graph(graph) model = make_model(graph) check_model(model) - from_onnx(model, runtime) + from_onnx(model, backend.cpu_runtime()) class TestStringMethods(unittest.TestCase): - #def test_run(self): + # def test_run(self): # model_file = next( # (name for name in os.listdir() if name.endswith(".onnx")), None # ) @@ -31,16 +33,17 @@ class TestStringMethods(unittest.TestCase): # run_onnx(onnx.load(model_file), runtime) def test_load(self): - model_file = next( - (name for name in os.listdir() if name.endswith(".onnx")), None - ) - if model_file != None: - print( - "model: {file}({size:.2f} MiB)".format( - file=model_file, size=os.path.getsize(model_file) / 1024 / 1024 + for model_file in os.listdir(): + if model_file.endswith(".onnx"): + print( + "model: {file}({size:.2f} MiB)".format( + file=model_file, size=os.path.getsize(model_file) / 1024 / 1024 + ) ) - ) - from_onnx(onnx.load(model_file), runtime) + model = OnnxStub(onnx.load(model_file), backend.cpu_runtime()).to_onnx( + "new" + ) + model = infer_shapes(model) def test_tensor(self): x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 2, 3]) @@ -55,7 +58,7 @@ class TestStringMethods(unittest.TestCase): ["i", "w"], ["o"], "conv", - pads=[1, 1], + pads=[1, 1, 1, 1], strides=[2, 1], dilations=[1, 2], ) @@ -102,7 +105,7 @@ class TestStringMethods(unittest.TestCase): ["y"], kernel_shape=[3, 3], dilations=[1, 1], - pads=[0, 0], + pads=[0, 0, 0, 0], strides=[2, 2], name="maxPool", ) @@ -116,7 +119,7 @@ class TestStringMethods(unittest.TestCase): ["x"], ["y"], kernel_shape=[3, 3], - pads=[0, 0], + pads=[0, 0, 0, 0], strides=[2, 2], name="avgPool", ) @@ -206,7 +209,7 @@ class TestStringMethods(unittest.TestCase): def test_flatten(self): x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7]) - y = make_tensor_value_info("y", TensorProto.FLOAT, [1*3, 5 * 7]) + y = make_tensor_value_info("y", TensorProto.FLOAT, [1 * 3, 5 * 7]) flatten = make_node("Flatten", ["x"], ["y"], axis=2, name="flatten") # make_and_import_model( make_graph([flatten], "flatten", [x], [y]) @@ -254,22 +257,19 @@ class TestStringMethods(unittest.TestCase): def test_slice(self): data = make_tensor_value_info("data", TensorProto.UINT32, [10, 64, 162, 162]) - output = make_tensor_value_info("output", TensorProto.UINT32, [1, 0, 99, 95]) - starts = make_tensor_value_info("starts", TensorProto.INT64, [4]) - starts_data = make_tensor("starts", TensorProto.INT64, [4], [2, 10, 1, 5]) - ends = make_tensor_value_info("ends", TensorProto.INT64, [4]) - ends_data = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100]) + output = make_tensor_value_info("output", TensorProto.UINT32, [1, 1, 99, 95]) + starts = make_tensor("starts", TensorProto.INT64, [4], [2, 9, 1, 5]) + ends = make_tensor("ends", TensorProto.INT64, [4], [3, 10, 100, 100]) slice = make_node("Slice", ["data", "starts", "ends"], ["output"], name="slice") - # FIXME 后端的实现是 axis:[start,end],onnx 的实现是 axis:[start,end) - # make_and_import_model( - make_graph( - [slice], - "slice", - [data, starts, ends], - [output], - [starts_data, ends_data], + make_and_import_model( + make_graph( + [slice], + "slice", + [data], + [output], + [starts, ends], + ) ) - # ) def test_pad(self): data = make_tensor_value_info("data", TensorProto.UINT32, [1, 64, 162, 162]) @@ -300,10 +300,10 @@ class TestStringMethods(unittest.TestCase): graph = make_graph([matmul, add], "lr", [x, a, b], [y]) model = make_model(graph) check_model(model) - from_onnx(model, runtime) + from_onnx(model, backend.cpu_runtime()) def test_frontend(self): - handler = backend.GraphHandler(runtime) + handler = backend.GraphHandler(backend.cpu_runtime()) a = handler.tensor([1, 2, 3], 12) b = handler.tensor([1, 2, 3], 12) c = handler.tensor([1, 2, 3], 12) diff --git a/src/core/graph.cc b/src/core/graph.cc index 90147320..f52f8af7 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -114,6 +114,15 @@ bool GraphObj::topo_sort() { return this->sorted = true; } +void GraphObj::optimize() { + for (auto &op : ops) { + switch (op->getOpType()) { + default: + break; + } + } +} + void GraphObj::dataMalloc() { for (auto &tensor : tensors) { tensor->dataMalloc(); diff --git a/src/core/graph_handler.cc b/src/core/graph_handler.cc index 424ca276..61db36ac 100644 --- a/src/core/graph_handler.cc +++ b/src/core/graph_handler.cc @@ -11,6 +11,8 @@ #include "operators/reshape.h" #include "operators/slice.h" #include "operators/softmax.h" +#include "operators/split.h" +#include "operators/transpose.h" #include "operators/unary.h" namespace infini { @@ -35,6 +37,24 @@ Tensor GraphHandlerObj::conv(Tensor input, Tensor weight, Tensor output, int ph, } } +Tensor GraphHandlerObj::convTransposed2d(Tensor input, Tensor weight, + Tensor output, int ph, int pw, int sh, + int sw, int dh, int dw, int oph, + int opw) { + if (output) { + g->addOpWithOutputs(std::move(input), + std::move(weight), output, ph, + pw, sh, sw, dh, dw, oph, opw); + return output; + } else { + return g + ->addOp(std::move(input), std::move(weight), + output, ph, pw, sh, sw, dh, dw, oph, + opw) + ->getOutput(); + } +} + Tensor GraphHandlerObj::matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB, Tensor bias, ActType act) { if (y) { @@ -128,9 +148,31 @@ DEFINE_UNARY_METHOD(relu, Relu) DEFINE_UNARY_METHOD(sigmoid, Sigmoid) DEFINE_UNARY_METHOD(tanh, Tanh) DEFINE_UNARY_METHOD(abs, Abs) +DEFINE_UNARY_METHOD(shape, Shape) + // see operators/reshape.h DEFINE_UNARY_METHOD(identity, Identity) +Tensor GraphHandlerObj::pRelu(Tensor x, Tensor slope, Tensor y) { + if (y) { + g->addOpWithOutputs(std::move(x), std::move(slope), y); + return y; + } else { + return g->addOp(std::move(x), std::move(slope), y) + ->getOutput(); + } +} + +Tensor GraphHandlerObj::clip(Tensor x, Tensor y, std::optional min, + std::optional max) { + if (y) { + g->addOpWithOutputs(std::move(x), y, min, max); + return y; + } else { + return g->addOp(std::move(x), y, min, max)->getOutput(); + } +} + Tensor GraphHandlerObj::softmax(Tensor input, Tensor output, int axis) { if (output) { g->addOpWithOutputs(std::move(input), output, axis); @@ -151,6 +193,16 @@ Tensor GraphHandlerObj::flatten(Tensor input, Tensor output, int axis) { } } +Tensor GraphHandlerObj::transpose(Tensor data, Tensor transposed, Shape perm) { + if (transposed) { + g->addOpWithOutputs(std::move(data), transposed, perm); + return transposed; + } else { + return g->addOp(std::move(data), transposed, perm) + ->getOutput(); + } +} + Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) { if (reshaped) { g->addOpWithOutputs(std::move(data), reshaped, @@ -171,6 +223,18 @@ Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) { } } +TensorVec GraphHandlerObj::split(Tensor input, std::optional outputs, + int axis, int num_outputs) { + if (outputs) { + g->addOpWithOutputs(std::move(input), outputs, axis, + num_outputs); + return *outputs; + } else { + return g->addOp(std::move(input), outputs, axis, num_outputs) + ->getOutputs(); + } +} + Tensor GraphHandlerObj::gather(Tensor data, Tensor indices, Tensor output, int axis) { if (output) { diff --git a/src/core/tensor.cc b/src/core/tensor.cc index fd5ddde4..e63039d5 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -64,79 +64,24 @@ vector TensorObj::getStride() const { void TensorObj::printData() const { IT_ASSERT(data != nullptr); - void *ptr = nullptr; - Blob buffer; - if (!runtime->isCpu()) { - buffer = NativeCpuRuntimeObj::getInstance()->allocBlob(getBytes()); - runtime->copyBlobToCPU(buffer->getPtr(), - getRawDataPtr(), getBytes()); - ptr = buffer->getPtr(); - } else - ptr = data->getPtr(); - if (dtype == DataType::Float32) - printDataFloat(static_cast(ptr)); - else if (dtype == DataType::UInt32) - printDataUint32_t(static_cast(ptr)); - else + if (!runtime->isCpu()) IT_TODO_HALT(); -} -void TensorObj::printDataFloat(float *ptr) const { - std::cout << "Tensor: " << guid << std::endl; - auto numDims = shape.size(); - auto dimSzVec = std::vector(numDims, 1); - dimSzVec[numDims - 1] = shape[numDims - 1]; - for (int i = numDims - 1; i != 0; --i) - dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; - for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { - if (iEnd > 1000 && i > 20 && i < iEnd - 20) { - printf("... , "); - i = iEnd - 20; - continue; - } - for (size_t j = 0; j < numDims; ++j) { - if (i % dimSzVec[j] == 0) { - std::cout << "["; - } - } - printf("%.1f", ptr[i]); - for (size_t j = 0; j < numDims; ++j) { - if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { - std::cout << "]"; - } - } - if (i != size() - 1) - std::cout << ", "; - if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1) - std::cout << std::endl; - } -} +#define TRY_PRINT(N) \ + if (dtype == DataType(N)) \ + std::cout << dataToString::t>() << std::endl; -void TensorObj::printDataUint32_t(uint32_t *ptr) const { - IT_ASSERT(data != nullptr); - std::cout << "Tensor: " << guid << std::endl; - auto numDims = shape.size(); - auto dimSzVec = std::vector(numDims, 1); - dimSzVec[numDims - 1] = shape[numDims - 1]; - for (int i = numDims - 1; i != 0; --i) - dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; - for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { - for (size_t j = 0; j < numDims; ++j) { - if (i % dimSzVec[j] == 0) { - std::cout << "["; - } - } - std::cout << ptr[i]; - for (size_t j = 0; j < numDims; ++j) { - if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { - std::cout << "]"; - } - } - if (i != size() - 1) - std::cout << ", "; - if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1) - std::cout << std::endl; - } + TRY_PRINT(0) // fmt: new line + else TRY_PRINT(1) // + else TRY_PRINT(2) // + else TRY_PRINT(3) // + else TRY_PRINT(4) // + else TRY_PRINT(5) // + else TRY_PRINT(6) // + else TRY_PRINT(7) // + else IT_TODO_HALT(); + +#undef TRY_PRINT } bool TensorObj::equalData(const Tensor &rhs, double relativeError) const { @@ -147,19 +92,27 @@ bool TensorObj::equalData(const Tensor &rhs, double relativeError) const { IT_ASSERT(rhs->getRuntime()->isCpu()); if (size() != rhs->size()) return false; - if (getDType() == DataType::UInt32) - return equalDataImpl(getRawDataPtr(), - rhs->getRawDataPtr(), size(), 0); - else if (getDType() == DataType::Float32) - return equalDataImpl(getRawDataPtr(), - rhs->getRawDataPtr(), size(), - relativeError); - else - IT_TODO_HALT(); + +#define TEST_EQUAL(N) \ + if (dtype == DataType(N)) \ + return equalDataImpl(getRawDataPtr::t *>(), \ + rhs->getRawDataPtr::t *>(), size()); + + TEST_EQUAL(0) // fmt: new line + else TEST_EQUAL(1) // + else TEST_EQUAL(2) // + else TEST_EQUAL(3) // + else TEST_EQUAL(4) // + else TEST_EQUAL(5) // + else TEST_EQUAL(6) // + else TEST_EQUAL(7) // + else IT_TODO_HALT(); + +#undef TEST_EQUAL } void TensorObj::dataMalloc() { - if (data == nullptr) + if (!data) data = runtime->allocBlob(getBytes()); } @@ -201,9 +154,9 @@ Shape TensorObj::getPosByOffset(size_t offset, Shape dim) const { size_t TensorObj::getOffsetByPos(Shape pos, Shape dim) const { int n = dim.size(); size_t offset = pos.at(0); - for (auto i = 1; i < n; i++) { + for (auto i = 1; i < n; i++) offset = offset * dim.at(i) + pos.at(i); - } + return offset; } @@ -213,10 +166,10 @@ size_t TensorObj::getOffsetByBroadcastOffset(size_t bcOffset, Shape pos = bcPos; int n = shape.size(); - for (auto i = 0; i < n; i++) { + for (auto i = 0; i < n; i++) if (shape.at(i) == 1) pos[i] = 0; - } + return getOffsetByPos(pos, shape); } }; // namespace infini diff --git a/src/cuda/cuda_runtime.cc b/src/cuda/cuda_runtime.cc index 94c7744a..37b5e7cf 100644 --- a/src/cuda/cuda_runtime.cc +++ b/src/cuda/cuda_runtime.cc @@ -1,6 +1,7 @@ #include "cuda/cuda_runtime.h" #include "core/kernel.h" #include "core/perf_engine.h" +#include "core/runtime.h" #include "operators/conv.h" #include "operators/matmul.h" namespace infini { @@ -16,10 +17,11 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const { auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfData = perfEngine.getPerfData(perfKey); // IT_ASSERT(perfData, "No perf data for OP " + op->toString()); - if (perfData) + if (perfData) { kernel->compute(op, perfData, this); - else + } else { kernel->compute(op, this); + } } } @@ -73,4 +75,4 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); } string CudaRuntimeObj::toString() const { return "CUDA Runtime"; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/src/ffi/ffi_infinitensor.cc b/src/ffi/ffi_infinitensor.cc index 1b58abd5..6bb77256 100644 --- a/src/ffi/ffi_infinitensor.cc +++ b/src/ffi/ffi_infinitensor.cc @@ -3,15 +3,24 @@ #include "operators/concat.h" #include "operators/conv.h" #include "operators/gather.h" +#include "operators/matmul.h" +#include "operators/pad.h" #include "operators/pooling.h" #include "operators/reduce_mean.h" #include "operators/reshape.h" +#include "operators/split.h" +#include "operators/transpose.h" +#include "operators/unary.h" +#include #include #ifdef USE_CUDA #include "cuda/cuda_runtime.h" #include "cuda/operator_timer.h" #endif +#ifdef USE_BANG +#include "bang/bang_runtime.h" +#endif #ifdef USE_INTELCPU #include "intelcpu/mkl_runtime.h" #include "intelcpu/operator_timer.h" @@ -57,6 +66,7 @@ void export_values(py::module &m) { .VALUE(OpType, G2BMM) .VALUE(OpType, GBMM) .VALUE(OpType, Pad) + .VALUE(OpType, Clip) .VALUE(OpType, Slice) .VALUE(OpType, Concat) .VALUE(OpType, Split) @@ -78,11 +88,12 @@ void export_values(py::module &m) { .VALUE(OpType, Softmax) .VALUE(OpType, Activation) .VALUE(OpType, Relu) + .VALUE(OpType, PRelu) .VALUE(OpType, Sigmoid) .VALUE(OpType, Tanh) .VALUE(OpType, Abs) .VALUE(OpType, Resize) - .VALUE(OpType, MemBound) + .VALUE(OpType, Dropout) .export_values(); #undef VALUE @@ -112,6 +123,10 @@ static int tensor_dtype(Tensor t) { static Ref cuda_runtime() { return make_ref(); } #endif +#ifdef USE_BANG +static Ref bang_runtime() { return make_ref(); } +#endif + #ifdef USE_INTELCPU static Ref intelcpu_runtime() { return make_ref(); } #endif @@ -123,11 +138,27 @@ static std::tuple conv_attrs_of(Operator op) { conv->getDw(), conv->getSh(), conv->getSw()); } +static std::tuple +conv_trans_attrs_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::ConvTrans); + auto conv = dynamic_cast(op.get()); + auto [oph, opw] = conv->getOutputPadding(); + return std::make_tuple(conv->getPh(), conv->getPw(), conv->getDh(), + conv->getDw(), conv->getSh(), conv->getSw(), oph, + opw); +} + +static std::tuple matmul_attrs_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Matmul); + auto matmul = dynamic_cast(op.get()); + return std::make_tuple(matmul->getTransA(), matmul->getTransB()); +} + static std::tuple batch_norm_attrs_of(Operator op) { IT_ASSERT(op->getOpType() == OpType::BatchNorm); auto batchnorm = dynamic_cast(op.get()); return std::make_tuple(batchnorm->getMomentum(), batchnorm->getEps(), - batchnorm->getTraining()); + batchnorm->getTrainingMode()); } static std::tuple @@ -140,45 +171,88 @@ pool_attrs_of(Operator op) { pool->getSh(), pool->getSw()); } +static std::tuple, std::optional> +clip_attrs_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Clip); + auto clip = dynamic_cast(op.get()); + return std::make_tuple(clip->getMin(), clip->getMax()); +} + +static std::tuple, bool> reduce_mean_attrs_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::ReduceMean); + auto reduce_mean = dynamic_cast(op.get()); + auto &set = reduce_mean->getAxes(); + return std::make_tuple(vector(set.begin(), set.end()), + reduce_mean->getKeepDims()); +} + static int concat_axis_of(Operator op) { IT_ASSERT(op->getOpType() == OpType::Concat); return dynamic_cast(op.get())->getDim(); } +static int split_axis_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Split); + return dynamic_cast(op.get())->getDim(); +} + static int gather_axis_of(Operator op) { IT_ASSERT(op->getOpType() == OpType::Gather); return dynamic_cast(op.get())->getAxis(); } -static vector reduce_mean_axes_of(Operator op) { - IT_ASSERT(op->getOpType() == OpType::ReduceMean); - auto &set = dynamic_cast(op.get())->getAxes(); - return vector(set.begin(), set.end()); +static vector reshape_shape_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Reshape); + auto shape = dynamic_cast(op.get())->getShape(); + vector ans(shape.size()); + std::transform(shape.begin(), shape.end(), ans.begin(), + [](auto x) { return static_cast(x); }); + return ans; } -static Shape reshape_shape_of(Operator op) { - IT_ASSERT(op->getOpType() == OpType::Reshape); - return dynamic_cast(op.get())->getShape(); +static vector pad_pads_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Pad); + auto shape = dynamic_cast(op.get())->getPads(); + vector ans(shape.size()); + std::transform(shape.begin(), shape.end(), ans.begin(), + [](auto x) { return static_cast(x); }); + return ans; +} + +static vector transpose_permute_of(Operator op) { + IT_ASSERT(op->getOpType() == OpType::Transpose); + return dynamic_cast(op.get())->getPermute(); } void export_functions(py::module &m) { #define FUNCTION(NAME) def(#NAME, &NAME) + m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance) #ifdef USE_CUDA - m.def("runtime", cuda_runtime) -#elif USE_INTELCPU - m.def("runtime", intelcpu_runtime) -#else - m.def("runtime", &NativeCpuRuntimeObj::getInstance) + .def("cuda_runtime", cuda_runtime) +#endif +#ifdef USE_INTELCPU + .def("intelcpu_runtime", intelcpu_runtime) +#endif +#ifdef USE_CUDA + .FUNCTION(cuda_runtime) +#endif +#ifdef USE_BANG + .FUNCTION(bang_runtime) #endif - .FUNCTION(conv_attrs_of) + .FUNCTION(conv_trans_attrs_of) + .FUNCTION(matmul_attrs_of) .FUNCTION(batch_norm_attrs_of) .FUNCTION(pool_attrs_of) + .FUNCTION(clip_attrs_of) + .FUNCTION(reduce_mean_attrs_of) .FUNCTION(tensor_dtype) .FUNCTION(reshape_shape_of) + .FUNCTION(pad_pads_of) + .FUNCTION(transpose_permute_of) .FUNCTION(concat_axis_of) - .FUNCTION(gather_axis_of) - .FUNCTION(reduce_mean_axes_of); + .FUNCTION(split_axis_of) + .FUNCTION(gather_axis_of); #undef FUNCTION } @@ -191,6 +265,10 @@ void init_graph_builder(py::module &m) { #ifdef USE_CUDA py::class_, RuntimeObj>( m, "CudaRuntime"); +#endif +#ifdef USE_BANG + py::class_, RuntimeObj>( + m, "BangRuntime"); #endif py::class_>(m, "Tensor") .def("fuid", &TensorObj::getFuid, policy::automatic) @@ -215,6 +293,7 @@ void init_graph_builder(py::module &m) { .def(py::init()) .def("tensor", &Handler::tensor, policy::move) .def("conv", &Handler::conv, policy::move) + .def("convTransposed2d", &Handler::convTransposed2d, policy::move) .def("matmul", &Handler::matmul, policy::move) .def("batchNorm", &Handler::batchNorm, policy::move) .def("maxPool", &Handler::maxPool, policy::move) @@ -229,15 +308,21 @@ void init_graph_builder(py::module &m) { .def("tanh", &Handler::tanh, policy::move) .def("softmax", &Handler::softmax, policy::move) .def("abs", &Handler::abs, policy::move) + .def("shape", &Handler::shape, policy::move) .def("identity", &Handler::identity, policy::move) .def("flatten", &Handler::flatten, policy::move) + .def("pRelu", &Handler::pRelu, policy::move) + .def("clip", &Handler::clip, policy::move) + .def("transpose", &Handler::transpose, policy::move) .def("reshape", &Handler::reshape, policy::move) .def("concat", &Handler::concat, policy::move) + .def("split", &Handler::split, policy::move) .def("gather", &Handler::gather, policy::move) .def("reduce_mean", &Handler::reduceMean, policy::move) .def("slice", &Handler::slice, policy::move) .def("pad", &Handler::pad, policy::move) .def("topo_sort", &Handler::topo_sort, policy::automatic) + .def("optimize", &Handler::optimize, policy::automatic) .def("operators", &Handler::operators, policy::move) .def("data_malloc", &Handler::data_malloc, policy::automatic) .def("run", &Handler::run, policy::automatic); diff --git a/src/kernels/bang/activation.cc b/src/kernels/bang/activation.cc new file mode 100644 index 00000000..935e2746 --- /dev/null +++ b/src/kernels/bang/activation.cc @@ -0,0 +1,208 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class UnaryCnnl : public BangKernelWithoutConfig { + virtual cnnlActivationMode_t getOpType() const = 0; + virtual float getCoef() const = 0; + virtual tuple getAlphBeta() const { return {1.f, 0.f}; } + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + int len = dim.size(); + int size = 1; + for (int i = 0; i < len; ++i) { + size *= dim[i]; + } + + int dim_array[1] = {size}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, 1, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, 1, dim_array)); + + // get op descriptor + cnnlActivationDescriptor_t opDesc; + checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); + checkCnnlError(cnnlSetActivationDescriptor( + opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef())); + + auto [alpha, beta] = getAlphBeta(); + cnnlStatus_t stat = + cnnlActivationForward(context->cnnlHandle(), opDesc, &alpha, aDesc, + aData, &beta, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyActivationDescriptor(opDesc)); + } +}; + +class RoundCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlRound(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class SquareCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlSquare(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class PReluCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + int alpha_array[4] = {1, 1, 1, 1}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, alpha_array)); + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = cnnlPrelu(context->cnnlHandle(), aDesc, aData, + bDesc, bData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class ReluCnnl : public UnaryCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_RELU; + } + float getCoef() const override { return 0.0; } +}; + +class SigmoidCnnl : public UnaryCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_SIGMOID; + } + float getCoef() const override { return 0.0; } +}; + +class TanhCnnl : public UnaryCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_TANH; + } + float getCoef() const override { return 0.0; } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Relu, DataType::Float32, ReluCnnl, + "Relu_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::PRelu, DataType::Float32, PReluCnnl, + "PRelu_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Sigmoid, DataType::Float32, SigmoidCnnl, + "Sigmoid_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Tanh, DataType::Float32, TanhCnnl, + "Tanh_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Round, DataType::Float32, RoundCnnl, + "Round_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Square, DataType::Float32, SquareCnnl, + "Square_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/activation_backward.cc b/src/kernels/bang/activation_backward.cc new file mode 100644 index 00000000..78be8c0d --- /dev/null +++ b/src/kernels/bang/activation_backward.cc @@ -0,0 +1,94 @@ +#include "operators/activation_backward.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class ActivationBackwardCnnl : public BangKernelWithoutConfig { + virtual cnnlActivationMode_t getOpType() const = 0; + virtual float getCoef() const = 0; + virtual tuple getAlphBeta() const { return {1.f, 0.f}; } + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const yData = (op->getInputs(0)->getRawDataPtr()); + void *const diffYData = (op->getInputs(1)->getRawDataPtr()); + void *const xData = (op->getInputs(2)->getRawDataPtr()); + void *const diffXData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t yDesc, diffYDesc, xDesc, diffXDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&yDesc)); + checkCnnlError(cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&diffYDesc)); + checkCnnlError(cnnlSetTensorDescriptor(diffYDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&xDesc)); + checkCnnlError(cnnlSetTensorDescriptor(xDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&diffXDesc)); + checkCnnlError(cnnlSetTensorDescriptor(diffXDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get op descriptor + cnnlActivationDescriptor_t opDesc; + checkCnnlError(cnnlCreateActivationDescriptor(&opDesc)); + checkCnnlError(cnnlSetActivationDescriptor( + opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef())); + + auto [alpha, beta] = getAlphBeta(); + cnnlStatus_t stat = cnnlActivationBackward( + context->cnnlHandle(), opDesc, &alpha, yDesc, yData, diffYDesc, + diffYData, xDesc, xData, &beta, diffXDesc, diffXData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(yDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(diffYDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(xDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(diffXDesc)); + checkCnnlError(cnnlDestroyActivationDescriptor(opDesc)); + } +}; + +class ReluBackwardCnnl : public ActivationBackwardCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_RELU; + } + float getCoef() const override { return 0.0; } +}; + +class SigmoidBackwardCnnl : public ActivationBackwardCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_SIGMOID; + } + float getCoef() const override { return 0.0; } +}; + +class TanhBackwardCnnl : public ActivationBackwardCnnl { + cnnlActivationMode_t getOpType() const override { + return CNNL_ACTIVATION_TANH; + } + float getCoef() const override { return 0.0; } +}; + +REGISTER_KERNEL(Device::BANG, OpType::ReluBackward, DataType::Float32, + ReluBackwardCnnl, "ReluBackward_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::SigmoidBackward, DataType::Float32, + SigmoidBackwardCnnl, "SigmoidBackward_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::TanhBackward, DataType::Float32, + TanhBackwardCnnl, "TanhBackward_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/cast.cc b/src/kernels/bang/cast.cc new file mode 100644 index 00000000..35da0190 --- /dev/null +++ b/src/kernels/bang/cast.cc @@ -0,0 +1,185 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class CastCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + cnnlCastDataType_t NlCastType; + CastObj::CastType type = op->getType(); + switch (type) { + case CastObj::Float2Int64: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_INT64; + break; + case CastObj::Float2Int32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_INT32; + break; + case CastObj::Float2Int16: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_INT16; + break; + case CastObj::Float2Int8: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_INT8; + break; + case CastObj::Int322Float: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + NlCastType = CNNL_CAST_INT32_TO_FLOAT; + break; + case CastObj::Int322Int8: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + NlCastType = CNNL_CAST_INT32_TO_INT8; + break; + case CastObj::Int322Int16: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + NlCastType = CNNL_CAST_INT32_TO_INT16; + break; + case CastObj::Int162Float: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + NlCastType = CNNL_CAST_INT16_TO_FLOAT; + break; + case CastObj::Int162Int32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_INT16_TO_INT32; + break; + case CastObj::Int82Float: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + NlCastType = CNNL_CAST_INT8_TO_FLOAT; + break; + case CastObj::Int82Int16: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array)); + NlCastType = CNNL_CAST_INT8_TO_INT16; + break; + case CastObj::Int82Int32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_INT8_TO_INT32; + break; + case CastObj::Uint82Float: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + NlCastType = CNNL_CAST_UINT8_TO_FLOAT; + break; + case CastObj::Uint82Int32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_UINT8_TO_INT32; + break; + case CastObj::Uint82Int64: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + NlCastType = CNNL_CAST_UINT8_TO_INT64; + break; + case CastObj::Int322Int64: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + NlCastType = CNNL_CAST_INT32_TO_INT64; + break; + case CastObj::Int642Int32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_INT64_TO_INT32; + break; + case CastObj::Int642Uint32: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array)); + NlCastType = CNNL_CAST_INT64_TO_UINT32; + break; + case CastObj::Int642Float: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + NlCastType = CNNL_CAST_INT64_TO_FLOAT; + break; + case CastObj::Uint322Int64: + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array)); + NlCastType = CNNL_CAST_UINT32_TO_INT64; + break; + default: + IT_TODO_HALT(); + } + cnnlStatus_t stat = cnnlCastDataType(context->cnnlHandle(), aDesc, + aData, NlCastType, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl, + "Cast_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/ceil.cc b/src/kernels/bang/ceil.cc new file mode 100644 index 00000000..5770f412 --- /dev/null +++ b/src/kernels/bang/ceil.cc @@ -0,0 +1,46 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class CeilCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlCeil(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Ceil, DataType::Float32, CeilCnnl, + "Ceil_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/clip.cc b/src/kernels/bang/clip.cc new file mode 100644 index 00000000..bdfb473b --- /dev/null +++ b/src/kernels/bang/clip.cc @@ -0,0 +1,42 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class ClipCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + float min = op->getMin().value(); + float max = op->getMax().value(); + + cnnlTensorDescriptor_t aDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlClip(context->cnnlHandle(), aDesc, aData, &min, &max, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Clip, DataType::Float32, ClipCnnl, + "Clip_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/concat.cc b/src/kernels/bang/concat.cc new file mode 100644 index 00000000..1bfc1d33 --- /dev/null +++ b/src/kernels/bang/concat.cc @@ -0,0 +1,68 @@ +#include "operators/concat.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class ConcatCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + int num = op->numInputs(); + int axis = op->getDim(); + void *argv[num]; + for (int i = 0; i < num; ++i) { + argv[i] = op->getInputs(i)->getRawDataPtr(); + } + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t desc; + + int dim_array[num][4]; + for (int i = 0; i < num; ++i) { + auto dim = op->getInputs(i)->getDims(); + if (dim.size() != 4) { + IT_TODO_HALT(); + } + dim_array[i][0] = dim[0]; + dim_array[i][1] = dim[1]; + dim_array[i][2] = dim[2]; + dim_array[i][3] = dim[3]; + } + + auto dim = op->getOutput()->getDims(); + int dimout_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + + checkCnnlError(cnnlCreateTensorDescriptor(&desc)); + checkCnnlError(cnnlSetTensorDescriptor( + desc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dimout_array)); + cnnlTensorDescriptor_t descArray[num]; + for (int i = 0; i < num; ++i) { + checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); + checkCnnlError( + cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array[i])); + } + + size_t wsSize; + cnnlGetConcatWorkspaceSize(context->cnnlHandle(), num, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlConcat(context->cnnlHandle(), num, axis, descArray, argv, + wsData, wsSize, desc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + for (int i = 0; i < num; ++i) { + checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i])); + } + checkCnnlError(cnnlDestroyTensorDescriptor(desc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Concat, DataType::Float32, ConcatCnnl, + "Concat_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/conv_trans.cc b/src/kernels/bang/conv_trans.cc new file mode 100644 index 00000000..3e22c03d --- /dev/null +++ b/src/kernels/bang/conv_trans.cc @@ -0,0 +1,88 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/conv.h" + +namespace infini { +class ConvTransCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); + const int cpg = op->getChannelPerGroup(); + const int g = c / cpg; + + int pad[4] = {ph, ph, pw, pw}; + int stride[2] = {sh, sw}; + int dilation[2] = {dh, dw}; + + cnnlConvolutionDescriptor_t convDesc; + checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc)); + checkCnnlError(cnnlSetConvolutionDescriptor( + convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT)); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dimInputs0 = op->getInputs(0)->getDims(); + auto dimInputs1 = op->getInputs(1)->getDims(); + auto dimOutput = op->getOutput()->getDims(); + + if (dimInputs0.size() != 4) + IT_TODO_HALT(); + if (dimInputs1.size() != 4) + IT_TODO_HALT(); + if (dimOutput.size() != 4) + IT_TODO_HALT(); + + int inputs0[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2], + dimInputs0[3]}; + int inputs1[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2], + dimInputs1[3]}; + int output[4] = {dimOutput[0], dimOutput[1], dimOutput[2], + dimOutput[3]}; + + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, inputs0)); + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, inputs1)); + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, output)); + + cnnlConvolutionBwdDataAlgo_t algo; + cnnlGetConvolutionBackwardDataAlgorithm( + context->cnnlHandle(), aDesc, bDesc, convDesc, cDesc, + CNNL_CONVOLUTION_BWD_DATA_FASTEST, &algo); + size_t wsSize; + cnnlGetConvolutionBackwardDataWorkspaceSize(context->cnnlHandle(), + aDesc, bDesc, convDesc, + cDesc, algo, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = cnnlConvolutionBackwardData( + context->cnnlHandle(), NULL, aDesc, aData, bDesc, bData, convDesc, + algo, wsData, wsSize, NULL, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::ConvTrans, DataType::Float32, + ConvTransCnnl, "ConvTrans_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/convbpfilter.cc b/src/kernels/bang/convbpfilter.cc new file mode 100644 index 00000000..b360cedb --- /dev/null +++ b/src/kernels/bang/convbpfilter.cc @@ -0,0 +1,159 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/conv.h" + +namespace infini { +class ConvBackwardFilterCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + const auto [n, c, h, w, f, r, s] = op->getNCHWFRS(); + const int cpg = op->getChannelPerGroup(); + const int g = c / cpg; + + int pad[4] = {ph, ph, pw, pw}; + int stride[2] = {sh, sw}; + int dilation[2] = {dh, dw}; + + cnnlConvolutionDescriptor_t convDesc; + checkCnnlError(cnnlCreateConvolutionDescriptor(&convDesc)); + checkCnnlError(cnnlSetConvolutionDescriptor( + convDesc, 4, pad, stride, dilation, g, CNNL_DTYPE_FLOAT)); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc, aDescTrans, bDescTrans, + cDescTrans; + auto dimInputs0 = op->getInputs(0)->getDims(); + auto dimInputs1 = op->getInputs(1)->getDims(); + auto dimOutput = op->getOutput()->getDims(); + + if (dimInputs0.size() != 4) + IT_TODO_HALT(); + if (dimInputs1.size() != 4) + IT_TODO_HALT(); + if (dimOutput.size() != 4) + IT_TODO_HALT(); + + int inputs0Array[4] = {dimInputs0[0], dimInputs0[1], dimInputs0[2], + dimInputs0[3]}; + int inputs1Array[4] = {dimInputs1[0], dimInputs1[1], dimInputs1[2], + dimInputs1[3]}; + int outputArray[4] = {dimOutput[0], dimOutput[1], dimOutput[2], + dimOutput[3]}; + + int inputs0ArrayTrans[4] = {dimInputs0[0], dimInputs0[2], dimInputs0[3], + dimInputs0[1]}; + int inputs1ArrayTrans[4] = {dimInputs1[0], dimInputs1[2], dimInputs1[3], + dimInputs1[1]}; + int outputArrayTrans[4] = {dimOutput[0], dimOutput[2], dimOutput[3], + dimOutput[1]}; + + int transMode[4] = {0, 2, 3, 1}; + cnnlTransposeDescriptor_t transDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&transDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(transDesc, 4, transMode)); + + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs0Array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&aDescTrans)); + checkCnnlError(cnnlSetTensorDescriptor(aDescTrans, CNNL_LAYOUT_NHWC, + CNNL_DTYPE_FLOAT, 4, + inputs0ArrayTrans)); + + size_t wsTrans1Size = dimInputs0[0] * dimInputs0[1] * dimInputs0[2] * + dimInputs0[3] * sizeof(float); + BangPtr wsTrans1Data = context->getWorkspace(wsTrans1Size); + + cnnlStatus_t stat = + cnnlTranspose(context->cnnlHandle(), transDesc, aDesc, aData, + aDescTrans, wsTrans1Data); + if (stat != CNNL_STATUS_SUCCESS) + return; + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + bDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, inputs1Array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDescTrans)); + checkCnnlError(cnnlSetTensorDescriptor(bDescTrans, CNNL_LAYOUT_NHWC, + CNNL_DTYPE_FLOAT, 4, + inputs1ArrayTrans)); + + size_t wsTrans2Size = dimInputs1[0] * dimInputs1[1] * dimInputs1[2] * + dimInputs1[3] * sizeof(float); + BangPtr wsTrans2Data = context->getWorkspace(wsTrans2Size); + + stat = cnnlTranspose(context->cnnlHandle(), transDesc, bDesc, bData, + bDescTrans, wsTrans2Data); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, outputArray)); + + checkCnnlError(cnnlCreateTensorDescriptor(&cDescTrans)); + checkCnnlError(cnnlSetTensorDescriptor(cDescTrans, CNNL_LAYOUT_NHWC, + CNNL_DTYPE_FLOAT, 4, + outputArrayTrans)); + + size_t wsTrans3Size = dimOutput[0] * dimOutput[1] * dimOutput[2] * + dimOutput[3] * sizeof(float); + BangPtr wsTrans3Data = context->getWorkspace(wsTrans3Size); + + cnnlConvolutionBwdFilterAlgo_t algo; + cnnlGetConvolutionBackwardFilterAlgorithm( + context->cnnlHandle(), convDesc, aDescTrans, bDescTrans, cDescTrans, + CNNL_CONVOLUTION_BWD_FILTER_FASTEST, &algo); + + size_t wsSize; + cnnlGetConvolutionBackwardFilterWorkspaceSize( + context->cnnlHandle(), aDescTrans, bDescTrans, cDescTrans, convDesc, + algo, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + stat = cnnlConvolutionBackwardFilter( + context->cnnlHandle(), NULL, aDescTrans, wsTrans1Data, bDescTrans, + wsTrans2Data, convDesc, algo, wsData, wsSize, NULL, cDescTrans, + wsTrans3Data); + if (stat != CNNL_STATUS_SUCCESS) + return; + + int transMode2[4] = {0, 3, 1, 2}; + cnnlTransposeDescriptor_t transOutputDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&transOutputDesc)); + checkCnnlError( + cnnlSetTransposeDescriptor(transOutputDesc, 4, transMode2)); + + stat = cnnlTranspose(context->cnnlHandle(), transOutputDesc, cDescTrans, + wsTrans3Data, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(aDescTrans)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDescTrans)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDescTrans)); + checkCnnlError(cnnlDestroyTransposeDescriptor(transDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(transOutputDesc)); + checkCnnlError(cnnlDestroyConvolutionDescriptor(convDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::ConvBackwardFilter, DataType::Float32, + ConvBackwardFilterCnnl, "ConvBackwardFilter_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/copy.cc b/src/kernels/bang/copy.cc new file mode 100644 index 00000000..37987729 --- /dev/null +++ b/src/kernels/bang/copy.cc @@ -0,0 +1,46 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class CopyCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlCopy(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Copy, DataType::Float32, CopyCnnl, + "Copy_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/det.cc b/src/kernels/bang/det.cc new file mode 100644 index 00000000..02726e69 --- /dev/null +++ b/src/kernels/bang/det.cc @@ -0,0 +1,53 @@ +#include "operators/det.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class DetCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + DetObj::Mode mode = op->getMode(); + cnnlDetMode_t nlMode; + if (mode == DetObj::LogDet) { + nlMode = CNNL_DET_MODE_LOGDET; + } else { + nlMode = CNNL_DET_MODE_DET; + } + cnnlTensorDescriptor_t aDesc, cDesc; + auto dimin = op->getInputs(0)->getDims(); + auto dimout = op->getOutput()->getDims(); + if (dimin.size() != 4 || dimout.size() != 2) + IT_TODO_HALT(); + + int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]}; + int dimout_array[2] = {dimout[0], dimout[1]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, dimout_array)); + + cnnlStatus_t stat = + cnnlDet(context->cnnlHandle(), nlMode, aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Det, DataType::Float32, DetCnnl, + "Det_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/element_wise.cc b/src/kernels/bang/element_wise.cc index 74505e9b..bca41e6f 100644 --- a/src/kernels/bang/element_wise.cc +++ b/src/kernels/bang/element_wise.cc @@ -1,5 +1,4 @@ #include "operators/element_wise.h" -#include "bang/bang_element_wise.h" #include "bang/bang_kernel_without_config.h" #include "bang/bang_runtime.h" @@ -66,6 +65,514 @@ class ElementWiseCnnl : public BangKernelWithoutConfig { } }; +class LogicOpCnnl : public BangKernelWithoutConfig { + virtual cnnlLogicOp_t getOpType() const = 0; + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetLogicOpWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlLogicOp(context->cnnlHandle(), getOpType(), aDesc, aData, bDesc, + bData, wsData, wsSize, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class BitComputeCnnl : public BangKernelWithoutConfig { + virtual cnnlBitComputeOp_t getOpType() const = 0; + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_INT32, 4, dim_array)); + + size_t wsSize; + cnnlGetBitComputeWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, + cDesc, &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlBitCompute_v2(context->cnnlHandle(), getOpType(), aDesc, aData, + bDesc, bData, cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class DivCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = cnnlDiv_v2( + context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc, + aData, bDesc, bData, wsData, wsSize, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class MaximumCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get op descriptor + size_t wsSize; + cnnlGetMaximumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlMaximum(context->cnnlHandle(), aDesc, aData, bDesc, bData, + cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class MinimumCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get op descriptor + size_t wsSize; + cnnlGetMinimumWorkspaceSize(context->cnnlHandle(), cDesc, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlMinimum(context->cnnlHandle(), aDesc, aData, bDesc, bData, + cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class MSELossCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + MSELossObj::Reduction reduction = op->getReduction(); + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + int dim_out[4] = {1, 1, 1, 1}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + if (reduction == MSELossObj::None) { + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + } else { + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_out)); + } + cnnlStatus_t stat; + if (reduction == MSELossObj::None) { + stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_NONE, aDesc, + aData, bDesc, bData, cDesc, cData); + } else if (reduction == MSELossObj::Sum) { + stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_SUM, aDesc, + aData, bDesc, bData, cDesc, cData); + } else { + stat = cnnlMSELoss(context->cnnlHandle(), CNNL_MSE_LOSS_MEAN, aDesc, + aData, bDesc, bData, cDesc, cData); + } + + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class PowerCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get op descriptor + size_t wsSize; + cnnlGetPowWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlPow(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + aDesc, aData, bDesc, bData, wsData, wsSize, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class FloorDivCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetFloorDivWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = cnnlFloorDiv_v2( + context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, aDesc, + aData, bDesc, bData, cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class FloorModCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetFloorModWorkspaceSize(context->cnnlHandle(), aDesc, bDesc, cDesc, + &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlFloorMod(context->cnnlHandle(), aDesc, aData, bDesc, bData, + cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +class SquaredDifferenceCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const bData = (op->getInputs(1)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, bDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + checkCnnlError(cnnlCreateTensorDescriptor(&bDesc)); + checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + size_t wsSize; + cnnlGetSquaredDifferenceWorkspaceSize(context->cnnlHandle(), aDesc, + bDesc, cDesc, &wsSize); + + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlSquaredDifference(context->cnnlHandle(), aDesc, aData, bDesc, + bData, cDesc, cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(bDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + class AddCnnl : public ElementWiseCnnl { cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_ADD; } }; @@ -81,12 +588,57 @@ class MulCnnl : public ElementWiseCnnl { cnnlOpTensorDesc_t getOpType() const override { return CNNL_OP_TENSOR_MUL; } }; -class ElementWiseBang : public BangKernelWithoutConfig { - void compute(const Operator &_op, - const RuntimeObj *_context) const override { - element_wise_kernel(_context, _op); - } +class EqualCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_EQ; } }; +class NotEqualCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NE; } +}; +class GreaterThanCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GT; } +}; +class GreaterEqualCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_GE; } +}; +class LessThanCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LT; } +}; +class LessEqualCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_LE; } +}; +class AndCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_AND; } +}; +class OrCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_OR; } +}; +class XorCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_XOR; } +}; +class NotCnnl : public LogicOpCnnl { + cnnlLogicOp_t getOpType() const override { return CNNL_LOGIC_OP_NOT; } +}; + +class BitAndCnnl : public BitComputeCnnl { + cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BAND_OP; } +}; +class BitOrCnnl : public BitComputeCnnl { + cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BOR_OP; } +}; +class BitXorCnnl : public BitComputeCnnl { + cnnlBitComputeOp_t getOpType() const override { return CNNL_CYCLE_BXOR_OP; } +}; +class BitNotCnnl : public BitComputeCnnl { + cnnlBitComputeOp_t getOpType() const override { return CNNL_BNOT_OP; } +}; +// class BitLeftShiftCnnl : public BitComputeCnnl { +// cnnlBitComputeOp_t getOpType() const override { return +// CNNL_BLEFT_SHIFT_OP_V2; } +// }; +// class BitRightShiftCnnl : public BitComputeCnnl { +// cnnlBitComputeOp_t getOpType() const override { return +// CNNL_BLEFT_SHIFT_OP_V2; } +// }; REGISTER_KERNEL(Device::BANG, OpType::Add, DataType::Float32, AddCnnl, "Add_cnnl_BANG_Float32"); @@ -95,8 +647,56 @@ REGISTER_KERNEL(Device::BANG, OpType::Sub, DataType::Float32, SubCnnl, REGISTER_KERNEL(Device::BANG, OpType::Mul, DataType::Float32, MulCnnl, "Mul_cnnl_BANG_Float32"); -REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, ElementWiseBang, - "Div_Bang_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Div, DataType::Float32, DivCnnl, + "Div_cnnl_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Maximum, DataType::Float32, MaximumCnnl, + "Maximum_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Minimum, DataType::Float32, MinimumCnnl, + "Minimum_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::MSELoss, DataType::Float32, MSELossCnnl, + "MSELoss_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Power, DataType::Float32, PowerCnnl, + "Power_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::FloorDiv, DataType::Float32, FloorDivCnnl, + "FloorDiv_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::FloorMod, DataType::Float32, FloorModCnnl, + "FloorMod_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::SquaredDifference, DataType::Float32, + SquaredDifferenceCnnl, "SquaredDifference_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Equal, DataType::Float32, EqualCnnl, + "Equal_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::NotEqual, DataType::Float32, NotEqualCnnl, + "NotEqual_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::GreaterThan, DataType::Float32, + GreaterThanCnnl, "GreaterThan_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::GreaterEqual, DataType::Float32, + GreaterEqualCnnl, "GreaterEqual_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::LessThan, DataType::Float32, LessThanCnnl, + "LessThan_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::LessEqual, DataType::Float32, + LessEqualCnnl, "LessEqual_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::And, DataType::Float32, AndCnnl, + "And_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Or, DataType::Float32, OrCnnl, + "Or_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Xor, DataType::Float32, XorCnnl, + "Xor_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Not, DataType::Float32, NotCnnl, + "Not_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::BitAnd, DataType::Float32, BitAndCnnl, + "BitAnd_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::BitOr, DataType::Float32, BitOrCnnl, + "BitOr_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::BitXor, DataType::Float32, BitXorCnnl, + "BitXor_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::BitNot, DataType::Float32, BitNotCnnl, + "BitNot_cnnl_BANG_Float32"); +// REGISTER_KERNEL(Device::BANG, OpType::BitLeftShift, DataType::Float32, +// BitLeftShiftCnnl, +// "BitLeftShift_cnnl_BANG_Float32"); +// REGISTER_KERNEL(Device::BANG, OpType::BitRightShift, DataType::Float32, +// BitRightShiftCnnl, +// "BitRightShift_cnnl_BANG_Float32"); // REGISTER_KERNEL(Device::BANG, OpType::Pow, DataType::Float32, // ElementWiseBang, // "Pow_Bang_Float32"); diff --git a/src/kernels/bang/erf.cc b/src/kernels/bang/erf.cc new file mode 100644 index 00000000..86c1e3a3 --- /dev/null +++ b/src/kernels/bang/erf.cc @@ -0,0 +1,47 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class ErfCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlErf_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Erf, DataType::Float32, ErfCnnl, + "Erf_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/exp.cc b/src/kernels/bang/exp.cc new file mode 100644 index 00000000..9d7d31f4 --- /dev/null +++ b/src/kernels/bang/exp.cc @@ -0,0 +1,47 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class ExpCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlExp_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Exp, DataType::Float32, ExpCnnl, + "Exp_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/fill.cc b/src/kernels/bang/fill.cc new file mode 100644 index 00000000..0f8fb846 --- /dev/null +++ b/src/kernels/bang/fill.cc @@ -0,0 +1,40 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class FillCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const cData = (op->getOutput()->getRawDataPtr()); + float value = op->getValue(); + + cnnlTensorDescriptor_t cDesc; + auto dim = op->getOutput()->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlFill(context->cnnlHandle(), value, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Fill, DataType::Float32, FillCnnl, + "Fill_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/floor.cc b/src/kernels/bang/floor.cc new file mode 100644 index 00000000..a0f2a082 --- /dev/null +++ b/src/kernels/bang/floor.cc @@ -0,0 +1,46 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class FloorCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlFloor(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Floor, DataType::Float32, FloorCnnl, + "Floor_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/hardtanh.cc b/src/kernels/bang/hardtanh.cc new file mode 100644 index 00000000..1c4ad697 --- /dev/null +++ b/src/kernels/bang/hardtanh.cc @@ -0,0 +1,42 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class HardtanhCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + float min = op->getMin(); + float max = op->getMax(); + + cnnlTensorDescriptor_t aDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = cnnlHardtanh(context->cnnlHandle(), aDesc, aData, + max, min, aDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Hardtanh, DataType::Float32, HardtanhCnnl, + "Hardtanh_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/l2loss.cc b/src/kernels/bang/l2loss.cc new file mode 100644 index 00000000..d7c66859 --- /dev/null +++ b/src/kernels/bang/l2loss.cc @@ -0,0 +1,40 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class L2LossCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlL2Loss(context->cnnlHandle(), aDesc, aData, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::L2Loss, DataType::Float32, L2LossCnnl, + "L2Loss_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/log.cc b/src/kernels/bang/log.cc new file mode 100644 index 00000000..4976b1ca --- /dev/null +++ b/src/kernels/bang/log.cc @@ -0,0 +1,62 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class LogCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + auto type = op->getType(); + cnnlLogBase_t base; + switch (type) { + case LogObj::Log2: + base = CNNL_LOG_2; + break; + case LogObj::LogE: + base = CNNL_LOG_E; + break; + case LogObj::Log10: + base = CNNL_LOG_10; + break; + default: + IT_TODO_HALT(); + } + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlLog_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + base, aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Log, DataType::Float32, LogCnnl, + "Log_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/matmul.cc b/src/kernels/bang/matmul.cc index b30ecb87..98f30947 100644 --- a/src/kernels/bang/matmul.cc +++ b/src/kernels/bang/matmul.cc @@ -18,19 +18,27 @@ class MatmulCnnl : public BangKernelWithoutConfig { auto dimInputs0 = op->getInputs(0)->getDims(); auto dimInputs1 = op->getInputs(1)->getDims(); auto dimOutput = op->getOutput()->getDims(); - if (dimInputs0.size() != 3) - IT_TODO_HALT(); - if (dimInputs1.size() != 3) - IT_TODO_HALT(); - if (dimOutput.size() != 3) - IT_TODO_HALT(); + int input0_batch_size = 1; + int input1_batch_size = 1; + int output_batch_size = 1; + for (size_t i = 0; i < dimInputs0.size() - 2; ++i) { + input0_batch_size *= dimInputs0[i]; + input1_batch_size *= dimInputs1[i]; + output_batch_size *= dimOutput[i]; + } bool transA = op->getTransA(); bool transB = op->getTransB(); - int inputs0Array[3] = {dimInputs0[0], dimInputs0[1], dimInputs0[2]}; - int inputs1Array[3] = {dimInputs1[0], dimInputs1[1], dimInputs1[2]}; - int outputArray[3] = {dimOutput[0], dimOutput[1], dimOutput[2]}; + int inputs0Array[3] = {input0_batch_size, + dimInputs0[dimInputs0.size() - 2], + dimInputs0[dimInputs0.size() - 1]}; + int inputs1Array[3] = {input1_batch_size, + dimInputs1[dimInputs1.size() - 2], + dimInputs1[dimInputs1.size() - 1]}; + int outputArray[3] = {output_batch_size, + dimOutput[dimOutput.size() - 2], + dimOutput[dimOutput.size() - 1]}; // get inputs checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); diff --git a/src/kernels/bang/negtensor.cc b/src/kernels/bang/negtensor.cc new file mode 100644 index 00000000..b0171120 --- /dev/null +++ b/src/kernels/bang/negtensor.cc @@ -0,0 +1,46 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class NegTensorCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlNegTensor(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Neg, DataType::Float32, NegTensorCnnl, + "Neg_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/pad.cc b/src/kernels/bang/pad.cc new file mode 100644 index 00000000..e211ee93 --- /dev/null +++ b/src/kernels/bang/pad.cc @@ -0,0 +1,65 @@ +#include "operators/pad.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class PadCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getOutput()->getDims(); + int dim_size = dim.size(); + int dim_array[dim_size]; + for (int i = 0; i < dim_size; ++i) { + dim_array[i] = dim[i]; + } + int paddings[dim_size * 2]; + std::vector pads = op->getPads(); + if (pads.size() == 2 && dim_size != 1) { + for (int i = 0; i < dim_size * 2; i += 2) { + paddings[i] = pads[0]; + paddings[i + 1] = pads[1]; + } + } else { + for (int i = 0; i < dim_size * 2; i += 2) { + paddings[i] = pads[i / 2]; + paddings[i + 1] = pads[i / 2 + dim_size]; + } + } + int dimout_array[dim_size]; + for (int i = 0; i < dim_size; ++i) { + dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1]; + } + float paddingValue = 0.0; + // input + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array)); + // output + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, dim_size, + dimout_array)); + + cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData, + paddings, &paddingValue, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Pad, DataType::Float32, PadCnnl, + "Pad_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/pooling.cc b/src/kernels/bang/pooling.cc new file mode 100644 index 00000000..5abbbf56 --- /dev/null +++ b/src/kernels/bang/pooling.cc @@ -0,0 +1,73 @@ +#include "operators/pooling.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class PoolingCnnl : public BangKernelWithoutConfig { + virtual cnnlPoolingMode_t getPoolingMode() const = 0; + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + void *const inData = (op->getInputs(0)->getRawDataPtr()); + void *const outData = (op->getOutput()->getRawDataPtr()); + + const auto [n, c, h, w, kh, kw] = op->getNCHWRS(); + const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation(); + + // get inputs + int inArray[4] = {n, c, h, w}; + cnnlTensorDescriptor_t inDesc; + checkCnnlError(cnnlCreateTensorDescriptor(&inDesc)); + checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, inArray)); + + // get maxpool descriptor + cnnlPoolingDescriptor_t poolingDesc; + checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc)); + checkCnnlError(cnnlSetPooling2dDescriptor_v2( + poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph, + ph, pw, pw, sh, sw, dh, dw, false)); + + // get outputs + auto outVec = op->getOutput()->getDims(); + int outArray[4] = {outVec[0], outVec[1], outVec[2], outVec[3]}; + cnnlTensorDescriptor_t outDesc; + checkCnnlError(cnnlCreateTensorDescriptor(&outDesc)); + checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, outArray)); + size_t wsSize; + cnnlGetPoolingWorkspaceSize(context->cnnlHandle(), getPoolingMode(), + outVec[3], outVec[2], &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + float alpha = 1.f, beta = 0.f; + checkCnnlError(cnnlPoolingForward(context->cnnlHandle(), poolingDesc, + &alpha, inDesc, inData, &beta, + outDesc, outData, wsData, wsSize)); + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(inDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(outDesc)); + checkCnnlError(cnnlDestroyPoolingDescriptor(poolingDesc)); + } +}; + +class maxPoolCnnl : public PoolingCnnl { + cnnlPoolingMode_t getPoolingMode() const override { + return CNNL_POOLING_MAX; + } +}; + +class avgPoolCnnl : public PoolingCnnl { + cnnlPoolingMode_t getPoolingMode() const override { + return CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::MaxPool, DataType::Float32, maxPoolCnnl, + "MaxPool_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::AvgPool, DataType::Float32, avgPoolCnnl, + "AvgPool_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/reciprocal.cc b/src/kernels/bang/reciprocal.cc new file mode 100644 index 00000000..38a22fd6 --- /dev/null +++ b/src/kernels/bang/reciprocal.cc @@ -0,0 +1,46 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class ReciprocalCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlReciprocal(context->cnnlHandle(), aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Reciprocal, DataType::Float32, + ReciprocalCnnl, "Reciprocal_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/reshape.cc b/src/kernels/bang/reshape.cc new file mode 100644 index 00000000..5b4e8613 --- /dev/null +++ b/src/kernels/bang/reshape.cc @@ -0,0 +1,42 @@ +#include "operators/reshape.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class CopyBang : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + auto inData = op->getInputs(0)->getRawDataPtr(); + auto outData = op->getOutputs()[0]->getRawDataPtr(); + cnnlTensorDescriptor_t aDesc; + auto dim = op->getInputs(0)->getDims(); + int len = dim.size(); + int size = 1; + for (int i = 0; i < len; ++i) { + size *= dim[i]; + } + + int dim_array[1] = {size}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY, + CNNL_DTYPE_FLOAT, 1, dim_array)); + cnnlStatus_t stat = + cnnlCopy(context->cnnlHandle(), aDesc, inData, aDesc, outData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + } +}; +// reshape/flatten/identity all act as copying from input to output. +REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang, + "Reshape_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang, + "Flatten_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang, + "Identity_BANG_Float32"); + +} // namespace infini diff --git a/src/kernels/bang/rsqrt.cc b/src/kernels/bang/rsqrt.cc new file mode 100644 index 00000000..fea06e13 --- /dev/null +++ b/src/kernels/bang/rsqrt.cc @@ -0,0 +1,47 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class RsqrtCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlRsqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Rsqrt, DataType::Float32, RsqrtCnnl, + "Rsqrt_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/split.cc b/src/kernels/bang/split.cc new file mode 100644 index 00000000..bfa842bc --- /dev/null +++ b/src/kernels/bang/split.cc @@ -0,0 +1,69 @@ +#include "operators/split.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class SplitCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + int num = op->numOutputs(); + int axis = op->getDim(); + void *argv[num]; + for (int i = 0; i < num; ++i) { + argv[i] = op->getOutput(i)->getRawDataPtr(); + } + void *const inputData = (op->getInputs(0)->getRawDataPtr()); + + cnnlTensorDescriptor_t desc; + + int dimout_array[num][4]; + for (int i = 0; i < num; ++i) { + auto dim = op->getOutput(i)->getDims(); + if (dim.size() != 4) { + IT_TODO_HALT(); + } + dimout_array[i][0] = dim[0]; + dimout_array[i][1] = dim[1]; + dimout_array[i][2] = dim[2]; + dimout_array[i][3] = dim[3]; + } + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) { + IT_TODO_HALT(); + } + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + checkCnnlError(cnnlCreateTensorDescriptor(&desc)); + checkCnnlError(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + cnnlTensorDescriptor_t descArray[num]; + for (int i = 0; i < num; ++i) { + checkCnnlError(cnnlCreateTensorDescriptor(&descArray[i])); + checkCnnlError( + cnnlSetTensorDescriptor(descArray[i], CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dimout_array[i])); + } + + size_t wsSize; + cnnlGetSplitWorkspaceSize(context->cnnlHandle(), num, &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlSplit(context->cnnlHandle(), num, axis, desc, inputData, wsData, + wsSize, descArray, argv); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + for (int i = 0; i < num; ++i) { + checkCnnlError(cnnlDestroyTensorDescriptor(descArray[i])); + } + checkCnnlError(cnnlDestroyTensorDescriptor(desc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Split, DataType::Float32, SplitCnnl, + "Split_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/sqrt.cc b/src/kernels/bang/sqrt.cc new file mode 100644 index 00000000..68715912 --- /dev/null +++ b/src/kernels/bang/sqrt.cc @@ -0,0 +1,47 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class SqrtCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + cnnlStatus_t stat = + cnnlSqrt_v2(context->cnnlHandle(), CNNL_COMPUTATION_HIGH_PRECISION, + aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Sqrt, DataType::Float32, SqrtCnnl, + "Sqrt_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/bang/transpose.cc b/src/kernels/bang/transpose.cc new file mode 100644 index 00000000..c484824c --- /dev/null +++ b/src/kernels/bang/transpose.cc @@ -0,0 +1,60 @@ +#include "operators/transpose.h" +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" + +namespace infini { +class TransposeCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dimin = op->getInputs(0)->getDims(); + auto dimout = op->getOutput()->getDims(); + if (dimin.size() != 4 || dimout.size() != 4) + IT_TODO_HALT(); + + int dimin_array[4] = {dimin[0], dimin[1], dimin[2], dimin[3]}; + int dimout_array[4] = {dimout[0], dimout[1], dimout[2], dimout[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimin_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor( + cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 4, dimout_array)); + + // get op descriptor + auto permute = op->getPermute(); + cnnlTransposeDescriptor_t opDesc; + checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc)); + checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute.data())); + + size_t wsSize; + cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), aDesc, opDesc, + &wsSize); + BangPtr wsData = context->getWorkspace(wsSize); + + cnnlStatus_t stat = + cnnlTranspose_v2(context->cnnlHandle(), opDesc, aDesc, aData, cDesc, + cData, wsData, wsSize); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Transpose, DataType::Float32, + TransposeCnnl, "Transpose_cnnl_BANG_Float32"); +}; // namespace infini diff --git a/src/kernels/bang/trigon.cc b/src/kernels/bang/trigon.cc new file mode 100644 index 00000000..4378aa6e --- /dev/null +++ b/src/kernels/bang/trigon.cc @@ -0,0 +1,184 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class TrigonCnnl : public BangKernelWithoutConfig { + virtual cnnlTrigonFunctionMode_t getOpType() const = 0; + virtual cnnlComputationPreference_t getPrefer() const = 0; + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get outputs + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, + CNNL_DTYPE_FLOAT, 4, dim_array)); + + // get op descriptor + cnnlTrigonDescriptor_t opDesc; + checkCnnlError(cnnlCreateTrigonDescriptor(&opDesc)); + checkCnnlError(cnnlSetTrigonDescriptor(opDesc, getOpType())); + + cnnlStatus_t stat = cnnlTrigonForward(context->cnnlHandle(), opDesc, + aDesc, aData, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + checkCnnlError(cnnlDestroyTrigonDescriptor(opDesc)); + } +}; + +class SinCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_SIN; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class CosCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_COS; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class TanCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_TAN; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ASinCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ASIN; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ACosCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ACOS; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ATanCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ATAN; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class SinHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_SINH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class CosHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_COSH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class TanHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_TANH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ASinHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ASINH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ACosHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ACOSH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +class ATanHCnnl : public TrigonCnnl { + cnnlTrigonFunctionMode_t getOpType() const override { + return CNNL_TRIGON_ATANH; + } + cnnlComputationPreference_t getPrefer() const override { + return CNNL_COMPUTATION_HIGH_PRECISION; + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Sin, DataType::Float32, SinCnnl, + "Sin_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Cos, DataType::Float32, CosCnnl, + "Cos_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::Tan, DataType::Float32, TanCnnl, + "Tan_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ASin, DataType::Float32, ASinCnnl, + "ASin_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ACos, DataType::Float32, ACosCnnl, + "ACos_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ATan, DataType::Float32, ATanCnnl, + "ATan_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::SinH, DataType::Float32, SinHCnnl, + "SinH_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::CosH, DataType::Float32, CosHCnnl, + "CosH_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::TanH, DataType::Float32, TanHCnnl, + "TanH_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ASinH, DataType::Float32, ASinHCnnl, + "ASinH_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ACosH, DataType::Float32, ACosHCnnl, + "ACosH_cnnl_BANG_Float32"); +REGISTER_KERNEL(Device::BANG, OpType::ATanH, DataType::Float32, ATanHCnnl, + "ATanH_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/kernels/cpu/element_wise.cc b/src/kernels/cpu/element_wise.cc index 81fed9e4..8657a1fe 100644 --- a/src/kernels/cpu/element_wise.cc +++ b/src/kernels/cpu/element_wise.cc @@ -11,17 +11,37 @@ template class NativeElementWise : public CpuKernelWithoutConfig { T *inptr1 = op->getInputs(1)->getRawDataPtr(); T *outptr = op->getOutput()->getRawDataPtr(); - auto outDim = op->getOutput()->getDims(); + int a[4] = {1, 1, 1, 1}; + int b[4] = {1, 1, 1, 1}; + int c[4] = {1, 1, 1, 1}; + auto a_input = op->getInputs(0)->getDims(); + auto b_input = op->getInputs(1)->getDims(); + auto c_output = op->getOutput()->getDims(); + std::copy(a_input.begin(), a_input.end(), a + (4 - a_input.size())); + std::copy(b_input.begin(), b_input.end(), b + (4 - b_input.size())); + std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size())); + auto n = op->getOutput()->size(); - for (size_t offset = 0; offset < n; offset++) { - // For now,we only process the same dims here, broardcast will be - // considered in the opt layer. - /*auto offset0 = - op->getInputs(0)->getOffsetByBroadcastOffset(offset, outDim); - auto offset1 = - op->getInputs(1)->getOffsetByBroadcastOffset(offset, outDim); - outptr[offset] = doCompute(inptr0[offset0], inptr1[offset1]);*/ - outptr[offset] = doCompute(inptr0[offset], inptr1[offset]); + for (size_t i = 0; i < n; ++i) { + int c0_index = i / (c[1] * c[2] * c[3]); + int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]); + int c2_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) / c[3]; + int c3_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) % c[3]; + + int a0_index = c0_index % a[0]; + int a1_index = c1_index % a[1]; + int a2_index = c2_index % a[2]; + int a3_index = c3_index % a[3]; + + int b0_index = c0_index % b[0]; + int b1_index = c1_index % b[1]; + int b2_index = c2_index % b[2]; + int b3_index = c3_index % b[3]; + outptr[i] = doCompute( + inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] + + a2_index * a[3] + a3_index], + inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] + + b2_index * b[3] + b3_index]); } } }; @@ -55,4 +75,4 @@ REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv, "divNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv, "divNaive_CPU_float32"); -}; // namespace infini \ No newline at end of file +}; // namespace infini diff --git a/src/kernels/cpu/unary.cc b/src/kernels/cpu/unary.cc index 8464a0d9..c32e5652 100644 --- a/src/kernels/cpu/unary.cc +++ b/src/kernels/cpu/unary.cc @@ -56,6 +56,25 @@ template class NaiveAbs : public NativeUnary { T doCompute(T val) const override { return val < 0 ? -val : val; } }; +template class Clip : public CpuKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *context) const override { + auto op = as(_op); + T *inptr = op->getInputs(0)->getRawDataPtr(); + T *outptr = op->getOutput()->getRawDataPtr(); + auto minValue = op->getMin(); + auto maxValue = op->getMax(); + + auto n = op->getOutput()->size(); + for (size_t offset = 0; offset < n; offset++) { + auto val = *inptr++; + *outptr++ = (minValue && val < *minValue) ? *minValue + : (maxValue && val > *maxValue) ? *maxValue + : val; + } + } +}; + REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32, NaiveRelu, "reluNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu, @@ -76,4 +95,6 @@ REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32, NaiveSoftmax, "softmaxNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32, NaiveSoftmax, "softmaxNaive_CPU_float32"); +REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip, + "Clip_CPU_float32"); }; // namespace infini diff --git a/src/kernels/cuda/batch_norm.cc b/src/kernels/cuda/batch_norm.cc index ce1aaf27..111137ff 100644 --- a/src/kernels/cuda/batch_norm.cc +++ b/src/kernels/cuda/batch_norm.cc @@ -20,18 +20,17 @@ class BatchNormCudnn : public CudaKernelWithoutConfig { auto dims = op->getInputs(0)->getDims(); // Only 4D and 5D tensors are supported by // cudnnBatchNormalizationForwardInference - IT_ASSERT(dims.size() == 4 || dims.size() == 5); + IT_ASSERT(dims.size() == 4); - int dimArray[CUDNN_DIM_MAX], strideArray[CUDNN_DIM_MAX], - dimPArray[CUDNN_DIM_MAX], stridePArray[CUDNN_DIM_MAX]; + int dimArray[4], strideArray[4], dimPArray[4], stridePArray[4]; for (size_t i = 0; i < dims.size(); ++i) { dimArray[i] = dims[i]; strideArray[i] = op->getInputs(0)->getStride()[i]; dimPArray[i] = 1; stridePArray[i] = 1; } - dimPArray[1] = op->getInputs(0)->getDims()[1]; - stridePArray[1] = op->getInputs(0)->getStride()[1]; + dimPArray[1] = op->getInputs(1)->getDims()[0]; + stridePArray[0] = op->getInputs(1)->getDims()[0]; // get inputs cudnnTensorDescriptor_t inDesc; checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); diff --git a/src/kernels/cuda/clip.cc b/src/kernels/cuda/clip.cc new file mode 100644 index 00000000..b4865504 --- /dev/null +++ b/src/kernels/cuda/clip.cc @@ -0,0 +1,27 @@ +#include "cuda/cuda_clip.h" +#include "cuda/cuda_kernel_wihtout_config.h" +#include "cuda/cuda_runtime.h" +#include "operators/unary.h" + +namespace infini { + +class ClipCuda : public CudaKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + + void *const inputData = (op->getInputs(0)->getRawDataPtr()); + void *const outputData = (op->getOutput()->getRawDataPtr()); + auto min = op->getMin(); + auto max = op->getMax(); + auto dim = op->getInputs(0)->getDims(); + int num = dim[0] * dim[1] * dim[2] * dim[3]; + clip_kernel((float *)inputData, (float *)outputData, num, + min ? *min : NAN, max ? *max : NAN); + } +}; + +REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda, + "Clip_CUDA_Float32"); + +}; // namespace infini diff --git a/src/kernels/cuda/clip.cu b/src/kernels/cuda/clip.cu new file mode 100644 index 00000000..eabc4926 --- /dev/null +++ b/src/kernels/cuda/clip.cu @@ -0,0 +1,32 @@ +#include "core/common.h" +#include "core/constants.h" +#include "cuda/cuda_common.h" +#include + +using infini::E_CONSTANT; +constexpr unsigned int num_threads() { return 32 * 4; } +constexpr int thread_work_size() { return 4; } +constexpr int block_work_size() { return thread_work_size() * num_threads(); } + +__global__ void _clip_kernel(float *input, float *output, int n, float minValue, + float maxValue) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = index; i < n; i += stride) { + output[i] = (!isnan(minValue) && input[i] < minValue) + ? minValue + : (!isnan(maxValue) && input[i] > maxValue) + ? maxValue : input[i]; + } +} + +namespace infini { +void clip_kernel(float *input, float *output, int num, float minValue, + float maxValue) { + int blocksize = block_work_size(); + int gridsize = (num + block_work_size() - 1) / block_work_size(); + _clip_kernel<<>>(input, output, num, minValue, + maxValue); +} + +}; // namespace infini diff --git a/src/kernels/cuda/element_wise.cc b/src/kernels/cuda/element_wise.cc index 97835746..a74d1bdf 100644 --- a/src/kernels/cuda/element_wise.cc +++ b/src/kernels/cuda/element_wise.cc @@ -19,24 +19,37 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig { void *const cData = (op->getOutput()->getRawDataPtr()); cudnnTensorDescriptor_t aDesc, bDesc, cDesc; - auto dim = op->getInputs(0)->getDims(); - if (dim.size() != 4) + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); + + if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4) IT_TODO_HALT(); - int n = dim[0], c = dim[1], h = dim[2], w = dim[3]; + + int a[4] = {1, 1, 1, 1}; + int b[4] = {1, 1, 1, 1}; + int c[4] = {1, 1, 1, 1}; + + std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size())); + std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); + std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size())); // get inputs checkCudnnError(cudnnCreateTensorDescriptor(&aDesc)); - checkCudnnError(cudnnSetTensor4dDescriptor( - aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, a[0], a[1], + a[2], a[3])); checkCudnnError(cudnnCreateTensorDescriptor(&bDesc)); - checkCudnnError(cudnnSetTensor4dDescriptor( - bDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, b[0], b[1], + b[2], b[3])); // get outputs checkCudnnError(cudnnCreateTensorDescriptor(&cDesc)); - checkCudnnError(cudnnSetTensor4dDescriptor( - cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, c[0], c[1], + c[2], c[3])); // get op descriptor cudnnOpTensorDescriptor_t opDesc; @@ -81,13 +94,27 @@ class ElementWiseCuda : public CudaKernelWithoutConfig { float *const aData = (op->getInputs(0)->getRawDataPtr()); float *const bData = (op->getInputs(1)->getRawDataPtr()); float *const cData = (op->getOutput()->getRawDataPtr()); + auto a_dim = op->getInputs(0)->getDims(); + auto b_dim = op->getInputs(1)->getDims(); + auto c_dim = op->getOutput()->getDims(); + + if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4) + IT_TODO_HALT(); + + int a[4] = {1, 1, 1, 1}; + int b[4] = {1, 1, 1, 1}; + int c[4] = {1, 1, 1, 1}; + + std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size())); + std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); + std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size())); - auto dim = op->getInputs(0)->getDims(); - int n = dim[0], c = dim[1], h = dim[2], w = dim[3]; if (op->getOpType() == OpType::Div) - div_kernel(aData, bData, cData, n * c * h * w); + div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], + b[2], b[3], c[0], c[1], c[2], c[3]); else if (op->getOpType() == OpType::Pow) - pow_kernel(aData, bData, cData, n * c * h * w); + pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], + b[2], b[3], c[0], c[1], c[2], c[3]); else IT_TODO_HALT(); } diff --git a/src/kernels/cuda/element_wise.cu b/src/kernels/cuda/element_wise.cu index be7d4495..b28f0144 100644 --- a/src/kernels/cuda/element_wise.cu +++ b/src/kernels/cuda/element_wise.cu @@ -5,34 +5,75 @@ constexpr unsigned int num_threads() { return 32 * 4; } constexpr int thread_work_size() { return 4; } constexpr int block_work_size() { return thread_work_size() * num_threads(); } -__global__ void _div_kernel(float *x, float *y, float *z, int n) { +__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; + int n = c0 * c1 * c2 * c3; + for (int i = index; i < n; i += stride) { - z[i] = x[i] / y[i]; + int c0_index = i/ (c1 * c2 * c3); + int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); + int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; + int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; + + int a0_index = c0_index % a0; + int a1_index = c1_index % a1; + int a2_index = c2_index % a2; + int a3_index = c3_index % a3; + + int b0_index = c0_index % b0; + int b1_index = c1_index % b1; + int b2_index = c2_index % b2; + int b3_index = c3_index % b3; + z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]; } } -__global__ void _pow_kernel(float *x, float *y, float *z, int n) { +__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; + int n = c0 * c1 * c2 * c3; + for (int i = index; i < n; i += stride) { - z[i] = pow(x[i], y[i]); + int c0_index = i/ (c1 * c2 * c3); + int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3); + int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3; + int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3; + + int a0_index = c0_index % a0; + int a1_index = c1_index % a1; + int a2_index = c2_index % a2; + int a3_index = c3_index % a3; + + int b0_index = c0_index % b0; + int b1_index = c1_index % b1; + int b2_index = c2_index % b2; + int b3_index = c3_index % b3; + z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]); } } namespace infini { -void div_kernel(float *a, float *b, float *c, int num) { +void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int blocksize = block_work_size(); + int num = c0*c1*c2*c3; int gridsize = (num + block_work_size() - 1) / block_work_size(); - _div_kernel<<>>(a, b, c, num); + _div_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); } -void pow_kernel(float *a, float *b, float *c, int num) { - +void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3, + int b0, int b1, int b2, int b3, + int c0, int c1, int c2, int c3) { int blocksize = block_work_size(); + int num = c0*c1*c2*c3; int gridsize = (num + block_work_size() - 1) / block_work_size(); - _pow_kernel<<>>(a, b, c, num); + _pow_kernel<<>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3); } }; // namespace infini diff --git a/src/kernels/cuda/pad_slice.cc b/src/kernels/cuda/pad_slice.cc index 04982a41..561a0923 100644 --- a/src/kernels/cuda/pad_slice.cc +++ b/src/kernels/cuda/pad_slice.cc @@ -34,7 +34,7 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig { void compute(const Operator &op, const RuntimeObj *_context) const override { do_compute(op->getOutput(), op->getInputs(0), - as(op)->getStart(), false); + as(op)->getStarts(), false); } }; diff --git a/src/kernels/cuda/unary.cc b/src/kernels/cuda/unary.cc index a944b8af..b4ac496f 100644 --- a/src/kernels/cuda/unary.cc +++ b/src/kernels/cuda/unary.cc @@ -60,6 +60,52 @@ class ActivationCudnn : public CudaKernelWithoutConfig { } }; +class SoftmaxCudnn : public CudaKernelWithoutConfig { + virtual cudnnSoftmaxAlgorithm_t getAlgorithmType() const = 0; + virtual cudnnSoftmaxMode_t getModeType() const = 0; + virtual tuple getAlphBeta() const { return {1.f, 0.f}; } + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const inputData = (op->getInputs(0)->getRawDataPtr()); + void *const outputData = (op->getOutput()->getRawDataPtr()); + + cudnnTensorDescriptor_t inputDesc, outputDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() > 4) + IT_TODO_HALT(); + int dim_array[4] = {1, 1, 1, 1}; + memcpy(dim_array + (4 - dim.size()), dim.data(), + dim.size() * sizeof(int)); + + // get inputs + checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor( + inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0], + dim_array[1], dim_array[2], dim_array[3])); + + // get outputs + checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc)); + checkCudnnError(cudnnSetTensor4dDescriptor( + outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0], + dim_array[1], dim_array[2], dim_array[3])); + + auto [alpha, beta] = getAlphBeta(); + cudnnStatus_t stat = cudnnSoftmaxForward( + context->cudnnHandle(), getAlgorithmType(), getModeType(), &alpha, + inputDesc, inputData, &beta, outputDesc, outputData); + if (stat != CUDNN_STATUS_SUCCESS) + return; + + // Destories in CUDA does not require sync. But cuDNN does not state + // whether sync is required before destories. + checkCudnnError(cudnnDestroyTensorDescriptor(inputDesc)); + checkCudnnError(cudnnDestroyTensorDescriptor(outputDesc)); + } +}; + class ReluCudnn : public ActivationCudnn { cudnnActivationMode_t getOpType() const override { return CUDNN_ACTIVATION_RELU; diff --git a/src/kernels/intelcpu/slice.cc b/src/kernels/intelcpu/slice.cc index 8a5a489f..663897cc 100644 --- a/src/kernels/intelcpu/slice.cc +++ b/src/kernels/intelcpu/slice.cc @@ -23,7 +23,7 @@ class MklSlice : public MklKernelWithoutConfig { std::vector sDims, offsets; for (int i = 0; i < ndim; ++i) { sDims.push_back(oDims.at(i)); - offsets.push_back(op->getStart().at(i)); + offsets.push_back(op->getStarts().at(i)); } auto sliceMd = srcMd.submemory_desc(sDims, offsets); auto sliceMemory = diff --git a/src/kernels/mlu/CMakeLists.txt b/src/kernels/mlu/CMakeLists.txt deleted file mode 100644 index b8cd41b0..00000000 --- a/src/kernels/mlu/CMakeLists.txt +++ /dev/null @@ -1,46 +0,0 @@ -cmake_minimum_required(VERSION 3.3) -project(bangops) -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include") -set(LIBRARY_OUTPUT_PATH "${CMAKE_BINARY_DIR}/lib") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fPIC -std=c++11 -pthread -pipe") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS} -O3") -set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -Wl,--gc-sections -fPIC") - -# check `NEUWARE_HOME` env -message(${NEUWARE_HOME}) -if(EXISTS ${NEUWARE_HOME}) - include_directories("${NEUWARE_HOME}/include") - link_directories("${NEUWARE_HOME}/lib64") - link_directories("${NEUWARE_HOME}/lib") - set(NEUWARE_ROOT_DIR "${NEUWARE_HOME}") -else() - message(FATAL_ERROR "NEUWARE directory cannot be found, refer README.md to prepare NEUWARE_HOME environment.") -endif() - -# setup cmake search path -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} - "${CMAKE_SOURCE_DIR}/cmake" - "${NEUWARE_HOME}/cmake" - "${NEUWARE_HOME}/cmake/modules" -) - -# include FindBANG.cmake and check cncc -find_package(BANG) -if(NOT BANG_FOUND) - message(FATAL_ERROR "BANG cannot be found.") -elseif (NOT BANG_CNCC_EXECUTABLE) - message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake") -endif() - -# setup cncc flags -set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -fPIC -Wall -Werror -std=c++11 -pthread") -set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -O3") -set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS}" "--bang-mlu-arch=mtp_220" - "--bang-mlu-arch=mtp_270" - "--bang-mlu-arch=mtp_290" - "--bang-mlu-arch=mtp_372" -) - -file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/src/*.mlu") -bang_add_library(bangops SHARED ${src_files}) - diff --git a/src/kernels/mlu/include/bang_div.h b/src/kernels/mlu/include/bang_div.h deleted file mode 100644 index 0f81c721..00000000 --- a/src/kernels/mlu/include/bang_div.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once -#include "cnnl.h" -namespace infini { -void div_kernel(cnnlHandle_t handle, const float *input1, const float *input2, - float *output, const uint32_t num); - -}; // namespace infini diff --git a/src/kernels/mlu/include/div.h b/src/kernels/mlu/include/div.h deleted file mode 100644 index 83520d1f..00000000 --- a/src/kernels/mlu/include/div.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef BANG_KERNELS_DIVOPERATION_DIV_H_ -#define BANG_KERNELS_DIVOPERATION_DIV_H_ - -__mlu_global__ void MLUDivKernelUnion1(float *output, float *input1, - float *input2, uint32_t num); - -#endif // BANG_KERNELS_DIVOPERATION_DIV_H_ diff --git a/src/kernels/mlu/src/div.mlu b/src/kernels/mlu/src/div.mlu deleted file mode 100644 index 63456f96..00000000 --- a/src/kernels/mlu/src/div.mlu +++ /dev/null @@ -1,24 +0,0 @@ -#include "bang_div.h" -#include "div.h" -namespace infini { -void div_kernel(cnnlHandle_t handle, - const float *input1, - const float *input2, - float *output, - const uint32_t num) { - // 任务类型和调度方法 - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - cnrtQueue_t queue; - cnnlGetQueue(handle, &queue); - k_dim.x = 4; - k_dim.y = 8; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; - // launch 任务 - MLUDivKernelUnion1<<>>((float*)output, - (float*)input1, - (float*)input2, - num); -} -}; diff --git a/src/kernels/mlu/src/div_device.mlu b/src/kernels/mlu/src/div_device.mlu deleted file mode 100644 index bee83eb8..00000000 --- a/src/kernels/mlu/src/div_device.mlu +++ /dev/null @@ -1,50 +0,0 @@ -#include "div.h" - -#define NRAM_USE_SIZE 102400 - -__nram__ char left[NRAM_USE_SIZE]; -__nram__ char right[NRAM_USE_SIZE]; -__nram__ char output[NRAM_USE_SIZE]; - -template -__mlu_device__ void DivFunction(T* output1, T* input1, T* input2, size_t num) { - int use_nram_size = NRAM_USE_SIZE; - int deal_align = use_nram_size / sizeof(T); - int num_per_core = num / taskDim; - int num_rem = num % taskDim; - int easy = num_per_core; - int hard = num_per_core + (num_rem != 0 ? 1 : 0); - int my = taskId < num_rem ? hard : easy; - int start = (taskId < num_rem) ? (hard * taskId) : (hard * num_rem + (taskId - num_rem) * easy); - char* input1_start = (char*)input1 + start * sizeof(T); - char* input2_start = (char*)input2 + start * sizeof(T); - char* output_start = (char*)output1 + start * sizeof(T); - - int my_repeat = my / deal_align; - int my_rem = my % deal_align; - for(int i = 0; i < my_repeat; ++i) { - __memcpy(left, input1_start, use_nram_size, GDRAM2NRAM); - __memcpy(right, input2_start, use_nram_size, GDRAM2NRAM); - __bang_active_recip((T*)right, (T*)right, deal_align); - __bang_mul((T*)output, (T*)left, (T*)right, deal_align); - __memcpy(output_start, output, use_nram_size, NRAM2GDRAM); - input1_start += use_nram_size; - input2_start += use_nram_size; - output_start += use_nram_size; - } - if(my_rem) { - __memcpy(left, input1_start, my_rem * sizeof(T), GDRAM2NRAM); - __memcpy(right, input2_start, my_rem * sizeof(T), GDRAM2NRAM); - __bang_active_recip((T*)right, (T*)right, deal_align); - __bang_mul((T*)output, (T*)left, (T*)right, deal_align); - __memcpy(output_start, output, my_rem * sizeof(T), NRAM2GDRAM); - } -} - -__mlu_global__ void MLUDivKernelUnion1(float *output, - float *input1, - float *input2, - uint32_t num) { - DivFunction((float*)output, (float*)input1, (float*)input2, num); -} - diff --git a/src/operators/activation_backward.cc b/src/operators/activation_backward.cc new file mode 100644 index 00000000..47a42086 --- /dev/null +++ b/src/operators/activation_backward.cc @@ -0,0 +1,37 @@ +#include "operators/activation_backward.h" + +namespace infini { +ActivationBackwardObj::ActivationBackwardObj(OpType type, GraphObj *graph, + Tensor y, Tensor diff_y, Tensor x, + Tensor diff_x) + : OperatorObj(type, {y, diff_y, x}, {diff_x}) { + IT_ASSERT(checkValid(graph)); +} + +optional> +ActivationBackwardObj::inferShape(const TensorVec &inputs) const { + return {{inputs[0]->getDims()}}; +} + +std::string ActivationBackwardObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector ActivationBackwardObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector ActivationBackwardObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +}; // namespace infini diff --git a/src/operators/batch_norm.cc b/src/operators/batch_norm.cc index b744df5a..f85b72f1 100644 --- a/src/operators/batch_norm.cc +++ b/src/operators/batch_norm.cc @@ -3,10 +3,10 @@ namespace infini { BatchNormObj::BatchNormObj(GraphObj *graph, Tensor input, Tensor output, Tensor mean, Tensor var, Tensor scale, Tensor bias, - float momentum, float eps, bool training) + float momentum, float eps, bool trainingMode) : OperatorObj(OpType::BatchNorm, {input, mean, var, scale, bias}, {output}), - momentum(momentum), eps(eps), training(training) { - if (training) + momentum(momentum), eps(eps), trainingMode(trainingMode) { + if (trainingMode) IT_TODO_HALT(); IT_ASSERT(checkValid(graph)); diff --git a/src/operators/conv.cc b/src/operators/conv.cc index 78f0a891..36d97081 100644 --- a/src/operators/conv.cc +++ b/src/operators/conv.cc @@ -178,6 +178,81 @@ void ConvTransposed2dObj::setAuxilaryAttributes(PaddingMode mode) { } } +void ConvBackwardFilterObj::setAuxilaryAttributes(PaddingMode mode) { + const Tensor &inputX = inputs[0]; + const Tensor &diffY = inputs[1]; + n = inputX->getDims()[0], c = inputX->getDims()[1], + h = inputX->getDims()[2], w = inputX->getDims()[3], f = diffY->getDims()[0], + r = diffY->getDims()[2], s = diffY->getDims()[3]; + if (mode == PaddingMode::Same) { + int oh = h / sh; + int ow = w / sw; + ph = (h - oh * sh + (r - sh) * dh) / 2; + pw = (w - ow * sw + (s - sw) * dw) / 2; + } else if (mode == PaddingMode::Valid) { + ph = pw = 0; + } +} + +ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, + Tensor diffY, Tensor diffW, int ph, + int pw, int sh, int sw, int dh, + int dw, Tensor bias, ActType act) + : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, ph, pw, sh, sw, dh, dw, + inputX, diffY), + act(act) { + if (bias) + IT_TODO_HALT(); + setAuxilaryAttributes(PaddingMode::Other); + IT_ASSERT(checkValid(graph)); +} + +ConvBackwardFilterObj::ConvBackwardFilterObj(GraphObj *graph, Tensor inputX, + Tensor diffY, Tensor diffW, + PaddingMode mode, int sh, int sw, + int dh, int dw, Tensor bias, + ActType act) + : ConvBaseObj(OpType::Conv, {inputX, diffY}, diffW, mode, sh, sw, dh, dw, + inputX, diffY), + act(act) { + if (bias) + IT_TODO_HALT(); + setAuxilaryAttributes(mode); + IT_ASSERT(checkValid(graph)); +} + +optional> +ConvBackwardFilterObj::inferShape(const TensorVec &inputs) const { + const auto &inputX = inputs[0], &diffY = inputs[1]; + auto n = inputX->getDims()[0]; + auto h = inputX->getDims()[2]; + auto w = inputX->getDims()[3]; + auto f = diffY->getDims()[0]; + auto r = diffY->getDims()[2]; + auto s = diffY->getDims()[3]; + int on = n, oc = f; + int oh = 0, ow = 0; + // For NCHW+FCRS layout, C of input is divisable by C of weight + if (inputX->getDims()[1] % diffY->getDims()[1] != 0) + return {}; + // Set padding size + if (padding == PaddingMode::Other) { + oh = (h - (r - sh) * dh + ph * 2) / sh; + ow = (w - (s - sw) * dw + pw * 2) / sw; + } else if (padding == PaddingMode::Same) { + oh = h / sh; + ow = w / sw; + // ph = (h - oh * sh + (r - sh) * dh) / 2; + // pw = (w - ow * sw + (s - sw) * dw) / 2; + } else if (padding == PaddingMode::Valid) { + int ph = 0; + int pw = 0; + oh = (h - (r - sh) * dh + ph * 2) / sh; + ow = (w - (s - sw) * dw + pw * 2) / sw; + } + return {{{on, oc, oh, ow}}}; +} + ConvTransposed2dNHWCObj::ConvTransposed2dNHWCObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph, int pw, int sh, int sw, diff --git a/src/operators/det.cc b/src/operators/det.cc new file mode 100644 index 00000000..e83f67ed --- /dev/null +++ b/src/operators/det.cc @@ -0,0 +1,43 @@ +#include "operators/det.h" + +namespace infini { +DetObj::DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode) + : OperatorObj(OpType::Det, {input}, {output}), modeValue(mode) { + IT_ASSERT(checkValid(graph)); +} + +optional> DetObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + auto input = A->getDims(); + int length = input.size(); + if (length == 2) { + std::vector output = {1}; + return {{output}}; + } else { + std::vector output(input.begin(), input.end() - 2); + return {{output}}; + } +} + +std::string DetObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector DetObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector DetObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +}; // namespace infini diff --git a/src/operators/dropout.cc b/src/operators/dropout.cc new file mode 100644 index 00000000..9a59942c --- /dev/null +++ b/src/operators/dropout.cc @@ -0,0 +1,40 @@ +#include "operators/dropout.h" + +namespace infini { + +DropoutObj::DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask, + float ratio, bool training_mode) + : OperatorObj(OpType::Dropout, {data}, {output, mask}), ratio(ratio) { + IT_ASSERT(0 <= ratio && ratio < 1); + IT_ASSERT(!training_mode); + IT_ASSERT(checkValid(graph)); +} + +optional> DropoutObj::inferShape(const TensorVec &inputs) const { + auto shape = inputs[0]->getDims(); + return {{shape, shape}}; +} +std::string DropoutObj::toString() const { + std::ostringstream os; + os << "Dropout[" << getGuid() << "](" << vecToString(inputs[0]->getDims()) + << ", " + << "ratio=" << ratio << ", " + << "training_mode=false, " + << "input=" << inputs[0]->getGuid() << ", " + << "outputs=" << outputs[0]->getGuid() << ", " << outputs[1]->getGuid() + << ")"; + return os.str(); +} + +vector DropoutObj::getWorkloadVector() const { + vector ret = inputs[0]->getDims(); + ret.emplace_back(static_cast(ratio)); + ret.emplace(ret.begin(), enum_to_underlying(type)); + return ret; +} + +vector DropoutObj::getOpAttrVector() const { + return {enum_to_underlying(type), static_cast(ratio), false}; +} + +} // namespace infini diff --git a/src/operators/element_wise.cc b/src/operators/element_wise.cc index bb13586a..b2f9e0cc 100644 --- a/src/operators/element_wise.cc +++ b/src/operators/element_wise.cc @@ -12,23 +12,29 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const { // For now,we only process the same dims here, broardcast will be considered // in the opt layer. const auto A = inputs[0], B = inputs[1]; - if (A->getDims().size() != B->getDims().size() || - A->getDims() != B->getDims()) - return {}; + int max_len = std::max(A->getDims().size(), B->getDims().size()); + std::vector A_(max_len, 1); + std::vector B_(max_len, 1); + std::vector res(max_len, 1); + memcpy(A_.data() + max_len - A->getDims().size(), A->getDims().data(), + A->getDims().size() * sizeof(int)); + memcpy(B_.data() + max_len - B->getDims().size(), B->getDims().data(), + B->getDims().size() * sizeof(int)); + // std::copy(A->getDims().begin(), A->getDims().end(), A_.begin() + (max_len + // - A->getDims().size())); std::copy(B->getDims().begin(), + // B->getDims().end(), B_.begin() + (max_len - B->getDims().size())); + // std::copy(A->getDims().rbegin(), A->getDims().rend(), A_.rbegin()); + // std::copy(B->getDims().rbegin(), B->getDims().rend(), B_.rbegin()); - return {{A->getDims()}}; - /* - int n = A->getDims().size(); - Shape shape; - for (int i = 0; i < n; i++) { - auto dimA = A->getDims().at(i); - auto dimB = B->getDims().at(i); - if (!(dimA == dimB || dimA == 1 || dimB == 1)) + for (int i = 0; i < max_len; ++i) { + if (A_[i] == B_[i] || (A_[i] == 1 || B_[i] == 1)) { + res[i] = std::max(A_[i], B_[i]); + } else { return {}; - auto dimI = dimA > dimB ? dimA : dimB; - shape.emplace_back(dimI); + } } - return {{shape}};*/ + + return {{res}}; } std::string ElementWiseObj::toString() const { @@ -54,4 +60,48 @@ vector ElementWiseObj::getOpAttrVector() const { return {enum_to_underlying(type)}; } +MSELossObj::MSELossObj(GraphObj *graph, Tensor input0, Tensor input1, + Reduction reduction, Tensor output) + : OperatorObj(OpType::MSELoss, {input0, input1}, {output}), + reductionMode(reduction) { + IT_ASSERT(checkValid(graph)); +} + +optional> MSELossObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0], B = inputs[1]; + if (A->getDims().size() != B->getDims().size() || + A->getDims() != B->getDims()) + return {}; + + if (reductionMode == None) { + return {{A->getDims()}}; + } else { + Shape temp = {1}; + return {{temp}}; + } +} + +std::string MSELossObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << vecToString(inputs[1]->getDims()) << ","; + os << "input0=" << inputs[0]->getGuid() << ","; + os << "input1=" << inputs[1]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +// use output dim or inputs dim? +vector MSELossObj::getWorkloadVector() const { + vector ret = outputs[0]->getDims(); + ret.emplace(ret.begin(), enum_to_underlying(type)); + return ret; +} + +vector MSELossObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + }; // namespace infini diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc index 4587efa8..3fb75371 100644 --- a/src/operators/matmul.cc +++ b/src/operators/matmul.cc @@ -4,8 +4,9 @@ namespace infini { MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA, bool transB, [[maybe_unused]] Tensor bias, ActType act) - : OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB), - act(act) { + : OperatorObj(OpType::Matmul, + bias ? TensorVec{A, B, bias} : TensorVec{A, B}, {C}), + transA(transA), transB(transB), act(act), b(1) { auto shape_a = A->getDims(); auto shape_b = B->getDims(); int dimA = shape_a.size(), dimB = shape_b.size(); diff --git a/src/operators/pad.cc b/src/operators/pad.cc index 1624236e..2b853769 100644 --- a/src/operators/pad.cc +++ b/src/operators/pad.cc @@ -13,7 +13,8 @@ PadObj::PadObj(GraphObj *graph, Tensor input, Tensor output, pads = vector(nDims * 2, 0); for (size_t i = 0; i < nAxis; ++i) { - auto j = (*axes)[i]; + auto k = (*axes)[i]; + auto j = k < 0 ? nDims + k : k; pads[j] = _pads[i]; pads[j + nDims] = _pads[i + nAxis]; } diff --git a/src/operators/reduce_mean.cc b/src/operators/reduce_mean.cc index 95e09dc5..d55377df 100644 --- a/src/operators/reduce_mean.cc +++ b/src/operators/reduce_mean.cc @@ -37,9 +37,10 @@ ReduceMeanObj::inferShape(const TensorVec &inputs) const { if (!isReduced(i)) ret.emplace_back(dims[i]); } - if (ret.size() == (size_t)0) - ret.emplace_back(1); - return {{ret}}; + if (ret.empty()) + return {{{1}}}; + else + return {{ret}}; } } diff --git a/src/operators/slice.cc b/src/operators/slice.cc index e5a5ec8d..a61d9d85 100644 --- a/src/operators/slice.cc +++ b/src/operators/slice.cc @@ -3,77 +3,103 @@ namespace infini { SliceObj::SliceObj(GraphObj *graph, Tensor input, Tensor output, const vector &starts, const vector &ends, - const optional> &axes, - const optional> &steps) + const optional> &_axes, + const optional> &_steps) : OperatorObj(OpType::Slice, {input}, {output}) { - if (steps) - IT_TODO_HALT(); - IT_ASSERT(starts.size() == ends.size()); + auto shape = input->getDims(); // shape of input + map axes; + vector steps; + { + auto size = starts.size(); // size of starts + IT_ASSERT(size == ends.size()); // size of ends - if (!axes) { - this->starts = starts; - this->ends = ends; - } else { - auto nAxis = (*axes).size(); - IT_ASSERT(starts.size() == nAxis); + if (_axes) { + IT_ASSERT(size == _axes->size()); + // onnx doc: "Behavior is undefined if an axis is repeated." + IT_ASSERT(size == std::set(_axes->begin(), _axes->end()).size()); - auto dims = input->getDims(); - this->starts = vector(dims.size(), 0); - this->ends.resize(dims.size()); - std::transform(dims.begin(), dims.end(), this->ends.begin(), - [](auto x) { return x - 1; }); + for (size_t i = 0; i < size; ++i) { + auto index = _axes->at(i); + if (index < 0) + index += shape.size(); + axes[index] = i; + } + } else + for (size_t i = 0; i < size; ++i) + axes[i] = i; - for (size_t j = 0; j < nAxis; ++j) { - auto i = (*axes)[j]; - if (i < 0) - IT_TODO_HALT(); - this->starts[i] = starts[j]; - this->ends[i] = ends[j]; + if (_steps) { + IT_ASSERT(size == _steps->size()); + // onnx doc: "‘steps’ cannot be 0." + IT_ASSERT(std::find(_steps->begin(), _steps->end(), 0) == + _steps->end()); + steps = *_steps; + } else { + steps.reserve(size); + for (size_t i = 0; i < size; ++i) + steps.push_back(1); } } + + auto size = shape.size(); + this->axes.reserve(size); + for (size_t i = 0; i < size; ++i) + if (auto _i = axes.find(i); _i != axes.end()) { + auto __i = _i->second; + auto start = starts[__i]; + auto end = ends[__i]; + this->axes.push_back({start >= 0 ? start : start + shape[__i], + end >= 0 ? end : end + shape[__i], + steps[__i]}); + } else { + this->axes.push_back({0, shape[i], 1}); + } IT_ASSERT(checkValid(graph)); } optional> SliceObj::inferShape(const TensorVec &inputs) const { - auto dims = inputs[0]->getDims(); - int nDims = dims.size(); - if (nDims != (int)starts.size()) - return {}; - for (int i = 0; i < nDims; ++i) { - if (starts[i] < 0 || ends[i] >= dims[i] || starts[i] > ends[i]) - return {}; - dims[i] = ends[i] - starts[i] + 1; + Shape ans; + ans.reserve(axes.size()); + for (const auto &range : axes) { + auto step = std::abs(range.step); + ans.push_back((range.end - range.start + step - 1) / step); } - - return {{dims}}; + return {{ans}}; } std::string SliceObj::toString() const { std::ostringstream os; - os << "Slice" - << "[" << getGuid() << "]"; - os << "("; - os << vecToString(inputs[0]->getDims()) << ","; - os << "starts=" << vecToString(starts) << ","; - os << "ends=" << vecToString(ends) << ","; - os << "input=" << inputs[0]->getGuid() << ","; - os << "output=" << outputs[0]->getGuid() << ")"; + os << "Slice[" << getGuid() << "]["; + for (const auto &range : axes) { + os << range.start << ':' << range.step << ':' << range.end << ", "; + } + os << "](" + << "input=" << inputs[0]->getGuid() << ", " + << "output=" << outputs[0]->getGuid() << ")"; return os.str(); } vector SliceObj::getWorkloadVector() const { - vector ret = inputs[0]->getDims(); - ret.insert(ret.end(), starts.begin(), starts.end()); - ret.insert(ret.end(), ends.begin(), ends.end()); - ret.emplace(ret.begin(), enum_to_underlying(type)); - return ret; + auto ans = getOpAttrVector(); + { + auto i = inputs[0]->getDims(); + ans.insert(ans.end(), i.begin(), i.end()); + } + if (!outputs.empty()) { + auto o = outputs[0]->getDims(); + ans.insert(ans.end(), o.begin(), o.end()); + } + return ans; } vector SliceObj::getOpAttrVector() const { - vector ret = starts; - ret.insert(ret.end(), ends.begin(), ends.end()); - ret.emplace(ret.begin(), enum_to_underlying(type)); - return ret; + vector ans{enum_to_underlying(type)}; + for (const auto &range : axes) { + ans.push_back(range.start); + ans.push_back(range.end); + ans.push_back(range.step); + } + return ans; } } // namespace infini diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc new file mode 100644 index 00000000..616eeb14 --- /dev/null +++ b/src/operators/transpose.cc @@ -0,0 +1,50 @@ +#include "operators/transpose.h" + +namespace infini { +TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output, + vector permute) + : OperatorObj(OpType::Transpose, {input}, {output}) { + if (permute.size() != 4) { + IT_TODO_HALT(); + } + transposePermute[0] = permute[0]; + transposePermute[1] = permute[1]; + transposePermute[2] = permute[2]; + transposePermute[3] = permute[3]; + IT_ASSERT(checkValid(graph)); +} + +optional> +TransposeObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + auto input = A->getDims(); + auto output = input; + + for (int i = 0; i < 4; ++i) { + output[i] = input[transposePermute[i]]; + } + return {{output}}; +} + +std::string TransposeObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector TransposeObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector TransposeObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +}; // namespace infini diff --git a/src/operators/unary.cc b/src/operators/unary.cc index cecf0e33..6f85cecf 100644 --- a/src/operators/unary.cc +++ b/src/operators/unary.cc @@ -32,4 +32,288 @@ vector UnaryObj::getOpAttrVector() const { return {enum_to_underlying(type)}; } +ClipObj::ClipObj(GraphObj *graph, Tensor input, Tensor output, + std::optional min, std::optional max) + : OperatorObj(OpType::Clip, {input}, {output}), minValue(min), + maxValue(max) { + IT_ASSERT(checkValid(graph)); +} + +optional> ClipObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string ClipObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector ClipObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector ClipObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +HardtanhObj::HardtanhObj(GraphObj *graph, Tensor input, Tensor output, + float min, float max) + : OperatorObj(OpType::Hardtanh, {input}, {output}), minValue(min), + maxValue(max) { + IT_ASSERT(checkValid(graph)); +} + +optional> HardtanhObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string HardtanhObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector HardtanhObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector HardtanhObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +FillObj::FillObj(GraphObj *graph, Tensor input, Tensor output, float value) + : OperatorObj(OpType::Fill, {input}, {output}), setValue(value) { + IT_ASSERT(checkValid(graph)); +} + +optional> FillObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string FillObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector FillObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector FillObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +L2LossObj::L2LossObj(GraphObj *graph, Tensor input, Tensor output) + : OperatorObj(OpType::L2Loss, {input}, {output}) { + IT_ASSERT(checkValid(graph)); +} + +optional> L2LossObj::inferShape(const TensorVec &inputs) const { + Shape temp = {1}; + return {{temp}}; +} + +std::string L2LossObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector L2LossObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector L2LossObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type) + : OperatorObj(OpType::Cast, {input}, {output}), castType(type) { + IT_ASSERT(checkValid(graph)); +} + +vector CastObj::inferDataType(const TensorVec &inputs) const { + auto input_dataType = inputs[0]->getDType(); + auto output_dataType = getOutputDataType(); + for (const auto &tensor : inputs) + IT_ASSERT(input_dataType == tensor->getDType()); + return vector(numOutputs(), output_dataType); +} + +optional> CastObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string CastObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector CastObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector CastObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +DataType CastObj::getOutputDataType() const { + switch (castType) { + case CastObj::Float2Int64: + return DataType::Int64; + case CastObj::Float2Int32: + return DataType::Int32; + case CastObj::Float2Int16: + return DataType::Int16; + case CastObj::Float2Int8: + return DataType::Int8; + case CastObj::Int322Float: + return DataType::Float32; + case CastObj::Int322Int8: + return DataType::Int8; + case CastObj::Int322Int16: + return DataType::Int16; + case CastObj::Int162Float: + return DataType::Float32; + case CastObj::Int162Int32: + return DataType::Int32; + case CastObj::Int82Float: + return DataType::Float32; + case CastObj::Int82Int16: + return DataType::Int16; + case CastObj::Int82Int32: + return DataType::Int32; + case CastObj::Uint82Float: + return DataType::Float32; + case CastObj::Uint82Int32: + return DataType::Int32; + case CastObj::Uint82Int64: + return DataType::Int64; + case CastObj::Int322Int64: + return DataType::Int64; + case CastObj::Int642Int32: + return DataType::Int32; + case CastObj::Int642Uint32: + return DataType::UInt32; + case CastObj::Int642Float: + return DataType::Float32; + case CastObj::Uint322Int64: + return DataType::Int64; + default: + IT_TODO_HALT(); + } +} + +ShapeObj::ShapeObj(GraphObj *graph, Tensor input, Tensor output) + : OperatorObj(OpType::Shape, {input}, {output}) { + IT_ASSERT(checkValid(graph)); +} + +optional> ShapeObj::inferShape(const TensorVec &inputs) const { + return {{{static_cast(inputs[0]->getDims().size())}}}; +} + +std::string ShapeObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "](" + << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +PReluObj::PReluObj(GraphObj *graph, Tensor input, Tensor alpha, Tensor output) + : OperatorObj(OpType::PRelu, {input, alpha}, {output}) { + IT_ASSERT(checkValid(graph)); +} + +optional> PReluObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string PReluObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << vecToString(inputs[0]->getDims()) << ","; + os << "input=" << inputs[0]->getGuid() << ","; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector PReluObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector PReluObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + +LogObj::LogObj(GraphObj *graph, Tensor input, Tensor output, LogType type) + : OperatorObj(OpType::Log, {input}, {output}), logType(type) { + IT_ASSERT(checkValid(graph)); +} + +optional> LogObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string LogObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector LogObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector LogObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + }; // namespace infini diff --git a/test/core/test_graph_replace.cc b/test/core/test_graph_replace.cc index 5959bc7d..a77624ec 100644 --- a/test/core/test_graph_replace.cc +++ b/test/core/test_graph_replace.cc @@ -281,7 +281,7 @@ TEST(MatchGraph, multi_input_output) { SubGraph subg0 = make_ref(runtime, TensorVec{i0, i1}); { auto slice = subg0->addOp(i0, nullptr, vector{0, 0}, - vector{55, 55}, + vector{56, 56}, vector{2, 3}, std::nullopt); auto relu0 = subg0->addOp(slice->getOutput(0), nullptr); Tensor w0 = subg0->addTensor(Shape{256, 64, 1, 1}, DataType::UInt32); @@ -303,7 +303,7 @@ TEST(MatchGraph, multi_input_output) { SubGraph subg1 = make_ref(runtime, TensorVec{i1, i0}); { auto slice = subg1->addOp(i0, nullptr, vector{0, 0}, - vector{55, 55}, + vector{56, 56}, vector{2, 3}, std::nullopt); auto relu0 = subg1->addOp(slice->getOutput(0), nullptr); Tensor w0 = subg1->addTensor(Shape{256, 64, 1, 1}, DataType::UInt32); @@ -328,7 +328,7 @@ TEST(MatchGraph, multi_input_output) { auto slice = subg2->addOp( extend->getOutput(0), nullptr, vector{0, 0}, - vector{55, 55}, vector{2, 3}, std::nullopt); + vector{56, 56}, vector{2, 3}, std::nullopt); auto extend1 = subg2->addOp(i1, nullptr, 1, 3); auto add = subg2->addOp(extend1->getOutput(0), diff --git a/test/core/test_tensor_save.cc b/test/core/test_tensor_save.cc index f5a84618..87dfda24 100644 --- a/test/core/test_tensor_save.cc +++ b/test/core/test_tensor_save.cc @@ -7,6 +7,7 @@ namespace infini { TEST(Prtotbuf, save_and_load) { +#ifdef TENSOR_PROTOBUF Runtime runtime = NativeCpuRuntimeObj::getInstance(); Graph g = make_ref(runtime); Tensor i0 = g->addTensor({1, 3, 4}, DataType::Float32); @@ -28,6 +29,7 @@ TEST(Prtotbuf, save_and_load) { u1->load("u.pb"); u1->printData(); EXPECT_TRUE(u1->equalData(u0)); +#endif } } // namespace infini diff --git a/test/kernels/bang/test_bang_activation_backward.cc b/test/kernels/bang/test_bang_activation_backward.cc new file mode 100644 index 00000000..3c90dc4e --- /dev/null +++ b/test/kernels/bang/test_bang_activation_backward.cc @@ -0,0 +1,56 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/activation_backward.h" +#include "operators/element_wise.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testActivationBackward( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor yCpu = make_ref(shape, DataType::Float32, cpuRuntime); + Tensor diffYCpu = make_ref(shape, DataType::Float32, cpuRuntime); + Tensor xCpu = make_ref(shape, DataType::Float32, cpuRuntime); + + yCpu->dataMalloc(); + diffYCpu->dataMalloc(); + xCpu->dataMalloc(); + + yCpu->setData(generator); + diffYCpu->setData(generator); + xCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto yGpu = bangGraph->cloneTensor(yCpu); + auto diffYGpu = bangGraph->cloneTensor(diffYCpu); + auto xGpu = bangGraph->cloneTensor(xCpu); + auto gpuOp = bangGraph->addOp(yGpu, diffYGpu, xGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto diffXGpu = gpuOp->getOutput(); + + EXPECT_TRUE(1); +} + +TEST(cnnl_ActivationBackward, run) { + testActivationBackward(IncrementalGenerator(), + Shape{1, 2, 2, 3}); + testActivationBackward( + IncrementalGenerator(), Shape{1, 2, 2, 3}); + testActivationBackward(IncrementalGenerator(), + Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_bitcompute.cc b/test/kernels/bang/test_bang_bitcompute.cc new file mode 100644 index 00000000..ebe1710c --- /dev/null +++ b/test/kernels/bang/test_bang_bitcompute.cc @@ -0,0 +1,51 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testBitCompute( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu1->printData(); + inputCpu2->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_BitCompute, run) { + testBitCompute(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testBitCompute(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testBitCompute(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testBitCompute(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_cast.cc b/test/kernels/bang/test_bang_cast.cc new file mode 100644 index 00000000..7bcc44ea --- /dev/null +++ b/test/kernels/bang/test_bang_cast.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testCast(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, CastObj::Float2Int32); + auto outputGpu = gpuOp->getOutput(); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Cast, run) { + testCast(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_ceil.cc b/test/kernels/bang/test_bang_ceil.cc new file mode 100644 index 00000000..696fb5ba --- /dev/null +++ b/test/kernels/bang/test_bang_ceil.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testCeil(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Ceil, run) { + testCeil(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_clip.cc b/test/kernels/bang/test_bang_clip.cc new file mode 100644 index 00000000..3ae9bbf7 --- /dev/null +++ b/test/kernels/bang/test_bang_clip.cc @@ -0,0 +1,42 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testClip(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + float min = 1.0; + float max = 4.0; + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, min, max); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Clip, run) { + testClip(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_concat.cc b/test/kernels/bang/test_bang_concat.cc new file mode 100644 index 00000000..4cf130e3 --- /dev/null +++ b/test/kernels/bang/test_bang_concat.cc @@ -0,0 +1,52 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/concat.h" + +#include "test.h" + +namespace infini { + +template +void testConcat(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = + bangGraph->addOp(TensorVec{inputGpu1, inputGpu2}, nullptr, 2); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu1->print(); + inputCpu1->printData(); + inputCpu2->print(); + inputCpu2->printData(); + outputGpu2Cpu->print(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Concat, run) { + testConcat(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_copy.cc b/test/kernels/bang/test_bang_copy.cc new file mode 100644 index 00000000..7b5ca90f --- /dev/null +++ b/test/kernels/bang/test_bang_copy.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testCopy(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(outputGpu2Cpu->equalData(inputCpu)); +} + +TEST(cnnl_Copy, run) { + testCopy(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_det.cc b/test/kernels/bang/test_bang_det.cc new file mode 100644 index 00000000..a65747ba --- /dev/null +++ b/test/kernels/bang/test_bang_det.cc @@ -0,0 +1,41 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/det.h" + +#include "test.h" + +namespace infini { + +template +void testDet(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, DetObj::NormalDet); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Det, run) { + testDet(IncrementalGenerator(), Shape{1, 1, 2, 2}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_erf.cc b/test/kernels/bang/test_bang_erf.cc new file mode 100644 index 00000000..13d4b471 --- /dev/null +++ b/test/kernels/bang/test_bang_erf.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testErf(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Erf, run) { + testErf(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_exp.cc b/test/kernels/bang/test_bang_exp.cc new file mode 100644 index 00000000..08b6ef0b --- /dev/null +++ b/test/kernels/bang/test_bang_exp.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testExp(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Exp, run) { + testExp(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_fill.cc b/test/kernels/bang/test_bang_fill.cc new file mode 100644 index 00000000..f7c8f463 --- /dev/null +++ b/test/kernels/bang/test_bang_fill.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testFill(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + float value = 1.0; + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, value); + auto outputGpu = gpuOp->getOutput(); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Fill, run) { + testFill(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_floor.cc b/test/kernels/bang/test_bang_floor.cc new file mode 100644 index 00000000..aeb3306a --- /dev/null +++ b/test/kernels/bang/test_bang_floor.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testFloor(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Floor, run) { + testFloor(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_floordiv.cc b/test/kernels/bang/test_bang_floordiv.cc new file mode 100644 index 00000000..cf4539a9 --- /dev/null +++ b/test/kernels/bang/test_bang_floordiv.cc @@ -0,0 +1,49 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testFloorDiv( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu1->printData(); + inputCpu2->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_FloorDiv, run) { + testFloorDiv(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_floormod.cc b/test/kernels/bang/test_bang_floormod.cc new file mode 100644 index 00000000..2d1eaa1b --- /dev/null +++ b/test/kernels/bang/test_bang_floormod.cc @@ -0,0 +1,49 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testFloorMod( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu1->printData(); + inputCpu2->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_FloorMod, run) { + testFloorMod(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_hardtanh.cc b/test/kernels/bang/test_bang_hardtanh.cc new file mode 100644 index 00000000..0381d195 --- /dev/null +++ b/test/kernels/bang/test_bang_hardtanh.cc @@ -0,0 +1,43 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testHardtanh( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + float min = 1.0; + float max = 4.0; + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, min, max); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Hardtanh, run) { + testHardtanh(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_l2loss.cc b/test/kernels/bang/test_bang_l2loss.cc new file mode 100644 index 00000000..3cde0874 --- /dev/null +++ b/test/kernels/bang/test_bang_l2loss.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testL2Loss(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_L2Loss, run) { + testL2Loss(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_log.cc b/test/kernels/bang/test_bang_log.cc new file mode 100644 index 00000000..2ba6df9b --- /dev/null +++ b/test/kernels/bang/test_bang_log.cc @@ -0,0 +1,42 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testLog(const std::function &generator, + const Shape &shape, LogObj::LogType type) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, type); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Log, run) { + testLog(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::Log2); + testLog(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::LogE); + testLog(IncrementalGenerator(), Shape{1, 2, 2, 3}, LogObj::Log10); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_logic.cc b/test/kernels/bang/test_bang_logic.cc new file mode 100644 index 00000000..b9bf73d5 --- /dev/null +++ b/test/kernels/bang/test_bang_logic.cc @@ -0,0 +1,56 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testLogicOp(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu1->printData(); + inputCpu2->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_LogicOp, run) { + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testLogicOp(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_maximum.cc b/test/kernels/bang/test_bang_maximum.cc new file mode 100644 index 00000000..e147ef69 --- /dev/null +++ b/test/kernels/bang/test_bang_maximum.cc @@ -0,0 +1,46 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testMaximum(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Maximum, run) { + testMaximum(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_minimum.cc b/test/kernels/bang/test_bang_minimum.cc new file mode 100644 index 00000000..6575e10b --- /dev/null +++ b/test/kernels/bang/test_bang_minimum.cc @@ -0,0 +1,46 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testMinimum(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Minimum, run) { + testMinimum(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_mseloss.cc b/test/kernels/bang/test_bang_mseloss.cc new file mode 100644 index 00000000..b2c25f7c --- /dev/null +++ b/test/kernels/bang/test_bang_mseloss.cc @@ -0,0 +1,57 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testMSELoss(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp1 = + bangGraph->addOp(inputGpu1, inputGpu2, MSELossObj::None, nullptr); + auto gpuOp2 = + bangGraph->addOp(inputGpu1, inputGpu2, MSELossObj::Sum, nullptr); + auto gpuOp3 = + bangGraph->addOp(inputGpu1, inputGpu2, MSELossObj::Mean, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu1 = gpuOp1->getOutput(); + auto outputGpu2 = gpuOp2->getOutput(); + auto outputGpu3 = gpuOp3->getOutput(); + auto outputGpu2Cpu1 = outputGpu1->clone(cpuRuntime); + auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime); + auto outputGpu2Cpu3 = outputGpu3->clone(cpuRuntime); + // Check + outputGpu2Cpu1->printData(); + outputGpu2Cpu2->printData(); + outputGpu2Cpu3->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_MSELoss, run) { + testMSELoss(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_neg.cc b/test/kernels/bang/test_bang_neg.cc new file mode 100644 index 00000000..2d93ca2f --- /dev/null +++ b/test/kernels/bang/test_bang_neg.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testNeg(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Neg, run) { + testNeg(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_net.cc b/test/kernels/bang/test_bang_net.cc new file mode 100644 index 00000000..b0fe11d4 --- /dev/null +++ b/test/kernels/bang/test_bang_net.cc @@ -0,0 +1,47 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +void testNet(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + auto outputGpu = gpuOp->getOutput(); + auto gpuOp2 = bangGraph->addOp(outputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu2 = gpuOp2->getOutput(); + auto outputGpu2Cpu2 = outputGpu2->clone(cpuRuntime); + // Check + inputCpu2->printData(); + outputGpu2Cpu2->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Net, run) { testNet(IncrementalGenerator(), Shape{1, 2, 2, 3}); } + +} // namespace infini diff --git a/test/kernels/bang/test_bang_optensor.cc b/test/kernels/bang/test_bang_optensor.cc index 436ab6dc..c46f80d3 100644 --- a/test/kernels/bang/test_bang_optensor.cc +++ b/test/kernels/bang/test_bang_optensor.cc @@ -42,6 +42,8 @@ void testOptensor( cpuRuntime->run(cpuGraph); auto outputCpu = cpuOp->getOutput(); // Check + outputCpu->printData(); + outputGpu2Cpu->printData(); EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu)); } @@ -49,6 +51,7 @@ TEST(cuDNN_OpTensor, run) { testOptensor(IncrementalGenerator(), Shape{1, 2, 2, 3}); testOptensor(IncrementalGenerator(), Shape{1, 2, 2, 3}); testOptensor(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testOptensor(IncrementalGenerator(), Shape{1, 2, 2, 3}); } } // namespace infini diff --git a/test/kernels/bang/test_bang_pad.cc b/test/kernels/bang/test_bang_pad.cc new file mode 100644 index 00000000..2abf1eec --- /dev/null +++ b/test/kernels/bang/test_bang_pad.cc @@ -0,0 +1,44 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/pad.h" + +#include "test.h" + +namespace infini { + +template +void testPad(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, vector{1, 1, 1, 1}, + vector{0, 3}); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu->print(); + inputCpu->printData(); + outputGpu2Cpu->print(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Pad, run) { + testPad(IncrementalGenerator(), Shape{1, 1, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_pooling.cc b/test/kernels/bang/test_bang_pooling.cc new file mode 100644 index 00000000..20347ae9 --- /dev/null +++ b/test/kernels/bang/test_bang_pooling.cc @@ -0,0 +1,41 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/pooling.h" + +#include "test.h" + +namespace infini { + +template +void testPooling(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Pooling, run) { + testPooling(IncrementalGenerator(), Shape{1, 1, 5, 5}); + testPooling(IncrementalGenerator(), Shape{1, 1, 5, 5}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_pow.cc b/test/kernels/bang/test_bang_pow.cc new file mode 100644 index 00000000..06dfc6dd --- /dev/null +++ b/test/kernels/bang/test_bang_pow.cc @@ -0,0 +1,46 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testPow(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Pow, run) { + testPow(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_prelu.cc b/test/kernels/bang/test_bang_prelu.cc new file mode 100644 index 00000000..bb85e8b1 --- /dev/null +++ b/test/kernels/bang/test_bang_prelu.cc @@ -0,0 +1,46 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testPRelu(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + Tensor alphaCpu = + make_ref(Shape{1, 1, 1, 1}, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + alphaCpu->dataMalloc(); + alphaCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto alphaGpu = bangGraph->cloneTensor(alphaCpu); + auto gpuOp = bangGraph->addOp(inputGpu, alphaGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + alphaCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_PRelu, run) { + testPRelu(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_reciprocal.cc b/test/kernels/bang/test_bang_reciprocal.cc new file mode 100644 index 00000000..a2a12723 --- /dev/null +++ b/test/kernels/bang/test_bang_reciprocal.cc @@ -0,0 +1,41 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testReciprocal( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Reciprocal, run) { + testReciprocal(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_round.cc b/test/kernels/bang/test_bang_round.cc new file mode 100644 index 00000000..26d0e56e --- /dev/null +++ b/test/kernels/bang/test_bang_round.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testRound(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Round, run) { + testRound(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_rsqrt.cc b/test/kernels/bang/test_bang_rsqrt.cc new file mode 100644 index 00000000..a420a638 --- /dev/null +++ b/test/kernels/bang/test_bang_rsqrt.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testRsqrt(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Rsqrt, run) { + testRsqrt(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_split.cc b/test/kernels/bang/test_bang_split.cc new file mode 100644 index 00000000..09bcac0a --- /dev/null +++ b/test/kernels/bang/test_bang_split.cc @@ -0,0 +1,48 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/split.h" + +#include "test.h" + +namespace infini { + +template +void testSplit(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto gpuOp = bangGraph->addOp(inputGpu1, std::nullopt, 3, 3); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto o0Cpu = gpuOp->getOutput(0)->clone(cpuRuntime); + auto o1Cpu = gpuOp->getOutput(1)->clone(cpuRuntime); + auto o2Cpu = gpuOp->getOutput(2)->clone(cpuRuntime); + // Check + inputCpu1->print(); + inputCpu1->printData(); + o0Cpu->print(); + o0Cpu->printData(); + o1Cpu->print(); + o1Cpu->printData(); + o2Cpu->print(); + o2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Split, run) { + testSplit(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_sqrt.cc b/test/kernels/bang/test_bang_sqrt.cc new file mode 100644 index 00000000..da9e8338 --- /dev/null +++ b/test/kernels/bang/test_bang_sqrt.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testSqrt(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Sqrt, run) { + testSqrt(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_square.cc b/test/kernels/bang/test_bang_square.cc new file mode 100644 index 00000000..f759f790 --- /dev/null +++ b/test/kernels/bang/test_bang_square.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testSquare(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Square, run) { + testSquare(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_squaredDifference.cc b/test/kernels/bang/test_bang_squaredDifference.cc new file mode 100644 index 00000000..33efa755 --- /dev/null +++ b/test/kernels/bang/test_bang_squaredDifference.cc @@ -0,0 +1,48 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/element_wise.h" + +#include "test.h" + +namespace infini { + +template +void testSquaredDifference( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu1 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu1->dataMalloc(); + inputCpu1->setData(generator); + Tensor inputCpu2 = + make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu2->dataMalloc(); + inputCpu2->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu1 = bangGraph->cloneTensor(inputCpu1); + auto inputGpu2 = bangGraph->cloneTensor(inputCpu2); + auto gpuOp = bangGraph->addOp(inputGpu1, inputGpu2, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_SquaredDifference, run) { + testSquaredDifference(IncrementalGenerator(), + Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_transpose.cc b/test/kernels/bang/test_bang_transpose.cc new file mode 100644 index 00000000..b9dd6d4b --- /dev/null +++ b/test/kernels/bang/test_bang_transpose.cc @@ -0,0 +1,43 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/transpose.h" + +#include "test.h" + +namespace infini { + +template +void testTranspose( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + vector permute = {0, 1, 3, 2}; + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, permute); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // Check + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Transpose, run) { + testTranspose(IncrementalGenerator(), Shape{1, 1, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_trigon.cc b/test/kernels/bang/test_bang_trigon.cc new file mode 100644 index 00000000..59fec809 --- /dev/null +++ b/test/kernels/bang/test_bang_trigon.cc @@ -0,0 +1,52 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testTrigon(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // CPU + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Trigon, run) { + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); + testTrigon(IncrementalGenerator(), Shape{1, 1, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/bang/test_bang_unary.cc b/test/kernels/bang/test_bang_unary.cc new file mode 100644 index 00000000..68534a3f --- /dev/null +++ b/test/kernels/bang/test_bang_unary.cc @@ -0,0 +1,47 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testUnary(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // CPU + Graph cpuGraph = make_ref(cpuRuntime); + auto cpuOp = cpuGraph->addOp(inputCpu, nullptr); + cpuGraph->dataMalloc(); + cpuRuntime->run(cpuGraph); + auto outputCpu = cpuOp->getOutput(); + // Check + EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu)); +} + +TEST(cnnl_Unary, run) { + testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); + testUnary(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/cuda/test_cuda_clip.cc b/test/kernels/cuda/test_cuda_clip.cc new file mode 100644 index 00000000..2c6abaf7 --- /dev/null +++ b/test/kernels/cuda/test_cuda_clip.cc @@ -0,0 +1,48 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "cuda/cuda_runtime.h" +#include "cuda/cuda_utility.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testClip(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto cudaRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph cudaGraph = make_ref(cudaRuntime); + auto inputGpu = cudaGraph->cloneTensor(inputCpu); + float min = 2.0; + float max = 4.0; + auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, min, max); + cudaGraph->dataMalloc(); + cudaRuntime->run(cudaGraph); + auto outputGpu = gpuOp->getOutput(); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + // CPU + Graph cpuGraph = make_ref(cpuRuntime); + auto cpuOp = cpuGraph->addOp(inputCpu, nullptr, min, max); + cpuGraph->dataMalloc(); + cpuRuntime->run(cpuGraph); + auto outputCpu = cpuOp->getOutput(); + // Check + EXPECT_TRUE(outputCpu->equalData(outputGpu2Cpu)); +} + +TEST(cuDNN_Unary, run) { + testClip(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/kernels/cuda/test_cuda_slice.cc b/test/kernels/cuda/test_cuda_slice.cc index 14926ea3..850fc2a8 100644 --- a/test/kernels/cuda/test_cuda_slice.cc +++ b/test/kernels/cuda/test_cuda_slice.cc @@ -20,7 +20,7 @@ TEST(CUDA_Slice, run) { Graph g = make_ref(cudaRuntime); auto i = g->cloneTensor(icpu); auto op = - g->addOp(i, nullptr, vector{1, 1}, vector{1, 4}, + g->addOp(i, nullptr, vector{1, 1}, vector{2, 5}, vector{0, 3}, std::nullopt); // allocate CUDA memory diff --git a/test/kernels/intelcpu/test_mkl_slice.cc b/test/kernels/intelcpu/test_mkl_slice.cc index 04a5ae86..b840adbf 100644 --- a/test/kernels/intelcpu/test_mkl_slice.cc +++ b/test/kernels/intelcpu/test_mkl_slice.cc @@ -12,7 +12,7 @@ TEST(MKL_Slice, run) { // Build input data Tensor i = g->addTensor(Shape{3, 2, 1, 5}, DataType::Float32); auto op = - g->addOp(i, nullptr, vector{1, 1}, vector{1, 4}, + g->addOp(i, nullptr, vector{1, 1}, vector{2, 5}, vector{0, 3}, std::nullopt); g->dataMalloc(); i->setData(IncrementalGenerator()); diff --git a/test/operators/test_clip.cc b/test/operators/test_clip.cc new file mode 100644 index 00000000..424f0c60 --- /dev/null +++ b/test/operators/test_clip.cc @@ -0,0 +1,38 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testClip(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph Graph = make_ref(cpuRuntime); + float min = 1.0; + float max = 4.0; + auto Op = Graph->addOp(inputCpu, nullptr, min, max); + Graph->dataMalloc(); + cpuRuntime->run(Graph); + auto output = Op->getOutput(); + inputCpu->printData(); + output->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Clip, run) { + testClip(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini diff --git a/test/operators/test_slice.cc b/test/operators/test_slice.cc index 436deada..02bd8bb7 100644 --- a/test/operators/test_slice.cc +++ b/test/operators/test_slice.cc @@ -9,10 +9,10 @@ TEST(Slice, ShapeInference) { { Graph g = make_ref(cpuRuntime); Tensor i = g->addTensor({10, 64, 162, 162}, DataType::UInt32); - auto op = g->addOp(i, nullptr, vector{2, 10, 1, 5}, + auto op = g->addOp(i, nullptr, vector{2, 9, 1, 5}, vector{3, 10, 100, 100}, std::nullopt, std::nullopt); - EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 100, 96})); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 1, 99, 95})); } { Graph g = make_ref(cpuRuntime); @@ -20,7 +20,7 @@ TEST(Slice, ShapeInference) { auto op = g->addOp(i, nullptr, vector{2, 5}, vector{3, 100}, vector{1, 3}, std::nullopt); - EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 2, 162, 96})); + EXPECT_EQ(op->getOutput()->getDims(), (Shape{10, 1, 162, 95})); } }