forked from jiuyuan/InfiniTensor
Compare commits
94 Commits
benchmark_
...
master
Author | SHA1 | Date |
---|---|---|
zhangyue | 5559536470 | |
Bolun Zhang | fac28c25f6 | |
zhangyue | 985d0dee5f | |
PanZezhong1725 | d1de3ab5c2 | |
Hardy | eafbff6cf9 | |
PanZezhong1725 | 7f6aec6c17 | |
xiaonans | a98573990b | |
Chenjie Duan | 54a35772fb | |
zhangyue | 00e6cc2587 | |
baominghelly | b51ccae3b2 | |
xiaonans | 1c08ba200c | |
xiaonans | 900d8e58e3 | |
xiaonans | b0876a13ce | |
xiaonans | ae9f61de5a | |
xiaonans | 9a3c0f11f6 | |
zhangyunze | 67b2bcb7d5 | |
xiaonans | 956ce37458 | |
zhangyunze | 4813204a36 | |
xiaonans | 030e5ca9c1 | |
xiaonans | e8d111ef5d | |
xiaonans | d1a90ba3e2 | |
xiaonans | afed5d3c3d | |
Haojie Wang | a5062f3f89 | |
Hardy | 09b2ecf98a | |
xiaonans | 6a1bfd6c45 | |
Chenjie Duan | 51086d2b8d | |
zhangyunze | 58993d4339 | |
PanZezhong1725 | 46e61a5bd4 | |
zhangyunze | b15c4979fa | |
Hardy | 42032356fb | |
Chenjie Duan | 83f1de93d0 | |
zhangyunze | 3967b437c8 | |
Chenjie Duan | 6e7bd6ca0c | |
Hardy | 5ac0ab442f | |
Chenjie Duan | 3f34372012 | |
learner2468 | 9a9587556c | |
xgqdut2016 | a3929c25f8 | |
Derui Yang | c143eebdf7 | |
Hardy | 67974aee8a | |
Hardy | 3ead20a23a | |
xgqdut2016 | a7293c12ba | |
PanZezhong1725 | 6ece3f4a77 | |
xgqdut2016 | 595a9906d2 | |
zhangyunze | 331f7ab2b8 | |
xiaonans | 965df4e294 | |
Hardy | f22fa2766e | |
Hardy | 50862df765 | |
Hardy | 1ea450882b | |
xgqdut2016 | d3e7543291 | |
Derui Yang | 1a6fccccbe | |
xgqdut2016 | ec3adf6fa7 | |
Bolun Zhang | 23b825efc4 | |
constroy Li | feccd4f318 | |
Haojie Wang | 7f5188bedd | |
baominghelly | 07ef587c65 | |
Derui Yang | d0f9792613 | |
Hardy | 1184fa131f | |
Haojie Wang | 8e4d88fb9f | |
PanZezhong1725 | 36ae7b7fb6 | |
PanZezhong1725 | ed3034f878 | |
kilinchange | 1151101fb9 | |
Haojie Wang | 90b9a80f72 | |
ChengXiang Qi | 7f16fa353e | |
PanZezhong1725 | 7600fe688c | |
Haojie Wang | 7a9fcd93b2 | |
PanZezhong1725 | 785853b0a3 | |
Haojie Wang | c0ff584e04 | |
Haojie Wang | f25bcca076 | |
kilinchange | 877db21021 | |
PanZezhong1725 | 62be816f53 | |
Haojie Wang | 8f2597a508 | |
kilinchange | 48ec730579 | |
PanZezhong1725 | c6b82cfda0 | |
constroy Li | 4c321c8a91 | |
xgqdut2016 | dda668fd16 | |
constroy Li | f60767a770 | |
Hardy | b4eda85e67 | |
PanZezhong1725 | 2412c25e67 | |
zhangyunze | 3e6ef305f1 | |
ChengXiang Qi | d8ffd8a4b7 | |
kuangjux | a8a5c037ca | |
PanZezhong1725 | 69fd251e5d | |
panzezhong | 0ce7e7651f | |
constroy Li | 1e91979c76 | |
zhangyunze | 1438f14a25 | |
PanZezhong1725 | 9cf6c30e1c | |
constroy Li | 384407421b | |
constroy Li | 48847958d0 | |
zhangyunze | ef672894d0 | |
kilinchange | 0dc5347089 | |
zhangyunze | bd9e1aeb3f | |
Derui Yang | 57ac94d893 | |
zhangyunze | 9b10a74788 | |
Derui Yang | 1dc65e2788 |
|
@ -1,12 +1,11 @@
|
||||||
name: Build and test cpu
|
name: Build and test cpu
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branch: 'master'
|
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'LICENSE'
|
- 'LICENSE'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths-ignore:
|
paths:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'LICENSE'
|
- 'LICENSE'
|
||||||
|
|
||||||
|
@ -15,10 +14,10 @@ env:
|
||||||
protobuf-version: "3.21.12"
|
protobuf-version: "3.21.12"
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
|
|
||||||
resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
|
resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
|
||||||
inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
|
inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
|
||||||
densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
|
densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
|
||||||
efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
|
efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
name: clang-format Check
|
name: clang-format Check
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branch: 'master'
|
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'LICENSE'
|
- 'LICENSE'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths-ignore:
|
paths:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'LICENSE'
|
- 'LICENSE'
|
||||||
|
|
||||||
|
|
|
@ -42,3 +42,5 @@ build_debug/
|
||||||
|
|
||||||
# onnx model
|
# onnx model
|
||||||
*.onnx
|
*.onnx
|
||||||
|
*.pb
|
||||||
|
*.npy
|
||||||
|
|
|
@ -11,5 +11,8 @@
|
||||||
path = 3rd-party/backward-cpp
|
path = 3rd-party/backward-cpp
|
||||||
url = git@github.com:bombela/backward-cpp.git
|
url = git@github.com:bombela/backward-cpp.git
|
||||||
[submodule "example"]
|
[submodule "example"]
|
||||||
path = example
|
path = examples/NNmodel
|
||||||
url = git@github.com:wanghailu0717/NNmodel.git
|
url = git@github.com:wanghailu0717/NNmodel.git
|
||||||
|
[submodule "examples/distributed/onnxsim_large_model"]
|
||||||
|
path = examples/distributed/onnxsim_large_model
|
||||||
|
url = git@github.com:luchangli03/onnxsim_large_model.git
|
||||||
|
|
131
CMakeLists.txt
131
CMakeLists.txt
|
@ -1,20 +1,45 @@
|
||||||
cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
|
|
||||||
include(CMakeDependentOption)
|
|
||||||
project(InfiniTensor C CXX)
|
|
||||||
|
|
||||||
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
|
||||||
option(USE_CUDA "Support CUDA GPU" OFF)
|
option(USE_CUDA "Support CUDA GPU" OFF)
|
||||||
option(USE_BANG "Support BANG MLU" OFF)
|
option(USE_BANG "Support BANG MLU" OFF)
|
||||||
|
option(USE_KUNLUN "Support KUNLUN XPU" OFF)
|
||||||
option(USE_INTELCPU "Support INTELCPU" OFF)
|
option(USE_INTELCPU "Support INTELCPU" OFF)
|
||||||
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
|
||||||
option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
|
option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
|
||||||
|
option(BUILD_NNET "Build nnet" OFF)
|
||||||
|
option(BUILD_DIST "Build project for distributed running" OFF)
|
||||||
option(BUILD_TEST "Build tests" OFF)
|
option(BUILD_TEST "Build tests" OFF)
|
||||||
|
|
||||||
|
if(USE_CUDA)
|
||||||
|
message("CMake 3.18 or higher is required for setting CUDAToolkit")
|
||||||
|
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
|
||||||
|
else()
|
||||||
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include(CMakeDependentOption)
|
||||||
|
project(InfiniTensor C CXX)
|
||||||
|
|
||||||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||||
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
|
|
||||||
|
|
||||||
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||||
|
# Build Type
|
||||||
|
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
|
message("Configuring for Debug build.")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
|
||||||
|
add_compile_definitions(DEBUG_MODE)
|
||||||
|
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||||
|
message("Configuring for Release build.")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||||
|
add_compile_definitions(NDEBUG)
|
||||||
|
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||||
|
message("Configuring for RelWithDebInfo build.")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||||
|
else()
|
||||||
|
message("Build type not specified. Configuring for RelWithDebInfo build.")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
||||||
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
|
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
|
||||||
|
@ -28,11 +53,13 @@ endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
||||||
|
add_compile_options(-Wno-error=unused-variable)
|
||||||
|
|
||||||
find_package(
|
find_package(
|
||||||
Python
|
Python
|
||||||
COMPONENTS Interpreter Development
|
COMPONENTS Interpreter Development
|
||||||
REQUIRED)
|
REQUIRED)
|
||||||
|
|
||||||
# OpenMP
|
# OpenMP
|
||||||
find_package(OpenMP)
|
find_package(OpenMP)
|
||||||
if(OpenMP_C_FOUND)
|
if(OpenMP_C_FOUND)
|
||||||
|
@ -69,16 +96,17 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
|
||||||
include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
|
include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
|
||||||
|
|
||||||
# TVM backend
|
# TVM backend
|
||||||
if(BUILD_TEST_EINNET)
|
if(BUILD_NNET AND BUILD_TEST)
|
||||||
if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
|
|
||||||
message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
|
|
||||||
endif()
|
|
||||||
# TVM and DMLC for invoking TVM packed functions
|
# TVM and DMLC for invoking TVM packed functions
|
||||||
include_directories(${TVM_INCLUDE_DIR})
|
include_directories(${TVM_INCLUDE_DIR})
|
||||||
include_directories(${DMLC_INCLUDE_DIR})
|
include_directories(${DMLC_INCLUDE_DIR})
|
||||||
include_directories(${DLPACK_INCLUDE_DIR})
|
include_directories(${DLPACK_INCLUDE_DIR})
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
|
if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
|
||||||
|
else()
|
||||||
|
# message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(BUILD_TEST)
|
if(BUILD_TEST)
|
||||||
|
@ -92,13 +120,21 @@ if(BUILD_TEST)
|
||||||
include_directories(3rd-party/googletest/googletest/include)
|
include_directories(3rd-party/googletest/googletest/include)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
|
||||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
|
||||||
|
|
||||||
|
|
||||||
# Source files
|
# Source files
|
||||||
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
|
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
|
||||||
|
|
||||||
|
if(BUILD_NNET)
|
||||||
|
add_compile_definitions(BUILD_NNET=1)
|
||||||
|
file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
|
||||||
|
list (APPEND SRC ${SRC_NNET})
|
||||||
|
# For locating resource files
|
||||||
|
set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
|
||||||
|
endif()
|
||||||
|
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
|
file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
|
||||||
|
@ -110,6 +146,11 @@ if(USE_BANG)
|
||||||
list (APPEND SRC ${SRC_BANG})
|
list (APPEND SRC ${SRC_BANG})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(USE_KUNLUN)
|
||||||
|
file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
|
||||||
|
list (APPEND SRC ${SRC_KUNLUN})
|
||||||
|
endif()
|
||||||
|
|
||||||
if(USE_INTELCPU)
|
if(USE_INTELCPU)
|
||||||
file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
|
file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
|
||||||
list (APPEND SRC ${SRC_INTELCPU})
|
list (APPEND SRC ${SRC_INTELCPU})
|
||||||
|
@ -124,7 +165,7 @@ endif()
|
||||||
target_link_libraries(InfiniTensor pybind11::embed)
|
target_link_libraries(InfiniTensor pybind11::embed)
|
||||||
|
|
||||||
# TVM backend
|
# TVM backend
|
||||||
if(BUILD_TEST_EINNET)
|
if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
|
||||||
target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
|
target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -177,6 +218,13 @@ if(USE_CUDA)
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
find_package(CUDAToolkit) # For nvrtc and cuda driver
|
find_package(CUDAToolkit) # For nvrtc and cuda driver
|
||||||
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
|
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
|
||||||
|
if (BUILD_DIST)
|
||||||
|
message(STATUS "Add BUILD_DIST, use NCCL with CUDA")
|
||||||
|
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||||
|
find_package(NCCL REQUIRED)
|
||||||
|
add_compile_definitions(INFINI_USE_NCCL=1)
|
||||||
|
target_link_libraries(InfiniTensor nccl)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_BANG)
|
if(USE_BANG)
|
||||||
|
@ -215,7 +263,50 @@ if(USE_BANG)
|
||||||
# BangC Kernels
|
# BangC Kernels
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
if (BUILD_DIST)
|
||||||
|
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
|
||||||
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
|
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
|
||||||
|
add_compile_definitions(INFINI_USE_CNCL=1)
|
||||||
|
else()
|
||||||
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(USE_KUNLUN)
|
||||||
|
add_compile_definitions(USE_KUNLUN=1)
|
||||||
|
if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
|
||||||
|
message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
|
||||||
|
elseif (DEFINED KUNLUN_HOME)
|
||||||
|
set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
|
||||||
|
else()
|
||||||
|
set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
|
||||||
|
endif()
|
||||||
|
message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
|
||||||
|
|
||||||
|
include_directories("${KUNLUN_HOME}/include/")
|
||||||
|
find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
|
||||||
|
find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||||
|
|
||||||
|
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||||
|
execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
|
||||||
|
elseif(DEFINED TARGET_CPU_ARCH)
|
||||||
|
set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||||
|
else()
|
||||||
|
set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
|
||||||
|
endif()
|
||||||
|
message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
|
||||||
|
|
||||||
|
if (BUILD_DIST)
|
||||||
|
message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
|
||||||
|
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||||
|
find_package(XCCL REQUIRED)
|
||||||
|
add_compile_definitions(INFINI_USE_XCCL=1)
|
||||||
|
target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# # Python bindings
|
# # Python bindings
|
||||||
|
@ -234,6 +325,7 @@ function(build_test files)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
if(BUILD_TEST)
|
if(BUILD_TEST)
|
||||||
|
add_compile_definitions(BUILD_TEST=1)
|
||||||
enable_testing()
|
enable_testing()
|
||||||
if(USE_TRACE)
|
if(USE_TRACE)
|
||||||
build_test(test/trace/*.cc)
|
build_test(test/trace/*.cc)
|
||||||
|
@ -241,11 +333,18 @@ if(BUILD_TEST)
|
||||||
if(BUILD_TEST_CORE)
|
if(BUILD_TEST_CORE)
|
||||||
build_test(test/core/*.cc)
|
build_test(test/core/*.cc)
|
||||||
build_test(test/operators/*.cc)
|
build_test(test/operators/*.cc)
|
||||||
|
build_test(test/kernels/nativecpu/*.cc)
|
||||||
if (USE_CUDA)
|
if (USE_CUDA)
|
||||||
build_test(test/kernels/cuda/*.cc)
|
build_test(test/kernels/cuda/*.cc)
|
||||||
|
build_test(test/cuda/*.cc)
|
||||||
endif()
|
endif()
|
||||||
if (USE_BANG)
|
if (USE_BANG)
|
||||||
build_test(test/kernels/bang/*.cc)
|
build_test(test/kernels/bang/*.cc)
|
||||||
|
build_test(test/bang/*.cc)
|
||||||
|
endif()
|
||||||
|
if (USE_KUNLUN)
|
||||||
|
build_test(test/kernels/kunlun/*.cc)
|
||||||
|
build_test(test/kunlun/*.cc)
|
||||||
endif()
|
endif()
|
||||||
if (USE_INTELCPU)
|
if (USE_INTELCPU)
|
||||||
build_test(test/kernels/intelcpu/*.cc)
|
build_test(test/kernels/intelcpu/*.cc)
|
||||||
|
@ -254,7 +353,7 @@ if(BUILD_TEST)
|
||||||
if(BUILD_TEST_PET)
|
if(BUILD_TEST_PET)
|
||||||
build_test(test/pet/*.cc)
|
build_test(test/pet/*.cc)
|
||||||
endif()
|
endif()
|
||||||
if(BUILD_TEST_EINNET)
|
if(BUILD_NNET AND BUILD_TEST)
|
||||||
build_test(test/nnet/test_*.cc)
|
build_test(test/nnet/test_*.cc)
|
||||||
|
|
||||||
# Build expression reader
|
# Build expression reader
|
||||||
|
|
46
Makefile
46
Makefile
|
@ -1,17 +1,38 @@
|
||||||
.PHONY : build clean install-python test-cpp test-onnx
|
.PHONY : build clean format install-python test-cpp test-onnx
|
||||||
|
|
||||||
TYPE ?= release
|
TYPE ?= Release
|
||||||
CUDA ?= OFF
|
CUDA ?= OFF
|
||||||
BANG ?= OFF
|
BANG ?= OFF
|
||||||
|
KUNLUN ?= OFF
|
||||||
INTELCPU ?= off
|
INTELCPU ?= off
|
||||||
BACKTRACE ?= ON
|
BACKTRACE ?= ON
|
||||||
TEST ?= ON
|
TEST ?= ON
|
||||||
|
DIST ?= OFF
|
||||||
|
NNET ?= OFF
|
||||||
|
DIST ?= OFF
|
||||||
|
FORMAT_ORIGIN ?=
|
||||||
|
# Docker build options
|
||||||
|
DOCKER_NAME ?= infinitensor
|
||||||
|
DOCKER_IMAGE_NAME ?= infinitensor
|
||||||
|
DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
|
||||||
|
DOCKER_RUN_OPTION ?=
|
||||||
|
|
||||||
|
# CUDA option.
|
||||||
|
ifeq ($(CUDA), ON)
|
||||||
|
DOCKER_IMAGE_NAME = infinitensor_cuda
|
||||||
|
DOCKER_NAME = infinitensor_cuda
|
||||||
|
DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
|
||||||
|
DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
|
||||||
|
endif
|
||||||
|
|
||||||
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
||||||
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
||||||
CMAKE_OPT += -DUSE_BANG=$(BANG)
|
CMAKE_OPT += -DUSE_BANG=$(BANG)
|
||||||
|
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
||||||
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||||
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||||
|
CMAKE_OPT += -DBUILD_DIST=$(DIST)
|
||||||
|
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
||||||
|
|
||||||
ifeq ($(INTELCPU), ON)
|
ifeq ($(INTELCPU), ON)
|
||||||
CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
|
CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
|
||||||
|
@ -24,9 +45,12 @@ build:
|
||||||
clean:
|
clean:
|
||||||
rm -rf build
|
rm -rf build
|
||||||
|
|
||||||
|
format:
|
||||||
|
@python3 scripts/format.py $(FORMAT_ORIGIN)
|
||||||
|
|
||||||
install-python: build
|
install-python: build
|
||||||
cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
|
cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
|
||||||
pip install pyinfinitensor/
|
pip install -e pyinfinitensor/
|
||||||
|
|
||||||
test-cpp:
|
test-cpp:
|
||||||
@echo
|
@echo
|
||||||
|
@ -35,3 +59,19 @@ test-cpp:
|
||||||
test-onnx:
|
test-onnx:
|
||||||
@echo
|
@echo
|
||||||
python3 pyinfinitensor/tests/test_onnx.py
|
python3 pyinfinitensor/tests/test_onnx.py
|
||||||
|
|
||||||
|
test-api:
|
||||||
|
@echo
|
||||||
|
python3 pyinfinitensor/tests/test_api.py
|
||||||
|
|
||||||
|
docker-build:
|
||||||
|
docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
|
||||||
|
|
||||||
|
docker-run:
|
||||||
|
docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
|
||||||
|
|
||||||
|
docker-start:
|
||||||
|
docker start $(DOCKER_IMAGE_NAME)
|
||||||
|
|
||||||
|
docker-exec:
|
||||||
|
docker exec -it $(DOCKER_IMAGE_NAME) bash
|
||||||
|
|
|
@ -33,13 +33,14 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
|
- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
|
||||||
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
|
- [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
|
||||||
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
|
- Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
|
||||||
- Supported hardware
|
- Supported hardware
|
||||||
- ✔ NVIDIA GPU
|
- ✔ NVIDIA GPU
|
||||||
- ✔ Cambricon MLU
|
- ✔ Cambricon MLU
|
||||||
|
- ✔ Kunlunxin XPU
|
||||||
- ⬜ Ascend NPU
|
- ⬜ Ascend NPU
|
||||||
- ⬜ Kunlunxin XPU
|
|
||||||
|
|
||||||
## Contributor Guide
|
## Contributor Guide
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
|
||||||
|
SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
|
||||||
|
|
||||||
|
set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
|
||||||
|
set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
|
||||||
|
set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
|
||||||
|
|
||||||
|
if ($ENV{CNCL_ROOT_DIR})
|
||||||
|
message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
|
||||||
|
endif()
|
||||||
|
list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
|
||||||
|
# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
|
||||||
|
|
||||||
|
find_path(CNCL_INCLUDE_DIRS
|
||||||
|
NAMES cncl.h
|
||||||
|
HINTS ${CNCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
if (USE_STATIC_CNCL)
|
||||||
|
MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
|
||||||
|
SET(CNCL_LIBNAME "CNCL_static")
|
||||||
|
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
SET(CNCL_LIBNAME "cncl")
|
||||||
|
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_library(CNCL_LIBRARIES
|
||||||
|
NAMES ${CNCL_LIBNAME}
|
||||||
|
HINTS ${CNCL_LIB_DIR})
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||||
|
|
||||||
|
if(CNCL_FOUND) # obtaining CNCL version and some sanity checks
|
||||||
|
set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
|
||||||
|
message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
|
||||||
|
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||||
|
list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
|
||||||
|
include(CheckCXXSymbolExists)
|
||||||
|
check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
|
||||||
|
|
||||||
|
if (CNCL_VERSION_DEFINED)
|
||||||
|
set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
|
||||||
|
file(WRITE ${file} "
|
||||||
|
#include <iostream>
|
||||||
|
#include <cncl.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
|
||||||
|
int x;
|
||||||
|
CNCLGetVersion(&x);
|
||||||
|
return x == CNCL_VERSION_CODE;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
|
||||||
|
RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
|
||||||
|
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
|
||||||
|
LINK_LIBRARIES ${CNCL_LIBRARIES})
|
||||||
|
if (NOT CNCL_VERSION_MATCHED)
|
||||||
|
message(FATAL_ERROR "Found CNCL header version and library version do not match! \
|
||||||
|
(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
|
||||||
|
else()
|
||||||
|
# message(STATUS "CNCL version < 2.3.5-5")
|
||||||
|
endif ()
|
||||||
|
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
|
||||||
|
|
||||||
|
message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
|
||||||
|
mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||||
|
endif()
|
|
@ -0,0 +1,165 @@
|
||||||
|
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# From PyTorch:
|
||||||
|
#
|
||||||
|
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
||||||
|
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
||||||
|
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
||||||
|
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
||||||
|
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||||
|
# Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||||
|
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
||||||
|
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||||
|
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
||||||
|
#
|
||||||
|
# From Caffe2:
|
||||||
|
#
|
||||||
|
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
||||||
|
#
|
||||||
|
# All contributions by Facebook:
|
||||||
|
# Copyright (c) 2016 Facebook Inc.
|
||||||
|
#
|
||||||
|
# All contributions by Google:
|
||||||
|
# Copyright (c) 2015 Google Inc.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# All contributions by Yangqing Jia:
|
||||||
|
# Copyright (c) 2015 Yangqing Jia
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# All contributions by Kakao Brain:
|
||||||
|
# Copyright 2019-2020 Kakao Brain
|
||||||
|
#
|
||||||
|
# All contributions from Caffe:
|
||||||
|
# Copyright(c) 2013, 2014, 2015, the respective contributors
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# All other contributions:
|
||||||
|
# Copyright(c) 2015, 2016 the respective contributors
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
||||||
|
# copyright over their contributions to Caffe2. The project versioning records
|
||||||
|
# all such contribution and copyright details. If a contributor wants to further
|
||||||
|
# mark their specific copyright on a particular contribution, they should
|
||||||
|
# indicate their copyright solely in the commit message of the change when it is
|
||||||
|
# committed.
|
||||||
|
#
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# 1. Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
#
|
||||||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
#
|
||||||
|
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
||||||
|
# and IDIAP Research Institute nor the names of its contributors may be
|
||||||
|
# used to endorse or promote products derived from this software without
|
||||||
|
# specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
# Find the nccl libraries
|
||||||
|
#
|
||||||
|
# The following variables are optionally searched for defaults
|
||||||
|
# NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
|
||||||
|
# NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
|
||||||
|
# NCCL_LIB_DIR: Directory where NCCL library is found
|
||||||
|
#
|
||||||
|
# The following are set after configuration is done:
|
||||||
|
# NCCL_FOUND
|
||||||
|
# NCCL_INCLUDE_DIRS
|
||||||
|
# NCCL_LIBRARIES
|
||||||
|
#
|
||||||
|
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
|
||||||
|
# install NCCL in the same location as the CUDA toolkit.
|
||||||
|
# See https://github.com/caffe2/caffe2/issues/1601
|
||||||
|
|
||||||
|
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
|
||||||
|
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
|
||||||
|
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
|
||||||
|
|
||||||
|
if ($ENV{NCCL_ROOT_DIR})
|
||||||
|
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
|
||||||
|
endif()
|
||||||
|
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
|
||||||
|
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
|
||||||
|
|
||||||
|
find_path(NCCL_INCLUDE_DIRS
|
||||||
|
NAMES nccl.h
|
||||||
|
HINTS ${NCCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
if (USE_STATIC_NCCL)
|
||||||
|
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
|
||||||
|
SET(NCCL_LIBNAME "nccl_static")
|
||||||
|
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
SET(NCCL_LIBNAME "nccl")
|
||||||
|
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_library(NCCL_LIBRARIES
|
||||||
|
NAMES ${NCCL_LIBNAME}
|
||||||
|
HINTS ${NCCL_LIB_DIR})
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
|
||||||
|
|
||||||
|
if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
|
||||||
|
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
|
||||||
|
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
|
||||||
|
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||||
|
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
|
||||||
|
include(CheckCXXSymbolExists)
|
||||||
|
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
|
||||||
|
|
||||||
|
if (NCCL_VERSION_DEFINED)
|
||||||
|
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
|
||||||
|
file(WRITE ${file} "
|
||||||
|
#include <iostream>
|
||||||
|
#include <nccl.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
|
||||||
|
int x;
|
||||||
|
ncclGetVersion(&x);
|
||||||
|
return x == NCCL_VERSION_CODE;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
|
||||||
|
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
|
||||||
|
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
|
||||||
|
LINK_LIBRARIES ${NCCL_LIBRARIES})
|
||||||
|
if (NOT NCCL_VERSION_MATCHED)
|
||||||
|
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
|
||||||
|
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
|
||||||
|
else()
|
||||||
|
# message(STATUS "NCCL version < 2.3.5-5")
|
||||||
|
endif ()
|
||||||
|
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
|
||||||
|
|
||||||
|
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
|
||||||
|
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
|
||||||
|
endif()
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Find the xccl libraries
|
||||||
|
set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
|
||||||
|
set(XCCL_LIB_DIR $ENV{KUNLUN_HOME} CACHE PATH "Folder contains KUNLUN XCCL libraries")
|
||||||
|
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
|
||||||
|
|
||||||
|
find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
|
||||||
|
NAMES xpu/bkcl.h
|
||||||
|
HINTS XCCL_INCLUDE_DIR)
|
||||||
|
|
||||||
|
find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
|
||||||
|
NAMES lib64/libbkcl.so
|
||||||
|
HINTS XCCL_LIB_DIR)
|
||||||
|
|
||||||
|
message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
|
||||||
|
message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||||
|
|
||||||
|
if (XCCL_FOUND)
|
||||||
|
set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
|
||||||
|
message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
|
||||||
|
list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
|
||||||
|
message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
|
||||||
|
mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
|
||||||
|
endif()
|
|
@ -133,10 +133,40 @@
|
||||||
make install-python BANG=ON
|
make install-python BANG=ON
|
||||||
```
|
```
|
||||||
|
|
||||||
|
编译 CPU 部分,同时编译昆仑 XPU 部分:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export KUNLUN_HOME=/path/to/your/kunlun_home
|
||||||
|
make install-python KUNLUN=ON
|
||||||
|
```
|
||||||
|
|
||||||
3. 使用方法
|
3. 使用方法
|
||||||
|
|
||||||
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
|
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
本项目也提供了 Docker 的环境,您可以使用 `make docker-build` 或 `make docker-build CUDA=ON` 命令启动并编译 Dockerfile,您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
|
||||||
|
|
||||||
|
由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中,因此暂时注释掉拉取 repo 并编译项目的过程,由用户在进入 docker 后自己维护 ssh key(将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题)。
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Build docker container.
|
||||||
|
make docker-build
|
||||||
|
# Run docker image.
|
||||||
|
make docker-run
|
||||||
|
# Execute docker image.
|
||||||
|
make docker-exec
|
||||||
|
```
|
||||||
|
|
||||||
|
如果需要编译 CUDA 版,请使用如下命令:
|
||||||
|
```shell
|
||||||
|
# Build docker container.
|
||||||
|
make docker-build CUDA=ON
|
||||||
|
# Run docker image.
|
||||||
|
make docker-run CUDA=ON
|
||||||
|
```
|
||||||
|
|
||||||
## 技术支持
|
## 技术支持
|
||||||
|
|
||||||
如遇到问题,请联系我们技术支持团队
|
如遇到问题,请联系我们技术支持团队
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
## 目录
|
## 目录
|
||||||
|
|
||||||
|
|
||||||
- [环境支持](#环境支持)
|
- [环境支持](#环境支持)
|
||||||
- [神经网络支持](#神经网络支持)
|
- [神经网络支持](#神经网络支持)
|
||||||
- [技术支持](#技术支持)
|
- [技术支持](#技术支持)
|
||||||
|
@ -19,10 +20,10 @@
|
||||||
|
|
||||||
目前已经验证过的神经网络模型有
|
目前已经验证过的神经网络模型有
|
||||||
|
|
||||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||||
|
|
||||||
## 技术支持
|
## 技术支持
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,10 @@
|
||||||
## 目录
|
## 目录
|
||||||
|
|
||||||
- [使用方法](#使用方法)
|
- [使用方法](#使用方法)
|
||||||
- [python-前端应用指南](#python-前端应用指南)
|
- [python 前端应用指南](#python-前端应用指南)
|
||||||
- [导入-onnx-模型](#导入-onnx-模型)
|
- [导入 onnx 模型](#导入-onnx-模型)
|
||||||
- [导出-onnx-模型](#导出-onnx-模型)
|
- [优化](#优化)
|
||||||
|
- [导出 onnx 模型](#导出-onnx-模型)
|
||||||
- [执行推理](#执行推理)
|
- [执行推理](#执行推理)
|
||||||
- [样例代码](#样例代码)
|
- [样例代码](#样例代码)
|
||||||
- [技术支持](#技术支持)
|
- [技术支持](#技术支持)
|
||||||
|
@ -13,7 +14,7 @@
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
项目管理功能已写到 [Makefile](Makefile),支持下列功能:
|
项目管理功能已写到 [Makefile](../Makefile),支持下列功能:
|
||||||
|
|
||||||
- 编译项目:`make`/`make build`
|
- 编译项目:`make`/`make build`
|
||||||
- 清理生成文件:`make clean`
|
- 清理生成文件:`make clean`
|
||||||
|
@ -26,6 +27,7 @@
|
||||||
- `TYPE`:编译模式(`debug`/`release`),默认值为 `release`
|
- `TYPE`:编译模式(`debug`/`release`),默认值为 `release`
|
||||||
- `CUDA`:是否编译 CUDA 后端,默认为 `OFF`,`ON` 打开
|
- `CUDA`:是否编译 CUDA 后端,默认为 `OFF`,`ON` 打开
|
||||||
- `BANG`:是否编译寒武纪后端,默认为 `OFF`,`ON` 打开
|
- `BANG`:是否编译寒武纪后端,默认为 `OFF`,`ON` 打开
|
||||||
|
- `KUNLUN`:是否编译昆仑后端,默认为 `OFF`,`ON` 打开
|
||||||
- `BACKTRACE`:是否启用栈回溯,默认为 `ON`,`OFF` 关闭,建议调试时打开
|
- `BACKTRACE`:是否启用栈回溯,默认为 `ON`,`OFF` 关闭,建议调试时打开
|
||||||
- `TEST`:是否编译 `googletest`,默认为 `ON`,`OFF` 关闭,只有 `test-cpp` 时必要
|
- `TEST`:是否编译 `googletest`,默认为 `ON`,`OFF` 关闭,只有 `test-cpp` 时必要
|
||||||
|
|
||||||
|
@ -37,10 +39,10 @@
|
||||||
|
|
||||||
支持的模型:
|
支持的模型:
|
||||||
|
|
||||||
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
|
||||||
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
|
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
|
||||||
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
|
||||||
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import onnx
|
import onnx
|
||||||
|
@ -95,7 +97,7 @@ for name, tensor in stub.inputs.items():
|
||||||
print(name, tensor.shape(), tensor)
|
print(name, tensor.shape(), tensor)
|
||||||
```
|
```
|
||||||
|
|
||||||
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx),会打印出:
|
||||||
|
|
||||||
```plaintext
|
```plaintext
|
||||||
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
|
data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
|
||||||
|
@ -136,7 +138,7 @@ for name, tensor in stub.outputs.items():
|
||||||
|
|
||||||
### 样例代码
|
### 样例代码
|
||||||
|
|
||||||
您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解,并尝试运行。在这个文件中,我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
python resnet.py -h
|
python resnet.py -h
|
||||||
|
|
2
env.sh
2
env.sh
|
@ -35,4 +35,4 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
|
||||||
# ├── tools
|
# ├── tools
|
||||||
# ├── version
|
# ├── version
|
||||||
# └── XTDK
|
# └── XTDK
|
||||||
export XPU_HOME=/usr/local/xpu
|
export KUNLUN_HOME=/usr/local/xpu
|
||||||
|
|
1
example
1
example
|
@ -1 +0,0 @@
|
||||||
Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
|
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
|
|
@ -0,0 +1,39 @@
|
||||||
|
# 分布式脚本
|
||||||
|
|
||||||
|
## 英伟达平台运行方式
|
||||||
|
|
||||||
|
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||||
|
|
||||||
|
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pytorch.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||||
|
```
|
||||||
|
|
||||||
|
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
||||||
|
|
||||||
|
#### 2. 运行InfiniTensor分布式脚本
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||||
|
```
|
||||||
|
|
||||||
|
## 寒武纪平台运行方式
|
||||||
|
|
||||||
|
**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配,具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
|
||||||
|
|
||||||
|
#### 1. 运行pytorch模型并生成输入和标准输出,可选择导出onnx
|
||||||
|
|
||||||
|
使用 `--export_onnx` 设置导出onnx的目录,默认为当前路径 `./`,不使用这个flag则只进行计算和生成输入输出。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pytorch_mlu.py --model gpt2 --batch_size 1 --length 1 --export_onnx ./
|
||||||
|
```
|
||||||
|
|
||||||
|
会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`,目前只支持单一输入输出。
|
||||||
|
|
||||||
|
#### 2. 运行InfiniTensor分布式脚本
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4
|
||||||
|
```
|
|
@ -0,0 +1,187 @@
|
||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnx.shape_inference import infer_shapes_path
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=1, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, required=True, help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.gen_std,
|
||||||
|
args.type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
|
||||||
|
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||||
|
load_inputs(stub, world_size, rank)
|
||||||
|
# stub.tune()
|
||||||
|
stub.run()
|
||||||
|
# get outputs
|
||||||
|
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
|
# bench
|
||||||
|
for _ in range(n):
|
||||||
|
stub.run()
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n * 2):
|
||||||
|
stub.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / (n * 2)
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def load_inputs(stub, world_size=1, rank=0):
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = np.load(f"./data/input_{i}.npy")
|
||||||
|
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
else:
|
||||||
|
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
|
||||||
|
results = np.load(f"./data/output.npy")
|
||||||
|
outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
|
||||||
|
print("outputs abs mean:", abs(outputs).mean())
|
||||||
|
print("max abs diff:", abs(outputs - results).max())
|
||||||
|
|
||||||
|
def start_worker(
|
||||||
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||||
|
):
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
model = parallel_model(model, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
onnx.save_model(
|
||||||
|
model,
|
||||||
|
f"./{dist_name}_rank{rank}.onnx",
|
||||||
|
save_as_external_data=True,
|
||||||
|
location=extern_path,
|
||||||
|
)
|
||||||
|
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||||
|
runtime = backend.BangRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
run_and_compare(name, model, runtime, world_size, rank, data_type)
|
||||||
|
|
||||||
|
|
||||||
|
def start_single(name, model, data_type):
|
||||||
|
runtime = backend.BangRuntime(0)
|
||||||
|
run_and_compare(name, model, runtime, data_type=data_type)
|
||||||
|
|
||||||
|
def generate_input_output(model):
|
||||||
|
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||||
|
runtime = backend.BangRuntime(0)
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
position_id = 0
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = tensor.copyout_numpy()
|
||||||
|
if np.issubdtype(input.dtype, np.integer):
|
||||||
|
if input.size == 1:
|
||||||
|
# input = np.array([position_id])
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
else:
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
elif input.dtype == np.bool_:
|
||||||
|
input = np.random.randint(0,2,size=input.shape) > 0
|
||||||
|
else:
|
||||||
|
if i == 0:
|
||||||
|
input = np.ones(input.shape).astype(input.dtype)
|
||||||
|
position_id = input.shape[-1] - 1
|
||||||
|
else:
|
||||||
|
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
np.save(f"./data/input_{i}", input)
|
||||||
|
stub.run()
|
||||||
|
time.sleep(0.01)
|
||||||
|
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
if np.isnan(output).any():
|
||||||
|
print("Nan in output")
|
||||||
|
np.save(f"./data/output", output)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||||
|
data_type = "default" if data_type == "fp32" else data_type
|
||||||
|
|
||||||
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print(f"generate standard data for {name}.")
|
||||||
|
# a small vocabulary size to fit all LLM.
|
||||||
|
generate_input_output(model)
|
||||||
|
return
|
||||||
|
|
||||||
|
if nproc_per_node == 1:
|
||||||
|
# run single process.
|
||||||
|
# use standalone process to isolate bang.
|
||||||
|
print("run model by single MLU.")
|
||||||
|
# p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||||
|
# p.start()
|
||||||
|
# p.join()
|
||||||
|
start_single(name, model, data_type)
|
||||||
|
return
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
print(f"run model by {world_size} MLU in parallel.")
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,249 @@
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
import torch_mlu
|
||||||
|
from transformers import BertModel, BertConfig
|
||||||
|
from transformers import GPT2Model, GPT2Config
|
||||||
|
from transformers import OPTModel, OPTConfig
|
||||||
|
from transformers import AlbertModel, AlbertConfig
|
||||||
|
from transformers import LlamaModel, LlamaConfig
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import onnx
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnxsim import simplify
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--export_onnx",
|
||||||
|
type=str,
|
||||||
|
nargs="?",
|
||||||
|
default=None,
|
||||||
|
const="./",
|
||||||
|
help="whether and where to export onnx file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.export_onnx,
|
||||||
|
args.type
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(modelname):
|
||||||
|
match modelname:
|
||||||
|
case "albert":
|
||||||
|
model = AlbertModel.from_pretrained("albert/albert-base-v2")
|
||||||
|
voc_size = AlbertConfig().vocab_size
|
||||||
|
case "bert":
|
||||||
|
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||||
|
voc_size = BertConfig().vocab_size
|
||||||
|
case "gpt2":
|
||||||
|
model = GPT2Model.from_pretrained("GPT2")
|
||||||
|
voc_size = GPT2Config().vocab_size
|
||||||
|
case "opt":
|
||||||
|
model = OPTModel.from_pretrained("facebook/opt-125m")
|
||||||
|
voc_size = OPTConfig().vocab_size
|
||||||
|
case "llama":
|
||||||
|
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||||
|
voc_size = LlamaConfig().vocab_size
|
||||||
|
case _:
|
||||||
|
raise KeyError(modelname)
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
return model, voc_size
|
||||||
|
|
||||||
|
def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
|
||||||
|
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||||
|
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||||
|
np.save("./data/input_0", data)
|
||||||
|
inputs = torch.from_numpy(data).to("mlu")
|
||||||
|
torch_model = torch_model.to("mlu")
|
||||||
|
if dtype == "fp16":
|
||||||
|
torch_model = torch_model.half()
|
||||||
|
|
||||||
|
n_iter = 20
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(10):
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
torch.mlu.synchronize()
|
||||||
|
begin = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(n_iter):
|
||||||
|
torch.mlu.synchronize()
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
torch.mlu.synchronize()
|
||||||
|
torch.mlu.synchronize()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
avg_time = (end - begin) / n_iter
|
||||||
|
outputs = outputs.last_hidden_state.to("cpu")
|
||||||
|
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
# torch.mlu.memory.empty_cache()
|
||||||
|
np.save("./data/output", np.array(outputs))
|
||||||
|
print("Save input & output into ./data.")
|
||||||
|
|
||||||
|
|
||||||
|
def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
|
||||||
|
data = data.to("mlu")
|
||||||
|
model = model.to("mlu")
|
||||||
|
if dtype == "fp16":
|
||||||
|
model = model.half()
|
||||||
|
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||||
|
if modelname != "llama":
|
||||||
|
# use onnxsim to simplify
|
||||||
|
onnx_model = onnx.load(path)
|
||||||
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
||||||
|
# onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||||
|
assert check
|
||||||
|
add_value_info_for_constants(onnx_model)
|
||||||
|
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||||
|
if extern:
|
||||||
|
extern_path = path.replace('.onnx', '.pb')
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
extern_path = extern_path.split("/")[-1]
|
||||||
|
convert_model_to_external_data(
|
||||||
|
onnx_model,
|
||||||
|
all_tensors_to_one_file=True,
|
||||||
|
location=extern_path,
|
||||||
|
size_threshold=1024,
|
||||||
|
convert_attribute=False,
|
||||||
|
)
|
||||||
|
onnx.save(onnx_model, path)
|
||||||
|
else:
|
||||||
|
# use third party tool to simplify llama
|
||||||
|
# reference: https://github.com/luchangli03/onnxsim_large_model/
|
||||||
|
sys.path.append("onnxsim_large_model")
|
||||||
|
from onnx_utils import set_onnx_input_shape
|
||||||
|
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
||||||
|
|
||||||
|
in_model_path = path
|
||||||
|
out_model_path = path
|
||||||
|
if not out_model_path:
|
||||||
|
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
||||||
|
if os.path.isdir(out_model_path):
|
||||||
|
out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
|
||||||
|
|
||||||
|
onnx_model = onnx.load(in_model_path)
|
||||||
|
print(f"load model from {in_model_path} success")
|
||||||
|
|
||||||
|
size_th_bytes = 1024 * 1024
|
||||||
|
|
||||||
|
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
||||||
|
print(f"compress model success")
|
||||||
|
|
||||||
|
onnx_model = set_onnx_input_shape(onnx_model, "")
|
||||||
|
|
||||||
|
tensor_size_threshold = f"1024KB"
|
||||||
|
skipped_optimizers = []
|
||||||
|
skipped_optimizers.append("eliminate_duplicate_initializer")
|
||||||
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
||||||
|
tensor_size_threshold=tensor_size_threshold)
|
||||||
|
if not check:
|
||||||
|
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
||||||
|
|
||||||
|
print(f"simplify model success")
|
||||||
|
|
||||||
|
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
||||||
|
print(f"uncompress model success")
|
||||||
|
|
||||||
|
add_value_info_for_constants(onnx_model)
|
||||||
|
|
||||||
|
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
||||||
|
|
||||||
|
|
||||||
|
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||||
|
"""
|
||||||
|
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||||
|
that info explicitly as ValueInfoProtos.
|
||||||
|
Mutates the model.
|
||||||
|
Args:
|
||||||
|
model: The ModelProto to update.
|
||||||
|
"""
|
||||||
|
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||||
|
if model.ir_version < 4:
|
||||||
|
return
|
||||||
|
|
||||||
|
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||||
|
inputs = {i.name for i in graph.input}
|
||||||
|
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||||
|
for init in graph.initializer:
|
||||||
|
# Check it really is a constant, not an input
|
||||||
|
if init.name in inputs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# The details we want to add
|
||||||
|
elem_type = init.data_type
|
||||||
|
shape = init.dims
|
||||||
|
|
||||||
|
# Get existing or create new value info for this constant
|
||||||
|
vi = existing_info.get(init.name)
|
||||||
|
if vi is None:
|
||||||
|
vi = graph.value_info.add()
|
||||||
|
vi.name = init.name
|
||||||
|
|
||||||
|
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||||
|
tt = vi.type.tensor_type
|
||||||
|
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||||
|
tt.elem_type = elem_type
|
||||||
|
if not tt.HasField("shape"):
|
||||||
|
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||||
|
tt.shape.dim.extend([])
|
||||||
|
for dim in shape:
|
||||||
|
tt.shape.dim.add().dim_value = dim
|
||||||
|
|
||||||
|
# Handle subgraphs
|
||||||
|
for node in graph.node:
|
||||||
|
for attr in node.attribute:
|
||||||
|
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||||
|
if attr.ref_attr_name != "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPH:
|
||||||
|
add_const_value_infos_to_graph(attr.g)
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||||
|
for g in attr.graphs:
|
||||||
|
add_const_value_infos_to_graph(g)
|
||||||
|
|
||||||
|
|
||||||
|
return add_const_value_infos_to_graph(model.graph)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
torch.backends.mlu.matmul.allow_tf32 = False
|
||||||
|
torch.backends.cnnl.allow_tf32 = False
|
||||||
|
modelname, batchsize, seqlen, export_path, dtype = parse_args()
|
||||||
|
if dtype == "tf32":
|
||||||
|
torch.backends.mlu.matmul.allow_tf32 = True
|
||||||
|
else:
|
||||||
|
os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
|
||||||
|
|
||||||
|
model, voc_size = get_model(modelname)
|
||||||
|
if export_path is not None:
|
||||||
|
filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
|
||||||
|
path = os.path.join(export_path, filename)
|
||||||
|
if not os.path.exists(path):
|
||||||
|
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||||
|
export_onnx(modelname, model, param, path, True, dtype)
|
||||||
|
else:
|
||||||
|
print("Onnx path exists, skipping export.")
|
||||||
|
|
||||||
|
run_pytorch(model, voc_size, batchsize, seqlen, dtype)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,161 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnx.shape_inference import infer_shapes_path
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=1, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, required=True, help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.gen_std,
|
||||||
|
args.type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(model, runtime, inputs, n=10, data_type = "default"):
|
||||||
|
stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
|
||||||
|
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
# stub.tune()
|
||||||
|
stub.run()
|
||||||
|
# get outputs
|
||||||
|
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
|
# bench
|
||||||
|
for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n):
|
||||||
|
stub.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / n
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model, runtime, data_type):
|
||||||
|
input_ids = np.load(f"{name}_inputs.npy")
|
||||||
|
position_ids = np.arange(input_ids.shape[-1])
|
||||||
|
results = np.load(f"{name}_results.npy")
|
||||||
|
outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
|
||||||
|
print("outputs abs mean:", abs(outputs).mean())
|
||||||
|
print("max abs diff:", abs(outputs - results).max())
|
||||||
|
|
||||||
|
|
||||||
|
def start_worker(
|
||||||
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
|
||||||
|
):
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
model = parallel_model(model, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
onnx.save_model(
|
||||||
|
model,
|
||||||
|
f"./{dist_name}_rank{rank}.onnx",
|
||||||
|
save_as_external_data=True,
|
||||||
|
location=extern_path,
|
||||||
|
)
|
||||||
|
#infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||||
|
runtime = backend.CudaRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
run_and_compare(name, model, runtime, data_type)
|
||||||
|
|
||||||
|
|
||||||
|
def start_single(name, model, data_type):
|
||||||
|
runtime = backend.CudaRuntime(0)
|
||||||
|
run_and_compare(name, model, runtime, data_type)
|
||||||
|
|
||||||
|
|
||||||
|
def gen_standard(name, model, voc_size, bs, len):
|
||||||
|
# generate standard results
|
||||||
|
input_ids = np.random.randint(0, voc_size, (bs, len))
|
||||||
|
position_ids = np.arange(len)
|
||||||
|
np.save(f"{name}_inputs", input_ids)
|
||||||
|
runtime = backend.CudaRuntime(0)
|
||||||
|
outputs = run_model(model, runtime, (input_ids, position_ids), 1)
|
||||||
|
print("outputs abs mean:", abs(outputs).mean())
|
||||||
|
np.save(f"{name}_results", outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
|
||||||
|
data_type = "default" if data_type == "fp32" else data_type
|
||||||
|
if data_type != "tf32":
|
||||||
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||||
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print(f"generate standard data for {name}.")
|
||||||
|
# a small vocabulary size to fit all LLM.
|
||||||
|
voc_size = 1000
|
||||||
|
gen_standard(name, model, voc_size, bs, length)
|
||||||
|
return
|
||||||
|
|
||||||
|
# run single process.
|
||||||
|
# use standalone process to isolate cuda.
|
||||||
|
print("run model by single GPU.")
|
||||||
|
p = mp.Process(target=start_single, args=(name, model, data_type))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
print(f"run model by {world_size} GPU in parallel.")
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,245 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=1, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model1", type=str, required=True, help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model2", type=str, required=True, help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model1,
|
||||||
|
args.model2,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.gen_std,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(model1, model2, runtime1, runtime2, inputs1: np.array, inputs2: np.array, n=20):
|
||||||
|
####################################
|
||||||
|
# run the first graph without kvcache
|
||||||
|
####################################
|
||||||
|
stub1 = OnnxStub(model1, runtime1)
|
||||||
|
stub1.inputs['onnx::Reshape_0'].copyin_int32(inputs1.reshape(-1).tolist())
|
||||||
|
stub1.tune()
|
||||||
|
stub1.run()
|
||||||
|
kvcache_it1 = []
|
||||||
|
count = 0
|
||||||
|
for output in stub1.outputs.items().__iter__():
|
||||||
|
if count == 0:
|
||||||
|
logits_it1 = np.array(output[1].copyout_float(), dtype=np.float32)
|
||||||
|
else:
|
||||||
|
kvcache_it1.append(np.array(output[1].copyout_float(), dtype=np.float32))
|
||||||
|
count = count + 1
|
||||||
|
|
||||||
|
# bench for stub1
|
||||||
|
next(stub1.inputs.items().__iter__())[1].copyin_int32(inputs1.reshape(-1).tolist())
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n):
|
||||||
|
stub1.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / n
|
||||||
|
print(f"stub1 average time: {avg_time}")
|
||||||
|
|
||||||
|
####################################
|
||||||
|
# run the second graph with kvcache
|
||||||
|
####################################
|
||||||
|
i = 0
|
||||||
|
batchsize = 1
|
||||||
|
stub2 = OnnxStub(model2, runtime2)
|
||||||
|
past_kvcache_length = (i+2)*np.ones((batchsize, 1), dtype=np.int32)
|
||||||
|
# copyin input
|
||||||
|
stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
|
||||||
|
stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
|
||||||
|
count = -1
|
||||||
|
for input in stub2.inputs.items().__iter__():
|
||||||
|
if count in range(24):
|
||||||
|
# print(count, input[0])
|
||||||
|
# print(np.dtype(kvcache_it1[count][0]), kvcache_it1[count].shape)
|
||||||
|
input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
|
||||||
|
count = count + 1
|
||||||
|
stub2.tune()
|
||||||
|
stub2.run()
|
||||||
|
|
||||||
|
# copyout output
|
||||||
|
count = 0
|
||||||
|
kvcache_it2 = []
|
||||||
|
for output in stub2.outputs.items().__iter__():
|
||||||
|
if count == 0:
|
||||||
|
logits_it2 = np.array(output[1].copyout_float(), dtype=np.float32)
|
||||||
|
else:
|
||||||
|
kvcache_it2.append(np.array(output[1].copyout_float(), dtype=np.float32))
|
||||||
|
count = count + 1
|
||||||
|
|
||||||
|
# bench for stub2
|
||||||
|
# copyin input
|
||||||
|
stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
|
||||||
|
stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
|
||||||
|
count = -1
|
||||||
|
for input in stub2.inputs.items().__iter__():
|
||||||
|
if count in range(24):
|
||||||
|
input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
|
||||||
|
count = count + 1
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n):
|
||||||
|
stub2.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / n
|
||||||
|
print(f"stub2 average time: {avg_time}")
|
||||||
|
return logits_it2
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model1, model2, runtime1, runtime2):
|
||||||
|
data1 = np.load(f"{name}_inputs1.npy")
|
||||||
|
data2 = np.load(f"{name}_inputs2.npy")
|
||||||
|
results = np.load(f"{name}_results.npy")
|
||||||
|
outputs = run_model(model1, model2, runtime1, runtime2, data1, data2)
|
||||||
|
print("outputs sum:", outputs.sum())
|
||||||
|
print("max abs diff:", abs(outputs - results).max())
|
||||||
|
print("max rel diff:", abs((outputs - results) / results).max())
|
||||||
|
# assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
|
||||||
|
|
||||||
|
|
||||||
|
def start_worker(
|
||||||
|
name: str, world_size: int, rank: int, local_rank: int, model1: onnx.ModelProto, model2: onnx.ModelProto
|
||||||
|
):
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
####################################
|
||||||
|
# shard the first graph
|
||||||
|
####################################
|
||||||
|
model1 = parallel_model(model1, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_stub1_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
convert_model_to_external_data(
|
||||||
|
model1,
|
||||||
|
all_tensors_to_one_file=True,
|
||||||
|
location=extern_path,
|
||||||
|
size_threshold=1024,
|
||||||
|
convert_attribute=False,
|
||||||
|
)
|
||||||
|
onnx.save(model1, f"./{dist_name}_stub1_rank{rank}.onnx")
|
||||||
|
runtime1 = backend.CudaRuntime(local_rank)
|
||||||
|
runtime1.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
|
||||||
|
####################################
|
||||||
|
# shard the second graph
|
||||||
|
####################################
|
||||||
|
model2 = parallel_model(model2, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_stub2_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
convert_model_to_external_data(
|
||||||
|
model2,
|
||||||
|
all_tensors_to_one_file=True,
|
||||||
|
location=extern_path,
|
||||||
|
size_threshold=1024,
|
||||||
|
convert_attribute=False,
|
||||||
|
)
|
||||||
|
onnx.save(model2, f"./{dist_name}_stub2_rank{rank}.onnx")
|
||||||
|
runtime2 = backend.CudaRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime2.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
|
||||||
|
# run the two graphs
|
||||||
|
run_and_compare(name, model1, model2, runtime1, runtime2)
|
||||||
|
|
||||||
|
|
||||||
|
def start_single(name, model1, model2):
|
||||||
|
runtime1 = backend.CudaRuntime(0)
|
||||||
|
runtime2 = backend.CudaRuntime(0)
|
||||||
|
run_and_compare(name, model1, model2, runtime1, runtime2)
|
||||||
|
|
||||||
|
|
||||||
|
def gen_standard(name, model1, model2, voc_size, bs, len):
|
||||||
|
# generate standard results
|
||||||
|
data1 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
|
||||||
|
data2 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
|
||||||
|
np.save(f"{name}_inputs1", data1)
|
||||||
|
np.save(f"{name}_inputs2", data2)
|
||||||
|
runtime1 = backend.CudaRuntime(0)
|
||||||
|
runtime2 = backend.CudaRuntime(0)
|
||||||
|
outputs = run_model(model1, model2, runtime1, runtime2, data1, data2, 1)
|
||||||
|
np.save(f"{name}_results", outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nnodes, nproc_per_node, name, model1_path, model2_path, bs, length, gen_std = parse_args()
|
||||||
|
|
||||||
|
model1 = onnx.load(model1_path)
|
||||||
|
model2 = onnx.load(model2_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print(f"generate standard data for {name}.")
|
||||||
|
# a small vocabulary size to fit all LLM.
|
||||||
|
voc_size = 1000
|
||||||
|
gen_standard(name, model1, model2, voc_size, bs, length)
|
||||||
|
return
|
||||||
|
|
||||||
|
# run single process.
|
||||||
|
# use standalone process to isolate cuda.
|
||||||
|
p = mp.Process(target=start_single, args=(name, model1, model2))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(name, world_size, rank, rank % nproc_per_node, model1, model2),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,188 @@
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
from transformers import BertModel, BertConfig
|
||||||
|
from transformers import GPT2Model, GPT2Config
|
||||||
|
from transformers import OPTModel, OPTConfig
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import onnx
|
||||||
|
import os
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnxsim import simplify
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--export_onnx",
|
||||||
|
type=str,
|
||||||
|
nargs="?",
|
||||||
|
default=None,
|
||||||
|
const="./",
|
||||||
|
help="whether and where to export onnx file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.export_onnx,
|
||||||
|
args.type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(modelname):
|
||||||
|
match modelname:
|
||||||
|
case "bert":
|
||||||
|
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||||
|
voc_size = BertConfig().vocab_size
|
||||||
|
case "gpt2":
|
||||||
|
model = GPT2Model.from_pretrained("gpt2")
|
||||||
|
voc_size = GPT2Config().vocab_size
|
||||||
|
case "opt":
|
||||||
|
model = model = OPTModel.from_pretrained("./opt-125m")
|
||||||
|
voc_size = OPTConfig().vocab_size
|
||||||
|
case _:
|
||||||
|
raise KeyError(modelname)
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
return model, voc_size
|
||||||
|
|
||||||
|
def run_pytorch(torch_model, voc_size, batchsize, len):
|
||||||
|
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||||
|
np.save("test_inputs", data)
|
||||||
|
inputs = torch.from_numpy(data).to("cuda")
|
||||||
|
torch_model = torch_model.to("cuda")
|
||||||
|
|
||||||
|
n_iter = 20
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(10):
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
begin = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(n_iter):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
#
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
avg_time = (end - begin) / n_iter
|
||||||
|
outputs = outputs.last_hidden_state.to("cpu")
|
||||||
|
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
torch.cuda.memory.empty_cache()
|
||||||
|
np.save("test_results", np.array(outputs, dtype=np.float32))
|
||||||
|
print("Save input & output as test_inputs.npy and test_results.npy")
|
||||||
|
|
||||||
|
|
||||||
|
def export_onnx(model, data, path, extern=False):
|
||||||
|
torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||||
|
onnx_model = onnx.load(path)
|
||||||
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
|
||||||
|
#onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||||
|
assert check
|
||||||
|
add_value_info_for_constants(onnx_model)
|
||||||
|
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||||
|
if extern:
|
||||||
|
extern_path = path.replace('.onnx', '.pb')
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
convert_model_to_external_data(
|
||||||
|
onnx_model,
|
||||||
|
all_tensors_to_one_file=True,
|
||||||
|
location=extern_path,
|
||||||
|
size_threshold=1024,
|
||||||
|
convert_attribute=False,
|
||||||
|
)
|
||||||
|
onnx.save(onnx_model, path)
|
||||||
|
|
||||||
|
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||||
|
"""
|
||||||
|
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||||
|
that info explicitly as ValueInfoProtos.
|
||||||
|
Mutates the model.
|
||||||
|
Args:
|
||||||
|
model: The ModelProto to update.
|
||||||
|
"""
|
||||||
|
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||||
|
if model.ir_version < 4:
|
||||||
|
return
|
||||||
|
|
||||||
|
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||||
|
inputs = {i.name for i in graph.input}
|
||||||
|
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||||
|
for init in graph.initializer:
|
||||||
|
# Check it really is a constant, not an input
|
||||||
|
if init.name in inputs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# The details we want to add
|
||||||
|
elem_type = init.data_type
|
||||||
|
shape = init.dims
|
||||||
|
|
||||||
|
# Get existing or create new value info for this constant
|
||||||
|
vi = existing_info.get(init.name)
|
||||||
|
if vi is None:
|
||||||
|
vi = graph.value_info.add()
|
||||||
|
vi.name = init.name
|
||||||
|
|
||||||
|
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||||
|
tt = vi.type.tensor_type
|
||||||
|
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||||
|
tt.elem_type = elem_type
|
||||||
|
if not tt.HasField("shape"):
|
||||||
|
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||||
|
tt.shape.dim.extend([])
|
||||||
|
for dim in shape:
|
||||||
|
tt.shape.dim.add().dim_value = dim
|
||||||
|
|
||||||
|
# Handle subgraphs
|
||||||
|
for node in graph.node:
|
||||||
|
for attr in node.attribute:
|
||||||
|
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||||
|
if attr.ref_attr_name != "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPH:
|
||||||
|
add_const_value_infos_to_graph(attr.g)
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||||
|
for g in attr.graphs:
|
||||||
|
add_const_value_infos_to_graph(g)
|
||||||
|
|
||||||
|
|
||||||
|
return add_const_value_infos_to_graph(model.graph)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = False
|
||||||
|
torch.backends.cudnn.allow_tf32 = False
|
||||||
|
modelname, batchsize, seqlen, export_path, data_type = parse_args()
|
||||||
|
if data_type == "tf32":
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
else:
|
||||||
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||||
|
|
||||||
|
model, voc_size = get_model(modelname)
|
||||||
|
if export_path is not None:
|
||||||
|
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||||
|
path = os.path.join(export_path, filename)
|
||||||
|
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||||
|
export_onnx(model, param, path, True)
|
||||||
|
|
||||||
|
if data_type == "fp16":
|
||||||
|
model = model.half()
|
||||||
|
run_pytorch(model, voc_size, batchsize, seqlen)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,14 @@
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
models=("bert" "gpt2" "llama")
|
||||||
|
batch_size=(1 32)
|
||||||
|
seq_len=(100 500)
|
||||||
|
nproc=(1 2 4)
|
||||||
|
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
for bs in "${batch_size[@]}"; do
|
||||||
|
for len in "${seq_len[@]}"; do
|
||||||
|
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
|
@ -0,0 +1,280 @@
|
||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnx.shape_inference import infer_shapes_path
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
from functools import wraps
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, default="", help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--run_single",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="whether run model with single process with standard inputs"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input_dir",
|
||||||
|
default="./",
|
||||||
|
help="path to save model input data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_dir",
|
||||||
|
default="./",
|
||||||
|
help="path to save model standard output"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--internal_model_dir",
|
||||||
|
default="./",
|
||||||
|
help="path to save internal onnx model for parallel run"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# check path, mkdir if not exist
|
||||||
|
check_exists(args.input_dir)
|
||||||
|
check_exists(args.result_dir)
|
||||||
|
check_exists(args.internal_model_dir)
|
||||||
|
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model,
|
||||||
|
args.gen_std,
|
||||||
|
args.run_single,
|
||||||
|
args.input_dir,
|
||||||
|
args.result_dir,
|
||||||
|
args.internal_model_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
utils function for this scripts
|
||||||
|
"""
|
||||||
|
def check_exists(path: str):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
def np_assert(base, test, rtol=1e-2, atol=1e-1):
|
||||||
|
# np.testing.assert_allclose(test, base, rtol, atol)
|
||||||
|
print("max abs diff:", abs(base - test).max())
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Perf wrapper, run function n times
|
||||||
|
then average
|
||||||
|
"""
|
||||||
|
def perf_it(n):
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
# warmup
|
||||||
|
for _ in range(n):
|
||||||
|
func(*args, **kwargs)
|
||||||
|
|
||||||
|
t_total = 0
|
||||||
|
for _ in range(n):
|
||||||
|
t0 = time.time()
|
||||||
|
func(*args, **kwargs)
|
||||||
|
t1 = time.time()
|
||||||
|
t_total += t1 - t0
|
||||||
|
avg_time = (t_total) / n
|
||||||
|
print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
|
||||||
|
return avg_time
|
||||||
|
return wrapper
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Run InfiniTensor model with Standard input
|
||||||
|
check=True: check with standard output gen by pytorch
|
||||||
|
perf=True: run n times to get avg time
|
||||||
|
"""
|
||||||
|
def run_model(task_name,
|
||||||
|
model,
|
||||||
|
runtime,
|
||||||
|
world_size=1,
|
||||||
|
rank=0,
|
||||||
|
n=10,
|
||||||
|
check=True,
|
||||||
|
perf=True):
|
||||||
|
|
||||||
|
stub = OnnxStub(model, runtime,
|
||||||
|
use_naive_allocator=True \
|
||||||
|
if task_name == "llama" else False)
|
||||||
|
|
||||||
|
# load in Onnx model inputs
|
||||||
|
def load_inputs(stub: OnnxStub):
|
||||||
|
# check exists
|
||||||
|
inputs = []
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input_path = os.path.join(input_dir, \
|
||||||
|
f"{task_name}_input_{i}.npy")
|
||||||
|
print(input_path)
|
||||||
|
if os.path.exists(input_path):
|
||||||
|
input = np.load(input_path)
|
||||||
|
else :
|
||||||
|
raise KeyError(f"{i} th input of model not exists")
|
||||||
|
# check shape
|
||||||
|
if all(x == y for x,y in zip(input.shape, tensor.shape())):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
else:
|
||||||
|
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||||
|
|
||||||
|
load_inputs(stub)
|
||||||
|
# stub.tune()
|
||||||
|
stub.run()
|
||||||
|
time.sleep(0.01)
|
||||||
|
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
|
# check output results with standard output
|
||||||
|
if check:
|
||||||
|
st_output_path = os.path.join(result_dir, \
|
||||||
|
f"{task_name}_output.npy")
|
||||||
|
assert os.path.exists(st_output_path) , \
|
||||||
|
"standard output not exists"
|
||||||
|
st_output = np.load(st_output_path)
|
||||||
|
if np.isnan(output).any():
|
||||||
|
print("Nan in output")
|
||||||
|
exit()
|
||||||
|
np_assert(st_output, output)
|
||||||
|
|
||||||
|
# perf
|
||||||
|
if perf:
|
||||||
|
@perf_it(n)
|
||||||
|
def perf_infinitensor(stub: OnnxStub):
|
||||||
|
stub.run()
|
||||||
|
perf_infinitensor(stub)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Start a worker in Parallel
|
||||||
|
"""
|
||||||
|
def start_worker(name: str,
|
||||||
|
world_size: int,
|
||||||
|
rank: int,
|
||||||
|
local_rank: int,
|
||||||
|
model: onnx.ModelProto):
|
||||||
|
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
# partial a onnx model to world_size part
|
||||||
|
model = parallel_model(model, world_size, rank)
|
||||||
|
onnx.save(model, os.path.join(internal_model_dir, \
|
||||||
|
f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
|
||||||
|
runtime = backend.KUNLUNRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
run_model(name, model, runtime, world_size, rank)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
generate standard input/output with
|
||||||
|
sigle card run
|
||||||
|
"""
|
||||||
|
def gen_standard(task_name: str, model: onnx.ModelProto):
|
||||||
|
runtime = backend.KUNLUNRuntime(0)
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
position_id = 0
|
||||||
|
# generate random input for model
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = tensor.copyout_numpy()
|
||||||
|
if np.issubdtype(input.dtype, np.integer):
|
||||||
|
if input.size == 1:
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
else:
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
elif input.dtype == np.bool_:
|
||||||
|
input = np.random.randint(0,2,size=input.shape) > 0
|
||||||
|
else:
|
||||||
|
if i == 0:
|
||||||
|
input = np.ones(input.shape).astype(input.dtype)
|
||||||
|
position_id = input.shape[-1] - 1
|
||||||
|
else:
|
||||||
|
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
np.save(os.path.join(input_dir, \
|
||||||
|
f"{task_name}_input_{i}.npy"), input)
|
||||||
|
stub.run()
|
||||||
|
# print(stub.outputs)
|
||||||
|
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
if np.isnan(output).any():
|
||||||
|
print("Nan in output")
|
||||||
|
exit()
|
||||||
|
np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
global input_dir, result_dir, internal_model_dir
|
||||||
|
|
||||||
|
nnodes, nproc_per_node, task_name, \
|
||||||
|
model_path, gen_std, run_single, \
|
||||||
|
input_dir, result_dir, internal_model_dir = parse_args()
|
||||||
|
|
||||||
|
# load input onnx model
|
||||||
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print("Generate inputs and outputs.")
|
||||||
|
gen_standard(task_name, model)
|
||||||
|
return
|
||||||
|
|
||||||
|
if run_single:
|
||||||
|
print("Run model by one GPU card.")
|
||||||
|
runtime = backend.KUNLUNRuntime(0)
|
||||||
|
run_model(task_name, model, runtime)
|
||||||
|
return
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
print(f"Run model by {world_size} GPU in parallel.")
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(task_name, world_size, rank, rank % nproc_per_node, model),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,36 @@
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
# models=("bert" "gpt2" "llama")
|
||||||
|
models=("bert" "gpt2")
|
||||||
|
batch_size=(1 32)
|
||||||
|
seq_len=(100 500)
|
||||||
|
nproc=(1 2 4)
|
||||||
|
|
||||||
|
results_dir="results"
|
||||||
|
|
||||||
|
if [ -d "$results_dir" ]; then
|
||||||
|
echo "directory ./$results_dir exists"
|
||||||
|
else
|
||||||
|
mkdir -p "$results_dir"
|
||||||
|
echo "mkdir $results_dir, logs saved there"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
for bs in "${batch_size[@]}"; do
|
||||||
|
for len in "${seq_len[@]}"; do
|
||||||
|
# run pytorch model
|
||||||
|
echo "Run pytorch $model with batch_size=$bs length=$len ."
|
||||||
|
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
|
||||||
|
for n in "${nproc[@]}"; do
|
||||||
|
# run infinitensor
|
||||||
|
echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
|
||||||
|
python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini
|
||||||
|
# delete internal files
|
||||||
|
find ./ -type f -name "*.onnx" -delete
|
||||||
|
find ./ -type f -name "*.pb" -delete
|
||||||
|
done
|
||||||
|
find ./ -type f -name "*.npy" -delete
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
|
@ -0,0 +1,35 @@
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
# models=("bert" "gpt2" "llama")
|
||||||
|
models=("llama")
|
||||||
|
batch_size=(1 )
|
||||||
|
seq_len=(100 500)
|
||||||
|
nproc=(1 2 4)
|
||||||
|
|
||||||
|
results_dir="results"
|
||||||
|
|
||||||
|
if [ -d "$results_dir" ]; then
|
||||||
|
echo "directory ./$results_dir exists"
|
||||||
|
else
|
||||||
|
mkdir -p "$results_dir"
|
||||||
|
echo "mkdir $results_dir, logs saved there"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
for model in "${models[@]}"; do
|
||||||
|
for bs in "${batch_size[@]}"; do
|
||||||
|
for len in "${seq_len[@]}"; do
|
||||||
|
echo "Run pytorch llama with batch_size="$bs" and length="$len""
|
||||||
|
python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
|
||||||
|
for n in "${nproc[@]}"; do
|
||||||
|
# run pytorch model
|
||||||
|
echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
|
||||||
|
python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
|
||||||
|
# delete internal files
|
||||||
|
find ./ -type f -name "*.onnx" -delete
|
||||||
|
find ./ -type f -name "*0c" -delete
|
||||||
|
done
|
||||||
|
find ./ -type f -name "*.npy" -delete
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
|
@ -0,0 +1,245 @@
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
from transformers import BertModel, BertConfig
|
||||||
|
from transformers import GPT2Model, GPT2Config
|
||||||
|
from transformers import OPTModel, OPTConfig
|
||||||
|
from transformers import LlamaModel, LlamaConfig
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import onnx
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from onnx.external_data_helper import convert_model_to_external_data
|
||||||
|
from onnxsim import simplify
|
||||||
|
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = False
|
||||||
|
torch.backends.cudnn.allow_tf32 = False
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--export_onnx",
|
||||||
|
type=str,
|
||||||
|
nargs="?",
|
||||||
|
default=None,
|
||||||
|
const="./",
|
||||||
|
help="whether and where to export onnx file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input_dir",
|
||||||
|
type=str,
|
||||||
|
default="./",
|
||||||
|
help="path to save pytorch model input data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_dir",
|
||||||
|
type=str,
|
||||||
|
default="./",
|
||||||
|
help="path to save pytorch model output data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--export_only",
|
||||||
|
action="store_true"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.export_onnx,
|
||||||
|
args.input_dir,
|
||||||
|
args.result_dir,
|
||||||
|
args.export_only
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(modelname):
|
||||||
|
if modelname == "bert":
|
||||||
|
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
|
||||||
|
voc_size = BertConfig().vocab_size
|
||||||
|
elif modelname == "gpt2":
|
||||||
|
model = GPT2Model.from_pretrained("gpt2")
|
||||||
|
voc_size = GPT2Config().vocab_size
|
||||||
|
elif modelname == "opt":
|
||||||
|
model = OPTModel.from_pretrained("./opt-125m")
|
||||||
|
voc_size = OPTConfig().vocab_size
|
||||||
|
elif modelname == "llama":
|
||||||
|
model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||||
|
voc_size = LlamaConfig().vocab_size
|
||||||
|
else :
|
||||||
|
raise KeyError(modelname)
|
||||||
|
|
||||||
|
model = model.eval()
|
||||||
|
return model, voc_size
|
||||||
|
|
||||||
|
def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
|
||||||
|
data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
|
||||||
|
np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
|
||||||
|
inputs = torch.from_numpy(data).to("cuda")
|
||||||
|
torch_model = torch_model.to("cuda")
|
||||||
|
|
||||||
|
n_iter = 10
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(10):
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
begin = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
for _ in range(n_iter):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
outputs = torch_model(inputs)
|
||||||
|
#
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
avg_time = (end - begin) / n_iter
|
||||||
|
outputs = outputs.last_hidden_state.to("cpu")
|
||||||
|
print("outputs abs mean:", abs(np.array(outputs)).mean())
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
torch.cuda.memory.empty_cache()
|
||||||
|
np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
|
||||||
|
np.array(outputs))
|
||||||
|
print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
|
||||||
|
|
||||||
|
|
||||||
|
def export_onnx(model_name, model, data, path, extern=False):
|
||||||
|
# torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
|
||||||
|
|
||||||
|
if model_name != "llama":
|
||||||
|
onnx_model = onnx.load(path)
|
||||||
|
onnx_model, check = simplify(onnx_model,
|
||||||
|
skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
|
||||||
|
# skipped_optimizers=['fuse_qkv'])
|
||||||
|
assert check
|
||||||
|
add_value_info_for_constants(onnx_model)
|
||||||
|
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||||
|
if extern:
|
||||||
|
extern_path = path.replace('.onnx', '.pb')
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
convert_model_to_external_data(
|
||||||
|
onnx_model,
|
||||||
|
all_tensors_to_one_file=True,
|
||||||
|
location=extern_path.split("/")[-1],
|
||||||
|
size_threshold=1024,
|
||||||
|
convert_attribute=False,
|
||||||
|
)
|
||||||
|
onnx.save(onnx_model, path)
|
||||||
|
else:
|
||||||
|
sys.path.append("onnxsim_large_model")
|
||||||
|
from onnx_utils import set_onnx_input_shape
|
||||||
|
from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
|
||||||
|
|
||||||
|
in_model_path = path
|
||||||
|
out_model_path = in_model_path[:-5] + ".sim.onnx"
|
||||||
|
|
||||||
|
onnx_model = onnx.load(in_model_path)
|
||||||
|
print(f"load model from {in_model_path} success")
|
||||||
|
|
||||||
|
size_th_bytes = 1024 * 1024
|
||||||
|
onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
|
||||||
|
print("compress model success")
|
||||||
|
|
||||||
|
onnx_model = set_onnx_input_shape(onnx_model, "")
|
||||||
|
tensor_size_threshold = f"1024KB"
|
||||||
|
skipped_optimizers = []
|
||||||
|
skipped_optimizers.append("eliminate_duplicate_initializer")
|
||||||
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
|
||||||
|
tensor_size_threshold=tensor_size_threshold)
|
||||||
|
if not check:
|
||||||
|
raise ValueError(f"simplify compressed model {in_model_path} failed")
|
||||||
|
|
||||||
|
print(f"simplify model success")
|
||||||
|
|
||||||
|
onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
|
||||||
|
print(f"uncompress model success")
|
||||||
|
|
||||||
|
add_value_info_for_constants(onnx_model)
|
||||||
|
|
||||||
|
onnx.save(onnx_model, out_model_path, save_as_external_data=True)
|
||||||
|
|
||||||
|
|
||||||
|
def add_value_info_for_constants(model : onnx.ModelProto):
|
||||||
|
"""
|
||||||
|
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||||
|
that info explicitly as ValueInfoProtos.
|
||||||
|
Mutates the model.
|
||||||
|
Args:
|
||||||
|
model: The ModelProto to update.
|
||||||
|
"""
|
||||||
|
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||||
|
if model.ir_version < 4:
|
||||||
|
return
|
||||||
|
|
||||||
|
def add_const_value_infos_to_graph(graph : onnx.GraphProto):
|
||||||
|
inputs = {i.name for i in graph.input}
|
||||||
|
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||||
|
for init in graph.initializer:
|
||||||
|
# Check it really is a constant, not an input
|
||||||
|
if init.name in inputs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# The details we want to add
|
||||||
|
elem_type = init.data_type
|
||||||
|
shape = init.dims
|
||||||
|
|
||||||
|
# Get existing or create new value info for this constant
|
||||||
|
vi = existing_info.get(init.name)
|
||||||
|
if vi is None:
|
||||||
|
vi = graph.value_info.add()
|
||||||
|
vi.name = init.name
|
||||||
|
|
||||||
|
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||||
|
tt = vi.type.tensor_type
|
||||||
|
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||||
|
tt.elem_type = elem_type
|
||||||
|
if not tt.HasField("shape"):
|
||||||
|
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||||
|
tt.shape.dim.extend([])
|
||||||
|
for dim in shape:
|
||||||
|
tt.shape.dim.add().dim_value = dim
|
||||||
|
|
||||||
|
# Handle subgraphs
|
||||||
|
for node in graph.node:
|
||||||
|
for attr in node.attribute:
|
||||||
|
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||||
|
if attr.ref_attr_name != "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPH:
|
||||||
|
add_const_value_infos_to_graph(attr.g)
|
||||||
|
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||||
|
for g in attr.graphs:
|
||||||
|
add_const_value_infos_to_graph(g)
|
||||||
|
|
||||||
|
|
||||||
|
return add_const_value_infos_to_graph(model.graph)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
global input_dir, result_dir
|
||||||
|
|
||||||
|
modelname, batchsize, seqlen, \
|
||||||
|
export_path, input_dir, result_dir, export_only = parse_args()
|
||||||
|
|
||||||
|
model, voc_size = get_model(modelname) # pytorch model
|
||||||
|
|
||||||
|
if export_path is not None:
|
||||||
|
os.makedirs(export_path, exist_ok=True)
|
||||||
|
filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
|
||||||
|
path = os.path.join(export_path, filename)
|
||||||
|
param = torch.zeros((batchsize, seqlen), dtype=torch.int)
|
||||||
|
export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
|
||||||
|
if export_only:
|
||||||
|
return
|
||||||
|
|
||||||
|
run_pytorch(model, voc_size, batchsize, seqlen, modelname)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
|
|
@ -0,0 +1,103 @@
|
||||||
|
import onnx
|
||||||
|
from onnx import (
|
||||||
|
ModelProto,
|
||||||
|
TensorProto,
|
||||||
|
NodeProto,
|
||||||
|
AttributeProto,
|
||||||
|
)
|
||||||
|
from onnx import helper, numpy_helper
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
|
||||||
|
for attr in node.attribute:
|
||||||
|
if attr.name in attrs:
|
||||||
|
if attr.type == AttributeProto.INT:
|
||||||
|
attrs[attr.name] = attr.i
|
||||||
|
elif attr.type == AttributeProto.INTS:
|
||||||
|
attrs[attr.name] = attr.ints
|
||||||
|
elif attr.type == AttributeProto.FLOAT:
|
||||||
|
attrs[attr.name] = attr.f
|
||||||
|
elif attr.type == AttributeProto.STRING:
|
||||||
|
attrs[attr.name] = attr.s
|
||||||
|
elif attr.type == AttributeProto.TENSOR:
|
||||||
|
attrs[attr.name] = attr.t
|
||||||
|
else:
|
||||||
|
assert False, "Unsupported Attribute Type: {}".format(attr.type)
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
|
def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
||||||
|
data = {init.name: init for init in model.graph.initializer}
|
||||||
|
nodes = list(model.graph.node)
|
||||||
|
|
||||||
|
def shard_tensor(tensor: TensorProto, dim: int):
|
||||||
|
array = numpy_helper.to_array(tensor)
|
||||||
|
if dim >= array.ndim:
|
||||||
|
dim = array.ndim - 1
|
||||||
|
assert array.shape[dim] % tp_world_size == 0
|
||||||
|
seg = array.shape[dim] // tp_world_size
|
||||||
|
array = array[tp_rank * seg : (tp_rank + 1) * seg]
|
||||||
|
return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")
|
||||||
|
|
||||||
|
def shard_gemm(node: NodeProto):
|
||||||
|
attrs = parse_attribute(
|
||||||
|
node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
|
||||||
|
)
|
||||||
|
trans = [attrs["transA"], attrs["transB"]]
|
||||||
|
dim = 0
|
||||||
|
for i, (input, t) in enumerate(zip(node.input, trans)):
|
||||||
|
if input in data:
|
||||||
|
dim = i
|
||||||
|
sharded = shard_tensor(data[input], dim ^ t)
|
||||||
|
node.input[i] = sharded.name
|
||||||
|
data[input] = sharded
|
||||||
|
if len(node.input) > 2:
|
||||||
|
input = node.input[2]
|
||||||
|
sharded = shard_tensor(data[input], dim)
|
||||||
|
node.input[2] = sharded.name
|
||||||
|
data[input] = sharded
|
||||||
|
|
||||||
|
node.output[0] += f":sharded({dim})"
|
||||||
|
return dim
|
||||||
|
|
||||||
|
for i, node in enumerate(nodes):
|
||||||
|
if node.op_type == "Gemm":
|
||||||
|
output = node.output[0]
|
||||||
|
dim = shard_gemm(node)
|
||||||
|
gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
|
||||||
|
# all_gather
|
||||||
|
nodes.insert(
|
||||||
|
i + 1,
|
||||||
|
helper.make_node(
|
||||||
|
op_type="AllGather",
|
||||||
|
inputs=[node.output[0]],
|
||||||
|
outputs=gathered,
|
||||||
|
name=node.name + "/allgather",
|
||||||
|
# domain="infini", # shape inference fails for custom domain
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# concat
|
||||||
|
nodes.insert(
|
||||||
|
i + 2,
|
||||||
|
helper.make_node(
|
||||||
|
op_type="Concat",
|
||||||
|
inputs=gathered,
|
||||||
|
outputs=[output],
|
||||||
|
name=node.name + "/concat",
|
||||||
|
axis=dim,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
graph = helper.make_graph(
|
||||||
|
nodes,
|
||||||
|
model.graph.name + f"_{tp_rank}",
|
||||||
|
model.graph.input,
|
||||||
|
model.graph.output,
|
||||||
|
data.values(),
|
||||||
|
doc_string=model.graph.doc_string,
|
||||||
|
value_info=model.graph.value_info,
|
||||||
|
)
|
||||||
|
model = helper.make_model(graph)
|
||||||
|
|
||||||
|
onnx.shape_inference.infer_shapes(model)
|
||||||
|
return model
|
|
@ -0,0 +1,247 @@
|
||||||
|
import onnx
|
||||||
|
from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto
|
||||||
|
from onnx import helper, numpy_helper
|
||||||
|
from typing import Dict, List
|
||||||
|
from placement import Placement, Replicate, Shard, _Partial
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
||||||
|
data = {init.name: init for init in model.graph.initializer}
|
||||||
|
vinfo = {info.name: info for info in model.graph.value_info}
|
||||||
|
vinfo.update({info.name: info for info in model.graph.input})
|
||||||
|
vinfo.update({info.name: info for info in model.graph.output})
|
||||||
|
output = {info.name: info for info in model.graph.output}
|
||||||
|
place: Dict[str, Placement] = {}
|
||||||
|
nodes: List[NodeProto] = []
|
||||||
|
|
||||||
|
def is_sharded(name: str):
|
||||||
|
return place[name].is_shard()
|
||||||
|
|
||||||
|
def shard_tensor(tensor: TensorProto, plc: Shard, groups: int = 1):
|
||||||
|
# print(f"shard {tensor.name} at dim {dim}")
|
||||||
|
assert plc.is_shard(), plc
|
||||||
|
ndim = len(tensor.dims)
|
||||||
|
if plc.dim < 0:
|
||||||
|
plc.dim += ndim
|
||||||
|
if tensor.dims[plc.dim] == 1: # broadcast dim, no need to shard.
|
||||||
|
return tensor
|
||||||
|
array = numpy_helper.to_array(tensor)
|
||||||
|
assert array.shape[plc.dim] % tp_world_size == 0, array.shape[plc.dim]
|
||||||
|
dims = list(tensor.dims)
|
||||||
|
dims.insert(plc.dim, groups)
|
||||||
|
dims[plc.dim + 1] //= groups
|
||||||
|
array = array.reshape(dims)
|
||||||
|
seg = array.shape[plc.dim + 1] // tp_world_size
|
||||||
|
array = array.take(
|
||||||
|
indices=range(tp_rank * seg, (tp_rank + 1) * seg), axis=plc.dim + 1
|
||||||
|
)
|
||||||
|
dims = list(tensor.dims)
|
||||||
|
dims[plc.dim] //= tp_world_size
|
||||||
|
array = array.reshape(dims)
|
||||||
|
tensor = numpy_helper.from_array(array, name=tensor.name)
|
||||||
|
place[tensor.name] = plc
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
def shard_gemm(node: NodeProto, groups: int = 1):
|
||||||
|
# print("gemm", node.name)
|
||||||
|
in_plc = place[node.input[0]]
|
||||||
|
w_plc = Shard(-1) if in_plc.is_replicate() else Shard(0)
|
||||||
|
transB = next((attr.i for attr in node.attribute if attr.name == "transB"), 0)
|
||||||
|
if transB:
|
||||||
|
w_plc.dim = ~w_plc.dim
|
||||||
|
input = node.input[1]
|
||||||
|
data[input] = shard_tensor(data[input], w_plc, groups)
|
||||||
|
|
||||||
|
output = node.output[0]
|
||||||
|
ndim = len(vinfo[output].type.tensor_type.shape.dim)
|
||||||
|
out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
|
||||||
|
place[node.output[0]] = out_plc
|
||||||
|
|
||||||
|
def shard_concat(node: NodeProto):
|
||||||
|
# hack for kvcache
|
||||||
|
in_plc = place[node.input[1]]
|
||||||
|
if in_plc.is_shard():
|
||||||
|
seq_len_dim = vinfo[node.input[0]].type.tensor_type.shape.dim.pop(1)
|
||||||
|
seq_len_dim.dim_value //= tp_world_size
|
||||||
|
vinfo[node.input[0]].type.tensor_type.shape.dim.insert(1, seq_len_dim)
|
||||||
|
place[node.input[0]] = in_plc
|
||||||
|
place[node.output[0]] = in_plc
|
||||||
|
|
||||||
|
def shard_binary(node: NodeProto, groups: int = 1):
|
||||||
|
# print("binary", node.name, node.input[0], place[node.input[0]])
|
||||||
|
a = node.input[0]
|
||||||
|
b = node.input[1]
|
||||||
|
if a in data:
|
||||||
|
a, b = b, a
|
||||||
|
place[node.output[0]] = place[a]
|
||||||
|
if is_sharded(a) and b in data and len(data[b].dims) == 1: # broadcast
|
||||||
|
data[b] = shard_tensor(data[b], Shard(0), groups)
|
||||||
|
|
||||||
|
def shard_reshape(node: NodeProto):
|
||||||
|
# print("reshape", node.name, node.input[0], place[node.input[0]])
|
||||||
|
if not is_sharded(node.input[0]):
|
||||||
|
return
|
||||||
|
in_plc = place[node.input[0]]
|
||||||
|
s_dim = -1
|
||||||
|
in_dims = [d.dim_value for d in vinfo[node.input[0]].type.tensor_type.shape.dim]
|
||||||
|
tensor = data[node.input[1]]
|
||||||
|
out_dims = numpy_helper.to_array(tensor).copy()
|
||||||
|
if len(in_dims) == 3 and len(out_dims) == 4:
|
||||||
|
if in_plc.dim == 0:
|
||||||
|
s_dim = 1
|
||||||
|
elif in_plc.dim == 2:
|
||||||
|
s_dim = 2
|
||||||
|
if len(in_dims) == 4 and len(out_dims) == 3:
|
||||||
|
if in_plc.dim == 1:
|
||||||
|
s_dim = 0
|
||||||
|
elif in_plc.dim == 2:
|
||||||
|
s_dim = 2
|
||||||
|
if len(in_dims) == 2 and len(out_dims) == 3:
|
||||||
|
if in_plc.dim == 1:
|
||||||
|
s_dim = 2
|
||||||
|
if len(in_dims) == 4 and len(out_dims) == 2:
|
||||||
|
if in_plc.dim == 1:
|
||||||
|
s_dim = 0
|
||||||
|
elif in_plc.dim == 2:
|
||||||
|
s_dim = 1
|
||||||
|
if len(in_dims) == 3 and len(out_dims) == 2:
|
||||||
|
if in_plc.dim == 1:
|
||||||
|
s_dim = 0
|
||||||
|
elif in_plc.dim == 2:
|
||||||
|
s_dim = 1
|
||||||
|
assert s_dim != -1
|
||||||
|
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
||||||
|
out_dims[s_dim] //= tp_world_size
|
||||||
|
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
|
||||||
|
node.input[1] = node.output[0] + "_shape"
|
||||||
|
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
|
||||||
|
place[node.output[0]] = Shard(s_dim)
|
||||||
|
|
||||||
|
def shard_split(node: NodeProto):
|
||||||
|
if not is_sharded(node.input[0]):
|
||||||
|
return
|
||||||
|
in_plc = place[node.input[0]]
|
||||||
|
split_tensor = data[node.input[1]]
|
||||||
|
split = numpy_helper.to_array(split_tensor).copy()
|
||||||
|
split //= tp_world_size
|
||||||
|
data[node.input[1]] = numpy_helper.from_array(split, name=node.input[1])
|
||||||
|
for output in node.output:
|
||||||
|
place[output] = in_plc
|
||||||
|
|
||||||
|
def shard_transpose(node: NodeProto):
|
||||||
|
plc = place[node.input[0]]
|
||||||
|
if plc.is_shard():
|
||||||
|
perm = next(attr.ints for attr in node.attribute if attr.name == "perm")
|
||||||
|
place[node.output[0]] = Shard(list(perm).index(plc.dim))
|
||||||
|
|
||||||
|
def shard_node(node: NodeProto):
|
||||||
|
if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
|
||||||
|
place[node.output[0]] = place[node.input[0]]
|
||||||
|
elif node.op_type in ["Where"]:
|
||||||
|
place[node.output[0]] = place[node.input[1]]
|
||||||
|
if node.op_type in {"Add", "Mul", "Div", "Max"}:
|
||||||
|
shard_binary(node)
|
||||||
|
elif node.op_type == "Reshape":
|
||||||
|
shard_reshape(node)
|
||||||
|
elif node.op_type == "Transpose":
|
||||||
|
shard_transpose(node)
|
||||||
|
elif node.op_type == "Split":
|
||||||
|
shard_split(node)
|
||||||
|
elif node.op_type == "MatMul":
|
||||||
|
assert (
|
||||||
|
place[node.input[0]] == place[node.input[1]]
|
||||||
|
), f"{place[node.input[0]]} != {place[node.input[1]]}"
|
||||||
|
place[node.output[0]] = place[node.input[0]]
|
||||||
|
elif node.op_type == "Concat":
|
||||||
|
shard_concat(node)
|
||||||
|
|
||||||
|
def find_successor(op_type: str, idx: int, search_limit: int = 1):
|
||||||
|
for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
|
||||||
|
if node.op_type == op_type:
|
||||||
|
return node
|
||||||
|
return None
|
||||||
|
|
||||||
|
# all tensors are initially replicated.
|
||||||
|
for v in vinfo:
|
||||||
|
place[v] = Replicate()
|
||||||
|
|
||||||
|
for t in data:
|
||||||
|
place[t] = Replicate()
|
||||||
|
|
||||||
|
for index, node in enumerate(model.graph.node):
|
||||||
|
nodes.append(node)
|
||||||
|
# linear
|
||||||
|
if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
|
||||||
|
input in data for input in node.input
|
||||||
|
):
|
||||||
|
# FIXME(constroy): the last MatMul should not be sharded as TP.
|
||||||
|
if (
|
||||||
|
node.output[0] in output
|
||||||
|
or (
|
||||||
|
index + 1 < len(model.graph.node)
|
||||||
|
and model.graph.node[index + 1].output[0]
|
||||||
|
)
|
||||||
|
in output
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
groups = 1
|
||||||
|
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
|
||||||
|
split_node = find_successor("Split", index, search_limit=2)
|
||||||
|
if split_node is not None:
|
||||||
|
groups = len(split_node.output)
|
||||||
|
shard_gemm(node, groups)
|
||||||
|
plc = place[node.output[0]]
|
||||||
|
if plc.is_partial():
|
||||||
|
new_name = node.output[0] + f":{plc}"
|
||||||
|
place[new_name] = place[node.output[0]]
|
||||||
|
# insert all_reduce
|
||||||
|
nodes.append(
|
||||||
|
helper.make_node(
|
||||||
|
op_type="ReduceSum",
|
||||||
|
inputs=[new_name],
|
||||||
|
outputs=[node.output[0]],
|
||||||
|
name=node.name + "/all_reduce",
|
||||||
|
noop_with_empty_axes=1,
|
||||||
|
communicator=0, # hack to treat ReduceSum as AllReduceSum
|
||||||
|
)
|
||||||
|
)
|
||||||
|
place[node.output[0]] = Replicate()
|
||||||
|
node.output[0] = new_name
|
||||||
|
if len(node.input) > 2: # split bias to add
|
||||||
|
prev = nodes[-1]
|
||||||
|
new_name = prev.output[0] + "_no_bias"
|
||||||
|
place[new_name] = place[node.output[0]]
|
||||||
|
bias = helper.make_node(
|
||||||
|
op_type="Add",
|
||||||
|
inputs=[new_name, node.input[2]],
|
||||||
|
outputs=[prev.output[0]],
|
||||||
|
name=node.name + "/bias",
|
||||||
|
)
|
||||||
|
node.input.pop()
|
||||||
|
prev.output[0] = new_name
|
||||||
|
shard_binary(bias, groups)
|
||||||
|
nodes.append(bias)
|
||||||
|
continue
|
||||||
|
shard_node(node)
|
||||||
|
|
||||||
|
new_input = []
|
||||||
|
for info in model.graph.input:
|
||||||
|
new_input.append(vinfo[info.name])
|
||||||
|
|
||||||
|
graph = helper.make_graph(
|
||||||
|
nodes,
|
||||||
|
model.graph.name + f"_{tp_rank}",
|
||||||
|
new_input,
|
||||||
|
model.graph.output,
|
||||||
|
data.values(),
|
||||||
|
doc_string=model.graph.doc_string,
|
||||||
|
# value_info=vinfo.values(),
|
||||||
|
)
|
||||||
|
for output in graph.output:
|
||||||
|
tt = output.type.tensor_type
|
||||||
|
if tt.HasField("shape"):
|
||||||
|
tt.ClearField("shape")
|
||||||
|
model = helper.make_model(graph)
|
||||||
|
#model = onnx.shape_inference.infer_shapes(model)
|
||||||
|
return model
|
|
@ -0,0 +1,64 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class Placement:
|
||||||
|
# base class Placement type
|
||||||
|
|
||||||
|
# convenient utils to check for placement types
|
||||||
|
def is_shard(self, dim: Optional[int] = None) -> bool:
|
||||||
|
if dim is not None and isinstance(self, Shard):
|
||||||
|
return self.dim == dim
|
||||||
|
else:
|
||||||
|
return isinstance(self, Shard)
|
||||||
|
|
||||||
|
def is_replicate(self) -> bool:
|
||||||
|
return isinstance(self, Replicate)
|
||||||
|
|
||||||
|
def is_partial(self) -> bool:
|
||||||
|
return isinstance(self, _Partial)
|
||||||
|
|
||||||
|
|
||||||
|
class Replicate(Placement):
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, Replicate):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
"""
|
||||||
|
machine readable representation of the Replicate placement
|
||||||
|
"""
|
||||||
|
return "Replicate()"
|
||||||
|
|
||||||
|
|
||||||
|
class Shard(Placement):
|
||||||
|
# shard placement, shard on a dim
|
||||||
|
def __init__(self, dim):
|
||||||
|
self.dim = dim
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, Shard):
|
||||||
|
return False
|
||||||
|
return self.dim == other.dim
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
"""
|
||||||
|
machine readable representation of the Shard placement
|
||||||
|
"""
|
||||||
|
return f"Shard(dim={self.dim})"
|
||||||
|
|
||||||
|
|
||||||
|
class _Partial(Placement):
|
||||||
|
def __init__(self, reduce_op: str = "sum"):
|
||||||
|
self.reduce_op: str = reduce_op
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, _Partial):
|
||||||
|
return False
|
||||||
|
return self.reduce_op == other.reduce_op
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
"""
|
||||||
|
machine readable representation of the Partial placement
|
||||||
|
"""
|
||||||
|
return f"_Partial(reduce_op={self.reduce_op})"
|
|
@ -0,0 +1,145 @@
|
||||||
|
import os
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import numpy as np
|
||||||
|
import onnx
|
||||||
|
import torch
|
||||||
|
from transformers import LlamaModel, LlamaForCausalLM
|
||||||
|
from tqdm import tqdm
|
||||||
|
import onnx_graphsurgeon as gs
|
||||||
|
from onnxsim import simplify
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='')
|
||||||
|
parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
|
||||||
|
parser.add_argument('--layer', dest='n_layers', type=int, default=2)
|
||||||
|
parser.add_argument('--iter', dest='n_iter', type=int, default=1)
|
||||||
|
parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
|
||||||
|
parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str,
|
||||||
|
default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
|
||||||
|
parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str,
|
||||||
|
default="/data1/shared/llama")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
|
||||||
|
ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
|
||||||
|
|
||||||
|
def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
|
||||||
|
param = torch.zeros(
|
||||||
|
(args.batchsize, 1024), dtype=torch.long)
|
||||||
|
logits = model(param, past_key_values=None)
|
||||||
|
param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
|
||||||
|
|
||||||
|
torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
|
||||||
|
"position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
|
||||||
|
do_constant_folding=True,)
|
||||||
|
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||||
|
print("simplifing onnx model")
|
||||||
|
onnx_model, check = simplify(onnx_model, skipped_optimizers=[
|
||||||
|
'eliminate_duplicate_initializer'])
|
||||||
|
assert check
|
||||||
|
|
||||||
|
onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
|
||||||
|
print("simlifing finished.")
|
||||||
|
|
||||||
|
|
||||||
|
@gs.Graph.register()
|
||||||
|
def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
|
||||||
|
for inp in inputs:
|
||||||
|
inp.outputs.clear()
|
||||||
|
for out in outputs:
|
||||||
|
out.inputs.clear()
|
||||||
|
for inp in inputs_added:
|
||||||
|
inputs.append(inp)
|
||||||
|
for out in outputs_removed:
|
||||||
|
out.inputs.clear()
|
||||||
|
return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_onnx_with_attention_op():
|
||||||
|
graph = gs.import_onnx(
|
||||||
|
onnx.load(ONNX_MODEL_PATH))
|
||||||
|
tmap = graph.tensors()
|
||||||
|
for i in range(args.n_layers):
|
||||||
|
inputs = [
|
||||||
|
tmap["onnx::Concat_" + str((i+1)*2)],
|
||||||
|
tmap["onnx::Concat_" + str((i+1)*2+1)],
|
||||||
|
tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
|
||||||
|
tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
|
||||||
|
tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
|
||||||
|
outputs = [
|
||||||
|
tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
|
||||||
|
|
||||||
|
inputs_added = [graph.inputs[1]]
|
||||||
|
outputs_removed = []
|
||||||
|
|
||||||
|
graph.replace_with_attention(
|
||||||
|
inputs, outputs, inputs_added, outputs_removed)
|
||||||
|
|
||||||
|
graph.outputs = [tmap[graph.outputs[0].name]]
|
||||||
|
graph.cleanup(True).toposort()
|
||||||
|
onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
kvcache_torch = None
|
||||||
|
torch_model = LlamaForCausalLM.from_pretrained(
|
||||||
|
args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
|
||||||
|
|
||||||
|
n_heads = torch_model.config.num_attention_heads
|
||||||
|
n_dims = torch_model.config.hidden_size // n_heads
|
||||||
|
|
||||||
|
if not os.path.exists(ONNX_MODEL_PATH):
|
||||||
|
print("exporting onnx graph")
|
||||||
|
export_onnx(torch_model, ONNX_MODEL_PATH)
|
||||||
|
replace_onnx_with_attention_op()
|
||||||
|
else:
|
||||||
|
print("will use exsiting onnx graph")
|
||||||
|
|
||||||
|
onnx_model = onnx.load(ONNX_MODEL_PATH)
|
||||||
|
stub = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
|
||||||
|
count_wrong = 0
|
||||||
|
for i in tqdm(range(0, args.n_max_length)):
|
||||||
|
query = np.random.randint(
|
||||||
|
torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
|
||||||
|
position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
|
||||||
|
|
||||||
|
####################################
|
||||||
|
# pytorch
|
||||||
|
####################################
|
||||||
|
outputs_torch = torch_model(
|
||||||
|
torch.tensor(query), past_key_values=kvcache_torch)
|
||||||
|
logit_torch = outputs_torch['logits']
|
||||||
|
kvcache_torch = outputs_torch['past_key_values']
|
||||||
|
|
||||||
|
####################################
|
||||||
|
# infinitensor
|
||||||
|
####################################
|
||||||
|
# copyin input
|
||||||
|
(list(stub.inputs.items()))[0][1].copyin_int64(
|
||||||
|
query.reshape(-1).tolist())
|
||||||
|
(list(stub.inputs.items()))[1][1].copyin_int64(
|
||||||
|
position_id.reshape(-1).tolist())
|
||||||
|
|
||||||
|
stub.run()
|
||||||
|
|
||||||
|
####################################
|
||||||
|
# validation
|
||||||
|
####################################
|
||||||
|
# copyout output
|
||||||
|
logits_it = np.array((list(stub.outputs.items()))
|
||||||
|
[0][1].copyout_float())
|
||||||
|
|
||||||
|
try:
|
||||||
|
np.testing.assert_allclose(
|
||||||
|
logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
np.testing.assert_allclose(
|
||||||
|
np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
|
||||||
|
except:
|
||||||
|
count_wrong = count_wrong + 1
|
||||||
|
|
||||||
|
result = "{}/{} failed.".format(count_wrong, args.n_max_length)
|
||||||
|
print(result)
|
||||||
|
del stub
|
|
@ -0,0 +1,29 @@
|
||||||
|
import sys
|
||||||
|
import onnx
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = sys.argv
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python onnx_inference.py model_name.onnx")
|
||||||
|
exit()
|
||||||
|
model_path = sys.argv[1]
|
||||||
|
# print(model_path)
|
||||||
|
|
||||||
|
onnx_model = onnx.load(model_path)
|
||||||
|
onnx_input = onnx_model.graph.input[0]
|
||||||
|
input_shape = [[d.dim_value for d in _input.type.tensor_type.shape.dim]
|
||||||
|
for _input in onnx_model.graph.input]
|
||||||
|
# Assume that there is only one input tensor
|
||||||
|
input_shape = input_shape[0]
|
||||||
|
# print(input_shape)
|
||||||
|
input_data = np.random.random(input_shape).astype(np.float32)
|
||||||
|
|
||||||
|
model = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
next(iter(model.inputs.values())).copyin_numpy(input_data)
|
||||||
|
model.run()
|
||||||
|
outputs = next(iter(model.outputs.values())).copyout_numpy()
|
||||||
|
outputs = torch.tensor(outputs)
|
||||||
|
print(outputs.shape)
|
|
@ -0,0 +1,80 @@
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.vision.transforms as T
|
||||||
|
from paddle.vision.datasets import Cifar10
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
def run_cifar_train_and_infer():
|
||||||
|
|
||||||
|
paddle.device.set_device("gpu")
|
||||||
|
|
||||||
|
transform = T.Compose(
|
||||||
|
[
|
||||||
|
T.Resize(224),
|
||||||
|
T.ToTensor(),
|
||||||
|
T.Normalize(
|
||||||
|
mean=[0.5, 0.5, 0.5],
|
||||||
|
std=[0.5, 0.5, 0.5],
|
||||||
|
to_rgb=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 下载数据集并初始化 DataSet
|
||||||
|
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
|
||||||
|
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
|
||||||
|
|
||||||
|
# 模型组网并初始化网络
|
||||||
|
densenet = paddle.vision.models.DenseNet(num_classes=10)
|
||||||
|
model = paddle.Model(densenet)
|
||||||
|
|
||||||
|
# 模型训练的配置准备,准备损失函数,优化器和评价指标
|
||||||
|
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
|
||||||
|
paddle.nn.CrossEntropyLoss(),
|
||||||
|
paddle.metric.Accuracy())
|
||||||
|
|
||||||
|
# 模型训练
|
||||||
|
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
|
||||||
|
# 模型评估
|
||||||
|
model.evaluate(test_dataset, batch_size=64, verbose=1)
|
||||||
|
|
||||||
|
# export to ONNX
|
||||||
|
save_path = 'onnx.save/densenet' # 需要保存的路径
|
||||||
|
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec ,InputSpec 支持动态的 shape。
|
||||||
|
paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
|
||||||
|
|
||||||
|
# 加载onnx模型并放到Infinitensor中
|
||||||
|
model_path = save_path + ".onnx"
|
||||||
|
onnx_model = onnx.load(model_path)
|
||||||
|
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
model = gofusion_model
|
||||||
|
model.init()
|
||||||
|
|
||||||
|
# 启动推理
|
||||||
|
cifar10_test = Cifar10(
|
||||||
|
mode="test",
|
||||||
|
transform=transform, # apply transform to every image
|
||||||
|
backend="cv2", # use OpenCV as image transform backend
|
||||||
|
)
|
||||||
|
batch_size = 1
|
||||||
|
total_size = 0
|
||||||
|
total_acc = 0.0
|
||||||
|
for data in itertools.islice(iter(cifar10_test), 10000):
|
||||||
|
images, labels = data
|
||||||
|
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
|
||||||
|
model.run()
|
||||||
|
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
|
||||||
|
outputs = paddle.to_tensor(outputs)
|
||||||
|
outputs = paddle.reshape(outputs, (1, 10))
|
||||||
|
labels = paddle.to_tensor(labels)
|
||||||
|
labels = paddle.reshape(labels, (1,1))
|
||||||
|
acc = paddle.metric.accuracy(outputs, labels)
|
||||||
|
total_acc += acc
|
||||||
|
total_size += batch_size
|
||||||
|
print("test acc: {}".format(total_acc.numpy() / total_size))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_cifar_train_and_infer()
|
|
@ -0,0 +1,80 @@
|
||||||
|
import paddle
|
||||||
|
import paddle.vision.transforms as T
|
||||||
|
from paddle.vision.datasets import Cifar10
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
def run_cifar_train_and_infer():
|
||||||
|
|
||||||
|
paddle.device.set_device("gpu")
|
||||||
|
|
||||||
|
transform = T.Compose(
|
||||||
|
[
|
||||||
|
T.Resize(224),
|
||||||
|
T.ToTensor(),
|
||||||
|
T.Normalize(
|
||||||
|
mean=[0.5, 0.5, 0.5],
|
||||||
|
std=[0.5, 0.5, 0.5],
|
||||||
|
to_rgb=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 下载数据集并初始化 DataSet
|
||||||
|
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
|
||||||
|
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
|
||||||
|
|
||||||
|
# 模型组网并初始化网络
|
||||||
|
inception = paddle.vision.models.InceptionV3(num_classes=10)
|
||||||
|
model = paddle.Model(inception)
|
||||||
|
|
||||||
|
# 模型训练的配置准备,准备损失函数,优化器和评价指标
|
||||||
|
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
|
||||||
|
paddle.nn.CrossEntropyLoss(),
|
||||||
|
paddle.metric.Accuracy())
|
||||||
|
|
||||||
|
# 模型训练
|
||||||
|
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
|
||||||
|
# 模型评估
|
||||||
|
model.evaluate(test_dataset, batch_size=64, verbose=1)
|
||||||
|
|
||||||
|
# export to ONNX
|
||||||
|
save_path = 'onnx.save/inception' # 需要保存的路径
|
||||||
|
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec ,InputSpec 支持动态的 shape。
|
||||||
|
paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
|
||||||
|
|
||||||
|
# 加载onnx模型并放到Infinitensor中
|
||||||
|
model_path = save_path + ".onnx"
|
||||||
|
onnx_model = onnx.load(model_path)
|
||||||
|
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
model = gofusion_model
|
||||||
|
model.init()
|
||||||
|
|
||||||
|
# 启动推理
|
||||||
|
cifar10_test = Cifar10(
|
||||||
|
mode="test",
|
||||||
|
transform=transform, # apply transform to every image
|
||||||
|
backend="cv2", # use OpenCV as image transform backend
|
||||||
|
)
|
||||||
|
batch_size = 1
|
||||||
|
total_size = 0
|
||||||
|
total_acc = 0.0
|
||||||
|
for data in itertools.islice(iter(cifar10_test), 10000):
|
||||||
|
images, labels = data
|
||||||
|
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
|
||||||
|
model.run()
|
||||||
|
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
|
||||||
|
outputs = paddle.to_tensor(outputs)
|
||||||
|
outputs = paddle.reshape(outputs, (1, 10))
|
||||||
|
labels = paddle.to_tensor(labels)
|
||||||
|
labels = paddle.reshape(labels, (1,1))
|
||||||
|
acc = paddle.metric.accuracy(outputs, labels)
|
||||||
|
total_acc += acc
|
||||||
|
total_size += batch_size
|
||||||
|
print("test acc: {}".format(total_acc.numpy() / total_size))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_cifar_train_and_infer()
|
|
@ -0,0 +1,31 @@
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
|
||||||
|
|
||||||
|
## What do we do in paddle*.py files?
|
||||||
|
|
||||||
|
1. Train model and evalute model with Cifar10 dataset
|
||||||
|
|
||||||
|
2. Export paddle model to onnx model
|
||||||
|
|
||||||
|
3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
|
||||||
|
|
||||||
|
## Command
|
||||||
|
|
||||||
|
1. Go to `/examples/python` folder
|
||||||
|
|
||||||
|
2. Run the following command
|
||||||
|
|
||||||
|
1. ```
|
||||||
|
python paddle_resnet.py
|
||||||
|
python paddle_densenet.py
|
||||||
|
python paddle_inception.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## What should I do if I use other device(MLU, XPU, NPU)?
|
||||||
|
|
||||||
|
You need to change this code:
|
||||||
|
|
||||||
|
```
|
||||||
|
paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
|
||||||
|
```
|
|
@ -0,0 +1,81 @@
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.vision.transforms as T
|
||||||
|
from paddle.vision.datasets import Cifar10
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
import itertools
|
||||||
|
from paddle.vision.models.resnet import BasicBlock
|
||||||
|
|
||||||
|
def run_cifar_train_and_infer():
|
||||||
|
|
||||||
|
paddle.device.set_device("gpu")
|
||||||
|
|
||||||
|
transform = T.Compose(
|
||||||
|
[
|
||||||
|
T.Resize(224),
|
||||||
|
T.ToTensor(),
|
||||||
|
T.Normalize(
|
||||||
|
mean=[0.5, 0.5, 0.5],
|
||||||
|
std=[0.5, 0.5, 0.5],
|
||||||
|
to_rgb=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 下载数据集并初始化 DataSet
|
||||||
|
train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
|
||||||
|
test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
|
||||||
|
|
||||||
|
# 模型组网并初始化网络
|
||||||
|
resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
|
||||||
|
model = paddle.Model(resnet)
|
||||||
|
|
||||||
|
# 模型训练的配置准备,准备损失函数,优化器和评价指标
|
||||||
|
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
|
||||||
|
paddle.nn.CrossEntropyLoss(),
|
||||||
|
paddle.metric.Accuracy())
|
||||||
|
|
||||||
|
# 模型训练
|
||||||
|
model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
|
||||||
|
# 模型评估
|
||||||
|
model.evaluate(test_dataset, batch_size=64, verbose=1)
|
||||||
|
|
||||||
|
# export to ONNX
|
||||||
|
save_path = 'onnx.save/resnet' # 需要保存的路径
|
||||||
|
x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型,支持持 Tensor 或 InputSpec ,InputSpec 支持动态的 shape。
|
||||||
|
paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
|
||||||
|
|
||||||
|
# 加载onnx模型并放到Infinitensor中
|
||||||
|
model_path = save_path + ".onnx"
|
||||||
|
onnx_model = onnx.load(model_path)
|
||||||
|
gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
model = gofusion_model
|
||||||
|
model.init()
|
||||||
|
|
||||||
|
# 启动推理
|
||||||
|
cifar10_test = Cifar10(
|
||||||
|
mode="test",
|
||||||
|
transform=transform, # apply transform to every image
|
||||||
|
backend="cv2", # use OpenCV as image transform backend
|
||||||
|
)
|
||||||
|
batch_size = 1
|
||||||
|
total_size = 0
|
||||||
|
total_acc = 0.0
|
||||||
|
for data in itertools.islice(iter(cifar10_test), 10000):
|
||||||
|
images, labels = data
|
||||||
|
next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
|
||||||
|
model.run()
|
||||||
|
outputs = next(model.outputs.items().__iter__())[1].copyout_float()
|
||||||
|
outputs = paddle.to_tensor(outputs)
|
||||||
|
outputs = paddle.reshape(outputs, (1, 10))
|
||||||
|
labels = paddle.to_tensor(labels)
|
||||||
|
labels = paddle.reshape(labels, (1,1))
|
||||||
|
acc = paddle.metric.accuracy(outputs, labels)
|
||||||
|
total_acc += acc
|
||||||
|
total_size += batch_size
|
||||||
|
print("test acc: {}".format(total_acc.numpy() / total_size))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_cifar_train_and_infer()
|
|
@ -0,0 +1,24 @@
|
||||||
|
import sys
|
||||||
|
import onnx
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import torchvision.models as models
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model_path = './resnet18.onnx'
|
||||||
|
tv_model = models.resnet50(weights=None)
|
||||||
|
input_shape = (1, 3, 224, 224)
|
||||||
|
param = torch.rand(input_shape)
|
||||||
|
torch.onnx.export(tv_model, param, model_path, verbose=False)
|
||||||
|
|
||||||
|
onnx_model = onnx.load(model_path)
|
||||||
|
model = OnnxStub(onnx_model, backend.cuda_runtime())
|
||||||
|
images = np.random.random(input_shape).astype(np.float32)
|
||||||
|
next(iter(model.inputs.values())).copyin_numpy(images)
|
||||||
|
model.run()
|
||||||
|
outputs = next(iter(model.outputs.values())).copyout_numpy()
|
||||||
|
outputs = torch.tensor(outputs)
|
||||||
|
outputs = torch.reshape(outputs, (1, 1000))
|
||||||
|
_, predicted = torch.max(outputs, 1)
|
||||||
|
print(predicted)
|
|
@ -2,6 +2,10 @@
|
||||||
#include "cnnl.h"
|
#include "cnnl.h"
|
||||||
#include "cnrt.h"
|
#include "cnrt.h"
|
||||||
#include "core/common.h"
|
#include "core/common.h"
|
||||||
|
#include "core/data_type.h"
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "cncl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define checkBangError(call) \
|
#define checkBangError(call) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -27,4 +31,70 @@ namespace infini {
|
||||||
|
|
||||||
using BangPtr = void *;
|
using BangPtr = void *;
|
||||||
|
|
||||||
|
inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
|
||||||
|
if (dataType == DataType::Float32) {
|
||||||
|
return CNNL_DTYPE_FLOAT;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Float16) {
|
||||||
|
return CNNL_DTYPE_HALF;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Double) {
|
||||||
|
return CNNL_DTYPE_DOUBLE;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int8) {
|
||||||
|
return CNNL_DTYPE_INT8;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int32) {
|
||||||
|
return CNNL_DTYPE_INT32;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::UInt8) {
|
||||||
|
return CNNL_DTYPE_UINT8;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::BFloat16) {
|
||||||
|
return CNNL_DTYPE_BFLOAT16;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int64) {
|
||||||
|
return CNNL_DTYPE_INT64;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Bool) {
|
||||||
|
return CNNL_DTYPE_BOOL;
|
||||||
|
}
|
||||||
|
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||||
|
" not supported in CNNL.");
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
|
||||||
|
if (dataType == DataType::Float32) {
|
||||||
|
return cnclFloat32;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Float16) {
|
||||||
|
return cnclHalf;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int8) {
|
||||||
|
return cnclInt8;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int16) {
|
||||||
|
return cnclInt16;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::Int32) {
|
||||||
|
return cnclInt32;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::UInt8) {
|
||||||
|
return cnclUint8;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::UInt16) {
|
||||||
|
return cnclUint16;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::UInt32) {
|
||||||
|
return cnclUint32;
|
||||||
|
}
|
||||||
|
if (dataType == DataType::BFloat16) {
|
||||||
|
return cnclBfloat16;
|
||||||
|
}
|
||||||
|
IT_TODO_HALT_MSG("Data type " + dataType.toString() +
|
||||||
|
" not supported in CNCL.");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -7,16 +7,19 @@ namespace infini {
|
||||||
class BangRuntimeObj : public RuntimeObj {
|
class BangRuntimeObj : public RuntimeObj {
|
||||||
private:
|
private:
|
||||||
cnnlHandle_t cnnl;
|
cnnlHandle_t cnnl;
|
||||||
|
cnrtQueue_t queue;
|
||||||
|
std::unique_ptr<CommunicatorObj> comm;
|
||||||
BangPtr workspace;
|
BangPtr workspace;
|
||||||
size_t workspaceSize;
|
size_t workspaceSize;
|
||||||
|
mutable size_t cursor;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
BangRuntimeObj() : RuntimeObj(Device::BANG) {
|
explicit BangRuntimeObj(int deviceId = 0)
|
||||||
|
: RuntimeObj(Device::BANG, deviceId) {
|
||||||
cnInit(0);
|
cnInit(0);
|
||||||
CNdev dev;
|
CNdev dev;
|
||||||
cnDeviceGet(&dev, 0);
|
cnDeviceGet(&dev, deviceId);
|
||||||
checkBangError(cnrtSetDevice(dev));
|
checkBangError(cnrtSetDevice(dev));
|
||||||
cnrtQueue_t queue;
|
|
||||||
checkBangError(cnrtQueueCreate(&queue));
|
checkBangError(cnrtQueueCreate(&queue));
|
||||||
|
|
||||||
checkCnnlError(cnnlCreate(&cnnl));
|
checkCnnlError(cnnlCreate(&cnnl));
|
||||||
|
@ -24,10 +27,12 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
// 10GB for Longformer
|
// 10GB for Longformer
|
||||||
// size_t longformerNum = 3lu * (1 << 30);
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
workspaceSize = 7ll << 30; // 7 GB
|
workspaceSize = 7ll << 30; // 7 GB
|
||||||
|
cursor = 0;
|
||||||
workspace = alloc(workspaceSize);
|
workspace = alloc(workspaceSize);
|
||||||
}
|
}
|
||||||
virtual ~BangRuntimeObj() {
|
virtual ~BangRuntimeObj() {
|
||||||
dealloc(workspace);
|
dealloc(workspace);
|
||||||
|
checkBangError(cnrtQueueDestroy(queue));
|
||||||
checkCnnlError(cnnlDestroy(cnnl));
|
checkCnnlError(cnnlDestroy(cnnl));
|
||||||
}
|
}
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
@ -45,10 +50,15 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
|
void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
|
||||||
cnnlHandle_t cnnlHandle() const { return cnnl; }
|
cnnlHandle_t cnnlHandle() const { return cnnl; }
|
||||||
BangPtr getWorkspace(size_t size) const {
|
BangPtr getWorkspace(size_t size) const {
|
||||||
IT_ASSERT(size <= workspaceSize);
|
IT_ASSERT((cursor + size) <= workspaceSize);
|
||||||
return workspace;
|
cursor += size;
|
||||||
|
void *temp = workspace;
|
||||||
|
temp += (cursor - size);
|
||||||
|
return temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void resetWorkspace() const { cursor = 0; }
|
||||||
|
|
||||||
void copyBlobFromCPU(void *dst, const void *src,
|
void copyBlobFromCPU(void *dst, const void *src,
|
||||||
size_t bytes) const override {
|
size_t bytes) const override {
|
||||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
@ -66,6 +76,9 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||||
}
|
}
|
||||||
|
void initComm(const string &name, int worldSize, int rank) final;
|
||||||
|
CommunicatorObj &getCommunicator() const override { return *comm; }
|
||||||
|
cnrtQueue_t getBangQueue() const { return queue; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
#pragma once
|
||||||
|
#include "bang_common.h"
|
||||||
|
#include "core/communicator.h"
|
||||||
|
#include <chrono>
|
||||||
|
#include <cncl.h>
|
||||||
|
#include <cnrt.h>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class CnclCommunicatorObj final : public CommunicatorObj {
|
||||||
|
private:
|
||||||
|
cnclComm_t *comms;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CnclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||||
|
: CommunicatorObj(worldSize, rank) {
|
||||||
|
const std::string filePath("./" + name + "_cncl_id.bin");
|
||||||
|
cnclCliqueId clique_id;
|
||||||
|
if (rank == 0) {
|
||||||
|
CNCL_CHECK(cnclGetCliqueId(&clique_id));
|
||||||
|
std::ofstream ofs(filePath, std::ios::binary);
|
||||||
|
ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
auto begin = std::chrono::steady_clock::now();
|
||||||
|
while (!std::filesystem::exists(filePath)) {
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||||
|
"time limit (10s) exceeded.");
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
}
|
||||||
|
std::ifstream ifs(filePath, std::ios::binary);
|
||||||
|
ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_comms = 1;
|
||||||
|
int *dev_list = new int[num_comms];
|
||||||
|
int *rank_list = new int[num_comms];
|
||||||
|
comms = new cnclComm_t[num_comms];
|
||||||
|
uint32_t num_dev = 0;
|
||||||
|
checkBangError(cnrtGetDeviceCount(&num_dev));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_comms; i++) {
|
||||||
|
rank_list[i] = rank;
|
||||||
|
dev_list[i] = rank_list[i] % num_dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
|
||||||
|
worldSize, &clique_id));
|
||||||
|
|
||||||
|
if (rank == 0) {
|
||||||
|
std::filesystem::remove(filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] dev_list;
|
||||||
|
delete[] rank_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
~CnclCommunicatorObj() {
|
||||||
|
CNCL_CHECK(cnclDestroyComms(comms, 1));
|
||||||
|
delete[] comms;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the actual cnclComm_t
|
||||||
|
cnclComm_t getCnclComm() { return comms[0]; }
|
||||||
|
|
||||||
|
virtual string toString() const final {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "CNCL communicator";
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -40,12 +40,12 @@ using HashType = uint64_t; // compatible with std::hash
|
||||||
|
|
||||||
// Assert: conditions should have no side effect
|
// Assert: conditions should have no side effect
|
||||||
#define _IT_ASSERT_2(condition, info) \
|
#define _IT_ASSERT_2(condition, info) \
|
||||||
(static_cast<bool>(condition) \
|
static_cast<bool>(condition) \
|
||||||
? void(0) \
|
? void(0) \
|
||||||
: throw ::infini::Exception( \
|
: throw ::infini::Exception( \
|
||||||
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) + \
|
||||||
"] Assertion failed (" + #condition + "): " + info))
|
"] Assertion failed (" + #condition + "): " + info)
|
||||||
#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "");
|
#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
|
||||||
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
|
||||||
|
|
||||||
#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
|
#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
|
||||||
|
@ -61,21 +61,35 @@ template <typename T> auto enum_to_underlying(T e) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
||||||
std::string ret;
|
std::stringstream ss;
|
||||||
ret.append("[");
|
ss << "[";
|
||||||
for (auto d : vec) {
|
for (size_t i = 0; i < vec.size(); ++i) {
|
||||||
ret.append(std::to_string(d));
|
ss << vec.at(i);
|
||||||
ret.append(",");
|
if (i < vec.size() - 1) {
|
||||||
|
ss << ",";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!vec.empty())
|
ss << "]";
|
||||||
ret.pop_back();
|
return ss.str();
|
||||||
ret.append("]");
|
}
|
||||||
return ret;
|
|
||||||
|
template <typename T> std::string vecToString(const T *st, size_t length) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[";
|
||||||
|
size_t i = 0;
|
||||||
|
for (i = 0; i < length; i++) {
|
||||||
|
ss << *(st + i);
|
||||||
|
if (i < length - 1) {
|
||||||
|
ss << ",";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ss << "]";
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
double timeit(
|
double timeit(
|
||||||
const std::function<void()> &func,
|
const std::function<void()> &func,
|
||||||
const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
|
const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
|
||||||
int timingRounds = 200);
|
int timingRounds = 10);
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
#include "object.h"
|
||||||
|
#include "ref.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
// base class
|
||||||
|
class CommunicatorObj : public Object {
|
||||||
|
protected:
|
||||||
|
int worldSize;
|
||||||
|
int rank;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CommunicatorObj(int worldSize, int rank)
|
||||||
|
: worldSize(worldSize), rank(rank) {}
|
||||||
|
|
||||||
|
virtual ~CommunicatorObj() = default;
|
||||||
|
virtual int getWorldSize() const { return worldSize; }
|
||||||
|
virtual int getRank() const { return rank; }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,22 +1,54 @@
|
||||||
|
#pragma once
|
||||||
#include "core/common.h"
|
#include "core/common.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
class DataType {
|
class DataType {
|
||||||
public:
|
public:
|
||||||
// legacy
|
|
||||||
static const DataType Float32;
|
|
||||||
static const DataType UInt32;
|
|
||||||
// These are just aligned with the type and index of onnx:
|
|
||||||
// <https://onnx.ai/onnx/intro/concepts.html#element-type>
|
// <https://onnx.ai/onnx/intro/concepts.html#element-type>
|
||||||
static const DataType UInt8, Int8, UInt16, Int16, Int32, Int64;
|
static const DataType Undefine;
|
||||||
static constexpr size_t sizePerElement[]{
|
static const DataType Float32;
|
||||||
sizeof(float), sizeof(uint32_t), sizeof(uint8_t), sizeof(int8_t),
|
static const DataType UInt8;
|
||||||
sizeof(uint16_t), sizeof(int16_t), sizeof(int32_t), sizeof(int64_t)};
|
static const DataType Int8;
|
||||||
|
static const DataType UInt16;
|
||||||
|
static const DataType Int16;
|
||||||
|
static const DataType Int32;
|
||||||
|
static const DataType Int64;
|
||||||
|
static const DataType String;
|
||||||
|
static const DataType Bool;
|
||||||
|
static const DataType Float16;
|
||||||
|
static const DataType Double;
|
||||||
|
static const DataType UInt32;
|
||||||
|
static const DataType UInt64;
|
||||||
|
static const DataType BFloat16;
|
||||||
|
// "sizePerElement" show the DType to cpu_type
|
||||||
|
// DataType::Bool -> int8_t DataType::Float16 -> uint16_t
|
||||||
|
static constexpr size_t sizePerElement[]{0,
|
||||||
|
sizeof(float),
|
||||||
|
sizeof(uint8_t),
|
||||||
|
sizeof(int8_t),
|
||||||
|
sizeof(uint16_t),
|
||||||
|
sizeof(int16_t),
|
||||||
|
sizeof(int32_t),
|
||||||
|
sizeof(int64_t),
|
||||||
|
sizeof(std::string),
|
||||||
|
sizeof(int8_t),
|
||||||
|
sizeof(uint16_t),
|
||||||
|
sizeof(double),
|
||||||
|
sizeof(uint32_t),
|
||||||
|
sizeof(uint64_t),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
sizeof(uint16_t)};
|
||||||
|
|
||||||
static constexpr std::string_view names[]{"Float32", "UInt32", "UInt8",
|
static constexpr std::string_view names[]{
|
||||||
"Int8", "UInt16", "Int16",
|
"Undefine", "Float32", "UInt8", "Int8", "UInt16",
|
||||||
"Int32", "Int64"};
|
"Int16", "Int32", "Int64", "String", "Bool",
|
||||||
|
"Float16", "Double", "UInt32", "UInt64", "PlaceHolder",
|
||||||
|
"PlaceHolder", "BFloat16"};
|
||||||
|
|
||||||
|
static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5, 6, 7, -1,
|
||||||
|
3, 4, 9, 1, 8, -1, -1, 4};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int index;
|
int index;
|
||||||
|
@ -29,37 +61,43 @@ class DataType {
|
||||||
bool operator==(const DataType &rhs) const { return index == rhs.index; }
|
bool operator==(const DataType &rhs) const { return index == rhs.index; }
|
||||||
bool operator<(const DataType &rhs) const { return index < rhs.index; }
|
bool operator<(const DataType &rhs) const { return index < rhs.index; }
|
||||||
|
|
||||||
template <typename T> static DataType get() {
|
template <typename T> static int get() {
|
||||||
IT_TODO_HALT_MSG("Unsupported data type");
|
IT_TODO_HALT_MSG("Unsupported data type");
|
||||||
}
|
}
|
||||||
size_t getSize() const { return sizePerElement[index]; }
|
size_t getSize() const { return sizePerElement[index]; }
|
||||||
string toString() const { return string(names[index]); }
|
string toString() const { return string(names[index]); }
|
||||||
|
int cpuTypeInt() const { return cpuType[index]; }
|
||||||
|
int getIndex() const { return index; }
|
||||||
};
|
};
|
||||||
|
|
||||||
inline const DataType DataType::Float32(0);
|
|
||||||
inline const DataType DataType::UInt32(1);
|
|
||||||
inline const DataType DataType::UInt8(2), DataType::Int8(3),
|
|
||||||
DataType::UInt16(4), DataType::Int16(5), DataType::Int32(6),
|
|
||||||
DataType::Int64(7);
|
|
||||||
// Method definitions are out of the declaration due to GCC bug:
|
// Method definitions are out of the declaration due to GCC bug:
|
||||||
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
||||||
template <> inline DataType DataType::get<float>() { return Float32; }
|
template <> inline int DataType::get<float>() { return 0; }
|
||||||
template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
|
template <> inline int DataType::get<uint32_t>() { return 1; }
|
||||||
template <> inline DataType DataType::get<uint8_t>() { return UInt8; }
|
template <> inline int DataType::get<uint8_t>() { return 2; }
|
||||||
template <> inline DataType DataType::get<int8_t>() { return Int8; }
|
template <> inline int DataType::get<int8_t>() { return 3; }
|
||||||
template <> inline DataType DataType::get<uint16_t>() { return UInt16; }
|
template <> inline int DataType::get<uint16_t>() { return 4; }
|
||||||
template <> inline DataType DataType::get<int16_t>() { return Int16; }
|
template <> inline int DataType::get<int16_t>() { return 5; }
|
||||||
template <> inline DataType DataType::get<int32_t>() { return Int32; }
|
template <> inline int DataType::get<int32_t>() { return 6; }
|
||||||
template <> inline DataType DataType::get<int64_t>() { return Int64; }
|
template <> inline int DataType::get<int64_t>() { return 7; }
|
||||||
|
template <> inline int DataType::get<uint64_t>() { return 8; }
|
||||||
|
template <> inline int DataType::get<double>() { return 9; }
|
||||||
|
|
||||||
template <int index> struct DT {};
|
template <int index> struct DT {};
|
||||||
template <> struct DT<0> { using t = float; };
|
template <> struct DT<0> { using t = bool; };
|
||||||
template <> struct DT<1> { using t = uint32_t; };
|
template <> struct DT<1> { using t = float; };
|
||||||
template <> struct DT<2> { using t = uint8_t; };
|
template <> struct DT<2> { using t = uint8_t; };
|
||||||
template <> struct DT<3> { using t = int8_t; };
|
template <> struct DT<3> { using t = int8_t; };
|
||||||
template <> struct DT<4> { using t = uint16_t; };
|
template <> struct DT<4> { using t = uint16_t; };
|
||||||
template <> struct DT<5> { using t = int16_t; };
|
template <> struct DT<5> { using t = int16_t; };
|
||||||
template <> struct DT<6> { using t = int32_t; };
|
template <> struct DT<6> { using t = int32_t; };
|
||||||
template <> struct DT<7> { using t = int64_t; };
|
template <> struct DT<7> { using t = int64_t; };
|
||||||
|
template <> struct DT<8> { using t = char; };
|
||||||
|
template <> struct DT<9> { using t = int8_t; };
|
||||||
|
template <> struct DT<10> { using t = uint16_t; };
|
||||||
|
template <> struct DT<11> { using t = double; };
|
||||||
|
template <> struct DT<12> { using t = uint32_t; };
|
||||||
|
template <> struct DT<13> { using t = uint64_t; };
|
||||||
|
template <> struct DT<16> { using t = uint16_t; };
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "core/lazy_allocator.h"
|
||||||
#include "core/operator.h"
|
#include "core/operator.h"
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
|
||||||
|
@ -9,9 +10,11 @@ class GraphObj : public Object {
|
||||||
Runtime runtime;
|
Runtime runtime;
|
||||||
TensorVec tensors;
|
TensorVec tensors;
|
||||||
OpVec ops;
|
OpVec ops;
|
||||||
|
LazyAllocator allocator;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
|
explicit GraphObj(Runtime runtime)
|
||||||
|
: runtime(runtime), allocator(runtime), sorted(false){};
|
||||||
GraphObj(Runtime runtime, OpVec ops_in);
|
GraphObj(Runtime runtime, OpVec ops_in);
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
Runtime getRuntime() const { return runtime; }
|
Runtime getRuntime() const { return runtime; }
|
||||||
|
@ -50,6 +53,7 @@ class GraphObj : public Object {
|
||||||
const TensorVec &getTensors() const { return tensors; }
|
const TensorVec &getTensors() const { return tensors; }
|
||||||
const OpVec &getOperators() const { return ops; }
|
const OpVec &getOperators() const { return ops; }
|
||||||
OpVec getComputeOps() const;
|
OpVec getComputeOps() const;
|
||||||
|
Tensor getTensor(int) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sort the nodes in topological order.
|
* Sort the nodes in topological order.
|
||||||
|
@ -61,7 +65,13 @@ class GraphObj : public Object {
|
||||||
|
|
||||||
void optimize();
|
void optimize();
|
||||||
|
|
||||||
void dataMalloc();
|
void shape_infer();
|
||||||
|
|
||||||
|
void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
|
||||||
|
|
||||||
|
Tensor cloneKV(Tensor &tensor);
|
||||||
|
|
||||||
|
void freeHeap();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Add an operator and create its outputs. Output tensor arguments
|
* @brief Add an operator and create its outputs. Output tensor arguments
|
||||||
|
@ -117,6 +127,11 @@ class GraphObj : public Object {
|
||||||
* @brief If the nodes is sorted in topological order.
|
* @brief If the nodes is sorted in topological order.
|
||||||
*/
|
*/
|
||||||
bool sorted;
|
bool sorted;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief If the weight tensors are allocated.
|
||||||
|
*/
|
||||||
|
bool weightAllocated = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -5,31 +5,11 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace infini {
|
#ifdef USE_CUDA
|
||||||
|
#include "cuda/cuda_runtime.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
// Use the indices from onnx to reduce delivery overhead,
|
namespace infini {
|
||||||
// which comes from onnx but may be not only used for onnx.
|
|
||||||
//
|
|
||||||
// see https://onnx.ai/onnx/intro/concepts.html#element-type
|
|
||||||
enum OnnxDType : int {
|
|
||||||
UNDEFINED = 0,
|
|
||||||
FLOAT,
|
|
||||||
UINT8,
|
|
||||||
INT8,
|
|
||||||
UINT16,
|
|
||||||
INT16,
|
|
||||||
INT32,
|
|
||||||
INT64,
|
|
||||||
STRING,
|
|
||||||
BOOL,
|
|
||||||
FLOAT16,
|
|
||||||
DOUBLE,
|
|
||||||
UINT32,
|
|
||||||
UINT64,
|
|
||||||
COMPLEX64,
|
|
||||||
COMPLEX128,
|
|
||||||
BFLOAT16,
|
|
||||||
};
|
|
||||||
|
|
||||||
class GraphHandlerObj {
|
class GraphHandlerObj {
|
||||||
Graph g;
|
Graph g;
|
||||||
|
@ -50,27 +30,40 @@ class GraphHandlerObj {
|
||||||
int pw, int sh, int sw, int dh, int dw, int oph,
|
int pw, int sh, int sw, int dh, int dw, int oph,
|
||||||
int opw);
|
int opw);
|
||||||
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
|
||||||
Tensor bias, ActType act);
|
Tensor bias, ActType act,
|
||||||
Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
|
std::string matmul_compute_type = "default");
|
||||||
Tensor scale, Tensor bias, float momentum, float eps,
|
Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
|
||||||
bool training);
|
Tensor var, Tensor scale, Tensor bias,
|
||||||
|
float momentum, float eps, bool training);
|
||||||
|
Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
|
||||||
|
Tensor bias, float eps, int axis, int stash_type);
|
||||||
|
Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
|
||||||
|
|
||||||
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
||||||
int ph, int pw, int sh, int sw);
|
int ph, int pw, int sh, int sw, int ceilMode);
|
||||||
Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
|
||||||
int ph, int pw, int sh, int sw);
|
int ph, int pw, int sh, int sw, int ceilMode);
|
||||||
|
|
||||||
Tensor add(Tensor a, Tensor b, Tensor c);
|
Tensor add(Tensor a, Tensor b, Tensor c);
|
||||||
Tensor sub(Tensor a, Tensor b, Tensor c);
|
Tensor sub(Tensor a, Tensor b, Tensor c);
|
||||||
Tensor mul(Tensor a, Tensor b, Tensor c);
|
Tensor mul(Tensor a, Tensor b, Tensor c);
|
||||||
Tensor div(Tensor a, Tensor b, Tensor c);
|
Tensor div(Tensor a, Tensor b, Tensor c);
|
||||||
Tensor pow(Tensor a, Tensor b, Tensor c);
|
Tensor pow(Tensor a, Tensor b, Tensor c);
|
||||||
|
Tensor min(Tensor a, Tensor b, Tensor c);
|
||||||
|
Tensor max(Tensor a, Tensor b, Tensor c);
|
||||||
|
|
||||||
Tensor relu(Tensor x, Tensor y);
|
Tensor relu(Tensor x, Tensor y);
|
||||||
|
Tensor silu(Tensor x, Tensor y);
|
||||||
|
Tensor gelu(Tensor x, Tensor y);
|
||||||
Tensor sigmoid(Tensor x, Tensor y);
|
Tensor sigmoid(Tensor x, Tensor y);
|
||||||
|
Tensor hardSigmoid(Tensor x, Tensor y);
|
||||||
|
Tensor hardSwish(Tensor x, Tensor y);
|
||||||
Tensor tanh(Tensor x, Tensor y);
|
Tensor tanh(Tensor x, Tensor y);
|
||||||
|
Tensor erf(Tensor x, Tensor y);
|
||||||
Tensor softmax(Tensor x, Tensor y, int axis);
|
Tensor softmax(Tensor x, Tensor y, int axis);
|
||||||
Tensor abs(Tensor x, Tensor y);
|
Tensor abs(Tensor x, Tensor y);
|
||||||
|
Tensor sqrt(Tensor x, Tensor y);
|
||||||
|
Tensor neg(Tensor x, Tensor y);
|
||||||
Tensor shape(Tensor x, Tensor y);
|
Tensor shape(Tensor x, Tensor y);
|
||||||
Tensor identity(Tensor x, Tensor y);
|
Tensor identity(Tensor x, Tensor y);
|
||||||
Tensor flatten(Tensor s, Tensor y, int axis);
|
Tensor flatten(Tensor s, Tensor y, int axis);
|
||||||
|
@ -79,17 +72,51 @@ class GraphHandlerObj {
|
||||||
std::optional<float> max);
|
std::optional<float> max);
|
||||||
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
||||||
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
||||||
|
Tensor resize(Tensor input, Tensor output,
|
||||||
|
const std::optional<vector<int>> &axes, Tensor sizes,
|
||||||
|
Tensor scales, Tensor roi, vector<uint32_t> sizes_,
|
||||||
|
vector<float> scales_, vector<float> roi_, string mode,
|
||||||
|
string ratioPolicy, string nearestMode,
|
||||||
|
string coordTransMode);
|
||||||
|
Tensor squeeze(Tensor input, Tensor output, Shape axes);
|
||||||
|
Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
|
||||||
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
||||||
|
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
|
||||||
|
Tensor input_q, Tensor input_k, Tensor input_v,
|
||||||
|
Tensor position_id, Tensor output_matmul);
|
||||||
|
Tensor RoPE(Tensor pos, Tensor input, Tensor output);
|
||||||
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
||||||
int num_outputs);
|
std::variant<int, vector<int>> numOrRatio);
|
||||||
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
||||||
|
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
|
||||||
Tensor reduceMean(Tensor data, Tensor reduced,
|
Tensor reduceMean(Tensor data, Tensor reduced,
|
||||||
const optional<vector<int>> &axes, bool keepdims);
|
const optional<vector<int>> &axes, bool keepdims);
|
||||||
|
Tensor reduceSum(Tensor data, Tensor reduced,
|
||||||
|
const optional<vector<int>> &axes, bool keepdims);
|
||||||
Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
|
Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
|
||||||
const vector<int> &ends, const optional<vector<int>> &axes,
|
const vector<int> &ends, const optional<vector<int>> &axes,
|
||||||
const optional<vector<int>> &steps);
|
const optional<vector<int>> &steps);
|
||||||
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
|
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
|
||||||
const optional<vector<int>> &axes);
|
const optional<vector<int>> &axes);
|
||||||
|
Tensor cast(Tensor input, Tensor output, int to);
|
||||||
|
Tensor expand(Tensor input, Tensor output, Shape dims);
|
||||||
|
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
|
||||||
|
std::vector<int> getDims(Tensor x) { return x->getDims(); }
|
||||||
|
|
||||||
|
Tensor allReduceSum(Tensor input, Tensor output);
|
||||||
|
Tensor allReduceProd(Tensor input, Tensor output);
|
||||||
|
Tensor allReduceMin(Tensor input, Tensor output);
|
||||||
|
Tensor allReduceMax(Tensor input, Tensor output);
|
||||||
|
Tensor allReduceAvg(Tensor input, Tensor output);
|
||||||
|
TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
|
||||||
|
Tensor broadcast(Tensor input, Tensor output, int root);
|
||||||
|
Tensor send(Tensor input, int source, int destination, Tensor output);
|
||||||
|
Tensor recv(Tensor output, int source, int destination, Shape dims,
|
||||||
|
int outputType, Tensor input);
|
||||||
|
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||||
|
std::string mode);
|
||||||
|
Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
|
||||||
|
int size);
|
||||||
|
|
||||||
//------ modifiers
|
//------ modifiers
|
||||||
|
|
||||||
|
@ -97,15 +124,31 @@ class GraphHandlerObj {
|
||||||
|
|
||||||
inline void optimize() { g->optimize(); }
|
inline void optimize() { g->optimize(); }
|
||||||
|
|
||||||
|
inline void shape_infer() { g->shape_infer(); }
|
||||||
|
|
||||||
|
void change_shape(const vector<int> &shape, int tensorId);
|
||||||
//------ runtime
|
//------ runtime
|
||||||
|
|
||||||
inline void data_malloc() { g->dataMalloc(); }
|
inline void data_malloc(bool useNaiveAllocator = false,
|
||||||
|
size_t memPoolSize = 0) {
|
||||||
|
g->dataMalloc(useNaiveAllocator, memPoolSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
|
||||||
|
|
||||||
|
inline void free_heap() { g->freeHeap(); }
|
||||||
|
|
||||||
inline void tune() { g->getRuntime()->run(g, true); }
|
inline void tune() { g->getRuntime()->run(g, true); }
|
||||||
|
|
||||||
inline void run() { g->getRuntime()->run(g); }
|
inline void run() { g->getRuntime()->run(g); }
|
||||||
|
|
||||||
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
|
inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
|
||||||
|
|
||||||
|
#ifdef USE_CUDA
|
||||||
|
inline void run_with_cudagraph() {
|
||||||
|
(as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -2,10 +2,11 @@
|
||||||
#include "core/common.h"
|
#include "core/common.h"
|
||||||
#include "core/operator.h"
|
#include "core/operator.h"
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
#include "utils/operator_utils.h"
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
using json = nlohmann::json;
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
class RuntimeObj; // Forward declaration for Kernel::compute
|
class RuntimeObj; // Forward declaration for Kernel::compute
|
||||||
|
|
||||||
|
@ -29,7 +30,6 @@ class Kernel {
|
||||||
public:
|
public:
|
||||||
Kernel() {}
|
Kernel() {}
|
||||||
virtual ~Kernel() {}
|
virtual ~Kernel() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param op The operator to be executed.
|
* @param op The operator to be executed.
|
||||||
* @param record The parameters for kernel execution. If extra parameters
|
* @param record The parameters for kernel execution. If extra parameters
|
||||||
|
@ -102,11 +102,9 @@ class KernelRegistry {
|
||||||
}
|
}
|
||||||
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
||||||
auto it = kernels.find(kernelAttrs);
|
auto it = kernels.find(kernelAttrs);
|
||||||
IT_ASSERT(it != kernels.end(),
|
IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
|
||||||
"Kernel not found for key {" +
|
get_kernel_attrs_str(kernelAttrs) +
|
||||||
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
"}");
|
||||||
", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
|
|
||||||
", " + std::get<2>(kernelAttrs).toString() + "}");
|
|
||||||
return std::get<0>(it->second);
|
return std::get<0>(it->second);
|
||||||
}
|
}
|
||||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||||
|
@ -131,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
||||||
#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \
|
#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \
|
||||||
namespace infini { \
|
namespace infini { \
|
||||||
static const bool _CAT(_register_kernel_, cnt) = \
|
static const bool _CAT(_register_kernel_, cnt) = \
|
||||||
KernelRegistry::getInstance().registerKernel( \
|
KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \
|
||||||
KernelAttrs{device, opType, dataType}, new kernel(), name); \
|
opType}, \
|
||||||
|
new kernel(), name); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define REGISTER_KERNEL(device, opType, dataType, kernel, name) \
|
#define REGISTER_KERNEL(device, opType, kernel, name) \
|
||||||
_REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
|
_REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
|
||||||
|
|
||||||
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
|
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
|
||||||
namespace infini { \
|
namespace infini { \
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "core/tensor.h"
|
||||||
|
#ifdef BUILD_TEST
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#endif
|
||||||
|
#include <cstddef>
|
||||||
|
#include <map>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class LazyAllocator {
|
||||||
|
private:
|
||||||
|
#ifdef BUILD_TEST
|
||||||
|
FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
|
||||||
|
|
||||||
|
FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Runtime runtime;
|
||||||
|
|
||||||
|
size_t used = 0;
|
||||||
|
|
||||||
|
size_t peak = 0;
|
||||||
|
|
||||||
|
size_t weightPeak = 0;
|
||||||
|
|
||||||
|
size_t heapPeak = 0;
|
||||||
|
|
||||||
|
size_t alignment;
|
||||||
|
|
||||||
|
bool hasMemPool = false;
|
||||||
|
|
||||||
|
size_t memPoolSize = 0;
|
||||||
|
|
||||||
|
// pointer to the memory actually allocated
|
||||||
|
void *ptr = nullptr;
|
||||||
|
|
||||||
|
// pointer to the weight memory space
|
||||||
|
void *weightPtr = nullptr;
|
||||||
|
|
||||||
|
// memory pool ptr
|
||||||
|
void *memPoolPtr = nullptr;
|
||||||
|
|
||||||
|
// // a cache designed for a batch size that has already occurred
|
||||||
|
// std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
|
||||||
|
// batchsizeToTensorOffset;
|
||||||
|
|
||||||
|
struct freeBlockInfo {
|
||||||
|
size_t addr;
|
||||||
|
size_t blockSize;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct cmpFreeBlockInfo {
|
||||||
|
bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
|
||||||
|
return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
|
||||||
|
: (a.addr < b.addr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// free balanced tree, maintains all free memory blocks
|
||||||
|
std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
|
||||||
|
|
||||||
|
// key: head address offset of the free memory block
|
||||||
|
// value: blockSize of the block
|
||||||
|
std::unordered_map<size_t, size_t> headAddrToBlockSize;
|
||||||
|
|
||||||
|
// key: tail address offset of the free memory block
|
||||||
|
// value: blockSize of the block
|
||||||
|
std::unordered_map<size_t, size_t> tailAddrToBlockSize;
|
||||||
|
|
||||||
|
public:
|
||||||
|
LazyAllocator(Runtime runtime);
|
||||||
|
|
||||||
|
virtual ~LazyAllocator();
|
||||||
|
|
||||||
|
void init();
|
||||||
|
|
||||||
|
void setMemPool(size_t memPoolSize);
|
||||||
|
|
||||||
|
bool getMemPoolStatus();
|
||||||
|
|
||||||
|
// function: simulate memory allocation
|
||||||
|
// arguments:
|
||||||
|
// size: size of memory block to be allocated
|
||||||
|
// return: head address offset of the allocated memory block
|
||||||
|
size_t alloc(size_t size);
|
||||||
|
|
||||||
|
size_t allocWeight(size_t size);
|
||||||
|
|
||||||
|
size_t heapAlloc(size_t size);
|
||||||
|
|
||||||
|
void freeHeap();
|
||||||
|
|
||||||
|
// function: simulate memory free
|
||||||
|
// arguments:
|
||||||
|
// addr: head address offset of memory block to be free
|
||||||
|
// size: size of memory block to be freed
|
||||||
|
void free(size_t addr, size_t size);
|
||||||
|
|
||||||
|
// function: perform actual memory allocation
|
||||||
|
// return: pointer to the head address of the allocated memory
|
||||||
|
void *getPtr();
|
||||||
|
|
||||||
|
// void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
|
||||||
|
|
||||||
|
// std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
|
||||||
|
|
||||||
|
void *getWeightPtr();
|
||||||
|
|
||||||
|
void *getHeapPtr();
|
||||||
|
|
||||||
|
void info();
|
||||||
|
|
||||||
|
private:
|
||||||
|
// function: memory alignment, rouned up
|
||||||
|
// return: size of the aligned memory block
|
||||||
|
size_t getAlignedSize(size_t size);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,269 @@
|
||||||
|
#pragma once
|
||||||
|
#ifndef OP_TYPE_H
|
||||||
|
#define OP_TYPE_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
struct OpType {
|
||||||
|
using underlying_t = uint16_t;
|
||||||
|
|
||||||
|
// Clang-format is ambiguous in formating of comment alignment.
|
||||||
|
// In order to disambiguate, it is necessary to comment all enum
|
||||||
|
// elements.
|
||||||
|
enum : underlying_t {
|
||||||
|
Unknown,
|
||||||
|
Abs, // Unary
|
||||||
|
Acos, // Unary
|
||||||
|
Acosh, // Unary
|
||||||
|
Add, // Binary
|
||||||
|
And, // Binary
|
||||||
|
ArgMax, //
|
||||||
|
Asin, // Unary
|
||||||
|
Asinh, // Unary
|
||||||
|
Atan, // Unary
|
||||||
|
Atanh, // Unary
|
||||||
|
AttentionKVCache, // Fusion
|
||||||
|
AveragePool, // Pool
|
||||||
|
BatchNormalization, //
|
||||||
|
Bernoulli, //
|
||||||
|
BitShift, // Binary
|
||||||
|
BitwiseAnd, // Binary
|
||||||
|
BitwiseNot, // Binary
|
||||||
|
BitwiseOr, // Binary
|
||||||
|
BitwiseXor, // Binary
|
||||||
|
BlackmanWindow, //
|
||||||
|
Cast, // Unary
|
||||||
|
CastLike, //
|
||||||
|
Ceil, // Unary
|
||||||
|
Celu, //
|
||||||
|
CenterCropPad, //
|
||||||
|
Clip, // Unary
|
||||||
|
Col2lm,
|
||||||
|
Compress,
|
||||||
|
Concat,
|
||||||
|
ConcatFromSequence,
|
||||||
|
ConstantOfShape,
|
||||||
|
Conv, // ComputationIntensive
|
||||||
|
ConvInteger, // ComputationIntensive
|
||||||
|
ConvTranspose, // ComputationIntensive
|
||||||
|
Cos, // Unary
|
||||||
|
Cosh, // Unary
|
||||||
|
CumSum,
|
||||||
|
DFT,
|
||||||
|
DeformConv, // ComputationIntensive
|
||||||
|
DepthToSpace,
|
||||||
|
DequantizeLinear,
|
||||||
|
Det,
|
||||||
|
Div, // Binary
|
||||||
|
Dropout,
|
||||||
|
DynamicQuantizeLinear,
|
||||||
|
Einsum,
|
||||||
|
Elu,
|
||||||
|
Equal, // Compair
|
||||||
|
Erf, // Unary
|
||||||
|
Exp, // Unary
|
||||||
|
Expand,
|
||||||
|
EyeLike,
|
||||||
|
Flatten,
|
||||||
|
Floor, // Unary
|
||||||
|
GRU,
|
||||||
|
Gather,
|
||||||
|
GatherElements,
|
||||||
|
GatherND,
|
||||||
|
Gemm,
|
||||||
|
Gelu, // Unary
|
||||||
|
GlobalAveragePool, // GlobalPool
|
||||||
|
GlobalLpPool, // GlobalPool
|
||||||
|
GlobalMaxPool, // GlobalPool
|
||||||
|
Greater, // Compair
|
||||||
|
GreaterOrEqual, // Compair
|
||||||
|
GridSample,
|
||||||
|
GroupNormalization,
|
||||||
|
HammingWindow,
|
||||||
|
HannWindow,
|
||||||
|
HardSigmoid,
|
||||||
|
HardSwish,
|
||||||
|
Hardmax,
|
||||||
|
Identity,
|
||||||
|
If,
|
||||||
|
InstanceNormalization,
|
||||||
|
IsInf,
|
||||||
|
IsNaN,
|
||||||
|
LRN,
|
||||||
|
LSTM,
|
||||||
|
LayerNormalization,
|
||||||
|
LeakyRelu,
|
||||||
|
Less, // Compair
|
||||||
|
LessOrEqual, // Compair
|
||||||
|
Log, // Unary
|
||||||
|
LogSoftmax,
|
||||||
|
Loop,
|
||||||
|
LpNormalization,
|
||||||
|
LpPool,
|
||||||
|
MatMul, // ComputationIntensive
|
||||||
|
MatMulInteger, // ComputationIntensive
|
||||||
|
Max,
|
||||||
|
MaxPool,
|
||||||
|
MaxRoiPool,
|
||||||
|
MaxUnpool,
|
||||||
|
Mean,
|
||||||
|
MeanVarianceNormalization,
|
||||||
|
MelWeightMatrix,
|
||||||
|
Min,
|
||||||
|
Mish,
|
||||||
|
Mod, // Binary
|
||||||
|
Mul, // Binary
|
||||||
|
Multinomial, //
|
||||||
|
Neg, // Unary
|
||||||
|
NegativeLogLikelihoodLoss,
|
||||||
|
NonMaxSuppression,
|
||||||
|
NonZero,
|
||||||
|
Not, // Unary
|
||||||
|
OneHot,
|
||||||
|
Optional,
|
||||||
|
OptionalGetElement,
|
||||||
|
OptionalHasElement,
|
||||||
|
Or, // Binary
|
||||||
|
PRelu, //
|
||||||
|
Pad, //
|
||||||
|
Pow, // Binary
|
||||||
|
QLinearConv, // ComputationIntensive
|
||||||
|
QLinearMatMul, // ComputationIntensive
|
||||||
|
QuantizeLinear,
|
||||||
|
RNN,
|
||||||
|
RandomNormal,
|
||||||
|
RandomNormalLike,
|
||||||
|
RandomUniform,
|
||||||
|
RandomUniformLike,
|
||||||
|
Range,
|
||||||
|
Reciprocal,
|
||||||
|
ReduceL1, // Reduce
|
||||||
|
ReduceL2, // Reduce
|
||||||
|
ReduceLogSum, // Reduce
|
||||||
|
ReduceLogSumExp, // Reduce
|
||||||
|
ReduceMax, // Reduce
|
||||||
|
ReduceMean, // Reduce
|
||||||
|
ReduceMin, // Reduce
|
||||||
|
ReduceProd, // Reduce
|
||||||
|
ReduceSum, // Reduce
|
||||||
|
ReduceSumSquare, // Reduce
|
||||||
|
Relu, // Unary
|
||||||
|
Silu, // Unary
|
||||||
|
Reshape,
|
||||||
|
Resize,
|
||||||
|
ReverseSequence,
|
||||||
|
RoiAlign,
|
||||||
|
RoPE, // Fusion
|
||||||
|
Round, // Unary
|
||||||
|
RMSNorm, // Fusion
|
||||||
|
STFT,
|
||||||
|
Scan,
|
||||||
|
Scatter,
|
||||||
|
ScatterElements,
|
||||||
|
ScatterND,
|
||||||
|
Selu,
|
||||||
|
SequenceAt,
|
||||||
|
SequenceConstruct,
|
||||||
|
SequenceEmpty,
|
||||||
|
SequenceErase,
|
||||||
|
SequenceInsert,
|
||||||
|
SequenceLength,
|
||||||
|
SequenceMap,
|
||||||
|
Shape,
|
||||||
|
Shrink,
|
||||||
|
Sigmoid,
|
||||||
|
Sign,
|
||||||
|
Sin, // Unary
|
||||||
|
Sinh, // Unary
|
||||||
|
Size,
|
||||||
|
Slice,
|
||||||
|
Softmax,
|
||||||
|
SoftmaxCrossEntropyLoss,
|
||||||
|
Softplus,
|
||||||
|
Softsign,
|
||||||
|
SpaceToDepth,
|
||||||
|
Split,
|
||||||
|
SplitToSequence,
|
||||||
|
Sqrt,
|
||||||
|
Squeeze,
|
||||||
|
StringNormalizer,
|
||||||
|
Sub, // Binary
|
||||||
|
Sum, //
|
||||||
|
Tan, // Unary
|
||||||
|
Tanh, // unary
|
||||||
|
TfIdfVectorizer,
|
||||||
|
ThresholdedRelu,
|
||||||
|
Tile,
|
||||||
|
TopK,
|
||||||
|
Transpose,
|
||||||
|
Trilu,
|
||||||
|
Unique,
|
||||||
|
Unsqueeze,
|
||||||
|
Upsample,
|
||||||
|
Where,
|
||||||
|
Xor, // Binary
|
||||||
|
// CUSTOM DEFINED
|
||||||
|
G2BMM,
|
||||||
|
GBMM,
|
||||||
|
MemBound,
|
||||||
|
// TODO
|
||||||
|
ConvTransNHWC,
|
||||||
|
ConvBackwardFilter,
|
||||||
|
ReluBackward,
|
||||||
|
SigmoidBackward,
|
||||||
|
TanhBackward,
|
||||||
|
|
||||||
|
Fill,
|
||||||
|
Extend,
|
||||||
|
MSELoss,
|
||||||
|
Hardtanh,
|
||||||
|
L2Loss,
|
||||||
|
Rsqrt,
|
||||||
|
FloorDiv,
|
||||||
|
FloorMod,
|
||||||
|
Square,
|
||||||
|
SquaredDifference,
|
||||||
|
|
||||||
|
// Communication Ops
|
||||||
|
AllReduceSum,
|
||||||
|
AllReduceProd,
|
||||||
|
AllReduceMin,
|
||||||
|
AllReduceMax,
|
||||||
|
AllReduceAvg,
|
||||||
|
AllGather,
|
||||||
|
Broadcast,
|
||||||
|
Send,
|
||||||
|
Recv,
|
||||||
|
} type;
|
||||||
|
|
||||||
|
constexpr OpType(decltype(type) t) : type(t) {}
|
||||||
|
constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
|
||||||
|
constexpr underlying_t underlying() const { return type; }
|
||||||
|
|
||||||
|
bool operator==(OpType others) const { return type == others.type; }
|
||||||
|
bool operator!=(OpType others) const { return type != others.type; }
|
||||||
|
bool operator<(OpType others) const { return type < others.type; }
|
||||||
|
|
||||||
|
const char *toString() const;
|
||||||
|
bool isUnary() const;
|
||||||
|
bool isBinary() const;
|
||||||
|
bool isElementWise() const;
|
||||||
|
bool isCompair() const;
|
||||||
|
bool isPool() const;
|
||||||
|
bool isGlobalPool() const;
|
||||||
|
bool isMatMulOrConv() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class ActType {
|
||||||
|
None,
|
||||||
|
Relu,
|
||||||
|
Sigmoid,
|
||||||
|
Tanh,
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
||||||
|
|
||||||
|
#endif // OP_TYPE_H
|
|
@ -1,231 +1,14 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "core/op_type.h"
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
|
||||||
enum class OpType {
|
|
||||||
Unknown = 0,
|
|
||||||
// linear
|
|
||||||
Conv = 100,
|
|
||||||
ConvBackwardFilter,
|
|
||||||
ConvBackwardData,
|
|
||||||
Matmul,
|
|
||||||
ConvTrans,
|
|
||||||
ConvTransNHWC,
|
|
||||||
G2BMM,
|
|
||||||
GBMM,
|
|
||||||
Pad,
|
|
||||||
Slice,
|
|
||||||
Concat,
|
|
||||||
Split,
|
|
||||||
Transpose,
|
|
||||||
Extend,
|
|
||||||
MaxPool,
|
|
||||||
AvgPool,
|
|
||||||
Add,
|
|
||||||
Sub,
|
|
||||||
Mul,
|
|
||||||
Div,
|
|
||||||
Pow,
|
|
||||||
Gather,
|
|
||||||
ReduceMean,
|
|
||||||
Reshape,
|
|
||||||
Flatten,
|
|
||||||
Identity,
|
|
||||||
// element wise
|
|
||||||
BatchNorm = 200,
|
|
||||||
Softmax,
|
|
||||||
Activation,
|
|
||||||
Relu,
|
|
||||||
ReluBackward,
|
|
||||||
PRelu,
|
|
||||||
Sigmoid,
|
|
||||||
SigmoidBackward,
|
|
||||||
Tanh,
|
|
||||||
TanhBackward,
|
|
||||||
Abs,
|
|
||||||
Sin,
|
|
||||||
Cos,
|
|
||||||
Tan,
|
|
||||||
ASin,
|
|
||||||
ACos,
|
|
||||||
ATan,
|
|
||||||
SinH,
|
|
||||||
CosH,
|
|
||||||
TanH,
|
|
||||||
ASinH,
|
|
||||||
ACosH,
|
|
||||||
ATanH,
|
|
||||||
Resize,
|
|
||||||
Arange,
|
|
||||||
Shape,
|
|
||||||
Copy,
|
|
||||||
Ceil,
|
|
||||||
Floor,
|
|
||||||
Clip,
|
|
||||||
Erf,
|
|
||||||
Exp,
|
|
||||||
Fill,
|
|
||||||
Log,
|
|
||||||
L2Loss,
|
|
||||||
Maximum,
|
|
||||||
Minimum,
|
|
||||||
MSELoss,
|
|
||||||
Neg,
|
|
||||||
Power,
|
|
||||||
Reciprocal,
|
|
||||||
Sqrt,
|
|
||||||
Rsqrt,
|
|
||||||
Cast,
|
|
||||||
FloorDiv,
|
|
||||||
FloorMod,
|
|
||||||
Det,
|
|
||||||
Round,
|
|
||||||
Square,
|
|
||||||
SquaredDifference,
|
|
||||||
Hardtanh,
|
|
||||||
Equal,
|
|
||||||
NotEqual,
|
|
||||||
GreaterThan,
|
|
||||||
GreaterEqual,
|
|
||||||
LessThan,
|
|
||||||
LessEqual,
|
|
||||||
And,
|
|
||||||
Or,
|
|
||||||
Xor,
|
|
||||||
Not,
|
|
||||||
BitAnd,
|
|
||||||
BitOr,
|
|
||||||
BitXor,
|
|
||||||
BitNot,
|
|
||||||
BitLeftShift,
|
|
||||||
BitRightShift,
|
|
||||||
Dropout,
|
|
||||||
//
|
|
||||||
MemBound = 300,
|
|
||||||
};
|
|
||||||
|
|
||||||
using KernelAttrs = std::tuple<Device, OpType, DataType>;
|
|
||||||
|
|
||||||
class OpRegistry {
|
|
||||||
public:
|
|
||||||
static std::string getOpName(OpType opType) {
|
|
||||||
#define FOP(op) \
|
|
||||||
case OpType::op: \
|
|
||||||
return #op
|
|
||||||
|
|
||||||
switch (opType) {
|
|
||||||
FOP(Unknown);
|
|
||||||
// linear
|
|
||||||
FOP(Conv);
|
|
||||||
FOP(ConvBackwardFilter);
|
|
||||||
FOP(ConvBackwardData);
|
|
||||||
FOP(Matmul);
|
|
||||||
FOP(ConvTrans);
|
|
||||||
FOP(G2BMM);
|
|
||||||
FOP(GBMM);
|
|
||||||
FOP(Pad);
|
|
||||||
FOP(Slice);
|
|
||||||
FOP(Concat);
|
|
||||||
FOP(Split);
|
|
||||||
FOP(Transpose);
|
|
||||||
FOP(Extend);
|
|
||||||
FOP(MaxPool);
|
|
||||||
FOP(AvgPool);
|
|
||||||
FOP(Add);
|
|
||||||
FOP(Sub);
|
|
||||||
FOP(Mul);
|
|
||||||
FOP(Div);
|
|
||||||
FOP(Pow);
|
|
||||||
FOP(Gather);
|
|
||||||
FOP(ReduceMean);
|
|
||||||
FOP(Reshape);
|
|
||||||
FOP(Identity);
|
|
||||||
FOP(Shape);
|
|
||||||
// element wise
|
|
||||||
FOP(BatchNorm);
|
|
||||||
FOP(Softmax);
|
|
||||||
FOP(Activation);
|
|
||||||
FOP(Relu);
|
|
||||||
FOP(ReluBackward);
|
|
||||||
FOP(PRelu);
|
|
||||||
FOP(Sigmoid);
|
|
||||||
FOP(SigmoidBackward);
|
|
||||||
FOP(Tanh);
|
|
||||||
FOP(TanhBackward);
|
|
||||||
FOP(Abs);
|
|
||||||
FOP(Sin);
|
|
||||||
FOP(Cos);
|
|
||||||
FOP(Tan);
|
|
||||||
FOP(ASin);
|
|
||||||
FOP(ACos);
|
|
||||||
FOP(ATan);
|
|
||||||
FOP(SinH);
|
|
||||||
FOP(CosH);
|
|
||||||
FOP(TanH);
|
|
||||||
FOP(ASinH);
|
|
||||||
FOP(ACosH);
|
|
||||||
FOP(ATanH);
|
|
||||||
FOP(Copy);
|
|
||||||
FOP(Ceil);
|
|
||||||
FOP(Floor);
|
|
||||||
FOP(Clip);
|
|
||||||
FOP(Erf);
|
|
||||||
FOP(Exp);
|
|
||||||
FOP(Fill);
|
|
||||||
FOP(Log);
|
|
||||||
FOP(L2Loss);
|
|
||||||
FOP(Maximum);
|
|
||||||
FOP(Minimum);
|
|
||||||
FOP(MSELoss);
|
|
||||||
FOP(Neg);
|
|
||||||
FOP(Power);
|
|
||||||
FOP(Reciprocal);
|
|
||||||
FOP(Sqrt);
|
|
||||||
FOP(Rsqrt);
|
|
||||||
FOP(Cast);
|
|
||||||
FOP(FloorDiv);
|
|
||||||
FOP(FloorMod);
|
|
||||||
FOP(Det);
|
|
||||||
FOP(Round);
|
|
||||||
FOP(Square);
|
|
||||||
FOP(SquaredDifference);
|
|
||||||
FOP(Hardtanh);
|
|
||||||
FOP(Equal);
|
|
||||||
FOP(NotEqual);
|
|
||||||
FOP(GreaterThan);
|
|
||||||
FOP(GreaterEqual);
|
|
||||||
FOP(LessThan);
|
|
||||||
FOP(LessEqual);
|
|
||||||
FOP(And);
|
|
||||||
FOP(Or);
|
|
||||||
FOP(Xor);
|
|
||||||
FOP(Not);
|
|
||||||
FOP(BitAnd);
|
|
||||||
FOP(BitOr);
|
|
||||||
FOP(BitXor);
|
|
||||||
FOP(BitNot);
|
|
||||||
FOP(BitLeftShift);
|
|
||||||
FOP(BitRightShift);
|
|
||||||
//
|
|
||||||
FOP(MemBound);
|
|
||||||
default:
|
|
||||||
IT_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#undef FOP
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
enum class ActType {
|
|
||||||
None,
|
|
||||||
Relu,
|
|
||||||
Sigmoid,
|
|
||||||
Tanh,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct OpPerfKey {
|
struct OpPerfKey {
|
||||||
HashType hash;
|
HashType hash;
|
||||||
OpType opType;
|
OpType::underlying_t opType;
|
||||||
vector<int> attrs;
|
vector<int> attrs;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -233,7 +16,7 @@ struct OpPerfKey {
|
||||||
// https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
|
// https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
|
||||||
OpPerfKey() = default;
|
OpPerfKey() = default;
|
||||||
OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
|
OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
|
||||||
: hash(hash), opType(opType), attrs(attrs) {}
|
: hash(hash), opType(opType.underlying()), attrs(attrs) {}
|
||||||
bool operator==(const OpPerfKey &rhs) const {
|
bool operator==(const OpPerfKey &rhs) const {
|
||||||
if (hash != rhs.hash)
|
if (hash != rhs.hash)
|
||||||
return false;
|
return false;
|
||||||
|
@ -272,8 +55,7 @@ class OperatorObj : public Object {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
|
OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
|
||||||
virtual optional<vector<Shape>>
|
virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
|
||||||
inferShape(const TensorVec &inputs) const = 0;
|
|
||||||
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
|
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
|
||||||
/**
|
/**
|
||||||
* @brief Constructs outputs (if requried) and check whether the operator is
|
* @brief Constructs outputs (if requried) and check whether the operator is
|
||||||
|
@ -290,16 +72,7 @@ class OperatorObj : public Object {
|
||||||
*/
|
*/
|
||||||
HashType hash() const;
|
HashType hash() const;
|
||||||
|
|
||||||
public: // check Op type
|
public:
|
||||||
bool isLinearOp() const;
|
|
||||||
bool isElementWiseOp() const;
|
|
||||||
bool isSplitOp() const;
|
|
||||||
bool isConcatOp() const;
|
|
||||||
bool isComputeOp() const;
|
|
||||||
bool isTransposeOp() const;
|
|
||||||
bool isReshapeOp() const;
|
|
||||||
bool isMemBoundOp() const;
|
|
||||||
|
|
||||||
public: // getter and setter
|
public: // getter and setter
|
||||||
const TensorVec &getInputs() const { return inputs; }
|
const TensorVec &getInputs() const { return inputs; }
|
||||||
const TensorVec &getOutputs() const { return outputs; }
|
const TensorVec &getOutputs() const { return outputs; }
|
||||||
|
@ -317,6 +90,7 @@ class OperatorObj : public Object {
|
||||||
OpType getOpType() const { return type; }
|
OpType getOpType() const { return type; }
|
||||||
// HACK: set correct data type
|
// HACK: set correct data type
|
||||||
DataType getDType() const { return getInputs(0)->getDType(); }
|
DataType getDType() const { return getInputs(0)->getDType(); }
|
||||||
|
DataType getOutDType() const { return getOutput()->getDType(); }
|
||||||
virtual int numInputs() const = 0;
|
virtual int numInputs() const = 0;
|
||||||
virtual int numOutputs() const = 0;
|
virtual int numOutputs() const = 0;
|
||||||
|
|
||||||
|
@ -331,7 +105,7 @@ class OperatorObj : public Object {
|
||||||
const TensorVec &newOutputs) const = 0;
|
const TensorVec &newOutputs) const = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
optional<vector<Shape>> inferShape() const;
|
optional<vector<Shape>> inferShape();
|
||||||
vector<DataType> inferDataType() const;
|
vector<DataType> inferDataType() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
#include "core/graph.h"
|
#include "core/graph.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include <nlohmann/json_fwd.hpp>
|
#include <nlohmann/json_fwd.hpp>
|
||||||
using json = nlohmann::json;
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
class PerfEngine {
|
class PerfEngine {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/common.h"
|
#include "core/common.h"
|
||||||
|
#include "core/communicator.h"
|
||||||
|
#include "core/op_type.h"
|
||||||
#include "core/ref.h"
|
#include "core/ref.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
@ -13,6 +15,7 @@ class GraphObj;
|
||||||
class GraphHandlerObj;
|
class GraphHandlerObj;
|
||||||
class RuntimeObj;
|
class RuntimeObj;
|
||||||
class BlobObj;
|
class BlobObj;
|
||||||
|
template <typename T> class WorkspaceObj;
|
||||||
|
|
||||||
using TensorBase = Ref<TensorBaseObj>;
|
using TensorBase = Ref<TensorBaseObj>;
|
||||||
using Tensor = Ref<TensorObj>;
|
using Tensor = Ref<TensorObj>;
|
||||||
|
@ -21,7 +24,7 @@ using Graph = Ref<GraphObj>;
|
||||||
using GraphHandler = Ref<GraphHandlerObj>;
|
using GraphHandler = Ref<GraphHandlerObj>;
|
||||||
using Runtime = Ref<RuntimeObj>;
|
using Runtime = Ref<RuntimeObj>;
|
||||||
using Blob = Ref<BlobObj>;
|
using Blob = Ref<BlobObj>;
|
||||||
enum class OpType;
|
template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
|
||||||
|
|
||||||
using TensorVec = vector<Tensor>;
|
using TensorVec = vector<Tensor>;
|
||||||
using OpVec = vector<Operator>;
|
using OpVec = vector<Operator>;
|
||||||
|
@ -29,15 +32,17 @@ using OpLists = list<Operator>;
|
||||||
|
|
||||||
using VType = uint32_t;
|
using VType = uint32_t;
|
||||||
|
|
||||||
enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
|
enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
|
||||||
/***************** Forward declaration end *****************/
|
/***************** Forward declaration end *****************/
|
||||||
|
|
||||||
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
protected:
|
protected:
|
||||||
Device device;
|
Device device;
|
||||||
|
int deviceId;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
RuntimeObj(Device device) : device(device) {}
|
explicit RuntimeObj(Device device, int deviceId = 0)
|
||||||
|
: device(device), deviceId(deviceId) {}
|
||||||
RuntimeObj(RuntimeObj &other) = delete;
|
RuntimeObj(RuntimeObj &other) = delete;
|
||||||
RuntimeObj &operator=(RuntimeObj const &) = delete;
|
RuntimeObj &operator=(RuntimeObj const &) = delete;
|
||||||
virtual ~RuntimeObj() {}
|
virtual ~RuntimeObj() {}
|
||||||
|
@ -69,6 +74,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
}
|
}
|
||||||
bool isCuda() const { return device == Device::CUDA; }
|
bool isCuda() const { return device == Device::CUDA; }
|
||||||
bool isBang() const { return device == Device::BANG; }
|
bool isBang() const { return device == Device::BANG; }
|
||||||
|
bool isKUNLUN() const { return device == Device::KUNLUN; }
|
||||||
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
void copyBlob(const TensorObj *dst, const TensorObj *src) const;
|
||||||
// TODO: unify these copy APIs
|
// TODO: unify these copy APIs
|
||||||
virtual void copyBlobFromCPU(void *dst, const void *src,
|
virtual void copyBlobFromCPU(void *dst, const void *src,
|
||||||
|
@ -77,6 +83,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
|
||||||
size_t bytes) const = 0;
|
size_t bytes) const = 0;
|
||||||
virtual string toString() const = 0;
|
virtual string toString() const = 0;
|
||||||
|
|
||||||
|
int getDeviceId() const { return deviceId; }
|
||||||
|
|
||||||
|
virtual void initComm(const string &name, int worldSize, int rank) = 0;
|
||||||
|
|
||||||
|
virtual CommunicatorObj &getCommunicator() const = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void printProfilingData(double totTime,
|
void printProfilingData(double totTime,
|
||||||
const std::map<OpType, double> &opTime,
|
const std::map<OpType, double> &opTime,
|
||||||
|
@ -97,6 +109,9 @@ class CpuRuntimeObj : public RuntimeObj {
|
||||||
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
|
void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
|
||||||
void copyBlobInsideRuntime(void *dst, const void *src,
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
size_t bytes) const override;
|
size_t bytes) const override;
|
||||||
|
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
||||||
|
|
||||||
|
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
class NativeCpuRuntimeObj : public CpuRuntimeObj {
|
class NativeCpuRuntimeObj : public CpuRuntimeObj {
|
||||||
|
|
|
@ -1,12 +1,17 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/tensor_base.h"
|
#include "core/tensor_base.h"
|
||||||
|
#include "core/tensor_type.h"
|
||||||
|
#include "utils/data_convert.h"
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
#if USE_CUDA
|
#if USE_CUDA
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
#endif
|
#endif
|
||||||
|
#if USE_BANG
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#endif
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
// TODO: how to deal with this
|
// TODO: how to deal with this
|
||||||
|
@ -18,13 +23,7 @@ class TensorObj : public TensorBaseObj {
|
||||||
size_t _size; // Cache of Π(shape).
|
size_t _size; // Cache of Π(shape).
|
||||||
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
Fuid fuid; // Cloned tensors share the same id. Tensors constructed from
|
||||||
// scratch have a new id.
|
// scratch have a new id.
|
||||||
|
TensorType tensorType = TensorType::others;
|
||||||
void copyin(const void *ptr, size_t size) {
|
|
||||||
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
|
|
||||||
}
|
|
||||||
void copyout(void *ptr, size_t size) const {
|
|
||||||
runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TensorObj(Shape shape, DataType dtype, Runtime runtime);
|
TensorObj(Shape shape, DataType dtype, Runtime runtime);
|
||||||
|
@ -35,30 +34,74 @@ class TensorObj : public TensorBaseObj {
|
||||||
size_t getBytes() const { return _size * dtype.getSize(); }
|
size_t getBytes() const { return _size * dtype.getSize(); }
|
||||||
|
|
||||||
Shape getDims() const { return shape; }
|
Shape getDims() const { return shape; }
|
||||||
vector<size_t> getStride() const;
|
void setShape(Shape shape_);
|
||||||
|
size_t getRank() const { return shape.size(); }
|
||||||
|
Shape getStride() const;
|
||||||
size_t getOffset(const vector<int> &ds) const;
|
size_t getOffset(const vector<int> &ds) const;
|
||||||
void dataMalloc();
|
void dataMalloc();
|
||||||
UidBaseType getFuid() const { return fuid; }
|
UidBaseType getFuid() const { return fuid; }
|
||||||
|
bool isWeight() const { return tensorType == TensorType::weight; }
|
||||||
|
bool isInput() const { return tensorType == TensorType::input; }
|
||||||
|
bool isOutput() const { return tensorType == TensorType::output; }
|
||||||
|
bool isOthers() const { return tensorType == TensorType::others; }
|
||||||
|
void setWeight() { tensorType = TensorType::weight; }
|
||||||
|
void setInput() {
|
||||||
|
if (!this->isWeight()) {
|
||||||
|
tensorType = TensorType::input;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void setOutput() {
|
||||||
|
if (!this->isWeight()) {
|
||||||
|
tensorType = TensorType::output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
string tensorTypeToString() const {
|
||||||
|
switch (tensorType) {
|
||||||
|
case TensorType::weight:
|
||||||
|
return "weight";
|
||||||
|
break;
|
||||||
|
case TensorType::input:
|
||||||
|
return "input";
|
||||||
|
break;
|
||||||
|
case TensorType::output:
|
||||||
|
return "output";
|
||||||
|
break;
|
||||||
|
case TensorType::others:
|
||||||
|
return "others";
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return "unknown tensor type";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void load(std::string file_path);
|
void load(std::string file_path);
|
||||||
void save(std::string file_path);
|
void save(std::string file_path);
|
||||||
|
|
||||||
|
void copyin(const void *ptr, size_t size) {
|
||||||
|
runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
|
||||||
|
}
|
||||||
|
void copyout(void *ptr, size_t size) const {
|
||||||
|
runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy elements from `data`.
|
// Copy elements from `data`.
|
||||||
template <typename T> void copyin(const vector<T> &data) {
|
template <typename T> void copyin(const vector<T> &data) {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
|
||||||
IT_ASSERT(data.size() >= _size);
|
IT_ASSERT(data.size() == _size);
|
||||||
copyin(data.data(), getBytes());
|
copyin(data.data(), getBytes());
|
||||||
}
|
}
|
||||||
// Copy all the elements to a vector.
|
// Copy all the elements to a vector.
|
||||||
template <typename T> auto copyout() const {
|
template <typename T> auto copyout() const {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
|
||||||
std::vector<T> ans(_size);
|
std::vector<T> ans(_size);
|
||||||
copyout(ans.data(), getBytes());
|
copyout(ans.data(), getBytes());
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
// Copy the element at `pos`.
|
// Copy the element at `pos`.
|
||||||
template <typename T> auto copyOne(const vector<int> &pos) const {
|
template <typename T> auto copyOne(const vector<int> &pos) const {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
|
||||||
auto offset = getOffset(pos);
|
auto offset = getOffset(pos);
|
||||||
auto bytes = dtype.getSize();
|
auto bytes = dtype.getSize();
|
||||||
T ans;
|
T ans;
|
||||||
|
@ -70,10 +113,16 @@ class TensorObj : public TensorBaseObj {
|
||||||
void copyData(const TensorObj *src);
|
void copyData(const TensorObj *src);
|
||||||
void copyData(const Tensor &src) { copyData(src.get()); }
|
void copyData(const Tensor &src) { copyData(src.get()); }
|
||||||
|
|
||||||
|
// TODO: Rename this function later, because it is confused that it will
|
||||||
|
// change the field data, but actually it generates data and maybe copy to
|
||||||
|
// device.
|
||||||
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
// FIXME: std::fucntion copies the generator instead of passing it by ref.
|
||||||
// Thus the internal state of generator cannot be updated.
|
// Thus the internal state of generator cannot be updated.
|
||||||
void setData(
|
void setData(
|
||||||
std::function<void(void *, size_t, DataType)> const &generator) const;
|
std::function<void(void *, size_t, DataType)> const &generator) const;
|
||||||
|
|
||||||
|
void setDataBlob(const Blob &blob);
|
||||||
|
|
||||||
Tensor clone() const {
|
Tensor clone() const {
|
||||||
auto obj = make_ref<TensorObj>(*this);
|
auto obj = make_ref<TensorObj>(*this);
|
||||||
obj->freeData();
|
obj->freeData();
|
||||||
|
@ -95,11 +144,16 @@ class TensorObj : public TensorBaseObj {
|
||||||
}
|
}
|
||||||
|
|
||||||
void printData() const;
|
void printData() const;
|
||||||
|
void dumpData(std::ofstream &ofs) const;
|
||||||
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
|
||||||
|
|
||||||
template <typename T> bool equalData(const vector<T> &dataVector) {
|
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
|
||||||
IT_ASSERT(size() == dataVector.size());
|
IT_ASSERT(size() == dataVector.size());
|
||||||
|
if (dtype == DataType::Float16) {
|
||||||
|
return equalDataImpl_fp16(getRawDataPtr<uint16_t *>(),
|
||||||
|
(float *)dataVector.data(), size());
|
||||||
|
}
|
||||||
|
IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
|
||||||
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,19 +193,41 @@ class TensorObj : public TensorBaseObj {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
bool equalDataImpl(const T *a, const T *b, size_t size,
|
||||||
|
double relativeError = 1e-6) const {
|
||||||
for (size_t i = 0; i < size; ++i) {
|
for (size_t i = 0; i < size; ++i) {
|
||||||
if constexpr (std::is_integral_v<T>) {
|
if constexpr (std::is_integral_v<T>) {
|
||||||
if (a[i] != b[i])
|
if (a[i] != b[i])
|
||||||
return false;
|
return false;
|
||||||
} else if constexpr (std::is_floating_point_v<T>) {
|
} else if constexpr (std::is_floating_point_v<T>) {
|
||||||
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
|
||||||
1e-6) {
|
fabs(a[i] - b[i]) > relativeError) {
|
||||||
|
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||||
|
return false;
|
||||||
|
} else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
|
||||||
|
fabs(a[i] - b[i]) /
|
||||||
|
std::max(fabs(a[i]), fabs(b[i])) >
|
||||||
|
relativeError) {
|
||||||
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else
|
} else {
|
||||||
static_assert(!sizeof(T), "Unsupported data type");
|
static_assert(!sizeof(T), "Unsupported data type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool equalDataImpl_fp16(const uint16_t *a, const float *b,
|
||||||
|
size_t size) const {
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
auto a_fp32 = fp16_to_float(a[i]);
|
||||||
|
auto b_fp32 = b[i];
|
||||||
|
if (fabs(a_fp32 - b_fp32) / std::max(fabs(a_fp32), fabs(b_fp32)) >
|
||||||
|
1e-6) {
|
||||||
|
printf("Error on %lu: %f %f\n", i, a_fp32, b_fp32);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -172,8 +248,8 @@ class TensorObj : public TensorBaseObj {
|
||||||
// // std::cerr << "Init beginned " << std::endl;
|
// // std::cerr << "Init beginned " << std::endl;
|
||||||
// #pragma omp parallel for
|
// #pragma omp parallel for
|
||||||
// for (size_t i = 0; i < iEnd; ++i)
|
// for (size_t i = 0; i < iEnd; ++i)
|
||||||
// data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
|
// data[i] = fastrand(random_seed[omp_get_thread_num() *
|
||||||
// 10000;
|
// 16]) % 10000;
|
||||||
// // std::cerr << "Init finished" << std::endl;
|
// // std::cerr << "Init finished" << std::endl;
|
||||||
// computed = ComputedFull;
|
// computed = ComputedFull;
|
||||||
// return true;
|
// return true;
|
||||||
|
@ -218,8 +294,8 @@ class TensorObj : public TensorBaseObj {
|
||||||
// auto nDim = dims.size();
|
// auto nDim = dims.size();
|
||||||
// auto nBroadcastDim = ds.size() - nDim;
|
// auto nBroadcastDim = ds.size() - nDim;
|
||||||
// for (size_t i = 0; i < nDim; ++i)
|
// for (size_t i = 0; i < nDim; ++i)
|
||||||
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
|
// if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
|
||||||
// dims[i])
|
// i] >= dims[i])
|
||||||
// return (size_t)-1;
|
// return (size_t)-1;
|
||||||
// size_t idx = 0;
|
// size_t idx = 0;
|
||||||
// for (size_t i = 0; i < nDim; ++i)
|
// for (size_t i = 0; i < nDim; ++i)
|
||||||
|
@ -278,12 +354,14 @@ class TensorObj : public TensorBaseObj {
|
||||||
// return (g_seed >> 16) & 0x7FFF;
|
// return (g_seed >> 16) & 0x7FFF;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// std::vector<std::vector<int>> const *getSplittingPoints() const {
|
// std::vector<std::vector<int>> const *getSplittingPoints()
|
||||||
|
// const {
|
||||||
// assert(!splittingPoints.empty());
|
// assert(!splittingPoints.empty());
|
||||||
// return &splittingPoints;
|
// return &splittingPoints;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// bool setSplittingPoints(std::vector<std::vector<int>> value) {
|
// bool setSplittingPoints(std::vector<std::vector<int>> value)
|
||||||
|
// {
|
||||||
// assert(!value.empty());
|
// assert(!value.empty());
|
||||||
// splittingPoints = value;
|
// splittingPoints = value;
|
||||||
// return true;
|
// return true;
|
||||||
|
@ -305,7 +383,7 @@ class TensorObj : public TensorBaseObj {
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// void initSplittingPoints() {
|
// void initSplittingPoints() {
|
||||||
// splittingPoints.resize(getDims().size()); }
|
// splittingPoints.resize(getRank()); }
|
||||||
|
|
||||||
// void printShape();
|
// void printShape();
|
||||||
};
|
};
|
||||||
|
|
|
@ -44,6 +44,7 @@ class TensorBaseObj : public Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
DataType getDType() const { return dtype; }
|
DataType getDType() const { return dtype; }
|
||||||
|
int getDTypeIndex() const { return dtype.getIndex(); }
|
||||||
Runtime getRuntime() const { return runtime; }
|
Runtime getRuntime() const { return runtime; }
|
||||||
|
|
||||||
// std::pair<Operator *, int> getOutputOfWithIndex();
|
// std::pair<Operator *, int> getOutputOfWithIndex();
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
enum class TensorType { weight, input, output, others };
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,42 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <class T> class WorkspaceObj {
|
||||||
|
private:
|
||||||
|
T workspace; // workspace pointer
|
||||||
|
size_t workspaceSize; // Size of workspace
|
||||||
|
size_t workspaceAlloc; // currently use workspace size
|
||||||
|
|
||||||
|
public:
|
||||||
|
WorkspaceObj(T workspace_, size_t workspaceSize_)
|
||||||
|
: workspace(workspace_), workspaceSize(workspaceSize_) {
|
||||||
|
workspaceAlloc = 0;
|
||||||
|
}
|
||||||
|
virtual ~WorkspaceObj() {
|
||||||
|
// Dealloc workspace in RuntimeObj
|
||||||
|
// Set workspace = nullptr here
|
||||||
|
workspace = nullptr;
|
||||||
|
}
|
||||||
|
size_t getWorkspaceSize() const { return workspaceSize; }
|
||||||
|
|
||||||
|
T getWorkspace(size_t size) {
|
||||||
|
// Get unused workspace
|
||||||
|
IT_ASSERT(size + workspaceAlloc <= workspaceSize);
|
||||||
|
auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
|
||||||
|
workspaceAlloc += size;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
T getWorkspace() {
|
||||||
|
// Override getWorkspace in order to dealloc in runtime
|
||||||
|
return workspace;
|
||||||
|
}
|
||||||
|
void resetWorkspace() {
|
||||||
|
// Reset workspaceAlloc every time end kernel
|
||||||
|
workspaceAlloc = 0;
|
||||||
|
}
|
||||||
|
size_t getWorkspaceAlloc() const { return workspaceAlloc; }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/common.h"
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
struct AttentionKVCacheMetadata {
|
||||||
|
int dimSize[4];
|
||||||
|
int stride[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
|
||||||
|
float *input_q, float *input_k, float *input_v,
|
||||||
|
int *position_id, float *output_matmul,
|
||||||
|
const AttentionKVCacheMetadata &compMeta,
|
||||||
|
float *output_O_temp, float *output_sum_temp);
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -5,17 +5,13 @@
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
#include <cudnn.h>
|
#include <cudnn.h>
|
||||||
#include <curand.h>
|
#include <curand.h>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
// TODO: replace with Exception (IT_ASSERT)
|
|
||||||
#define checkCudaError(call) \
|
#define checkCudaError(call) \
|
||||||
{ \
|
if (auto err = call; err != cudaSuccess) \
|
||||||
auto err = call; \
|
throw ::infini::Exception(std::string("[") + __FILE__ + ":" + \
|
||||||
if (cudaSuccess != err) { \
|
std::to_string(__LINE__) + "] CUDA error (" + \
|
||||||
fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
#call + "): " + cudaGetErrorString(err))
|
||||||
cudaGetErrorString(err)); \
|
|
||||||
exit(EXIT_FAILURE); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define checkCUresult(call) \
|
#define checkCUresult(call) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -39,14 +35,10 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
#define checkCudnnError(call) \
|
#define checkCudnnError(call) \
|
||||||
{ \
|
if (auto err = call; err != CUDNN_STATUS_SUCCESS) \
|
||||||
auto err = call; \
|
throw ::infini::Exception(std::string("[") + __FILE__ + ":" + \
|
||||||
if (CUDNN_STATUS_SUCCESS != err) { \
|
std::to_string(__LINE__) + "] cuDNN error (" + \
|
||||||
fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__, \
|
#call + "): " + cudnnGetErrorString(err))
|
||||||
__LINE__, cudnnGetErrorString(err)); \
|
|
||||||
exit(EXIT_FAILURE); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define checkCurandError(call) \
|
#define checkCurandError(call) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -120,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {
|
||||||
|
|
||||||
using CudaPtr = void *;
|
using CudaPtr = void *;
|
||||||
|
|
||||||
|
class CUDAStream {
|
||||||
|
public:
|
||||||
|
CUDAStream(const CUDAStream &) = delete;
|
||||||
|
CUDAStream(CUDAStream &&) = delete;
|
||||||
|
void operator=(const CUDAStream &) = delete;
|
||||||
|
void operator=(CUDAStream &&) = delete;
|
||||||
|
static cudaStream_t getCurrentStream() { return _stream; }
|
||||||
|
static void Init() { CUDAStream::_stream = 0; };
|
||||||
|
static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
|
||||||
|
static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
CUDAStream(){};
|
||||||
|
static cudaStream_t _stream;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -1,8 +1,20 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||||
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
int c2, int c3);
|
||||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||||
|
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||||
|
int c2, int c3);
|
||||||
|
void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||||
|
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||||
|
int c2, int c3);
|
||||||
|
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||||
|
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||||
|
int c2, int c3);
|
||||||
|
|
||||||
|
void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||||
|
|
||||||
|
void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "operators/unary.h"
|
||||||
|
#include "utils/small_array.h"
|
||||||
|
namespace infini {
|
||||||
|
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||||
|
int outputsize, SmallArray inputShape,
|
||||||
|
SmallArray outputShape);
|
||||||
|
|
||||||
|
void expandRowKernel(int dType, void *input, void *output, int n_rows,
|
||||||
|
int row_len);
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma once
|
||||||
|
#include "operators/unary.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
void LaynormKernel(const float *input, const float *scale, const float eps,
|
||||||
|
int size, int scaleSize, const int dimsize, const int stride,
|
||||||
|
float *output, const float *bias, int biasSize);
|
||||||
|
void LaynormKernel(const float *input, const float *scale, const float eps,
|
||||||
|
int size, int scaleSize, const int dimsize, const int stride,
|
||||||
|
float *output);
|
||||||
|
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||||
|
int size, int scaleSize, const int dimsize, const int stride,
|
||||||
|
half *output, const half *bias, int biasSize);
|
||||||
|
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||||
|
int size, int scaleSize, const int dimsize, const int stride,
|
||||||
|
half *output);
|
||||||
|
}; // namespace infini
|
|
@ -10,10 +10,11 @@ typedef struct {
|
||||||
int wholeNDim[MAX_DIM]; // dim size after padding or before slicing
|
int wholeNDim[MAX_DIM]; // dim size after padding or before slicing
|
||||||
int partNDim[MAX_DIM]; // dim size before padding or after slicing
|
int partNDim[MAX_DIM]; // dim size before padding or after slicing
|
||||||
int partStride[MAX_DIM]; // stride before padding or after slicing
|
int partStride[MAX_DIM]; // stride before padding or after slicing
|
||||||
|
int DType;
|
||||||
} TransMetaData;
|
} TransMetaData;
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void pad_slice_kernel(float *partData, float *wholeData,
|
void pad_slice_kernel(void *partData, void *wholeData,
|
||||||
const TransMetaData &metadata, int nDims, int num,
|
const TransMetaData &metadata, int nDims, int num,
|
||||||
bool isPad);
|
bool isPad);
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "operators/rms_norm.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
|
||||||
|
int num_tokens, int hidden_size);
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,12 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "operators/rope.h"
|
||||||
|
#include "utils/small_array.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void rope_kernel(int dType, int *pos, void *input, void *output, int size,
|
||||||
|
int dim_model, int dim_head, int hidden_stride,
|
||||||
|
int pos_stride);
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -1,6 +1,9 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "core/runtime.h"
|
#include "core/runtime.h"
|
||||||
#include "cuda/cuda_common.h"
|
#include "cuda/cuda_common.h"
|
||||||
|
#ifdef INFINI_USE_NCCL
|
||||||
|
#include "cuda/nccl_communicator.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -8,21 +11,34 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
private:
|
private:
|
||||||
cudnnHandle_t cudnn;
|
cudnnHandle_t cudnn;
|
||||||
cublasHandle_t cublas;
|
cublasHandle_t cublas;
|
||||||
|
std::unique_ptr<CommunicatorObj> comm;
|
||||||
CudaPtr workspace;
|
CudaPtr workspace;
|
||||||
size_t workspaceSize;
|
size_t workspaceSize;
|
||||||
|
bool isCudaGraphCreated;
|
||||||
|
cudaGraph_t cudaGraph;
|
||||||
|
cudaGraphExec_t cudaGraphInstance;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
|
explicit CudaRuntimeObj(int deviceId = 0)
|
||||||
|
: RuntimeObj(Device::CUDA, deviceId) {
|
||||||
|
|
||||||
|
checkCudaError(cudaSetDevice(deviceId));
|
||||||
checkCudnnError(cudnnCreate(&cudnn));
|
checkCudnnError(cudnnCreate(&cudnn));
|
||||||
checkCublasError(cublasCreate(&cublas));
|
checkCublasError(cublasCreate(&cublas));
|
||||||
// 10GB for Longformer
|
// 10GB for Longformer
|
||||||
// size_t longformerNum = 3lu * (1 << 30);
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
workspaceSize = 7ll << 30; // 7 GB
|
workspaceSize = 7ll << 30; // 7 GB
|
||||||
workspace = alloc(workspaceSize);
|
workspace = alloc(workspaceSize);
|
||||||
|
isCudaGraphCreated = false;
|
||||||
|
CUDAStream::Init();
|
||||||
}
|
}
|
||||||
virtual ~CudaRuntimeObj() {
|
virtual ~CudaRuntimeObj() {
|
||||||
try {
|
try {
|
||||||
|
if (isCudaGraphCreated) {
|
||||||
|
checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
|
||||||
|
checkCudaError(cudaGraphDestroy(cudaGraph));
|
||||||
|
CUDAStream::destroyStream();
|
||||||
|
}
|
||||||
dealloc(workspace);
|
dealloc(workspace);
|
||||||
checkCudnnError(cudnnDestroy(cudnn));
|
checkCudnnError(cudnnDestroy(cudnn));
|
||||||
checkCublasError(cublasDestroy(cublas));
|
checkCublasError(cublasDestroy(cublas));
|
||||||
|
@ -69,6 +85,13 @@ class CudaRuntimeObj : public RuntimeObj {
|
||||||
|
|
||||||
void runWithoutSync(const Graph &graph) const;
|
void runWithoutSync(const Graph &graph) const;
|
||||||
|
|
||||||
|
void runWithCudaGraph(const Graph &graph);
|
||||||
|
|
||||||
|
// init communicator
|
||||||
|
void initComm(const string &name, int worldSize, int rank) final;
|
||||||
|
|
||||||
|
CommunicatorObj &getCommunicator() const final { return *comm; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void tune(const Graph &graph, bool profiling) const;
|
void tune(const Graph &graph, bool profiling) const;
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
#pragma once
|
||||||
|
#include "utils/small_array.h"
|
||||||
|
namespace infini {
|
||||||
|
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||||
|
int dimsize, int stride);
|
||||||
|
void softmax_kernel(int num_blocks, half *input, half *output, int size,
|
||||||
|
int dimsize, int stride);
|
||||||
|
} // namespace infini
|
|
@ -3,13 +3,13 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
const int BATCH_SIZE = 32; // parallel tensor number.
|
const int BATCH_SIZE = 32; // parallel tensor number.
|
||||||
const int DIM_MAX_SIZE = 4;
|
const int DIM_MAX_SIZE = 8;
|
||||||
|
|
||||||
// Concat operator acts like element tensors composing to one big tensor,and
|
// Concat operator acts like element tensors composing to one big tensor,and
|
||||||
// split operator acts like one big tensor being composed by element
|
// split operator acts like one big tensor being composed by element
|
||||||
// tensors.
|
// tensors.
|
||||||
struct ElementTensorMetadata {
|
template <typename T> struct ElementTensorMetadata {
|
||||||
float *data[BATCH_SIZE];
|
T *data[BATCH_SIZE];
|
||||||
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
|
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
|
||||||
// the composed tensor.
|
// the composed tensor.
|
||||||
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
|
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
|
||||||
|
@ -20,16 +20,17 @@ struct ElementTensorMetadata {
|
||||||
data[i], dimBgNo[i], dimSize[i], nElements[i]);
|
data[i], dimBgNo[i], dimSize[i], nElements[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
template <typename T> struct ComposedTensorMetadata {
|
||||||
struct ComposedTensorMetadata {
|
|
||||||
int dimSize[DIM_MAX_SIZE];
|
int dimSize[DIM_MAX_SIZE];
|
||||||
int stride[DIM_MAX_SIZE];
|
int stride[DIM_MAX_SIZE];
|
||||||
float *data;
|
T *data;
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void split_concat_kernel(const ElementTensorMetadata &eleMeta,
|
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
|
||||||
const ComposedTensorMetadata &compMeta, int dim,
|
const ComposedTensorMetadata<float> &compMeta, int dim,
|
||||||
|
int batchSize, int nDims, bool isSplit);
|
||||||
|
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
|
||||||
|
const ComposedTensorMetadata<half> &compMeta, int dim,
|
||||||
int batchSize, int nDims, bool isSplit);
|
int batchSize, int nDims, bool isSplit);
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "operators/transpose.h"
|
||||||
|
#include "utils/small_array.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
|
||||||
|
SmallArray strides, SmallArray outputShape);
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -3,31 +3,22 @@
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void softmax_kernel(float *input, float *output, int num);
|
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
|
||||||
void relu_kernel(float *input, float *output, int num);
|
template <typename T> void relu_kernel(T *input, T *output, size_t num);
|
||||||
void sigmoid_kernel(float *input, float *output, int num);
|
template <typename T> void silu_kernel(T *input, T *output, size_t num);
|
||||||
void tanh_kernel(float *input, float *output, int num);
|
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
|
||||||
void abs_kernel(float *input, float *output, int num);
|
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void abs_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void neg_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void gelu_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void erf_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
|
||||||
|
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
|
||||||
|
|
||||||
void unary_kernel(const Operator &_op) {
|
template <typename INPUT, typename OUTPUT>
|
||||||
auto op = as<UnaryObj>(_op);
|
void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
|
||||||
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
|
|
||||||
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
|
|
||||||
|
|
||||||
auto dim = op->getInputs(0)->getDims();
|
void unary_kernel(const Operator &_op);
|
||||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
|
||||||
if (op->getOpType() == OpType::Softmax)
|
|
||||||
softmax_kernel(inputData, outputData, n * c * h * w);
|
|
||||||
else if (op->getOpType() == OpType::Relu)
|
|
||||||
relu_kernel(inputData, outputData, n * c * h * w);
|
|
||||||
else if (op->getOpType() == OpType::Sigmoid)
|
|
||||||
sigmoid_kernel(inputData, outputData, n * c * h * w);
|
|
||||||
else if (op->getOpType() == OpType::Tanh)
|
|
||||||
tanh_kernel(inputData, outputData, n * c * h * w);
|
|
||||||
else if (op->getOpType() == OpType::Abs)
|
|
||||||
abs_kernel(inputData, outputData, n * c * h * w);
|
|
||||||
else
|
|
||||||
IT_TODO_HALT();
|
|
||||||
}
|
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -1,11 +1,29 @@
|
||||||
|
#pragma once
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
#include "cuda/cuda_common.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
void cudaPrintFloat(float *x, int len);
|
void cudaPrintFloat(float *x, int len);
|
||||||
|
|
||||||
void cudaPrintTensor(const Tensor &tensor) {
|
void cudaPrintTensor(const Tensor &tensor);
|
||||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
|
||||||
|
cudaDataType cublasDataTypeConvert(DataType);
|
||||||
|
|
||||||
|
template <int index> struct DT_CUDA {};
|
||||||
|
template <> struct DT_CUDA<0> { using t = bool; };
|
||||||
|
template <> struct DT_CUDA<1> { using t = float; };
|
||||||
|
template <> struct DT_CUDA<2> { using t = unsigned char; };
|
||||||
|
template <> struct DT_CUDA<3> { using t = char; };
|
||||||
|
template <> struct DT_CUDA<4> { using t = unsigned short; };
|
||||||
|
template <> struct DT_CUDA<5> { using t = short; };
|
||||||
|
template <> struct DT_CUDA<6> { using t = int; };
|
||||||
|
template <> struct DT_CUDA<7> { using t = long long; };
|
||||||
|
template <> struct DT_CUDA<9> { using t = bool; };
|
||||||
|
template <> struct DT_CUDA<10> { using t = half; };
|
||||||
|
template <> struct DT_CUDA<11> { using t = double; };
|
||||||
|
template <> struct DT_CUDA<12> { using t = unsigned int; };
|
||||||
|
template <> struct DT_CUDA<13> { using t = unsigned long long; };
|
||||||
|
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma once
|
||||||
|
#include "operators/unary.h"
|
||||||
|
#include "utils/small_array.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void whereKernel(const float *inputX, const float *inputY,
|
||||||
|
const uint8_t *condition, float *output, int nDims,
|
||||||
|
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||||
|
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||||
|
int ySize, int cSize);
|
||||||
|
void whereKernel(const half *inputX, const half *inputY,
|
||||||
|
const uint8_t *condition, half *output, int nDims,
|
||||||
|
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||||
|
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||||
|
int ySize, int cSize);
|
||||||
|
}; // namespace infini
|
|
@ -1,17 +1,61 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "core/data_type.h"
|
||||||
typedef struct {
|
#include "core/operator.h"
|
||||||
int *indexValue;
|
#include "operators/gather.h"
|
||||||
int axis;
|
|
||||||
int inNDim;
|
|
||||||
int outNDim;
|
|
||||||
int idxNDim;
|
|
||||||
int outDim[4];
|
|
||||||
int idxDim[4];
|
|
||||||
int idxStride[4];
|
|
||||||
int inStride[4];
|
|
||||||
} GatherMetaData;
|
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
|
struct GatherMetaData {
|
||||||
|
// Pointer to indices
|
||||||
|
void *indexValue;
|
||||||
|
// Type of index values
|
||||||
|
DataType indexType;
|
||||||
|
// Type of input and output data
|
||||||
|
DataType dataType;
|
||||||
|
// Axis of the gather operation
|
||||||
|
int axis;
|
||||||
|
// Rank of input
|
||||||
|
int inNDim;
|
||||||
|
// Rank of output
|
||||||
|
int outNDim;
|
||||||
|
// Rank of indices
|
||||||
|
int idxNDim;
|
||||||
|
// Shape of output
|
||||||
|
int outDim[4];
|
||||||
|
// Shape of indices
|
||||||
|
int idxDim[4];
|
||||||
|
// Strides of indices
|
||||||
|
int idxStride[4];
|
||||||
|
// Strides of input
|
||||||
|
int inStride[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void initGatherMetaData(GatherMetaData &metaData,
|
||||||
|
const Ref<OperatorObj> &_op) {
|
||||||
|
memset(&metaData, 0, sizeof(metaData));
|
||||||
|
auto op = as<GatherBaseObj>(_op);
|
||||||
|
Ref<TensorObj> in = op->getInputs(0);
|
||||||
|
Ref<TensorObj> index = op->getInputs(1);
|
||||||
|
Ref<TensorObj> out = op->getOutput();
|
||||||
|
metaData.indexValue = index->getRawDataPtr<void *>();
|
||||||
|
metaData.indexType = index->getDType();
|
||||||
|
metaData.dataType = in->getDType();
|
||||||
|
metaData.axis = op->getAxis();
|
||||||
|
metaData.inNDim = in->getRank();
|
||||||
|
metaData.outNDim = out->getRank();
|
||||||
|
metaData.idxNDim = index->getRank();
|
||||||
|
for (int i = 0; i < metaData.outNDim; ++i)
|
||||||
|
metaData.outDim[i] = out->getDims()[i];
|
||||||
|
for (int i = 0; i < metaData.idxNDim; ++i) {
|
||||||
|
metaData.idxDim[i] = index->getDims()[i];
|
||||||
|
metaData.idxStride[i] = index->getStride()[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < metaData.inNDim; ++i) {
|
||||||
|
metaData.inStride[i] = in->getStride()[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
template <typename T>
|
||||||
|
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
|
||||||
|
|
||||||
|
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
|
||||||
|
size_t num);
|
||||||
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/communicator.h"
|
||||||
|
#include <chrono>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <nccl.h>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#define checkNcclError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (ncclSuccess != err) { \
|
||||||
|
fprintf(stderr, "NCCL error in %s:%i : %s.\n", __FILE__, __LINE__, \
|
||||||
|
ncclGetErrorString(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class NcclCommunicatorObj final : public CommunicatorObj {
|
||||||
|
private:
|
||||||
|
ncclComm_t comm;
|
||||||
|
|
||||||
|
public:
|
||||||
|
NcclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||||
|
: CommunicatorObj(worldSize, rank) {
|
||||||
|
const std::string filePath("./" + name + "_nccl_id.bin");
|
||||||
|
ncclUniqueId commId;
|
||||||
|
if (rank == 0) {
|
||||||
|
checkNcclError(ncclGetUniqueId(&commId));
|
||||||
|
std::ofstream ofs(filePath, std::ios::binary);
|
||||||
|
ofs.write((char *)&commId, sizeof(ncclUniqueId));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
auto begin = std::chrono::steady_clock::now();
|
||||||
|
while (!std::filesystem::exists(filePath)) {
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||||
|
"time limit (10s) exceeded.");
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
}
|
||||||
|
std::ifstream ifs(filePath, std::ios::binary);
|
||||||
|
ifs.read((char *)&commId, sizeof(ncclUniqueId));
|
||||||
|
}
|
||||||
|
checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
|
||||||
|
if (rank == 0) {
|
||||||
|
std::filesystem::remove(filePath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the actual ncclComm_t
|
||||||
|
ncclComm_t getNcclComm() { return comm; }
|
||||||
|
|
||||||
|
void finalize() { checkNcclError(ncclCommFinalize(comm)); }
|
||||||
|
|
||||||
|
~NcclCommunicatorObj() final {
|
||||||
|
finalize();
|
||||||
|
checkNcclError(ncclCommDestroy(comm));
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual string toString() const final {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "NCCL communicator";
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,6 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
namespace infini {
|
|
||||||
void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
|
|
||||||
float *y, int dim, int stride);
|
|
||||||
}
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
#include "core/op_type.h"
|
||||||
|
#include "kunlun/kunlun_common.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
using KunlunActType = xdnn::Activation_t;
|
||||||
|
KunlunActType parseActType(ActType act) {
|
||||||
|
switch (act) {
|
||||||
|
case ActType::None:
|
||||||
|
return KunlunActType::LINEAR;
|
||||||
|
case ActType::Tanh:
|
||||||
|
return KunlunActType::TANH;
|
||||||
|
case ActType::Sigmoid:
|
||||||
|
return KunlunActType::SIGMOID;
|
||||||
|
case ActType::Relu:
|
||||||
|
return KunlunActType::RELU6;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "Activation Type not support yet!\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return KunlunActType::LINEAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/common.h"
|
||||||
|
#include "xpu/runtime_ex.h"
|
||||||
|
#include "xpu/xdnn.h"
|
||||||
|
|
||||||
|
namespace xdnn = baidu::xpu::api;
|
||||||
|
|
||||||
|
#define checkKUNLUNError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (XPU_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__, \
|
||||||
|
__LINE__, xpu_strerror(err)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
using KUNLUNPtr = void *;
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,24 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "kunlun/kunlun_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class KUNLUNKernelWithoutConfig : public Kernel {
|
||||||
|
public:
|
||||||
|
virtual void compute(const Operator &op, const PerfRecord &record,
|
||||||
|
const RuntimeObj *context) const {
|
||||||
|
compute(op, context);
|
||||||
|
}
|
||||||
|
virtual void compute(const Operator &op,
|
||||||
|
const RuntimeObj *context) const = 0;
|
||||||
|
// Premise: op is idempotent since it is called multiple times.
|
||||||
|
virtual PerfRecord tune(const Operator &op,
|
||||||
|
const RuntimeObj *_context) const {
|
||||||
|
auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
|
||||||
|
return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
|
||||||
|
[&]() { context->sync(); }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,81 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "core/workspace.h"
|
||||||
|
#include "kunlun/kunlun_common.h"
|
||||||
|
#ifdef INFINI_USE_XCCL
|
||||||
|
#include "kunlun/xccl_communicator.h"
|
||||||
|
#endif
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class KUNLUNRuntimeObj : public RuntimeObj {
|
||||||
|
private:
|
||||||
|
xdnn::Context *ctx;
|
||||||
|
std::unique_ptr<CommunicatorObj> comm;
|
||||||
|
// KUNLUNPtr workspace;
|
||||||
|
// size_t workspaceSize;
|
||||||
|
Workspace<KUNLUNPtr> workspace;
|
||||||
|
|
||||||
|
public:
|
||||||
|
KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
|
||||||
|
xpu_set_device(deviceId);
|
||||||
|
ctx = xdnn::create_context();
|
||||||
|
// 10GB for Longformer
|
||||||
|
// size_t longformerNum = 3lu * (1 << 30);
|
||||||
|
size_t workspaceSize = 2llu << 30; // 2 GB
|
||||||
|
KUNLUNPtr wkspacePtr = alloc(workspaceSize);
|
||||||
|
workspace =
|
||||||
|
make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
|
||||||
|
}
|
||||||
|
virtual ~KUNLUNRuntimeObj() {
|
||||||
|
KUNLUNPtr wkspacePtr = workspace->getWorkspace();
|
||||||
|
dealloc(wkspacePtr);
|
||||||
|
xdnn::destroy_context(ctx);
|
||||||
|
}
|
||||||
|
string toString() const override;
|
||||||
|
|
||||||
|
void run(const Graph &graph, bool tune = false,
|
||||||
|
bool profiling = false) const;
|
||||||
|
// double runEvaluation(const Graph &graph, int nWarmups,
|
||||||
|
// int nEvaluations) const;
|
||||||
|
void sync() const;
|
||||||
|
|
||||||
|
KUNLUNPtr alloc(size_t size) override {
|
||||||
|
void *ptr;
|
||||||
|
checkKUNLUNError(
|
||||||
|
xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void dealloc(void *ptr) override { xpu_free(ptr); }
|
||||||
|
|
||||||
|
xdnn::Context *KUNLUNHandle() const { return ctx; }
|
||||||
|
// Get $size workspace by bytes
|
||||||
|
KUNLUNPtr getWorkspace(size_t size) const {
|
||||||
|
auto ret = workspace->getWorkspace(size);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
|
||||||
|
|
||||||
|
void copyBlobFromCPU(void *dst, const void *src,
|
||||||
|
size_t bytes) const override {
|
||||||
|
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
|
||||||
|
}
|
||||||
|
void copyBlobToCPU(void *dst, const void *src,
|
||||||
|
size_t bytes) const override {
|
||||||
|
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
|
||||||
|
}
|
||||||
|
void copyBlobInsideRuntime(void *dst, const void *src,
|
||||||
|
size_t bytes) const override {
|
||||||
|
xpu_memcpy(dst, const_cast<void *>(src), bytes,
|
||||||
|
XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
|
||||||
|
}
|
||||||
|
void initComm(const string &name, int worldSize, int rank) final;
|
||||||
|
|
||||||
|
CommunicatorObj &getCommunicator() const final { return *comm; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,10 @@
|
||||||
|
#pragma once
|
||||||
|
namespace infini {
|
||||||
|
namespace opTimer {
|
||||||
|
double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
|
||||||
|
int padh, int padw, int strideh, int stridew,
|
||||||
|
int dilationh, int dilationw, int group,
|
||||||
|
const char *name);
|
||||||
|
double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
|
||||||
|
} // namespace opTimer
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,60 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/communicator.h"
|
||||||
|
#include "xpu/bkcl.h"
|
||||||
|
#include <chrono>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#define checkXcclError(call) \
|
||||||
|
{ \
|
||||||
|
auto err = call; \
|
||||||
|
if (BKCL_SUCCESS != err) { \
|
||||||
|
fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class XcclCommunicatorObj final : public CommunicatorObj {
|
||||||
|
private:
|
||||||
|
BKCLContext_t comm;
|
||||||
|
|
||||||
|
public:
|
||||||
|
XcclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||||
|
: CommunicatorObj(worldSize, rank) {
|
||||||
|
const std::string filePath("./" + name + "_xccl_id.bin");
|
||||||
|
BKCLUniqueId commId;
|
||||||
|
if (rank == 0) {
|
||||||
|
checkXcclError(bkcl_get_unique_id(&commId));
|
||||||
|
std::ofstream ofs(filePath, std::ios::binary);
|
||||||
|
ofs.write((char *)&commId, sizeof(BKCLUniqueId));
|
||||||
|
} else {
|
||||||
|
auto begin = std::chrono::steady_clock::now();
|
||||||
|
while (!std::filesystem::exists(filePath)) {
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
_IT_ASSERT_2(now < begin + std::chrono::seconds(100),
|
||||||
|
"time limit (100s) exceeded.");
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
}
|
||||||
|
std::ifstream ifs(filePath, std::ios::binary);
|
||||||
|
ifs.read((char *)&commId, sizeof(BKCLUniqueId));
|
||||||
|
}
|
||||||
|
checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
|
||||||
|
if (rank == 0) {
|
||||||
|
std::filesystem::remove(filePath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BKCLContext_t getXcclComm() { return comm; }
|
||||||
|
|
||||||
|
~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
|
||||||
|
virtual string toString() const final {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "XCCL communicator";
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -24,7 +24,7 @@
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
namespace nnet {
|
namespace nnet {
|
||||||
int matchExprResult(Derivator &derivator, string fn);
|
int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome);
|
||||||
bool checkExprLogSame(string fnPrefix, int start, int end);
|
bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end);
|
||||||
bool checkExprsEquvivalence(VecExpr exprs);
|
bool checkExprsEquvivalence(VecExpr exprs);
|
||||||
} // namespace nnet
|
} // namespace nnet
|
||||||
|
|
|
@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
|
||||||
OP_CLONE(G2BMMObj);
|
OP_CLONE(G2BMMObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
int numInputs() const override { return 2; }
|
int numInputs() const override { return 2; }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
|
@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
|
||||||
OP_CLONE(GBMMObj);
|
OP_CLONE(GBMMObj);
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
int numInputs() const override { return 2; }
|
int numInputs() const override { return 2; }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
|
@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
|
||||||
ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
|
ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
|
||||||
Tensor x, Tensor diff_x);
|
Tensor x, Tensor diff_x);
|
||||||
OP_CLONE(ActivationBackwardObj);
|
OP_CLONE(ActivationBackwardObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 3; }
|
int numInputs() const override { return 3; }
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief The AllGather operation gathers N values from k ranks into
|
||||||
|
* an output of size k*N, and distributes that result to all ranks.
|
||||||
|
* The output is ordered by rank index.
|
||||||
|
*
|
||||||
|
* For more details:
|
||||||
|
* https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather
|
||||||
|
*/
|
||||||
|
class AllGatherObj : public OperatorObj {
|
||||||
|
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new AllGather object
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param input The input tensor from this rank.
|
||||||
|
* @param outputs A list of output tensors collected from all ranks.
|
||||||
|
* @param world_size Total number of ranks.
|
||||||
|
*/
|
||||||
|
AllGatherObj(GraphObj *graph, Tensor input, std::optional<TensorVec>,
|
||||||
|
int world_size);
|
||||||
|
OP_CLONE(AllGatherObj);
|
||||||
|
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return world_size; }
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
|
||||||
|
int getWorldSize() const { return world_size; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
int world_size;
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,75 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief The AllReduce operation is performing reductions on data (sum, min,
|
||||||
|
* max, avg, or div) across devices and writing the result in the
|
||||||
|
* receive buffers of every rank. For example, in an allreduce operation between
|
||||||
|
* k ranks and performing a sum, each rank will provide an array Vk of N values,
|
||||||
|
* and receive an identical arrays S of N values, where S[i] =
|
||||||
|
* V0[i]+V1[i]+…+Vk-1[i].
|
||||||
|
*
|
||||||
|
* For more details:
|
||||||
|
* https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce
|
||||||
|
*/
|
||||||
|
class AllReduceBaseObj : public OperatorObj {
|
||||||
|
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new AllReduce base object. Should be called by every
|
||||||
|
* child class constructor, but not directly.
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param opType The operation type. This param is taken care of by child
|
||||||
|
* classes.
|
||||||
|
* @param input The input tensor from this rank.
|
||||||
|
* @param output The output tensor, same size as input.
|
||||||
|
*/
|
||||||
|
AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
|
||||||
|
Tensor output);
|
||||||
|
OP_CLONE(AllReduceBaseObj);
|
||||||
|
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
|
||||||
|
return {{inputs[0]->getDims()}};
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override {
|
||||||
|
return {inputs[0]->getDType()};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceSumObj : public AllReduceBaseObj {
|
||||||
|
public:
|
||||||
|
AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceProdObj : public AllReduceBaseObj {
|
||||||
|
public:
|
||||||
|
AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceMinObj : public AllReduceBaseObj {
|
||||||
|
public:
|
||||||
|
AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceMaxObj : public AllReduceBaseObj {
|
||||||
|
public:
|
||||||
|
AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceAvgObj : public AllReduceBaseObj {
|
||||||
|
public:
|
||||||
|
AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,43 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief Fused Attention with KVCache input operator. All the input and output
|
||||||
|
* tensors should have the same rank except for the position_id.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class AttentionKVCacheObj : public OperatorObj {
|
||||||
|
int dim;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new AttentionKVCache object.
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param input_k_cache The k_cache input tensor.
|
||||||
|
* @param input_v_cache The v_cache input tensor.
|
||||||
|
* @param input_q The query input tensor.
|
||||||
|
* @param input_k The key input tensor.
|
||||||
|
* @param input_v The value input tensor.
|
||||||
|
* @param position_id The positon id of the query,
|
||||||
|
* @param output_matmul The query output tensor.
|
||||||
|
*/
|
||||||
|
AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
|
||||||
|
Tensor input_v_cache, Tensor input_q, Tensor input_k,
|
||||||
|
Tensor input_v, Tensor position_id,
|
||||||
|
Tensor output_matmul);
|
||||||
|
OP_CLONE(AttentionKVCacheObj);
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 6; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
int getDim() const { return dim; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
|
||||||
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
|
||||||
float eps = 1e-5, bool trainingMode = false);
|
float eps = 1e-5, bool trainingMode = false);
|
||||||
OP_CLONE(BatchNormObj);
|
OP_CLONE(BatchNormObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
|
||||||
// output size will be 3 when training
|
// output size will be 3 when training
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief The Broadcast operation copies an N-element buffer on the root rank to
|
||||||
|
* all ranks.
|
||||||
|
*
|
||||||
|
* For more details:
|
||||||
|
* https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#broadcast
|
||||||
|
*/
|
||||||
|
class BroadcastObj : public OperatorObj {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new Broadcast object.
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param input The input tensor. Only root needs to initialize it with
|
||||||
|
* data.
|
||||||
|
* @param output The output tensor, same size as input.
|
||||||
|
* @param root The root rank who performs the broadcast.
|
||||||
|
*/
|
||||||
|
BroadcastObj(GraphObj *graph, Tensor input, Tensor output, int root);
|
||||||
|
OP_CLONE(BroadcastObj);
|
||||||
|
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
|
||||||
|
return {{inputs[0]->getDims()}};
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
|
||||||
|
int getRoot() const { return root; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override {
|
||||||
|
return {inputs[0]->getDType()};
|
||||||
|
};
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// The rank who broadcasts data among this communication group
|
||||||
|
int root;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
|
||||||
ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
|
ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
|
||||||
OP_CLONE(ConcatObj);
|
OP_CLONE(ConcatObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return inputs.size(); }
|
int numInputs() const override { return inputs.size(); }
|
||||||
|
|
|
@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
|
||||||
ActType act = ActType::None);
|
ActType act = ActType::None);
|
||||||
OP_CLONE(ConvObj);
|
OP_CLONE(ConvObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
|
||||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||||
Tensor bias = nullptr, ActType act = ActType::None);
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
ActType getAct() const { return act; }
|
ActType getAct() const { return act; }
|
||||||
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||||
|
|
||||||
|
@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
|
||||||
Tensor bias = nullptr, ActType act = ActType::None);
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
OP_CLONE(ConvTransposed2dObj);
|
OP_CLONE(ConvTransposed2dObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
int getNumGroups() const override { return group; }
|
int getNumGroups() const override { return group; }
|
||||||
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
|
std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
|
||||||
|
|
||||||
|
@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
|
||||||
Tensor bias = nullptr, ActType act = ActType::None);
|
Tensor bias = nullptr, ActType act = ActType::None);
|
||||||
OP_CLONE(ConvTransposed2dNHWCObj);
|
OP_CLONE(ConvTransposed2dNHWCObj);
|
||||||
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
int getNumGroups() const override { return group; }
|
int getNumGroups() const override { return group; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
|
||||||
enum Mode { NormalDet = 0, LogDet };
|
enum Mode { NormalDet = 0, LogDet };
|
||||||
DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
|
DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
|
||||||
OP_CLONE(DetObj);
|
OP_CLONE(DetObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 1; }
|
int numInputs() const override { return 1; }
|
||||||
|
|
|
@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
|
||||||
DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
|
DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
|
||||||
float ratio, bool training_mode);
|
float ratio, bool training_mode);
|
||||||
OP_CLONE(DropoutObj);
|
OP_CLONE(DropoutObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 1; }
|
int numInputs() const override { return 1; }
|
||||||
|
|
|
@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
|
||||||
*/
|
*/
|
||||||
ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
|
ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
|
||||||
Tensor output);
|
Tensor output);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 2; }
|
int numInputs() const override { return 2; }
|
||||||
|
@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
|
||||||
MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
|
MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
|
||||||
Reduction reduction, Tensor output);
|
Reduction reduction, Tensor output);
|
||||||
OP_CLONE(MSELossObj);
|
OP_CLONE(MSELossObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
Reduction getReduction() const { return reductionMode; }
|
Reduction getReduction() const { return reductionMode; }
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
|
@ -65,26 +65,24 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
|
DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
|
DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
|
DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
|
DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Max)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
|
DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Min)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
|
DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Pow)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
|
DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
|
DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
|
DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
|
DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
|
DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::Greater)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
|
DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterOrEqual)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
|
DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::Less)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
|
DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessOrEqual)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
|
|
||||||
DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
|
DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
|
DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
|
DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
|
DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
|
DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitwiseAnd)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
|
DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitwiseOr)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
|
DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitwiseXor)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
|
DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitwiseNot)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
|
DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitShift)
|
||||||
DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
/**
|
||||||
|
* @brief Broadcast the input tensor following the given shape and the
|
||||||
|
* broadcast rule.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class ExpandObj : public OperatorObj {
|
||||||
|
Shape dims;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new Expand object.
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param input The input tensor.
|
||||||
|
* @param output The output tensor.
|
||||||
|
* @param dims The shape you want to expand to, following the broadcast
|
||||||
|
* rule.
|
||||||
|
*/
|
||||||
|
ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
|
||||||
|
OP_CLONE(ExpandObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return 1; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
Shape getShape() const { return dims; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
|
||||||
ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
|
ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
|
||||||
int num = 1);
|
int num = 1);
|
||||||
OP_CLONE(ExtendObj);
|
OP_CLONE(ExtendObj);
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 1; }
|
int numInputs() const override { return 1; }
|
||||||
|
|
|
@ -3,14 +3,28 @@
|
||||||
#include "core/operator.h"
|
#include "core/operator.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
class GatherBaseObj : public OperatorObj {
|
||||||
|
protected:
|
||||||
|
int axis;
|
||||||
|
|
||||||
|
public:
|
||||||
|
GatherBaseObj(OpType opType, TensorVec inputs, TensorVec outputs, int axis)
|
||||||
|
: OperatorObj(opType, inputs, outputs), axis(axis) {}
|
||||||
|
|
||||||
|
virtual ~GatherBaseObj() {}
|
||||||
|
int numInputs() const override { return 2; }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
|
int getAxis() const { return axis; }
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Gather and concatenate given positions on a certain dimension of the
|
* @brief Gather and concatenate given positions on a certain dimension of the
|
||||||
* input tensor using an index tensor.
|
* input tensor using an index tensor.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class GatherObj : public OperatorObj {
|
class GatherObj : public GatherBaseObj {
|
||||||
int axis;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* @brief Construct a new Gather object.
|
* @brief Construct a new Gather object.
|
||||||
|
@ -25,10 +39,7 @@ class GatherObj : public OperatorObj {
|
||||||
int axis);
|
int axis);
|
||||||
OP_CLONE(GatherObj);
|
OP_CLONE(GatherObj);
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 2; }
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
int numOutputs() const override { return 1; }
|
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
|
||||||
int getAxis() const { return axis; }
|
|
||||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -36,4 +47,33 @@ class GatherObj : public OperatorObj {
|
||||||
vector<int> getWorkloadVector() const override;
|
vector<int> getWorkloadVector() const override;
|
||||||
vector<int> getOpAttrVector() const override;
|
vector<int> getOpAttrVector() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief GatherElements takes two inputs data and indices of the
|
||||||
|
* same rank r >= 1 and an optional attribute axis that identifies
|
||||||
|
* an axis of data.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class GatherElementsObj : public GatherBaseObj {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a new GatherElements object.
|
||||||
|
*
|
||||||
|
* @param graph The computation graph that this operator belongs to.
|
||||||
|
* @param input The input tensor.
|
||||||
|
* @param indices The index tensor.
|
||||||
|
* @param output The output tensor. Same shape as indices.
|
||||||
|
* @param axis The axis to gather on.
|
||||||
|
*/
|
||||||
|
GatherElementsObj(GraphObj *graph, Tensor input, Tensor indices,
|
||||||
|
Tensor output, int axis);
|
||||||
|
OP_CLONE(GatherElementsObj);
|
||||||
|
std::string toString() const override;
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class LayerNormObj : public OperatorObj {
|
||||||
|
float eps;
|
||||||
|
int axis, stash_type;
|
||||||
|
|
||||||
|
public:
|
||||||
|
LayerNormObj(GraphObj *graph, Tensor input, Tensor scale, Tensor output,
|
||||||
|
Tensor bias = nullptr, float eps = 1e-5, int axis = -1,
|
||||||
|
int stash_type = 1);
|
||||||
|
OP_CLONE(LayerNormObj);
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
std::string toString() const override;
|
||||||
|
|
||||||
|
Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
|
||||||
|
int numInputs() const override { return inputs.size(); }
|
||||||
|
int numOutputs() const override { return outputs.size(); }
|
||||||
|
float getEps() const { return eps; }
|
||||||
|
int getAxis() const { return axis; }
|
||||||
|
int getStashType() const { return stash_type; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
|
||||||
|
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||||
|
};
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,29 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class LRNObj : public OperatorObj {
|
||||||
|
|
||||||
|
public:
|
||||||
|
LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
|
||||||
|
float beta, float bias, int size);
|
||||||
|
OP_CLONE(LRNObj);
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return inputs.size(); }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
auto getAlphaBetaBias() const {
|
||||||
|
return tuple(alpha_value, beta_value, bias_value);
|
||||||
|
}
|
||||||
|
auto getSize() const { return size_value; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float alpha_value, beta_value, bias_value;
|
||||||
|
int size_value;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue