forked from jiuyuan/InfiniTensor
Merge branch 'master' into update_pybind11
This commit is contained in:
commit
3b5dd7d28c
|
@ -13,7 +13,7 @@ if(USE_CUDA)
|
||||||
message("CMake 3.18 or higher is required for setting CUDAToolkit")
|
message("CMake 3.18 or higher is required for setting CUDAToolkit")
|
||||||
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
|
cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
|
||||||
else()
|
else()
|
||||||
cmake_minimum_required(VERSION 3.12)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include(CMakeDependentOption)
|
include(CMakeDependentOption)
|
||||||
|
@ -245,6 +245,7 @@ if(USE_BANG)
|
||||||
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
|
find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
|
||||||
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
|
find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
|
||||||
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
|
find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
|
||||||
|
find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
|
||||||
|
|
||||||
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
|
||||||
|
@ -261,7 +262,13 @@ if(USE_BANG)
|
||||||
# BangC Kernels
|
# BangC Kernels
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
|
||||||
|
if (BUILD_DIST)
|
||||||
|
message(STATUS "Add BUILD_DIST, use CNCL with BANG")
|
||||||
|
|
||||||
|
add_compile_definitions(INFINI_USE_CNCL=1)
|
||||||
|
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_KUNLUN)
|
if(USE_KUNLUN)
|
||||||
|
@ -324,6 +331,7 @@ if(BUILD_TEST)
|
||||||
endif()
|
endif()
|
||||||
if (USE_BANG)
|
if (USE_BANG)
|
||||||
build_test(test/kernels/bang/*.cc)
|
build_test(test/kernels/bang/*.cc)
|
||||||
|
build_test(test/bang/*.cc)
|
||||||
endif()
|
endif()
|
||||||
if (USE_KUNLUN)
|
if (USE_KUNLUN)
|
||||||
build_test(test/kernels/kunlun/*.cc)
|
build_test(test/kernels/kunlun/*.cc)
|
||||||
|
|
1
Makefile
1
Makefile
|
@ -29,6 +29,7 @@ CMAKE_OPT += -DUSE_BANG=$(BANG)
|
||||||
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
|
||||||
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
|
||||||
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
CMAKE_OPT += -DBUILD_TEST=$(TEST)
|
||||||
|
CMAKE_OPT += -DBUILD_DIST=ON
|
||||||
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
CMAKE_OPT += -DBUILD_NNET=$(NNET)
|
||||||
|
|
||||||
ifeq ($(INTELCPU), ON)
|
ifeq ($(INTELCPU), ON)
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
|
||||||
|
SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
|
||||||
|
|
||||||
|
set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
|
||||||
|
set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
|
||||||
|
set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
|
||||||
|
|
||||||
|
if ($ENV{CNCL_ROOT_DIR})
|
||||||
|
message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
|
||||||
|
endif()
|
||||||
|
list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
|
||||||
|
# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
|
||||||
|
|
||||||
|
find_path(CNCL_INCLUDE_DIRS
|
||||||
|
NAMES cncl.h
|
||||||
|
HINTS ${CNCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
if (USE_STATIC_CNCL)
|
||||||
|
MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
|
||||||
|
SET(CNCL_LIBNAME "CNCL_static")
|
||||||
|
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
SET(CNCL_LIBNAME "cncl")
|
||||||
|
if (CNCL_VERSION) # Prefer the versioned library if a specific CNCL version is specified
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_library(CNCL_LIBRARIES
|
||||||
|
NAMES ${CNCL_LIBNAME}
|
||||||
|
HINTS ${CNCL_LIB_DIR})
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||||
|
|
||||||
|
if(CNCL_FOUND) # obtaining CNCL version and some sanity checks
|
||||||
|
set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
|
||||||
|
message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
|
||||||
|
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||||
|
list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
|
||||||
|
include(CheckCXXSymbolExists)
|
||||||
|
check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
|
||||||
|
|
||||||
|
if (CNCL_VERSION_DEFINED)
|
||||||
|
set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
|
||||||
|
file(WRITE ${file} "
|
||||||
|
#include <iostream>
|
||||||
|
#include <cncl.h>
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
|
||||||
|
int x;
|
||||||
|
CNCLGetVersion(&x);
|
||||||
|
return x == CNCL_VERSION_CODE;
|
||||||
|
}
|
||||||
|
")
|
||||||
|
try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
|
||||||
|
RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
|
||||||
|
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
|
||||||
|
LINK_LIBRARIES ${CNCL_LIBRARIES})
|
||||||
|
if (NOT CNCL_VERSION_MATCHED)
|
||||||
|
message(FATAL_ERROR "Found CNCL header version and library version do not match! \
|
||||||
|
(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
|
||||||
|
else()
|
||||||
|
# message(STATUS "CNCL version < 2.3.5-5")
|
||||||
|
endif ()
|
||||||
|
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
|
||||||
|
|
||||||
|
message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
|
||||||
|
mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
|
||||||
|
endif()
|
|
@ -1 +1 @@
|
||||||
Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
|
Subproject commit b896cec2dba5b8522b141ac4f89eb43074ee1b98
|
|
@ -0,0 +1,196 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import multiprocessing as mp
|
||||||
|
from pyinfinitensor.onnx import OnnxStub, backend
|
||||||
|
import onnx
|
||||||
|
from onnx.shape_inference import infer_shapes_path
|
||||||
|
import numpy as np
|
||||||
|
from parallel_opt import parallel_model
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="launch distributed infinitensor")
|
||||||
|
parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nproc_per_node", type=int, default=2, help="number of processes per node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--name", type=str, default="test", help="name of this instance."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx",
|
||||||
|
help="path to the ONNX model file."
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
|
||||||
|
parser.add_argument("--length", type=int, default=1, help="sequence length.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen_std",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="whether to generate the standard results.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("arg setting: ", args)
|
||||||
|
return (
|
||||||
|
args.num_nodes,
|
||||||
|
args.nproc_per_node,
|
||||||
|
args.name,
|
||||||
|
args.model,
|
||||||
|
args.batch_size,
|
||||||
|
args.length,
|
||||||
|
args.gen_std,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model(model, runtime, world_size=1, rank=0, n=10):
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
load_inputs(stub, world_size, rank)
|
||||||
|
# stub.tune()
|
||||||
|
stub.run()
|
||||||
|
# get outputs
|
||||||
|
time.sleep(0.01)
|
||||||
|
outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
|
||||||
|
# bench
|
||||||
|
begin = time.time()
|
||||||
|
for _ in range(n):
|
||||||
|
stub.run()
|
||||||
|
end = time.time()
|
||||||
|
avg_time = (end - begin) / n
|
||||||
|
print(f"average time: {avg_time}")
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_compare(name, model, runtime, world_size=1, rank = 0):
|
||||||
|
results = np.load(f"./data/output.npy")
|
||||||
|
outputs = run_model(model, runtime, world_size, rank)
|
||||||
|
print("answer argmax:", np.argmax(results))
|
||||||
|
print("output argmax:", np.argmax(outputs))
|
||||||
|
#np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
|
||||||
|
getDiff(results, outputs)
|
||||||
|
|
||||||
|
|
||||||
|
def start_worker(
|
||||||
|
name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
|
||||||
|
):
|
||||||
|
dist_name = name + "_dist"
|
||||||
|
model = parallel_model(model, world_size, rank)
|
||||||
|
extern_path = f"./{dist_name}_rank{rank}.pb"
|
||||||
|
if os.path.exists(extern_path):
|
||||||
|
os.remove(extern_path)
|
||||||
|
onnx.save_model(
|
||||||
|
model,
|
||||||
|
f"./{dist_name}_rank{rank}.onnx",
|
||||||
|
save_as_external_data=True,
|
||||||
|
location=extern_path,
|
||||||
|
)
|
||||||
|
infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
|
||||||
|
runtime = backend.BangRuntime(local_rank)
|
||||||
|
# print("init comm")
|
||||||
|
runtime.init_comm(
|
||||||
|
dist_name,
|
||||||
|
world_size,
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
run_and_compare(name, model, runtime, world_size, rank)
|
||||||
|
|
||||||
|
|
||||||
|
def start_single(name, model):
|
||||||
|
runtime = backend.BangRuntime(0)
|
||||||
|
run_and_compare(name, model, runtime)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_input_output(model):
|
||||||
|
os.makedirs(os.path.dirname("./data/"), exist_ok=True)
|
||||||
|
runtime = backend.BangRuntime(0)
|
||||||
|
stub = OnnxStub(model, runtime)
|
||||||
|
position_id = 0
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = tensor.copyout_numpy()
|
||||||
|
if np.issubdtype(input.dtype, np.integer):
|
||||||
|
if input.size == 1:
|
||||||
|
# input = np.array([position_id])
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
else:
|
||||||
|
input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
|
||||||
|
elif input.dtype == np.bool_:
|
||||||
|
input = np.random.randint(0,2,size=input.shape) > 0
|
||||||
|
else:
|
||||||
|
if i == 0:
|
||||||
|
input = np.ones(input.shape).astype(input.dtype)
|
||||||
|
position_id = input.shape[-1] - 1
|
||||||
|
else:
|
||||||
|
input = np.random.rand(*input.shape).astype(input.dtype)
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
np.save(f"./data/input_{i}", input)
|
||||||
|
stub.run()
|
||||||
|
time.sleep(0.01)
|
||||||
|
output = next(stub.outputs.values().__iter__()).copyout_numpy()
|
||||||
|
if np.isnan(output).any():
|
||||||
|
print("Nan in output")
|
||||||
|
np.save(f"./data/output", output)
|
||||||
|
|
||||||
|
|
||||||
|
def load_inputs(stub, world_size=1, rank=0):
|
||||||
|
for i, (name, tensor) in enumerate(stub.inputs.items()):
|
||||||
|
input = np.load(f"./data/input_{i}.npy")
|
||||||
|
if all(x == y for x,y in zip(input.shape,tensor.shape())):
|
||||||
|
tensor.copyin_numpy(input)
|
||||||
|
else:
|
||||||
|
tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
|
||||||
|
|
||||||
|
def getDiff(base, test):
|
||||||
|
absolute_diff = np.abs(np.subtract(base, test))
|
||||||
|
max_absolute_diff = np.max(absolute_diff)
|
||||||
|
|
||||||
|
baseCopy = base.astype(np.float64).ravel()
|
||||||
|
testCopy = test.astype(np.float64).ravel()
|
||||||
|
upValue = np.sum(np.abs(baseCopy - testCopy))
|
||||||
|
downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
|
||||||
|
max_relative_diff = upValue / downValue
|
||||||
|
print(f"Max absolute difference: {max_absolute_diff}\n"
|
||||||
|
f"Max relative difference: {max_relative_diff}")
|
||||||
|
return max_absolute_diff, max_relative_diff
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
|
||||||
|
|
||||||
|
model = onnx.load(model_path)
|
||||||
|
|
||||||
|
# generate standart output
|
||||||
|
if gen_std:
|
||||||
|
print("Generate inputs and outputs.")
|
||||||
|
p = mp.Process(target=generate_input_output, args=[model])
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
return
|
||||||
|
|
||||||
|
# run single process.
|
||||||
|
# use standalone process to isolate cuda.
|
||||||
|
print("run model by single MLU.")
|
||||||
|
p = mp.Process(target=start_single, args=(name, model))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
# run distributed parallel.
|
||||||
|
world_size = nnodes * nproc_per_node
|
||||||
|
print(f"run model by {world_size} MLUs in parallel.")
|
||||||
|
workers = [
|
||||||
|
mp.Process(
|
||||||
|
target=start_worker,
|
||||||
|
args=(name, world_size, rank, rank % nproc_per_node, model),
|
||||||
|
)
|
||||||
|
for rank in range(world_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.start()
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
w.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -115,7 +115,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
||||||
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
assert out_dims[s_dim] % tp_world_size == 0, out_dims
|
||||||
out_dims[s_dim] //= tp_world_size
|
out_dims[s_dim] //= tp_world_size
|
||||||
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
|
# if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
|
||||||
# node.input[1] = node.output[0] + "_shape"
|
node.input[1] = node.output[0] + "_shape"
|
||||||
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
|
data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
|
||||||
place[node.output[0]] = Shard(s_dim)
|
place[node.output[0]] = Shard(s_dim)
|
||||||
|
|
||||||
|
|
|
@ -7,17 +7,19 @@ namespace infini {
|
||||||
class BangRuntimeObj : public RuntimeObj {
|
class BangRuntimeObj : public RuntimeObj {
|
||||||
private:
|
private:
|
||||||
cnnlHandle_t cnnl;
|
cnnlHandle_t cnnl;
|
||||||
|
cnrtQueue_t queue;
|
||||||
|
std::unique_ptr<CommunicatorObj> comm;
|
||||||
BangPtr workspace;
|
BangPtr workspace;
|
||||||
size_t workspaceSize;
|
size_t workspaceSize;
|
||||||
mutable size_t cursor;
|
mutable size_t cursor;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
BangRuntimeObj() : RuntimeObj(Device::BANG) {
|
explicit BangRuntimeObj(int deviceId = 0)
|
||||||
|
: RuntimeObj(Device::BANG, deviceId) {
|
||||||
cnInit(0);
|
cnInit(0);
|
||||||
CNdev dev;
|
CNdev dev;
|
||||||
cnDeviceGet(&dev, 0);
|
cnDeviceGet(&dev, deviceId);
|
||||||
checkBangError(cnrtSetDevice(dev));
|
checkBangError(cnrtSetDevice(dev));
|
||||||
cnrtQueue_t queue;
|
|
||||||
checkBangError(cnrtQueueCreate(&queue));
|
checkBangError(cnrtQueueCreate(&queue));
|
||||||
|
|
||||||
checkCnnlError(cnnlCreate(&cnnl));
|
checkCnnlError(cnnlCreate(&cnnl));
|
||||||
|
@ -30,6 +32,7 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
}
|
}
|
||||||
virtual ~BangRuntimeObj() {
|
virtual ~BangRuntimeObj() {
|
||||||
dealloc(workspace);
|
dealloc(workspace);
|
||||||
|
checkBangError(cnrtQueueDestroy(queue));
|
||||||
checkCnnlError(cnnlDestroy(cnnl));
|
checkCnnlError(cnnlDestroy(cnnl));
|
||||||
}
|
}
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
@ -73,10 +76,9 @@ class BangRuntimeObj : public RuntimeObj {
|
||||||
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
|
||||||
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
CNRT_MEM_TRANS_DIR_PEER2PEER));
|
||||||
}
|
}
|
||||||
|
void initComm(const string &name, int worldSize, int rank) final;
|
||||||
void initComm(const string &, int, int) override { IT_TODO_HALT(); }
|
CommunicatorObj &getCommunicator() const override { return *comm; }
|
||||||
|
cnrtQueue_t getBangQueue() const { return queue; }
|
||||||
CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
#pragma once
|
||||||
|
#include "bang_common.h"
|
||||||
|
#include "core/communicator.h"
|
||||||
|
#include <chrono>
|
||||||
|
#include <cncl.h>
|
||||||
|
#include <cnrt.h>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class CnclCommunicatorObj final : public CommunicatorObj {
|
||||||
|
private:
|
||||||
|
cnclComm_t *comms;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CnclCommunicatorObj(const string &name, int worldSize, int rank)
|
||||||
|
: CommunicatorObj(worldSize, rank) {
|
||||||
|
const std::string filePath("./" + name + "_cncl_id.bin");
|
||||||
|
cnclCliqueId clique_id;
|
||||||
|
if (rank == 0) {
|
||||||
|
CNCL_CHECK(cnclGetCliqueId(&clique_id));
|
||||||
|
std::ofstream ofs(filePath, std::ios::binary);
|
||||||
|
ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
auto begin = std::chrono::steady_clock::now();
|
||||||
|
while (!std::filesystem::exists(filePath)) {
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
_IT_ASSERT_2(now < begin + std::chrono::seconds(10),
|
||||||
|
"time limit (10s) exceeded.");
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
}
|
||||||
|
std::ifstream ifs(filePath, std::ios::binary);
|
||||||
|
ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_comms = 1;
|
||||||
|
int *dev_list = new int[num_comms];
|
||||||
|
int *rank_list = new int[num_comms];
|
||||||
|
comms = new cnclComm_t[num_comms];
|
||||||
|
uint32_t num_dev = 0;
|
||||||
|
checkBangError(cnrtGetDeviceCount(&num_dev));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_comms; i++) {
|
||||||
|
rank_list[i] = rank;
|
||||||
|
dev_list[i] = rank_list[i] % num_dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
|
||||||
|
worldSize, &clique_id));
|
||||||
|
|
||||||
|
if (rank == 0) {
|
||||||
|
std::filesystem::remove(filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] dev_list;
|
||||||
|
delete[] rank_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
~CnclCommunicatorObj() {
|
||||||
|
CNCL_CHECK(cnclDestroyComms(comms, 1));
|
||||||
|
delete[] comms;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the actual cnclComm_t
|
||||||
|
cnclComm_t getCnclComm() { return comms[0]; }
|
||||||
|
|
||||||
|
virtual string toString() const final {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "CNCL communicator";
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -65,12 +65,18 @@ class GraphHandlerObj {
|
||||||
std::optional<float> max);
|
std::optional<float> max);
|
||||||
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
Tensor transpose(Tensor data, Tensor transposed, Shape perm);
|
||||||
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
|
||||||
|
Tensor resize(Tensor input, Tensor output,
|
||||||
|
const std::optional<vector<int>> &axes, Tensor sizes,
|
||||||
|
Tensor scales, Tensor roi, vector<uint32_t> sizes_,
|
||||||
|
vector<float> scales_, vector<float> roi_, string mode,
|
||||||
|
string ratioPolicy, string nearestMode,
|
||||||
|
string coordTransMode);
|
||||||
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
Tensor concat(TensorVec inputs, Tensor output, int dim);
|
||||||
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
|
Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
|
||||||
Tensor input_q, Tensor input_k, Tensor input_v,
|
Tensor input_q, Tensor input_k, Tensor input_v,
|
||||||
Tensor position_id, Tensor output_matmul);
|
Tensor position_id, Tensor output_matmul);
|
||||||
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
|
||||||
int num_outputs);
|
std::variant<int, vector<int>> numOrRatio);
|
||||||
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
|
||||||
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
|
Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
|
||||||
Tensor reduceMean(Tensor data, Tensor reduced,
|
Tensor reduceMean(Tensor data, Tensor reduced,
|
||||||
|
@ -99,6 +105,8 @@ class GraphHandlerObj {
|
||||||
int outputType, Tensor input);
|
int outputType, Tensor input);
|
||||||
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
|
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||||
std::string mode);
|
std::string mode);
|
||||||
|
Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
|
||||||
|
int size);
|
||||||
|
|
||||||
//------ modifiers
|
//------ modifiers
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "core/common.h"
|
#include "core/common.h"
|
||||||
#include "core/operator.h"
|
#include "core/operator.h"
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
#include "utils/operator_utils.h"
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
@ -102,11 +103,9 @@ class KernelRegistry {
|
||||||
}
|
}
|
||||||
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
|
||||||
auto it = kernels.find(kernelAttrs);
|
auto it = kernels.find(kernelAttrs);
|
||||||
IT_ASSERT(it != kernels.end(),
|
IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
|
||||||
"Kernel not found for key {" +
|
get_kernel_attrs_str(kernelAttrs) +
|
||||||
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
"}");
|
||||||
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
|
|
||||||
std::get<2>(kernelAttrs).toString() + "}");
|
|
||||||
return std::get<0>(it->second);
|
return std::get<0>(it->second);
|
||||||
}
|
}
|
||||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||||
|
|
|
@ -8,7 +8,9 @@
|
||||||
#if USE_CUDA
|
#if USE_CUDA
|
||||||
#include "cuda/cuda_runtime.h"
|
#include "cuda/cuda_runtime.h"
|
||||||
#endif
|
#endif
|
||||||
|
#if USE_BANG
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#endif
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
// TODO: how to deal with this
|
// TODO: how to deal with this
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#pragma once
|
||||||
|
#include "core/operator.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class LRNObj : public OperatorObj {
|
||||||
|
|
||||||
|
public:
|
||||||
|
LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
|
||||||
|
float beta, float bias, int size);
|
||||||
|
OP_CLONE(LRNObj);
|
||||||
|
|
||||||
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||||
|
|
||||||
|
std::string toString() const override;
|
||||||
|
int numInputs() const override { return inputs.size(); }
|
||||||
|
int numOutputs() const override { return 1; }
|
||||||
|
auto getAlphaBetaBias() const {
|
||||||
|
return tuple(alpha_value, beta_value, bias_value);
|
||||||
|
}
|
||||||
|
auto getSize() const { return size_value; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
float alpha_value, beta_value, bias_value;
|
||||||
|
int size_value;
|
||||||
|
vector<int> getWorkloadVector() const override;
|
||||||
|
vector<int> getOpAttrVector() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -27,6 +27,60 @@ class ResizeObj : public OperatorObj {
|
||||||
enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
|
enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
|
||||||
enum class ECoeffMode { nearest, linear, cubic };
|
enum class ECoeffMode { nearest, linear, cubic };
|
||||||
|
|
||||||
|
static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
|
||||||
|
if (mode == "half_pixel") {
|
||||||
|
return ECoordinateTransMode::halfPixel;
|
||||||
|
} else if (mode == "asymmetric") {
|
||||||
|
return ECoordinateTransMode::asymmetric;
|
||||||
|
} else if (mode == "align_corners") {
|
||||||
|
return ECoordinateTransMode::alignCorners;
|
||||||
|
} else if (mode == "pytorch_half_pixel") {
|
||||||
|
return ECoordinateTransMode::pytorchHalfPixel;
|
||||||
|
} else if (mode == "tf_crop_and_resize") {
|
||||||
|
return ECoordinateTransMode::tfCropAndResize;
|
||||||
|
} else {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENearestMode fromENearestModeStr(string mode) {
|
||||||
|
if (mode == "round_prefer_floor") {
|
||||||
|
return ENearestMode::roundPreferFloor;
|
||||||
|
} else if (mode == "round_prefer_ceil") {
|
||||||
|
return ENearestMode::roundPreferCeil;
|
||||||
|
} else if (mode == "floor") {
|
||||||
|
return ENearestMode::floor;
|
||||||
|
} else if (mode == "ceil") {
|
||||||
|
return ENearestMode::ceil;
|
||||||
|
} else {
|
||||||
|
return ENearestMode::none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
|
||||||
|
if (ratioPolicyStr == "stretch") {
|
||||||
|
return EKeepAspectRatioPolicy::stretch;
|
||||||
|
} else if (ratioPolicyStr == "not_larger") {
|
||||||
|
return EKeepAspectRatioPolicy::notLarger;
|
||||||
|
} else if (ratioPolicyStr == "not_smaller") {
|
||||||
|
return EKeepAspectRatioPolicy::notSmaller;
|
||||||
|
} else {
|
||||||
|
return EKeepAspectRatioPolicy::none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ECoeffMode fromECoeffModeStr(string mode) {
|
||||||
|
if (mode == "nearest") {
|
||||||
|
return ECoeffMode::nearest;
|
||||||
|
} else if (mode == "linear") {
|
||||||
|
return ECoeffMode::linear;
|
||||||
|
} else if (mode == "cubic") {
|
||||||
|
return ECoeffMode::cubic;
|
||||||
|
} else {
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
vector<int> axes;
|
vector<int> axes;
|
||||||
vector<float> scales;
|
vector<float> scales;
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#ifndef OPERATOR_UTIL_H
|
#ifndef OPERATOR_UTIL_H
|
||||||
#define OPERATOR_UTIL_H
|
#define OPERATOR_UTIL_H
|
||||||
|
|
||||||
|
#include "core/operator.h"
|
||||||
#include "core/tensor.h"
|
#include "core/tensor.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
@ -10,8 +11,15 @@ namespace infini {
|
||||||
Shape infer_broadcast(const Shape &A, const Shape &B);
|
Shape infer_broadcast(const Shape &A, const Shape &B);
|
||||||
// Launch the real axis based on rank and current axis
|
// Launch the real axis based on rank and current axis
|
||||||
int get_real_axis(const int &axis, const int &rank);
|
int get_real_axis(const int &axis, const int &rank);
|
||||||
// check if tensor B is unidirectional broadcastable to tensor A
|
// Check if tensor B is unidirectional broadcastable to tensor A
|
||||||
bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
|
bool is_unidirectional_broadcasting(const Shape &A, const Shape &B);
|
||||||
|
// Locate the index with size from Shape
|
||||||
|
Shape locate_index(size_t inputN, const Shape &shape);
|
||||||
|
// Delocate the ShapeIndex from Shape with broadcast
|
||||||
|
size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
|
||||||
|
const Shape &stride);
|
||||||
|
// Convert KernelAttrs to a string representation
|
||||||
|
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -535,6 +535,65 @@ class OnnxStub:
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
shape,
|
shape,
|
||||||
)
|
)
|
||||||
|
elif node.op_type == "Resize":
|
||||||
|
output = tensors.get(node.output[0])
|
||||||
|
attributes = _parse_attribute(
|
||||||
|
node,
|
||||||
|
{
|
||||||
|
"antialias": 0,
|
||||||
|
"axes": None,
|
||||||
|
"coordinate_transformation_mode": "half_pixel",
|
||||||
|
"cubic_coeff_a": -0.75,
|
||||||
|
"exclude_outside": 0,
|
||||||
|
"extrapolation_value": 0.0,
|
||||||
|
"keep_aspect_ratio_policy": "none",
|
||||||
|
"mode": "nearest",
|
||||||
|
"nearest_mode": "none",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
(
|
||||||
|
axes,
|
||||||
|
keep_aspect_ratio_policy,
|
||||||
|
coordinate_transformation_mode,
|
||||||
|
mode,
|
||||||
|
nearest_mode,
|
||||||
|
) = (
|
||||||
|
attributes[name]
|
||||||
|
for name in [
|
||||||
|
"axes",
|
||||||
|
"keep_aspect_ratio_policy",
|
||||||
|
"coordinate_transformation_mode",
|
||||||
|
"mode",
|
||||||
|
"nearest_mode",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if len(node.input) > 1:
|
||||||
|
roiVal = _parse_data(data[node.input[1]])
|
||||||
|
else:
|
||||||
|
roiVal = []
|
||||||
|
if len(node.input) > 2:
|
||||||
|
scalesVal = _parse_data(data[node.input[2]])
|
||||||
|
else:
|
||||||
|
scalesVal = []
|
||||||
|
if len(node.input) > 3:
|
||||||
|
sizesVal = _parse_data(data[node.input[3]])
|
||||||
|
else:
|
||||||
|
sizesVal = []
|
||||||
|
tensors[node.output[0]] = self.handler.resize(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
output,
|
||||||
|
axes,
|
||||||
|
tensors[node.input[3]] if len(node.input) > 3 else None,
|
||||||
|
tensors[node.input[2]] if len(node.input) > 2 else None,
|
||||||
|
tensors[node.input[1]] if len(node.input) > 1 else None,
|
||||||
|
sizesVal,
|
||||||
|
scalesVal,
|
||||||
|
roiVal,
|
||||||
|
mode,
|
||||||
|
keep_aspect_ratio_policy,
|
||||||
|
nearest_mode,
|
||||||
|
coordinate_transformation_mode,
|
||||||
|
)
|
||||||
elif node.op_type == "Squeeze":
|
elif node.op_type == "Squeeze":
|
||||||
input_shape = _search_shape(model, node.input[0])
|
input_shape = _search_shape(model, node.input[0])
|
||||||
axes = set(
|
axes = set(
|
||||||
|
@ -585,6 +644,20 @@ class OnnxStub:
|
||||||
tensors.get(node.output[0]),
|
tensors.get(node.output[0]),
|
||||||
)
|
)
|
||||||
elif node.op_type == "Split":
|
elif node.op_type == "Split":
|
||||||
|
split = (
|
||||||
|
_parse_data(data[node.input[1]])
|
||||||
|
if (len(node.input) > 1)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
if split is None:
|
||||||
|
split = next(
|
||||||
|
(
|
||||||
|
attr.ints
|
||||||
|
for attr in node.attribute
|
||||||
|
if attr.name == "split"
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
for name, tensor in zip(
|
for name, tensor in zip(
|
||||||
node.output,
|
node.output,
|
||||||
self.handler.split(
|
self.handler.split(
|
||||||
|
@ -598,7 +671,7 @@ class OnnxStub:
|
||||||
),
|
),
|
||||||
0,
|
0,
|
||||||
),
|
),
|
||||||
len(node.output),
|
split if split is not None else len(node.output),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
tensors[name] = tensor
|
tensors[name] = tensor
|
||||||
|
@ -857,6 +930,22 @@ class OnnxStub:
|
||||||
tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
|
tensors[output_name] = self.handler.tensor(dims, tensor.data_type)
|
||||||
data[output_name] = tensor
|
data[output_name] = tensor
|
||||||
tensors[output_name].set_weight()
|
tensors[output_name].set_weight()
|
||||||
|
elif node.op_type == "LRN":
|
||||||
|
attributes = _parse_attribute(
|
||||||
|
node, {"alpha": 0.0001, "beta": 0.75, "bias": 1.0, "size": 1}
|
||||||
|
)
|
||||||
|
(alpha, beta, bias, size) = (
|
||||||
|
attributes[name]
|
||||||
|
for name in ["alpha", "beta", "bias", "size"]
|
||||||
|
)
|
||||||
|
tensors[node.output[0]] = self.handler.lrn(
|
||||||
|
tensors[node.input[0]],
|
||||||
|
tensors.get(node.output[0]),
|
||||||
|
alpha,
|
||||||
|
beta,
|
||||||
|
bias,
|
||||||
|
size,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||||
new_node_name.append(node.name)
|
new_node_name.append(node.name)
|
||||||
|
@ -1195,6 +1284,20 @@ class OnnxStub:
|
||||||
elif ty == backend.OpTypeId.Expand:
|
elif ty == backend.OpTypeId.Expand:
|
||||||
shape = backend.expand_shape_of(op)
|
shape = backend.expand_shape_of(op)
|
||||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, shape=shape))
|
ctx.push_node(make_node(ty.name, inputs, outputs, name, shape=shape))
|
||||||
|
elif ty == backend.OpTypeId.LRN:
|
||||||
|
alpha, beta, bias, size = backend.lrn_attrs_of(op)
|
||||||
|
ctx.push_node(
|
||||||
|
make_node(
|
||||||
|
ty.name,
|
||||||
|
inputs,
|
||||||
|
outputs,
|
||||||
|
name,
|
||||||
|
alpha,
|
||||||
|
beta,
|
||||||
|
bias,
|
||||||
|
size,
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception("Unsupported OpType", ty)
|
raise Exception("Unsupported OpType", ty)
|
||||||
|
|
||||||
|
|
|
@ -295,6 +295,14 @@ class TestStringMethods(unittest.TestCase):
|
||||||
make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
|
make_graph([reshape], "reshape", [data, shape], [reshaped], [shape_data])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_resize(self):
|
||||||
|
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 128, 40, 40])
|
||||||
|
roi = make_tensor("roi", TensorProto.FLOAT, [0], [])
|
||||||
|
scales = make_tensor("scales", TensorProto.FLOAT, [4], [1, 1, 2, 2])
|
||||||
|
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 128, 80, 80])
|
||||||
|
reshape = make_node("Resize", ["x", "roi", "scales"], ["y"], name="resize")
|
||||||
|
make_and_import_model(make_graph([reshape], "resize", [x], [y], [roi, scales]))
|
||||||
|
|
||||||
def test_concat(self):
|
def test_concat(self):
|
||||||
input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
|
input1 = make_tensor_value_info("input1", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||||
input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
|
input2 = make_tensor_value_info("input2", TensorProto.FLOAT, [1, 3, 2, 5])
|
||||||
|
@ -435,6 +443,12 @@ class TestStringMethods(unittest.TestCase):
|
||||||
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
|
split = make_node("Split", ["input"], ["output"], name="split", axis=0)
|
||||||
make_and_import_model(make_graph([split], "split", [input], []))
|
make_and_import_model(make_graph([split], "split", [input], []))
|
||||||
|
|
||||||
|
def test_split1(self):
|
||||||
|
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||||
|
splitAttr = make_tensor_value_info("split", TensorProto.INT64, [2, 1])
|
||||||
|
split = make_node("Split", ["input", "split"], ["output"], name="split", axis=1)
|
||||||
|
make_and_import_model(make_graph([split], "split", [input, splitAttr], []))
|
||||||
|
|
||||||
def test_allBroadcast(self):
|
def test_allBroadcast(self):
|
||||||
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
input = make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
|
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 2, 4])
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
#include "bang/bang_runtime.h"
|
#include "bang/bang_runtime.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
#include "core/perf_engine.h"
|
#include "core/perf_engine.h"
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -59,4 +62,15 @@ void BangRuntimeObj::sync() const { cnrtSyncDevice(); }
|
||||||
|
|
||||||
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
string BangRuntimeObj::toString() const { return "BANG Runtime"; }
|
||||||
|
|
||||||
|
void BangRuntimeObj::initComm(const string &name, int worldSize, int rank) {
|
||||||
|
IT_ASSERT(worldSize > 0);
|
||||||
|
IT_ASSERT(rank >= 0);
|
||||||
|
IT_ASSERT(rank < worldSize);
|
||||||
|
IT_ASSERT(!comm) << "communicator is already initialized.";
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
comm = std::make_unique<CnclCommunicatorObj>(name, worldSize, rank);
|
||||||
|
#else
|
||||||
|
IT_TODO_HALT_MSG("Not compiled with CNCL.");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -10,12 +10,14 @@
|
||||||
#include "operators/expand.h"
|
#include "operators/expand.h"
|
||||||
#include "operators/gather.h"
|
#include "operators/gather.h"
|
||||||
#include "operators/layer_norm.h"
|
#include "operators/layer_norm.h"
|
||||||
|
#include "operators/lrn.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
#include "operators/pad.h"
|
#include "operators/pad.h"
|
||||||
#include "operators/pooling.h"
|
#include "operators/pooling.h"
|
||||||
#include "operators/recv.h"
|
#include "operators/recv.h"
|
||||||
#include "operators/reduce.h"
|
#include "operators/reduce.h"
|
||||||
#include "operators/reshape.h"
|
#include "operators/reshape.h"
|
||||||
|
#include "operators/resize.h"
|
||||||
#include "operators/send.h"
|
#include "operators/send.h"
|
||||||
#include "operators/slice.h"
|
#include "operators/slice.h"
|
||||||
#include "operators/softmax.h"
|
#include "operators/softmax.h"
|
||||||
|
@ -24,6 +26,7 @@
|
||||||
#include "operators/unary.h"
|
#include "operators/unary.h"
|
||||||
#include "operators/where.h"
|
#include "operators/where.h"
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -252,6 +255,64 @@ Tensor GraphHandlerObj::reshape(Tensor data, Tensor reshaped, Shape shape) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::resize(Tensor input, Tensor output,
|
||||||
|
const std::optional<vector<int>> &axes,
|
||||||
|
Tensor sizes, Tensor scales, Tensor roi,
|
||||||
|
vector<uint32_t> sizes_, vector<float> scales_,
|
||||||
|
vector<float> roi_, string mode,
|
||||||
|
string ratioPolicy, string nearestMode,
|
||||||
|
string coordTransMode) {
|
||||||
|
if (sizes_.size() > 0) {
|
||||||
|
sizes->dataMalloc();
|
||||||
|
sizes->copyin<uint32_t>(sizes_);
|
||||||
|
}
|
||||||
|
if (scales_.size() > 0) {
|
||||||
|
scales->dataMalloc();
|
||||||
|
scales->copyin<float>(scales_);
|
||||||
|
}
|
||||||
|
if (roi_.size() > 0) {
|
||||||
|
roi->dataMalloc();
|
||||||
|
roi->copyin<float>(roi_);
|
||||||
|
}
|
||||||
|
ResizeObj::EKeepAspectRatioPolicy ratioPolicy_ =
|
||||||
|
ResizeObj::fromRatioPolicyStr(ratioPolicy);
|
||||||
|
ResizeObj::ENearestMode nearestMode_ =
|
||||||
|
ResizeObj::fromENearestModeStr(nearestMode);
|
||||||
|
ResizeObj::ECoordinateTransMode coordTransMode_ =
|
||||||
|
ResizeObj::fromECoordinateTransModeStr(coordTransMode);
|
||||||
|
ResizeObj::ECoeffMode mode_ = ResizeObj::fromECoeffModeStr(mode);
|
||||||
|
if (output) {
|
||||||
|
if (mode == "nearest") {
|
||||||
|
g->addOpWithOutputs<ResizeObj>(
|
||||||
|
std::move(input), output, std::move(axes), std::move(sizes),
|
||||||
|
std::move(scales), std::move(roi), ratioPolicy_, nearestMode_,
|
||||||
|
coordTransMode_);
|
||||||
|
} else {
|
||||||
|
g->addOpWithOutputs<ResizeObj>(
|
||||||
|
std::move(input), output, std::move(axes), std::move(sizes),
|
||||||
|
std::move(scales), std::move(roi), mode_, ratioPolicy_,
|
||||||
|
coordTransMode_);
|
||||||
|
}
|
||||||
|
return output;
|
||||||
|
} else {
|
||||||
|
if (mode == "nearest") {
|
||||||
|
return g
|
||||||
|
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
|
||||||
|
std::move(sizes), std::move(scales),
|
||||||
|
std::move(roi), ratioPolicy_, nearestMode_,
|
||||||
|
coordTransMode_)
|
||||||
|
->getOutput();
|
||||||
|
} else {
|
||||||
|
return g
|
||||||
|
->addOp<ResizeObj>(std::move(input), output, std::move(axes),
|
||||||
|
std::move(sizes), std::move(scales),
|
||||||
|
std::move(roi), mode_, ratioPolicy_,
|
||||||
|
coordTransMode_)
|
||||||
|
->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
|
Tensor GraphHandlerObj::concat(TensorVec inputs, Tensor output, int dim) {
|
||||||
if (output) {
|
if (output) {
|
||||||
g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
|
g->addOpWithOutputs<ConcatObj>(std::move(inputs), output, dim);
|
||||||
|
@ -283,14 +344,29 @@ Tensor GraphHandlerObj::attentionKVCache(Tensor input_k_cache,
|
||||||
}
|
}
|
||||||
|
|
||||||
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
|
TensorVec GraphHandlerObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||||
int axis, int num_outputs) {
|
int axis,
|
||||||
|
std::variant<int, vector<int>> numOrRatio) {
|
||||||
if (outputs) {
|
if (outputs) {
|
||||||
|
if (std::holds_alternative<int>(numOrRatio)) {
|
||||||
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||||
num_outputs);
|
std::get<int>(numOrRatio));
|
||||||
|
} else {
|
||||||
|
g->addOpWithOutputs<SplitObj>(std::move(input), outputs, axis,
|
||||||
|
std::get<vector<int>>(numOrRatio));
|
||||||
|
}
|
||||||
return *outputs;
|
return *outputs;
|
||||||
} else {
|
} else {
|
||||||
return g->addOp<SplitObj>(std::move(input), outputs, axis, num_outputs)
|
if (std::holds_alternative<int>(numOrRatio)) {
|
||||||
|
return g
|
||||||
|
->addOp<SplitObj>(std::move(input), outputs, axis,
|
||||||
|
std::get<int>(numOrRatio))
|
||||||
->getOutputs();
|
->getOutputs();
|
||||||
|
} else {
|
||||||
|
return g
|
||||||
|
->addOp<SplitObj>(std::move(input), outputs, axis,
|
||||||
|
std::get<vector<int>>(numOrRatio))
|
||||||
|
->getOutputs();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -519,6 +595,19 @@ Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor GraphHandlerObj::lrn(Tensor input, Tensor output, float alpha,
|
||||||
|
float beta, float bias, int size) {
|
||||||
|
if (output) {
|
||||||
|
g->addOpWithOutputs<LRNObj>(std::move(input), output, alpha, beta, bias,
|
||||||
|
size);
|
||||||
|
return output;
|
||||||
|
} else {
|
||||||
|
return g
|
||||||
|
->addOp<LRNObj>(std::move(input), output, alpha, beta, bias, size)
|
||||||
|
->getOutput();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static CastType inferCastType(Tensor input, int to) {
|
static CastType inferCastType(Tensor input, int to) {
|
||||||
auto iType = input->getDType();
|
auto iType = input->getDType();
|
||||||
auto oType = DataType(to);
|
auto oType = DataType(to);
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include "operators/conv.h"
|
#include "operators/conv.h"
|
||||||
#include "operators/expand.h"
|
#include "operators/expand.h"
|
||||||
#include "operators/gather.h"
|
#include "operators/gather.h"
|
||||||
|
#include "operators/lrn.h"
|
||||||
#include "operators/matmul.h"
|
#include "operators/matmul.h"
|
||||||
#include "operators/pad.h"
|
#include "operators/pad.h"
|
||||||
#include "operators/pooling.h"
|
#include "operators/pooling.h"
|
||||||
|
@ -113,6 +114,7 @@ void export_values(py::module &m) {
|
||||||
.VALUE(OpType, Erf)
|
.VALUE(OpType, Erf)
|
||||||
.VALUE(OpType, Where)
|
.VALUE(OpType, Where)
|
||||||
.VALUE(OpType, DepthToSpace)
|
.VALUE(OpType, DepthToSpace)
|
||||||
|
.VALUE(OpType, LRN)
|
||||||
.export_values();
|
.export_values();
|
||||||
|
|
||||||
#undef VALUE
|
#undef VALUE
|
||||||
|
@ -296,6 +298,14 @@ static std::tuple<int, std::string> depth_to_space_attrs_of(Operator op) {
|
||||||
depth_to_space->getModeString());
|
depth_to_space->getModeString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::tuple<float, float, float, int> lrn_attrs_of(Operator op) {
|
||||||
|
IT_ASSERT(op->getOpType() == OpType::LRN);
|
||||||
|
auto lrn = dynamic_cast<const LRNObj *>(op.get());
|
||||||
|
auto [alpha, beta, bias] = lrn->getAlphaBetaBias();
|
||||||
|
auto size = lrn->getSize();
|
||||||
|
return std::make_tuple(alpha, beta, bias, size);
|
||||||
|
}
|
||||||
|
|
||||||
void export_functions(py::module &m) {
|
void export_functions(py::module &m) {
|
||||||
#define FUNCTION(NAME) def(#NAME, &NAME)
|
#define FUNCTION(NAME) def(#NAME, &NAME)
|
||||||
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
m.def("cpu_runtime", &NativeCpuRuntimeObj::getInstance)
|
||||||
|
@ -332,7 +342,8 @@ void export_functions(py::module &m) {
|
||||||
.FUNCTION(gather_axis_of)
|
.FUNCTION(gather_axis_of)
|
||||||
.FUNCTION(flatten_axis_of)
|
.FUNCTION(flatten_axis_of)
|
||||||
.FUNCTION(cast_to_of)
|
.FUNCTION(cast_to_of)
|
||||||
.FUNCTION(depth_to_space_attrs_of);
|
.FUNCTION(depth_to_space_attrs_of)
|
||||||
|
.FUNCTION(lrn_attrs_of);
|
||||||
#undef FUNCTION
|
#undef FUNCTION
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -388,7 +399,9 @@ void init_graph_builder(py::module &m) {
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_BANG
|
#ifdef USE_BANG
|
||||||
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
py::class_<BangRuntimeObj, std::shared_ptr<BangRuntimeObj>, RuntimeObj>(
|
||||||
m, "BangRuntime");
|
m, "BangRuntime")
|
||||||
|
.def(py::init<int>(), py::arg("device") = 0)
|
||||||
|
.def("init_comm", &BangRuntimeObj::initComm);
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_KUNLUN
|
#ifdef USE_KUNLUN
|
||||||
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
|
py::class_<KUNLUNRuntimeObj, std::shared_ptr<KUNLUNRuntimeObj>, RuntimeObj>(
|
||||||
|
@ -495,6 +508,7 @@ void init_graph_builder(py::module &m) {
|
||||||
.def("transpose", &Handler::transpose, policy::move)
|
.def("transpose", &Handler::transpose, policy::move)
|
||||||
.def("depthToSpace", &Handler::depthToSpace, policy::move)
|
.def("depthToSpace", &Handler::depthToSpace, policy::move)
|
||||||
.def("reshape", &Handler::reshape, policy::move)
|
.def("reshape", &Handler::reshape, policy::move)
|
||||||
|
.def("resize", &Handler::resize, policy::move)
|
||||||
.def("concat", &Handler::concat, policy::move)
|
.def("concat", &Handler::concat, policy::move)
|
||||||
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
|
.def("attentionKVCache", &Handler::attentionKVCache, policy::move)
|
||||||
.def("split", &Handler::split, policy::move)
|
.def("split", &Handler::split, policy::move)
|
||||||
|
@ -517,6 +531,7 @@ void init_graph_builder(py::module &m) {
|
||||||
.def("expand", &Handler::expand, policy::move)
|
.def("expand", &Handler::expand, policy::move)
|
||||||
.def("erf", &Handler::erf, policy::move)
|
.def("erf", &Handler::erf, policy::move)
|
||||||
.def("where", &Handler::where, policy::move)
|
.def("where", &Handler::where, policy::move)
|
||||||
|
.def("lrn", &Handler::lrn, policy::move)
|
||||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||||
.def("optimize", &Handler::optimize, policy::automatic)
|
.def("optimize", &Handler::optimize, policy::automatic)
|
||||||
.def("operators", &Handler::operators, policy::move)
|
.def("operators", &Handler::operators, policy::move)
|
||||||
|
|
|
@ -30,8 +30,9 @@ class UnaryCnnl : public BangKernelWithoutConfig {
|
||||||
cDim.data()));
|
cDim.data()));
|
||||||
cnnlActivationDescriptor_t opDesc;
|
cnnlActivationDescriptor_t opDesc;
|
||||||
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
checkCnnlError(cnnlCreateActivationDescriptor(&opDesc));
|
||||||
checkCnnlError(cnnlSetActivationDescriptor(
|
checkCnnlError(cnnlSetActivationDescriptor_v2(
|
||||||
opDesc, getOpType(), CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
opDesc, getOpType(), CNNL_ACTIVATION_HIGH_PRECISION,
|
||||||
|
CNNL_NOT_PROPAGATE_NAN, getCoef()));
|
||||||
|
|
||||||
auto [alpha, beta] = getAlphBeta();
|
auto [alpha, beta] = getAlphBeta();
|
||||||
cnnlStatus_t stat =
|
cnnlStatus_t stat =
|
||||||
|
@ -131,6 +132,7 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
||||||
std::vector<int> inDim = {1, 1, 1};
|
std::vector<int> inDim = {1, 1, 1};
|
||||||
std::vector<int> outDim = inDim;
|
std::vector<int> outDim = inDim;
|
||||||
|
|
||||||
|
if (aDim.size() >= 3) {
|
||||||
if (axis == 0) {
|
if (axis == 0) {
|
||||||
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||||
inDim[0] = aDim[0];
|
inDim[0] = aDim[0];
|
||||||
|
@ -158,6 +160,25 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
||||||
}
|
}
|
||||||
outDim = inDim;
|
outDim = inDim;
|
||||||
}
|
}
|
||||||
|
} else if (aDim.size() == 2) {
|
||||||
|
if (axis == 0) {
|
||||||
|
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||||
|
inDim = aDim;
|
||||||
|
inDim.push_back(1);
|
||||||
|
outDim = inDim;
|
||||||
|
} else {
|
||||||
|
mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
|
||||||
|
inDim = aDim;
|
||||||
|
inDim.insert(inDim.begin(), 1);
|
||||||
|
outDim = inDim;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mode = CNNL_SOFTMAX_MODE_HIGH_DIMENSION;
|
||||||
|
inDim = aDim;
|
||||||
|
inDim.push_back(1);
|
||||||
|
inDim.push_back(1);
|
||||||
|
outDim = inDim;
|
||||||
|
}
|
||||||
|
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
@ -171,8 +192,8 @@ class SoftmaxCnnl : public BangKernelWithoutConfig {
|
||||||
float beta = 0.0;
|
float beta = 0.0;
|
||||||
cnnlStatus_t stat =
|
cnnlStatus_t stat =
|
||||||
cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
|
cnnlSoftmaxForward_v2(context->cnnlHandle(), CNNL_SOFTMAX_ACCURATE,
|
||||||
mode, CNNL_COMPUTATION_HIGH_PRECISION, &alpha,
|
mode, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
|
||||||
aDesc, aData, &beta, cDesc, cData);
|
&alpha, aDesc, aData, &beta, cDesc, cData);
|
||||||
if (stat != CNNL_STATUS_SUCCESS)
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
return;
|
return;
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "operators/all_gather.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include <thread>
|
||||||
|
namespace infini {
|
||||||
|
class AllGatherCNCL : public BangKernelWithoutConfig {
|
||||||
|
public:
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<AllGatherObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
int world_size = op->getWorldSize();
|
||||||
|
// Check if world size info in operator matches runtime
|
||||||
|
IT_ASSERT(world_size == context->getCommunicator().getWorldSize());
|
||||||
|
|
||||||
|
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||||
|
BangPtr output_temp =
|
||||||
|
context->getWorkspace(op->getInputs(0)->getBytes() * world_size);
|
||||||
|
// void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||||
|
// IT_ASSERT(op->getDType() == DataType::Float32);
|
||||||
|
checkBangError(cnrtMalloc(&output_temp,
|
||||||
|
op->getInputs(0)->getBytes() * world_size));
|
||||||
|
size_t bytes = op->getInputs(0)->getBytes();
|
||||||
|
size_t count = bytes / op->getDType().getSize();
|
||||||
|
|
||||||
|
cnclComm_t comm =
|
||||||
|
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||||
|
.getCnclComm();
|
||||||
|
cnrtQueue_t queue = context->getBangQueue();
|
||||||
|
CNCL_CHECK(
|
||||||
|
cnclAllGather(input, output_temp, count, cnclFloat32, comm, queue));
|
||||||
|
checkBangError(cnrtQueueSync(queue));
|
||||||
|
for (int i = 0; i < world_size; ++i) {
|
||||||
|
Tensor output = op->getOutput(i);
|
||||||
|
context->copyBlobInsideRuntime(
|
||||||
|
output->getRawDataPtr<float *>(),
|
||||||
|
static_cast<float *>(output_temp) + i * count, bytes);
|
||||||
|
}
|
||||||
|
checkBangError(cnrtFree(output_temp));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AllGather, DataType::Float32,
|
||||||
|
AllGatherCNCL, "AllGather_CNCL_BANG_Float32");
|
||||||
|
} // namespace infini
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,53 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "operators/all_reduce.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include <thread>
|
||||||
|
namespace infini {
|
||||||
|
class AllReduceCNCL : public BangKernelWithoutConfig {
|
||||||
|
public:
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<AllReduceBaseObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||||
|
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||||
|
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||||
|
size_t count = op->getInputs(0)->size();
|
||||||
|
cnclComm_t comm =
|
||||||
|
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||||
|
.getCnclComm();
|
||||||
|
cnrtQueue_t queue = context->getBangQueue();
|
||||||
|
// checkBangError(cnrtQueueSync(queue));
|
||||||
|
CNCL_CHECK(cnclAllReduce(input, output, count, cnclFloat, getRedOp(),
|
||||||
|
comm, queue));
|
||||||
|
checkBangError(cnrtQueueSync(queue));
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual cnclReduceOp_t getRedOp() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class AllReduceSumCNCL : public AllReduceCNCL {
|
||||||
|
cnclReduceOp_t getRedOp() const override { return cnclSum; }
|
||||||
|
};
|
||||||
|
class AllReduceProdCNCL : public AllReduceCNCL {
|
||||||
|
cnclReduceOp_t getRedOp() const override { return cnclProd; }
|
||||||
|
};
|
||||||
|
class AllReduceMinCNCL : public AllReduceCNCL {
|
||||||
|
cnclReduceOp_t getRedOp() const override { return cnclMin; }
|
||||||
|
};
|
||||||
|
class AllReduceMaxCNCL : public AllReduceCNCL {
|
||||||
|
cnclReduceOp_t getRedOp() const override { return cnclMax; }
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AllReduceSum, DataType::Float32,
|
||||||
|
AllReduceSumCNCL, "AllReduce_Sum_CNCL_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AllReduceProd, DataType::Float32,
|
||||||
|
AllReduceProdCNCL, "AllReduce_Prod_CNCL_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMin, DataType::Float32,
|
||||||
|
AllReduceMinCNCL, "AllReduce_Min_CNCL_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::AllReduceMax, DataType::Float32,
|
||||||
|
AllReduceMaxCNCL, "AllReduce_Max_CNCL_BANG_Float32");
|
||||||
|
} // namespace infini
|
||||||
|
#endif
|
|
@ -17,51 +17,87 @@ class BatchNormCnnl : public BangKernelWithoutConfig {
|
||||||
void *const output = (op->getOutput()->getRawDataPtr<void *>());
|
void *const output = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
auto dims = op->getInputs(0)->getDims();
|
auto dims = op->getInputs(0)->getDims();
|
||||||
|
auto outDims = op->getOutput()->getDims();
|
||||||
if (dims.size() != 4)
|
if (dims.size() != 4)
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
|
|
||||||
int dimArray[4], strideArray[4], dimPArray[1], stridePArray[1];
|
int dimsTrans[4] = {dims[0], dims[2], dims[3], dims[1]};
|
||||||
|
int dimsOutTrans[4] = {outDims[0], outDims[2], outDims[3], outDims[1]};
|
||||||
|
int permute[4] = {0, 2, 3, 1};
|
||||||
|
int permuteOut[4] = {0, 3, 1, 2};
|
||||||
|
|
||||||
for (size_t i = 0; i < dims.size(); ++i) {
|
|
||||||
dimArray[i] = dims[i];
|
|
||||||
strideArray[i] = op->getInputs(0)->getStride()[i];
|
|
||||||
}
|
|
||||||
int w = dimArray[3];
|
|
||||||
dimArray[3] = dimArray[1];
|
|
||||||
int h = dimArray[2];
|
|
||||||
dimArray[1] = h;
|
|
||||||
dimArray[2] = w;
|
|
||||||
|
|
||||||
dimPArray[0] = op->getInputs(1)->getDims()[0];
|
|
||||||
stridePArray[0] = op->getInputs(1)->getDims()[0];
|
|
||||||
// get inputs
|
// get inputs
|
||||||
cnnlTensorDescriptor_t inDesc;
|
cnnlTensorDescriptor_t inDesc, intransDesc, outDesc, outtransDesc;
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_NHWC,
|
checkCnnlError(cnnlCreateTensorDescriptor(&intransDesc));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&outtransDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||||
CNNL_DTYPE_FLOAT, dims.size(),
|
CNNL_DTYPE_FLOAT, dims.size(),
|
||||||
dimArray, strideArray));
|
dims.data()));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(intransDesc, CNNL_LAYOUT_NHWC,
|
||||||
|
CNNL_DTYPE_FLOAT, dims.size(),
|
||||||
|
dimsTrans));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||||
|
outDims.data()));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(outtransDesc, CNNL_LAYOUT_NHWC,
|
||||||
|
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||||
|
dimsOutTrans));
|
||||||
|
cnnlTransposeDescriptor_t opDesc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&opDesc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(opDesc, 4, permute));
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), inDesc, opDesc,
|
||||||
|
&wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
BangPtr inputTrans = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(inDesc) * sizeof(float));
|
||||||
|
BangPtr outputTrans = context->getWorkspace(
|
||||||
|
cnnlGetTensorElementNum(inDesc) * sizeof(float));
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlTranspose_v2(context->cnnlHandle(), opDesc, inDesc, input,
|
||||||
|
intransDesc, inputTrans, wsData, wsSize);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
// get bnScaleBiasMeanVarDesc
|
// get bnScaleBiasMeanVarDesc
|
||||||
|
auto dimsScaleBiasMeanVar = op->getInputs(1)->getDims();
|
||||||
cnnlTensorDescriptor_t paraDesc;
|
cnnlTensorDescriptor_t paraDesc;
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(¶Desc));
|
checkCnnlError(cnnlCreateTensorDescriptor(¶Desc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptorEx(paraDesc, CNNL_LAYOUT_ARRAY,
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
CNNL_DTYPE_FLOAT, 1, dimPArray,
|
paraDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||||
stridePArray));
|
dimsScaleBiasMeanVar.size(), dimsScaleBiasMeanVar.data()));
|
||||||
|
|
||||||
float alpha = 1.f, beta = 0.f;
|
float alpha = 1.f, beta = 0.f;
|
||||||
// This mode is intended for use after convolutional layers
|
// This mode is intended for use after convolutional layers
|
||||||
cnnlStatus_t stat = cnnlBatchNormForwardInference(
|
stat = cnnlBatchNormForwardInference(
|
||||||
context->cnnlHandle(), &alpha, &beta, inDesc, input, paraDesc,
|
context->cnnlHandle(), &alpha, &beta, intransDesc, inputTrans,
|
||||||
scale, bias, mean, var, op->getEps(), inDesc, output);
|
paraDesc, scale, bias, mean, var, op->getEps(), outtransDesc,
|
||||||
|
outputTrans);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
cnnlTransposeDescriptor_t op2Desc;
|
||||||
|
checkCnnlError(cnnlCreateTransposeDescriptor(&op2Desc));
|
||||||
|
checkCnnlError(cnnlSetTransposeDescriptor(op2Desc, 4, permuteOut));
|
||||||
|
cnnlGetTransposeWorkspaceSize(context->cnnlHandle(), intransDesc,
|
||||||
|
op2Desc, &wsSize);
|
||||||
|
BangPtr ws2Data = context->getWorkspace(wsSize);
|
||||||
|
stat = cnnlTranspose_v2(context->cnnlHandle(), op2Desc, outtransDesc,
|
||||||
|
outputTrans, outDesc, output, ws2Data, wsSize);
|
||||||
if (stat != CNNL_STATUS_SUCCESS)
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Destories in BANG does not require sync. But cnnl does not state
|
// Destories in BANG does not require sync. But cnnl does not state
|
||||||
// whether sync is required before destories.
|
// whether sync is required before destories.
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(intransDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(outtransDesc));
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(paraDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(opDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTransposeDescriptor(op2Desc));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "operators/broadcast.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include <thread>
|
||||||
|
namespace infini {
|
||||||
|
class BroadcastCNCL : public BangKernelWithoutConfig {
|
||||||
|
public:
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<BroadcastObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||||
|
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||||
|
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||||
|
size_t count = op->getInputs(0)->getBytes() / op->getDType().getSize();
|
||||||
|
|
||||||
|
cnclComm_t comm =
|
||||||
|
dynamic_cast<CnclCommunicatorObj &>(context->getCommunicator())
|
||||||
|
.getCnclComm();
|
||||||
|
cnrtQueue_t queue = context->getBangQueue();
|
||||||
|
// TODO: Using default stream 0 for now.
|
||||||
|
CNCL_CHECK(cnclBroadcast(input, output, count, cnclFloat32,
|
||||||
|
op->getRoot(), comm, queue));
|
||||||
|
checkBangError(cnrtQueueSync(queue));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Broadcast, DataType::Float32,
|
||||||
|
BroadcastCNCL, "Broadcast_CNCL_BANG_Float32");
|
||||||
|
} // namespace infini
|
||||||
|
|
||||||
|
#endif
|
|
@ -23,6 +23,8 @@ class GatherCnnl : public BangKernelWithoutConfig {
|
||||||
CNNL_DTYPE_FLOAT, aDim.size(),
|
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||||
aDim.data()));
|
aDim.data()));
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&bDesc));
|
||||||
|
checkCnnlError(
|
||||||
|
cnnlSetTensorDescriptorPointerMode(bDesc, CNNL_POINTER_MODE_HOST));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
|
checkCnnlError(cnnlSetTensorDescriptor(bDesc, CNNL_LAYOUT_ARRAY,
|
||||||
CNNL_DTYPE_INT32, bDim.size(),
|
CNNL_DTYPE_INT32, bDim.size(),
|
||||||
bDim.data()));
|
bDim.data()));
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
#include "operators/layer_norm.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
class LayerNormCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<LayerNormObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const scaleData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *biasData = NULL;
|
||||||
|
if (op->numInputs() == 3) {
|
||||||
|
biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
|
}
|
||||||
|
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto inDims = op->getInputs(0)->getDims();
|
||||||
|
auto outDims = op->getOutput()->getDims();
|
||||||
|
auto fiterDims = op->getOutput(1)->getDims();
|
||||||
|
|
||||||
|
float eps = op->getEps();
|
||||||
|
const int axis = op->getAxis();
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, inDims.size(),
|
||||||
|
inDims.data()));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&fiterDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
fiterDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, fiterDims.size(),
|
||||||
|
fiterDims.data()));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&outDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(outDesc, CNNL_LAYOUT_ARRAY,
|
||||||
|
CNNL_DTYPE_FLOAT, outDims.size(),
|
||||||
|
outDims.data()));
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
|
||||||
|
&wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlLayerNormForward(
|
||||||
|
context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
|
||||||
|
scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
|
||||||
|
inDesc, NULL, NULL);
|
||||||
|
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(inDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(fiterDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(outDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::LayerNormalization, DataType::Float32,
|
||||||
|
LayerNormCnnl, "LayerNorm_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -0,0 +1,62 @@
|
||||||
|
#include "operators/lrn.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class LRNCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<LRNObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
auto aDim = op->getInputs(0)->getDims();
|
||||||
|
auto cDim = op->getOutput()->getDims();
|
||||||
|
auto [alpha, beta, bias] = op->getAlphaBetaBias();
|
||||||
|
auto size = op->getSize();
|
||||||
|
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, aDim.size(),
|
||||||
|
aDim.data()));
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
|
||||||
|
CNNL_DTYPE_FLOAT, cDim.size(),
|
||||||
|
cDim.data()));
|
||||||
|
|
||||||
|
size_t extra_size;
|
||||||
|
cnnlGetLrnExtraInputSize_v2(context->cnnlHandle(), cDesc,
|
||||||
|
CNNL_LRN_LOCAL_SIZE, size, &extra_size);
|
||||||
|
void *extra_cpu = NULL;
|
||||||
|
extra_cpu = malloc(extra_size);
|
||||||
|
BangPtr extra_mlu = context->getWorkspace(extra_size);
|
||||||
|
cnnlInitLrnExtraInput(context->cnnlHandle(), CNNL_LRN_LOCAL_SIZE, size,
|
||||||
|
(double)alpha, (double)beta, (double)bias, aDesc,
|
||||||
|
cDesc, extra_cpu);
|
||||||
|
cnrtMemcpy(extra_mlu, extra_cpu, extra_size,
|
||||||
|
CNRT_MEM_TRANS_DIR_HOST2DEV);
|
||||||
|
|
||||||
|
size_t wsSize;
|
||||||
|
cnnlGetLrnWorkspaceSize_v2(context->cnnlHandle(), aDesc, cDesc,
|
||||||
|
CNNL_LRN_LOCAL_SIZE, size, &wsSize);
|
||||||
|
BangPtr wsData = context->getWorkspace(wsSize);
|
||||||
|
|
||||||
|
cnnlStatus_t stat = cnnlLrn_v2(
|
||||||
|
context->cnnlHandle(), CNNL_LRN_LOCAL_SIZE, size, (double)alpha,
|
||||||
|
(double)beta, (double)bias, wsData, wsSize, aDesc, aData, extra_mlu,
|
||||||
|
extra_size, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::LRN, DataType::Float32, LRNCnnl,
|
||||||
|
"LRN_cnnl_BANG_Float32");
|
||||||
|
|
||||||
|
}; // namespace infini
|
|
@ -10,15 +10,29 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
auto op = as<MatmulObj>(_op);
|
auto op = as<MatmulObj>(_op);
|
||||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
auto input_num = op->numInputs();
|
||||||
|
|
||||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
|
void *biasData = NULL;
|
||||||
|
if (input_num > 2) {
|
||||||
|
biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
|
}
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
|
cnnlTensorDescriptor_t aDesc, bDesc, cDesc, biasDesc;
|
||||||
auto dimInputs0 = op->getInputs(0)->getDims();
|
auto dimInputs0 = op->getInputs(0)->getDims();
|
||||||
auto dimInputs1 = op->getInputs(1)->getDims();
|
auto dimInputs1 = op->getInputs(1)->getDims();
|
||||||
|
std::vector<int> dimBias;
|
||||||
|
if (input_num > 2) {
|
||||||
|
dimBias = op->getInputs(2)->getDims();
|
||||||
|
}
|
||||||
|
|
||||||
auto dimOutput = op->getOutput()->getDims();
|
auto dimOutput = op->getOutput()->getDims();
|
||||||
|
|
||||||
|
float alpha = 1.0;
|
||||||
|
float beta = 0.0;
|
||||||
|
|
||||||
int32_t transA = op->getTransA();
|
int32_t transA = op->getTransA();
|
||||||
int32_t transB = op->getTransB();
|
int32_t transB = op->getTransB();
|
||||||
|
|
||||||
|
@ -37,6 +51,13 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT,
|
||||||
dimOutput.size(), dimOutput.data()));
|
dimOutput.size(), dimOutput.data()));
|
||||||
|
|
||||||
|
if (input_num > 2) {
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&biasDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
biasDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dimBias.size(),
|
||||||
|
dimBias.data()));
|
||||||
|
}
|
||||||
|
|
||||||
cnnlMatMulDescriptor_t bmm_desc;
|
cnnlMatMulDescriptor_t bmm_desc;
|
||||||
cnnlMatMulDescCreate(&bmm_desc);
|
cnnlMatMulDescCreate(&bmm_desc);
|
||||||
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSA, &transA,
|
cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSA, &transA,
|
||||||
|
@ -47,8 +68,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
cnnlMatMulAlgo_t bmm_algo;
|
cnnlMatMulAlgo_t bmm_algo;
|
||||||
cnnlMatMulAlgoCreate(&bmm_algo);
|
cnnlMatMulAlgoCreate(&bmm_algo);
|
||||||
|
|
||||||
float alpha = 1.0;
|
|
||||||
float beta = 0.0;
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
cnnlMatMulHeuristicResult_t desc;
|
cnnlMatMulHeuristicResult_t desc;
|
||||||
|
@ -66,9 +85,22 @@ class MatmulCnnl : public BangKernelWithoutConfig {
|
||||||
if (stat != CNNL_STATUS_SUCCESS)
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
wsData = NULL;
|
||||||
|
if (input_num > 2) {
|
||||||
|
cnnlGetBiasAddWorkspaceSize(context->cnnlHandle(), biasDesc, cDesc,
|
||||||
|
&wsSize);
|
||||||
|
stat = cnnlBiasAdd(context->cnnlHandle(), &alpha, biasDesc,
|
||||||
|
biasData, wsData, wsSize, &alpha, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(bDesc));
|
||||||
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
if (input_num > 2) {
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(biasDesc));
|
||||||
|
}
|
||||||
checkCnnlError(cnnlMatMulDescDestroy(bmm_desc));
|
checkCnnlError(cnnlMatMulDescDestroy(bmm_desc));
|
||||||
checkCnnlError(cnnlMatMulAlgoDestroy(bmm_algo));
|
checkCnnlError(cnnlMatMulAlgoDestroy(bmm_algo));
|
||||||
checkCnnlError(cnnlDestroyMatMulHeuristicResult(desc));
|
checkCnnlError(cnnlDestroyMatMulHeuristicResult(desc));
|
||||||
|
|
|
@ -13,14 +13,14 @@ class PadCnnl : public BangKernelWithoutConfig {
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
cnnlTensorDescriptor_t aDesc, cDesc;
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
auto dim = op->getOutput()->getDims();
|
auto dimIn = op->getInputs(0)->getDims();
|
||||||
int dim_size = dim.size();
|
auto dimOut = op->getOutput()->getDims();
|
||||||
int dim_array[dim_size];
|
|
||||||
for (int i = 0; i < dim_size; ++i) {
|
int dim_size = dimIn.size();
|
||||||
dim_array[i] = dim[i];
|
|
||||||
}
|
|
||||||
int paddings[dim_size * 2];
|
int paddings[dim_size * 2];
|
||||||
|
|
||||||
std::vector<int> pads = op->getPads();
|
std::vector<int> pads = op->getPads();
|
||||||
|
|
||||||
if (pads.size() == 2 && dim_size != 1) {
|
if (pads.size() == 2 && dim_size != 1) {
|
||||||
for (int i = 0; i < dim_size * 2; i += 2) {
|
for (int i = 0; i < dim_size * 2; i += 2) {
|
||||||
paddings[i] = pads[0];
|
paddings[i] = pads[0];
|
||||||
|
@ -32,20 +32,18 @@ class PadCnnl : public BangKernelWithoutConfig {
|
||||||
paddings[i + 1] = pads[i / 2 + dim_size];
|
paddings[i + 1] = pads[i / 2 + dim_size];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int dimout_array[dim_size];
|
|
||||||
for (int i = 0; i < dim_size; ++i) {
|
|
||||||
dimout_array[i] = dim[i] + paddings[2 * i] + paddings[2 * i + 1];
|
|
||||||
}
|
|
||||||
float paddingValue = 0.0;
|
float paddingValue = 0.0;
|
||||||
// input
|
// input
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(
|
checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_ARRAY,
|
||||||
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, dim_size, dim_array));
|
CNNL_DTYPE_FLOAT, dimIn.size(),
|
||||||
|
dimIn.data()));
|
||||||
// output
|
// output
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_ARRAY,
|
||||||
CNNL_DTYPE_FLOAT, dim_size,
|
CNNL_DTYPE_FLOAT, dimOut.size(),
|
||||||
dimout_array));
|
dimOut.data()));
|
||||||
|
|
||||||
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
|
cnnlStatus_t stat = cnnlPad(context->cnnlHandle(), aDesc, aData,
|
||||||
paddings, &paddingValue, cDesc, cData);
|
paddings, &paddingValue, cDesc, cData);
|
||||||
|
|
|
@ -21,13 +21,14 @@ class PoolingCnnl : public BangKernelWithoutConfig {
|
||||||
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
|
||||||
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
checkCnnlError(cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_NCHW,
|
||||||
CNNL_DTYPE_FLOAT, 4, inArray));
|
CNNL_DTYPE_FLOAT, 4, inArray));
|
||||||
|
bool mode = op->getCeilMode();
|
||||||
|
|
||||||
// get maxpool descriptor
|
// get maxpool descriptor
|
||||||
cnnlPoolingDescriptor_t poolingDesc;
|
cnnlPoolingDescriptor_t poolingDesc;
|
||||||
checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
|
checkCnnlError(cnnlCreatePoolingDescriptor(&poolingDesc));
|
||||||
checkCnnlError(cnnlSetPooling2dDescriptor_v2(
|
checkCnnlError(cnnlSetPooling2dDescriptor_v2(
|
||||||
poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
|
poolingDesc, getPoolingMode(), CNNL_NOT_PROPAGATE_NAN, kh, kw, ph,
|
||||||
ph, pw, pw, sh, sw, dh, dw, false));
|
ph, pw, pw, sh, sw, dh, dw, mode));
|
||||||
|
|
||||||
// get outputs
|
// get outputs
|
||||||
// TODO: verify ceiling mode
|
// TODO: verify ceiling mode
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
|
#include "operators/reduce.h"
|
||||||
#include "bang/bang_kernel_without_config.h"
|
#include "bang/bang_kernel_without_config.h"
|
||||||
#include "bang/bang_runtime.h"
|
#include "bang/bang_runtime.h"
|
||||||
#include "operators/reduce.h"
|
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
class ReduceMeanCnnl : public BangKernelWithoutConfig {
|
class ReduceCnnlBase : public BangKernelWithoutConfig {
|
||||||
|
virtual cnnlReduceOp_t getReduceOp() const = 0;
|
||||||
|
|
||||||
void compute(const Operator &_op,
|
void compute(const Operator &_op,
|
||||||
const RuntimeObj *_context) const override {
|
const RuntimeObj *_context) const override {
|
||||||
auto op = as<ReduceMeanObj>(_op);
|
auto op = as<ReduceBaseObj>(_op);
|
||||||
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
@ -34,7 +36,7 @@ class ReduceMeanCnnl : public BangKernelWithoutConfig {
|
||||||
cnnlReduceDescriptor_t reduceDesc;
|
cnnlReduceDescriptor_t reduceDesc;
|
||||||
checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
|
checkCnnlError(cnnlCreateReduceDescriptor(&reduceDesc));
|
||||||
checkCnnlError(cnnlSetReduceDescriptor_v2(
|
checkCnnlError(cnnlSetReduceDescriptor_v2(
|
||||||
reduceDesc, axes.data(), axes.size(), CNNL_REDUCE_AVG,
|
reduceDesc, axes.data(), axes.size(), getReduceOp(),
|
||||||
CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
|
CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES,
|
||||||
CNNL_32BIT_INDICES, 0.0));
|
CNNL_32BIT_INDICES, 0.0));
|
||||||
|
|
||||||
|
@ -63,7 +65,17 @@ class ReduceMeanCnnl : public BangKernelWithoutConfig {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class ReduceMeanCnnl : public ReduceCnnlBase {
|
||||||
|
cnnlReduceOp_t getReduceOp() const override { return CNNL_REDUCE_AVG; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class ReduceSumCnnl : public ReduceCnnlBase {
|
||||||
|
cnnlReduceOp_t getReduceOp() const override { return CNNL_REDUCE_ADD; }
|
||||||
|
};
|
||||||
|
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32,
|
REGISTER_KERNEL(Device::BANG, OpType::ReduceMean, DataType::Float32,
|
||||||
ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32");
|
ReduceMeanCnnl, "ReduceMean_cnnl_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::ReduceSum, DataType::Float32,
|
||||||
|
ReduceSumCnnl, "ReduceSum_cnnl_BANG_Float32");
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
|
@ -27,6 +27,8 @@ class CopyBang : public BangKernelWithoutConfig {
|
||||||
// reshape/flatten/identity all act as copying from input to output.
|
// reshape/flatten/identity all act as copying from input to output.
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
|
REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Float32, CopyBang,
|
||||||
"Reshape_BANG_Float32");
|
"Reshape_BANG_Float32");
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Reshape, DataType::Int64, CopyBang,
|
||||||
|
"Reshape_BANG_Int64");
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
|
REGISTER_KERNEL(Device::BANG, OpType::Flatten, DataType::Float32, CopyBang,
|
||||||
"Flatten_BANG_Float32");
|
"Flatten_BANG_Float32");
|
||||||
REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
|
REGISTER_KERNEL(Device::BANG, OpType::Identity, DataType::Float32, CopyBang,
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
#include "operators/slice.h"
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
class SliceCnnl : public BangKernelWithoutConfig {
|
||||||
|
void compute(const Operator &_op,
|
||||||
|
const RuntimeObj *_context) const override {
|
||||||
|
auto op = as<SliceObj>(_op);
|
||||||
|
auto context = dynamic_cast<const BangRuntimeObj *>(_context);
|
||||||
|
|
||||||
|
auto starts = op->getStarts();
|
||||||
|
auto ends = op->getEnds();
|
||||||
|
auto steps = op->getSteps();
|
||||||
|
|
||||||
|
int32_t starts_array[starts.size()];
|
||||||
|
int32_t ends_array[ends.size()];
|
||||||
|
int32_t steps_array[steps.size()];
|
||||||
|
|
||||||
|
for (size_t i = 0; i < starts.size(); i++) {
|
||||||
|
starts_array[i] = starts[i];
|
||||||
|
ends_array[i] = ends[i];
|
||||||
|
steps_array[i] = steps[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
|
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
|
auto aDim = op->getInputs(0)->getDims();
|
||||||
|
int aDim_size = aDim.size();
|
||||||
|
int aDim_array[aDim_size];
|
||||||
|
for (int i = 0; i < aDim_size; ++i) {
|
||||||
|
aDim_array[i] = aDim[i];
|
||||||
|
}
|
||||||
|
auto cDim = op->getOutput()->getDims();
|
||||||
|
int cDim_size = cDim.size();
|
||||||
|
int cDim_array[cDim_size];
|
||||||
|
for (int i = 0; i < cDim_size; ++i) {
|
||||||
|
cDim_array[i] = cDim[i];
|
||||||
|
}
|
||||||
|
cnnlTensorDescriptor_t aDesc, cDesc;
|
||||||
|
// input
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
aDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, aDim_size, aDim_array));
|
||||||
|
// output
|
||||||
|
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||||
|
checkCnnlError(cnnlSetTensorDescriptor(
|
||||||
|
cDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, cDim_size, cDim_array));
|
||||||
|
|
||||||
|
cnnlStatus_t stat =
|
||||||
|
cnnlStridedSlice(context->cnnlHandle(), aDesc, aData, starts_array,
|
||||||
|
ends_array, steps_array, cDesc, cData);
|
||||||
|
if (stat != CNNL_STATUS_SUCCESS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(aDesc));
|
||||||
|
checkCnnlError(cnnlDestroyTensorDescriptor(cDesc));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL(Device::BANG, OpType::Slice, DataType::Float32, SliceCnnl,
|
||||||
|
"Slice_cnnl_BANG_Float32");
|
||||||
|
}; // namespace infini
|
|
@ -1,5 +1,6 @@
|
||||||
#include "operators/element_wise.h"
|
#include "operators/element_wise.h"
|
||||||
#include "core/kernel.h"
|
#include "core/kernel.h"
|
||||||
|
#include "utils/operator_utils.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
||||||
|
@ -11,37 +12,34 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
||||||
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
|
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
|
||||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||||
|
|
||||||
int a[4] = {1, 1, 1, 1};
|
auto shapeA = op->getInputs(0)->getDims();
|
||||||
int b[4] = {1, 1, 1, 1};
|
auto shapeB = op->getInputs(1)->getDims();
|
||||||
int c[4] = {1, 1, 1, 1};
|
auto shapeC = op->getOutput()->getDims();
|
||||||
auto a_input = op->getInputs(0)->getDims();
|
auto rank = op->getOutput()->getRank();
|
||||||
auto b_input = op->getInputs(1)->getDims();
|
Shape a(rank, 1);
|
||||||
auto c_output = op->getOutput()->getDims();
|
Shape b(rank, 1);
|
||||||
std::copy(a_input.begin(), a_input.end(), a + (4 - a_input.size()));
|
std::copy(shapeA.begin(), shapeA.end(),
|
||||||
std::copy(b_input.begin(), b_input.end(), b + (4 - b_input.size()));
|
a.begin() + (rank - shapeA.size()));
|
||||||
std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
|
std::copy(shapeB.begin(), shapeB.end(),
|
||||||
|
b.begin() + (rank - shapeB.size()));
|
||||||
|
auto getStride = [&](const Shape &shape) {
|
||||||
|
int p = 1;
|
||||||
|
Shape stride(rank);
|
||||||
|
for (auto i = rank; i > 0; --i) {
|
||||||
|
stride[i - 1] = p;
|
||||||
|
p = p * shape[i - 1];
|
||||||
|
}
|
||||||
|
return stride;
|
||||||
|
};
|
||||||
|
Shape strideA = getStride(a);
|
||||||
|
Shape strideB = getStride(b);
|
||||||
|
|
||||||
auto n = op->getOutput()->size();
|
auto n = op->getOutput()->size();
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
int c0_index = i / (c[1] * c[2] * c[3]);
|
auto shapeIndexC = locate_index(i, shapeC);
|
||||||
int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
|
auto indexA = delocate_index(shapeIndexC, a, strideA);
|
||||||
int c2_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) / c[3];
|
auto indexB = delocate_index(shapeIndexC, b, strideB);
|
||||||
int c3_index = ((i % (c[1] * c[2] * c[3])) % (c[2] * c[3])) % c[3];
|
outptr[i] = doCompute(inptr0[indexA], inptr1[indexB]);
|
||||||
|
|
||||||
int a0_index = c0_index % a[0];
|
|
||||||
int a1_index = c1_index % a[1];
|
|
||||||
int a2_index = c2_index % a[2];
|
|
||||||
int a3_index = c3_index % a[3];
|
|
||||||
|
|
||||||
int b0_index = c0_index % b[0];
|
|
||||||
int b1_index = c1_index % b[1];
|
|
||||||
int b2_index = c2_index % b[2];
|
|
||||||
int b3_index = c3_index % b[3];
|
|
||||||
outptr[i] = doCompute(
|
|
||||||
inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
|
|
||||||
a2_index * a[3] + a3_index],
|
|
||||||
inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
|
|
||||||
b2_index * b[3] + b3_index]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -35,7 +35,7 @@ __global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData,
|
||||||
whole[tid] = 0;
|
whole[tid] = 0;
|
||||||
else
|
else
|
||||||
whole[tid] = part[offset];
|
whole[tid] = part[offset];
|
||||||
else
|
else if (offset >= 0)
|
||||||
part[offset] = whole[tid];
|
part[offset] = whole[tid];
|
||||||
tid += stride;
|
tid += stride;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
#include "operators/lrn.h"
|
||||||
|
#include "utils/operator_utils.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
LRNObj::LRNObj(GraphObj *graph, Tensor input, Tensor output, float alpha,
|
||||||
|
float beta, float bias, int size)
|
||||||
|
: OperatorObj(OpType::LRN, TensorVec{input}, {output}), alpha_value(alpha),
|
||||||
|
beta_value(beta), bias_value(bias), size_value(size) {
|
||||||
|
IT_ASSERT(checkValid(graph));
|
||||||
|
}
|
||||||
|
|
||||||
|
optional<vector<Shape>> LRNObj::inferShape(const TensorVec &inputs) {
|
||||||
|
const auto A = inputs[0];
|
||||||
|
return {{A->getDims()}};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string LRNObj::toString() const {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "LRN[" << getGuid() << "]";
|
||||||
|
os << "(";
|
||||||
|
os << vecToString(inputs[0]->getDims()) << ",";
|
||||||
|
os << "input=" << inputs[0]->getGuid() << ",";
|
||||||
|
os << "output=" << outputs[0]->getGuid() << ")";
|
||||||
|
return os.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<int> LRNObj::getWorkloadVector() const {
|
||||||
|
vector<int> ret = getOutput()->getDims();
|
||||||
|
ret.emplace(ret.begin(), type.underlying());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<int> LRNObj::getOpAttrVector() const { return {type.underlying()}; }
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -1,4 +1,5 @@
|
||||||
#include "utils/operator_utils.h"
|
#include "utils/operator_utils.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
|
@ -64,4 +65,54 @@ bool is_unidirectional_broadcasting(const Shape &A, const Shape &B) {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Shape locate_index(size_t inputN, const Shape &shape) {
|
||||||
|
Shape ans(shape.size());
|
||||||
|
auto i = ans.rbegin();
|
||||||
|
auto j = shape.rbegin(), ej = shape.rend();
|
||||||
|
while (j != ej) {
|
||||||
|
auto div = std::div(inputN, *j++);
|
||||||
|
*i++ = div.rem;
|
||||||
|
inputN = div.quot;
|
||||||
|
}
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
|
||||||
|
const Shape &stride) {
|
||||||
|
size_t ans = 0;
|
||||||
|
Shape index(shapeIndex.size());
|
||||||
|
IT_ASSERT(shapeIndex.size() == shape.size());
|
||||||
|
IT_ASSERT(shape.size() == stride.size());
|
||||||
|
for (size_t i = 0; i < shape.size(); ++i) {
|
||||||
|
index[i] = shapeIndex[i] % shape[i];
|
||||||
|
ans += index[i] * stride[i];
|
||||||
|
}
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string device_to_str(Device device) {
|
||||||
|
std::string deviceStr;
|
||||||
|
switch (device) {
|
||||||
|
case Device::CPU:
|
||||||
|
return "CPU";
|
||||||
|
case Device::CUDA:
|
||||||
|
return "CUDA";
|
||||||
|
case Device::BANG:
|
||||||
|
return "BANG";
|
||||||
|
case Device::INTELCPU:
|
||||||
|
return "INTELCPU";
|
||||||
|
case Device::KUNLUN:
|
||||||
|
return "KUNLUN";
|
||||||
|
default:
|
||||||
|
IT_TODO_HALT();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs) {
|
||||||
|
std::string deviceStr = device_to_str(std::get<0>(kernelAttrs));
|
||||||
|
std::string opStr = OpType(std::get<1>(kernelAttrs)).toString();
|
||||||
|
std::string datatypeStr = std::get<2>(kernelAttrs).toString();
|
||||||
|
return deviceStr + ", " + opStr + ", " + datatypeStr;
|
||||||
|
}
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
static int WORLD_SIZE = 2;
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void allReduceSum(float *data, int deviceId) {
|
||||||
|
// Create Runtime and setup communication
|
||||||
|
BangRuntimeObj *bang_runtime = new BangRuntimeObj(deviceId);
|
||||||
|
int rank = deviceId;
|
||||||
|
bang_runtime->initComm("test_cncl_comm", WORLD_SIZE, rank);
|
||||||
|
cnclComm_t comm =
|
||||||
|
dynamic_cast<CnclCommunicatorObj &>(bang_runtime->getCommunicator())
|
||||||
|
.getCnclComm();
|
||||||
|
cnrtQueue_t queue = bang_runtime->getBangQueue();
|
||||||
|
// Copy data
|
||||||
|
float *data_mlu;
|
||||||
|
checkBangError(cnrtMalloc((void **)&data_mlu, sizeof(float)));
|
||||||
|
checkBangError(
|
||||||
|
cnrtMemcpy(data_mlu, data, sizeof(float), cnrtMemcpyHostToDev));
|
||||||
|
// Do AllReduce
|
||||||
|
CNCL_CHECK(
|
||||||
|
cnclAllReduce(data_mlu, data_mlu, 1, cnclFloat, cnclSum, comm, queue));
|
||||||
|
|
||||||
|
checkBangError(cnrtQueueSync(queue));
|
||||||
|
// Copy data back and sync device
|
||||||
|
checkBangError(
|
||||||
|
cnrtMemcpy(data, data_mlu, sizeof(float), cnrtMemcpyDevToHost));
|
||||||
|
ASSERT_EQ(*data, 5.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup communication between 2 threads, each controlling 1 MLU.
|
||||||
|
// Do AllReduce Sum on {1.0, 4.0}. Results should be {5.0, 5.0}.
|
||||||
|
TEST(CNCL, multi_mlu_communication) {
|
||||||
|
float data[] = {1.0, 4.0};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allReduceSum(&data[i], i);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
||||||
|
#endif
|
|
@ -0,0 +1,60 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/all_gather.h"
|
||||||
|
#include "test.h"
|
||||||
|
#include <cncl.h>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
static int WORLD_SIZE = 2;
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void allGather(const string taskName, int deviceID, vector<float> data,
|
||||||
|
vector<vector<float>> ans) {
|
||||||
|
// Create Runtimes and initiate communication
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
|
||||||
|
bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
|
||||||
|
// Create Graph and insert allReduce operation
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto input =
|
||||||
|
g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
|
||||||
|
auto op = g->addOp<AllGatherObj>(input, std::nullopt, WORLD_SIZE);
|
||||||
|
// Copy data from CPU to MLU
|
||||||
|
g->dataMalloc();
|
||||||
|
input->copyin(data);
|
||||||
|
// Run operation
|
||||||
|
bangRuntime->run(g);
|
||||||
|
// Copy output from MLU to CPU
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
auto result = op->getOutputs()[i]->clone(cpuRuntime);
|
||||||
|
EXPECT_TRUE(result->equalData(ans[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_AllGather, run) {
|
||||||
|
vector<float> data[2] = {{2., 3.}, {5., 6.}};
|
||||||
|
vector<vector<float>> ans = {{2., 3.}, {5., 6.}};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allGather("test_all_gather", i, data[i], ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
||||||
|
#endif
|
|
@ -0,0 +1,124 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/all_reduce.h"
|
||||||
|
#include "test.h"
|
||||||
|
#include <cncl.h>
|
||||||
|
#include <future>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
static int WORLD_SIZE = 2;
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <typename OperatorObj>
|
||||||
|
void allReduce(const string taskName, int deviceID, vector<float> data,
|
||||||
|
vector<float> ans) {
|
||||||
|
// Create Runtimes and initiate communication
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
|
||||||
|
bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
|
||||||
|
// Create Graph and insert allReduce operation
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto input =
|
||||||
|
g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
|
||||||
|
auto op = g->addOp<OperatorObj>(input, nullptr);
|
||||||
|
// Copy data from CPU to MLU
|
||||||
|
g->dataMalloc();
|
||||||
|
input->copyin(data);
|
||||||
|
// Run operation
|
||||||
|
bangRuntime->run(g);
|
||||||
|
// Copy output from MLU to CPU
|
||||||
|
auto result = op->getOutput()->clone(cpuRuntime);
|
||||||
|
|
||||||
|
EXPECT_TRUE(result->equalData(ans));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_AllReduce, sum) {
|
||||||
|
vector<float> data[2] = {{2., 3.}, {5., 6.}};
|
||||||
|
vector<float> ans = {7., 9.};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allReduce<AllReduceSumObj>("test_allreduce_sum", i, data[i], ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_AllReduce, prod) {
|
||||||
|
vector<float> data[2] = {{2., 3.}, {5., 6.}};
|
||||||
|
vector<float> ans = {10., 18.};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allReduce<AllReduceProdObj>("test_allreduce_prod", i, data[i], ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_AllReduce, min) {
|
||||||
|
vector<float> data[2] = {{2., 3.}, {5., 6.}};
|
||||||
|
vector<float> ans = {2., 3.};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allReduce<AllReduceMinObj>("test_allreduce_min", i, data[i], ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_AllReduce, max) {
|
||||||
|
vector<float> data[2] = {{2., 3.}, {5., 6.}};
|
||||||
|
vector<float> ans = {5., 6.};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
allReduce<AllReduceMaxObj>("test_allreduce_max", i, data[i], ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
||||||
|
#endif
|
|
@ -0,0 +1,57 @@
|
||||||
|
#include "bang/bang_kernel_without_config.h"
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/batch_norm.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
TEST(BANG_BatchNorm, run) {
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build cpu graph
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto iCpu = gCpu->addTensor(Shape{1, 3, 2, 2}, DataType::Float32);
|
||||||
|
auto meanCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||||
|
auto varCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||||
|
auto scaleCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||||
|
auto biasCpu = gCpu->addTensor(Shape{3}, DataType::Float32);
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
iCpu->setData(IncrementalGenerator());
|
||||||
|
meanCpu->copyin(vector<float>{1, 6, 9});
|
||||||
|
varCpu->copyin(vector<float>{4, 1, 9});
|
||||||
|
scaleCpu->setData(OneGenerator());
|
||||||
|
biasCpu->setData(ZeroGenerator());
|
||||||
|
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
|
||||||
|
auto i = g->cloneTensor(iCpu);
|
||||||
|
auto mean = g->cloneTensor(meanCpu);
|
||||||
|
auto var = g->cloneTensor(varCpu);
|
||||||
|
auto scale = g->cloneTensor(scaleCpu);
|
||||||
|
auto bias = g->cloneTensor(biasCpu);
|
||||||
|
auto op =
|
||||||
|
g->addOp<BatchNormObj>(i, nullptr, mean, var, scale, bias, 0.9, 0);
|
||||||
|
|
||||||
|
g->dataMalloc();
|
||||||
|
i->setData(IncrementalGenerator());
|
||||||
|
mean->copyin(vector<float>{1, 6, 9});
|
||||||
|
var->copyin(vector<float>{4, 1, 9});
|
||||||
|
scale->setData(OneGenerator());
|
||||||
|
bias->setData(ZeroGenerator());
|
||||||
|
|
||||||
|
bangRuntime->run(g);
|
||||||
|
|
||||||
|
auto o = op->getOutput();
|
||||||
|
auto ocpu = o->clone(cpuRuntime);
|
||||||
|
|
||||||
|
// check results on CPU
|
||||||
|
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 2}));
|
||||||
|
EXPECT_TRUE(ocpu->equalData(vector<float>{
|
||||||
|
-0.5, 0, 0.5, 1, -2, -1, 0, 1, -0.333333, 0, 0.3333333, 0.6666667}));
|
||||||
|
}
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,65 @@
|
||||||
|
#ifdef INFINI_USE_CNCL
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "bang/cncl_communicator.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/broadcast.h"
|
||||||
|
#include "test.h"
|
||||||
|
#include <cncl.h>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
static int WORLD_SIZE = 2;
|
||||||
|
static int root = 0;
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
void broadcast(const string taskName, int deviceID, vector<float> data,
|
||||||
|
vector<float> ans) {
|
||||||
|
// Create Runtimes and initiate communication
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Runtime bangRuntime = make_ref<BangRuntimeObj>(deviceID);
|
||||||
|
bangRuntime->initComm(taskName, WORLD_SIZE, deviceID);
|
||||||
|
// Create Graph and insert allReduce operation
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto input =
|
||||||
|
g->addTensor(Shape{static_cast<int>(data.size())}, DataType::Float32);
|
||||||
|
auto op = g->addOp<BroadcastObj>(input, nullptr, root);
|
||||||
|
// Copy data from CPU to GPU
|
||||||
|
g->dataMalloc();
|
||||||
|
// Only rank 0 has the data
|
||||||
|
if (deviceID == root) {
|
||||||
|
input->copyin(data);
|
||||||
|
}
|
||||||
|
// Run broadcast operation
|
||||||
|
bangRuntime->run(g);
|
||||||
|
// Copy output from GPU to CPU
|
||||||
|
auto result = op->getOutput()->clone(cpuRuntime);
|
||||||
|
|
||||||
|
EXPECT_TRUE(result->equalData(ans));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_Broadcast, run) {
|
||||||
|
// Only 1 device gets data. Every rank should have the same data after
|
||||||
|
// broadcast.
|
||||||
|
vector<float> data = {2., 3., 5., 6.};
|
||||||
|
vector<float> ans = {2., 3., 5., 6.};
|
||||||
|
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process
|
||||||
|
broadcast("test_broadcast", i, data, ans);
|
||||||
|
exit(0); // Ensure child process exits to avoid unnecessary
|
||||||
|
// repetition in parent
|
||||||
|
} else if (pid < 0) {
|
||||||
|
std::cerr << "Error creating process" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Wait for all child processes to finish
|
||||||
|
for (int i = 0; i < WORLD_SIZE; ++i) {
|
||||||
|
wait(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
||||||
|
#endif
|
|
@ -32,6 +32,8 @@ void testConcat(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
auto gpuOp =
|
auto gpuOp =
|
||||||
bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
|
bangGraph->addOp<T>(TensorVec{inputGpu1, inputGpu2}, nullptr, 2);
|
||||||
bangGraph->dataMalloc();
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu1->setData(generator);
|
||||||
|
inputGpu2->setData(generator);
|
||||||
bangRuntime->run(bangGraph);
|
bangRuntime->run(bangGraph);
|
||||||
auto outputGpu = gpuOp->getOutput();
|
auto outputGpu = gpuOp->getOutput();
|
||||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
|
|
@ -18,8 +18,14 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
|
|
||||||
// Build input data on CPU
|
// Build input data on CPU
|
||||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
inputCpu->dataMalloc();
|
Graph cpuGraph = make_ref<GraphObj>(cpuRuntime);
|
||||||
|
auto cpuOp =
|
||||||
|
cpuGraph->addOp<T>(inputCpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
||||||
|
cpuGraph->addTensor(inputCpu);
|
||||||
|
cpuGraph->dataMalloc();
|
||||||
inputCpu->setData(generator);
|
inputCpu->setData(generator);
|
||||||
|
cpuRuntime->run(cpuGraph);
|
||||||
|
auto outputCpu = cpuOp->getOutput();
|
||||||
|
|
||||||
// GPU
|
// GPU
|
||||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
@ -27,17 +33,16 @@ void testPooling(const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
auto gpuOp =
|
auto gpuOp =
|
||||||
bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
bangGraph->addOp<T>(inputGpu, nullptr, 3, 3, 1, 1, 1, 1, 2, 2, 0);
|
||||||
bangGraph->dataMalloc();
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->setData(generator);
|
||||||
bangRuntime->run(bangGraph);
|
bangRuntime->run(bangGraph);
|
||||||
auto outputGpu = gpuOp->getOutput();
|
auto outputGpu = gpuOp->getOutput();
|
||||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
inputCpu->printData();
|
|
||||||
outputGpu2Cpu->printData();
|
|
||||||
EXPECT_TRUE(1);
|
EXPECT_TRUE(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(cnnl_Pooling, run) {
|
TEST(cnnl_Pooling, run) {
|
||||||
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
testPooling<MaxPoolObj>(IncrementalGenerator(), Shape{1, 3, 5, 5});
|
||||||
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 1, 5, 5});
|
testPooling<AvgPoolObj>(IncrementalGenerator(), Shape{1, 3, 5, 5});
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/reduce.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
template <typename ReduceObjT>
|
||||||
|
void test_reduce(const Shape &shape, const vector<float> &data,
|
||||||
|
const optional<const vector<int>> &axis, bool keepDims,
|
||||||
|
const vector<float> &ExpectData) {
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor icpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// Build BANG graph
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto i = g->cloneTensor(icpu);
|
||||||
|
auto op = g->addOp<ReduceObjT>(i, nullptr, axis, keepDims);
|
||||||
|
|
||||||
|
// allocate BANG memory
|
||||||
|
g->dataMalloc();
|
||||||
|
i->copyin(data);
|
||||||
|
|
||||||
|
// Execute on BANG
|
||||||
|
bangRuntime->run(g);
|
||||||
|
|
||||||
|
// clone BANG output to CPU
|
||||||
|
auto o = op->getOutput();
|
||||||
|
auto ocpu = o->clone(cpuRuntime);
|
||||||
|
|
||||||
|
// check results on CPU
|
||||||
|
EXPECT_TRUE(ocpu->equalData(ExpectData));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_ReduceMean, run) {
|
||||||
|
test_reduce<ReduceMeanObj>(
|
||||||
|
Shape{3, 2, 2}, vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2},
|
||||||
|
std::nullopt, true, vector<float>{18.25});
|
||||||
|
test_reduce<ReduceMeanObj>(
|
||||||
|
Shape{1, 3, 2, 2, 1},
|
||||||
|
vector<float>{5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2}, std::nullopt,
|
||||||
|
false, vector<float>{18.25});
|
||||||
|
|
||||||
|
test_reduce<ReduceMeanObj>(
|
||||||
|
Shape{2, 3, 2, 2},
|
||||||
|
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||||
|
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||||
|
vector<int>{1, 2}, false, vector<float>{5, 6, 17, 18});
|
||||||
|
test_reduce<ReduceMeanObj>(
|
||||||
|
Shape{2, 3, 2, 2, 1},
|
||||||
|
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||||
|
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||||
|
vector<int>{1, 2}, true, vector<float>{5, 6, 17, 18});
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BANG_ReduceSum, run) {
|
||||||
|
test_reduce<ReduceSumObj>(Shape{3, 2, 2},
|
||||||
|
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||||
|
std::nullopt, true, vector<float>{12});
|
||||||
|
test_reduce<ReduceSumObj>(Shape{1, 3, 2, 2, 1},
|
||||||
|
vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||||
|
std::nullopt, false, vector<float>{12});
|
||||||
|
|
||||||
|
test_reduce<ReduceSumObj>(
|
||||||
|
Shape{2, 3, 2, 2},
|
||||||
|
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||||
|
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||||
|
vector<int>{1, 2}, false, vector<float>{30, 36, 102, 108});
|
||||||
|
test_reduce<ReduceSumObj>(
|
||||||
|
Shape{2, 3, 2, 2, 1},
|
||||||
|
vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||||
|
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
|
||||||
|
vector<int>{1, 2}, true, vector<float>{30, 36, 102, 108});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,39 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/slice.h"
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
TEST(BANG_Slice, run) {
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor icpu =
|
||||||
|
make_ref<TensorObj>(Shape{3, 2, 1, 5}, DataType::Float32, cpuRuntime);
|
||||||
|
icpu->dataMalloc();
|
||||||
|
icpu->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
// Build CUDA graph;
|
||||||
|
Graph g = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto i = g->cloneTensor(icpu);
|
||||||
|
auto op =
|
||||||
|
g->addOp<SliceObj>(i, nullptr, vector<int>{1, 1}, vector<int>{2, 5},
|
||||||
|
vector<int>{0, 3}, std::nullopt);
|
||||||
|
|
||||||
|
// allocate CUDA memory
|
||||||
|
g->dataMalloc();
|
||||||
|
i->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
// Execute on CUDA
|
||||||
|
bangRuntime->run(g);
|
||||||
|
|
||||||
|
// clone CUDA output to CPU
|
||||||
|
auto o = op->getOutput();
|
||||||
|
auto cpuo = o->clone(cpuRuntime);
|
||||||
|
// bangPrintTensor(o);
|
||||||
|
// check results on CPU
|
||||||
|
EXPECT_TRUE(cpuo->equalData(vector<float>{11, 12, 13, 14, 16, 17, 18, 19}));
|
||||||
|
}
|
||||||
|
} // namespace infini
|
|
@ -0,0 +1,131 @@
|
||||||
|
#include "bang/bang_runtime.h"
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/kernel.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/softmax.h"
|
||||||
|
#include "test.h"
|
||||||
|
#include <cmath>
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
TEST(cuDNN_Softmax, run_axis1) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu =
|
||||||
|
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||||
|
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||||
|
vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
|
||||||
|
0.032058604, 0.08714432, 0.23688284, 0.6439143}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cuDNN_Softmax, run_axis0) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu =
|
||||||
|
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||||
|
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(
|
||||||
|
outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cuDNN_Softmax2, run_axis1) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu =
|
||||||
|
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||||
|
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->setData(IncrementalGenerator());
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||||
|
0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
|
||||||
|
0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
|
||||||
|
0.9820138, 0.9820138, 0.9820138, 0.9820138}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cuDNN_Softmax2, run_axis2) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu =
|
||||||
|
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||||
|
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->setData(IncrementalGenerator());
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||||
|
0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
|
||||||
|
0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
|
||||||
|
0.1192029, 0.1192029, 0.8807971, 0.8807971}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(cuDNN_Softmax2, run_axis3) {
|
||||||
|
// Runtime
|
||||||
|
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
auto bangRuntime = make_ref<BangRuntimeObj>();
|
||||||
|
|
||||||
|
// Build input data on CPU
|
||||||
|
Tensor inputCpu =
|
||||||
|
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||||
|
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||||
|
auto gpuOp = bangGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
|
||||||
|
bangGraph->dataMalloc();
|
||||||
|
inputGpu->setData(IncrementalGenerator());
|
||||||
|
bangRuntime->run(bangGraph);
|
||||||
|
auto outputGpu = gpuOp->getOutput();
|
||||||
|
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||||
|
// Check
|
||||||
|
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||||
|
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||||
|
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||||
|
0.2689414, 0.7310586, 0.2689414, 0.7310586}));
|
||||||
|
}
|
||||||
|
} // namespace infini
|
|
@ -73,6 +73,38 @@ TEST(Split, CudaHigh) {
|
||||||
44., 45., 46., 47.}));
|
44., 45., 46., 47.}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Split, SplitWithRatio) {
|
||||||
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||||
|
|
||||||
|
auto input = gCpu->addTensor({2, 6, 2, 1, 2}, DataType::Float32);
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
input->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||||
|
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||||
|
|
||||||
|
auto inputGpu = gCuda->cloneTensor(input);
|
||||||
|
vector<int> split = {2, 4};
|
||||||
|
auto op = gCuda->addOp<SplitObj>(inputGpu, std::nullopt, 1, split);
|
||||||
|
gCuda->dataMalloc();
|
||||||
|
inputGpu->setData(IncrementalGenerator());
|
||||||
|
|
||||||
|
cudaRuntime->run(gCuda);
|
||||||
|
|
||||||
|
// copy output from CUDA to CPU
|
||||||
|
EXPECT_EQ(op->getOutputs().size(), (size_t)2);
|
||||||
|
auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
|
||||||
|
auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
|
||||||
|
EXPECT_TRUE(
|
||||||
|
o0Cpu->equalData(vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 24., 25.,
|
||||||
|
26., 27., 28., 29., 30., 31.}));
|
||||||
|
EXPECT_TRUE(o1Cpu->equalData(
|
||||||
|
vector<float>{8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18.,
|
||||||
|
19., 20., 21., 22., 23., 32., 33., 34., 35., 36., 37.,
|
||||||
|
38., 39., 40., 41., 42., 43., 44., 45., 46., 47.}));
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Split, Cuda_dim0) {
|
TEST(Split, Cuda_dim0) {
|
||||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
#include "core/graph.h"
|
||||||
|
#include "core/runtime.h"
|
||||||
|
#include "operators/element_wise.h"
|
||||||
|
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
namespace infini {
|
||||||
|
|
||||||
|
using ExpectOutput = vector<float>;
|
||||||
|
template <class T>
|
||||||
|
void testElementWiseNativeCpu(
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator1,
|
||||||
|
const std::function<void(void *, size_t, DataType)> &generator2,
|
||||||
|
const Shape &shape1, const Shape &shape2, const ExpectOutput &ansVec) {
|
||||||
|
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||||
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
|
auto t1 = g->addTensor(shape1, DataType::Float32);
|
||||||
|
auto t2 = g->addTensor(shape2, DataType::Float32);
|
||||||
|
|
||||||
|
auto op = g->addOp<T>(t1, t2, nullptr);
|
||||||
|
g->dataMalloc();
|
||||||
|
t1->setData(generator1);
|
||||||
|
t2->setData(generator2);
|
||||||
|
|
||||||
|
runtime->run(g);
|
||||||
|
EXPECT_TRUE(op->getOutput()->equalData(ansVec));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ElementWise, NativeCpu) {
|
||||||
|
testElementWiseNativeCpu<AddObj>(
|
||||||
|
IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
|
||||||
|
Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 4, 5, 6, 6, 7, 8, 10, 11, 12});
|
||||||
|
testElementWiseNativeCpu<MulObj>(
|
||||||
|
IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
|
||||||
|
Shape{2, 1, 1}, ExpectOutput{0, 0, 0, 3, 4, 5, 0, 0, 0, 9, 10, 11});
|
||||||
|
testElementWiseNativeCpu<SubObj>(
|
||||||
|
IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
|
||||||
|
Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 2, 3, 4, 6, 7, 8, 8, 9, 10});
|
||||||
|
testElementWiseNativeCpu<DivObj>(
|
||||||
|
IncrementalGenerator(), OneGenerator(), Shape{1, 2, 2, 3, 1},
|
||||||
|
Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace infini
|
Loading…
Reference in New Issue