forked from jiuyuan/InfiniTensor
Merge branch 'master' into xpu
This commit is contained in:
commit
879f2f0315
|
@ -16,6 +16,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
|||
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
|
||||
|
||||
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||
# Build Type
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
message("Configuring for Debug build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
|
||||
add_compile_definitions(DEBUG_MODE)
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
message("Configuring for Release build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||
add_compile_definitions(NDEBUG)
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||
message("Configuring for RelWithDebInfo build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||
else()
|
||||
message("Build type not specified. Configuring for RelWithDebInfo build.")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
|
||||
endif()
|
||||
|
||||
|
||||
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
|
||||
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
|
||||
|
|
28
Makefile
28
Makefile
|
@ -8,6 +8,20 @@ INTELCPU ?= off
|
|||
BACKTRACE ?= ON
|
||||
TEST ?= ON
|
||||
FORMAT_ORIGIN ?=
|
||||
# Docker build options
|
||||
DOCKER_NAME ?= infinitensor
|
||||
DOCKER_IMAGE_NAME ?= infinitensor
|
||||
DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
|
||||
DOCKER_RUN_OPTION ?=
|
||||
|
||||
# CUDA option.
|
||||
ifeq ($(CUDA), ON)
|
||||
DOCKER_IMAGE_NAME = infinitensor_cuda
|
||||
DOCKER_NAME = infinitensor_cuda
|
||||
DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
|
||||
DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
|
||||
endif
|
||||
|
||||
|
||||
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
|
||||
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
|
||||
|
@ -41,3 +55,17 @@ test-cpp:
|
|||
test-onnx:
|
||||
@echo
|
||||
python3 pyinfinitensor/tests/test_onnx.py
|
||||
|
||||
docker-build:
|
||||
docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
|
||||
|
||||
docker-run:
|
||||
docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
|
||||
|
||||
docker-start:
|
||||
docker start $(DOCKER_IMAGE_NAME)
|
||||
|
||||
docker-exec:
|
||||
docker exec -it $(DOCKER_IMAGE_NAME) bash
|
||||
|
||||
|
||||
|
|
|
@ -137,6 +137,29 @@
|
|||
|
||||
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
|
||||
|
||||
## Docker
|
||||
|
||||
本项目也提供了 Docker 的环境,您可以使用 `make docker-build` 或 `make docker-build CUDA=ON` 命令启动并编译 Dockerfile,您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
|
||||
|
||||
由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中,因此暂时注释掉拉取 repo 并编译项目的过程,由用户在进入 docker 后自己维护 ssh key(将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题)。
|
||||
|
||||
```shell
|
||||
# Build docker container.
|
||||
make docker-build
|
||||
# Run docker image.
|
||||
make docker-run
|
||||
# Execute docker image.
|
||||
make docker-exec
|
||||
```
|
||||
|
||||
如果需要编译 CUDA 版,请使用如下命令:
|
||||
```shell
|
||||
# Build docker container.
|
||||
make docker-build CUDA=ON
|
||||
# Run docker image.
|
||||
make docker-run CUDA=ON
|
||||
```
|
||||
|
||||
## 技术支持
|
||||
|
||||
如遇到问题,请联系我们技术支持团队
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
|
||||
namespace infini {
|
||||
|
@ -69,23 +70,6 @@ class DataType {
|
|||
int getIndex() const { return index; }
|
||||
};
|
||||
|
||||
// to be consistent with onnx
|
||||
// https://github.com/onnx/onnx/blob/aeb21329122b96df1d3ef33b500a35ca140b1431/onnx/onnx.proto#L484
|
||||
inline const DataType DataType::Undefine(0);
|
||||
inline const DataType DataType::Float32(1);
|
||||
inline const DataType DataType::UInt8(2);
|
||||
inline const DataType DataType::Int8(3);
|
||||
inline const DataType DataType::UInt16(4);
|
||||
inline const DataType DataType::Int16(5);
|
||||
inline const DataType DataType::Int32(6);
|
||||
inline const DataType DataType::Int64(7);
|
||||
inline const DataType DataType::String(8);
|
||||
inline const DataType DataType::Bool(9);
|
||||
inline const DataType DataType::Float16(10);
|
||||
inline const DataType DataType::Double(11);
|
||||
inline const DataType DataType::UInt32(12);
|
||||
inline const DataType DataType::UInt64(13);
|
||||
inline const DataType DataType::BFloat16(16);
|
||||
// Method definitions are out of the declaration due to GCC bug:
|
||||
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
|
||||
template <> inline int DataType::get<float>() { return 0; }
|
||||
|
|
|
@ -41,10 +41,13 @@ class GraphHandlerObj {
|
|||
Tensor mul(Tensor a, Tensor b, Tensor c);
|
||||
Tensor div(Tensor a, Tensor b, Tensor c);
|
||||
Tensor pow(Tensor a, Tensor b, Tensor c);
|
||||
Tensor min(Tensor a, Tensor b, Tensor c);
|
||||
Tensor max(Tensor a, Tensor b, Tensor c);
|
||||
|
||||
Tensor relu(Tensor x, Tensor y);
|
||||
Tensor sigmoid(Tensor x, Tensor y);
|
||||
Tensor tanh(Tensor x, Tensor y);
|
||||
Tensor erf(Tensor x, Tensor y);
|
||||
Tensor softmax(Tensor x, Tensor y, int axis);
|
||||
Tensor abs(Tensor x, Tensor y);
|
||||
Tensor sqrt(Tensor x, Tensor y);
|
||||
|
@ -68,6 +71,8 @@ class GraphHandlerObj {
|
|||
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
|
||||
const optional<vector<int>> &axes);
|
||||
Tensor cast(Tensor input, Tensor output, int to);
|
||||
Tensor expand(Tensor input, Tensor output, Shape dims);
|
||||
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
|
||||
|
||||
//------ modifiers
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ class TensorObj : public TensorBaseObj {
|
|||
|
||||
Shape getDims() const { return shape; }
|
||||
size_t getRank() const { return shape.size(); }
|
||||
vector<size_t> getStride() const;
|
||||
Shape getStride() const;
|
||||
size_t getOffset(const vector<int> &ds) const;
|
||||
void dataMalloc();
|
||||
UidBaseType getFuid() const { return fuid; }
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/unary.h"
|
||||
#include "utils/small_array.h"
|
||||
namespace infini {
|
||||
void expand_kernel(float *input, float *output, int nDims, int outputsize,
|
||||
SmallArray inputShape, SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include "operators/transpose.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,13 @@
|
|||
#pragma once
|
||||
#include "operators/unary.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
void where_kernel(const float *inputx, const float *inputy,
|
||||
const float *condition, float *output, int nDims,
|
||||
infini::SmallArray inputxShape,
|
||||
infini::SmallArray inputyShape,
|
||||
infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
|
@ -1,7 +1,10 @@
|
|||
#pragma once
|
||||
#include "core/data_type.h"
|
||||
|
||||
typedef struct {
|
||||
int *indexValue;
|
||||
namespace infini {
|
||||
struct GatherMetaData {
|
||||
void *indexValue;
|
||||
DataType indexType;
|
||||
int axis;
|
||||
int inNDim;
|
||||
int outNDim;
|
||||
|
@ -10,8 +13,7 @@ typedef struct {
|
|||
int idxDim[4];
|
||||
int idxStride[4];
|
||||
int inStride[4];
|
||||
} GatherMetaData;
|
||||
};
|
||||
|
||||
namespace infini {
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
|
||||
}
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief Broadcast the input tensor following the given shape and the
|
||||
* broadcast rule.
|
||||
*
|
||||
*/
|
||||
class ExpandObj : public OperatorObj {
|
||||
Shape dims;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new Expand object.
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param output The output tensor.
|
||||
* @param dims The shape you want to expand to, following the broadcast
|
||||
* rule.
|
||||
*/
|
||||
ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
|
||||
OP_CLONE(ExpandObj);
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
Shape getShape() const { return dims; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,36 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief Return elements, either from X or Y, depending on condition.
|
||||
*
|
||||
*/
|
||||
class WhereObj : public OperatorObj {
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new Where object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param inputX The input tensor X.
|
||||
* @param inputY The input tensor Y.
|
||||
* @param output The output tensor.
|
||||
* @param condition The condition tensor.
|
||||
*/
|
||||
WhereObj(GraphObj *graph, Tensor inputX, Tensor inputY, Tensor condition,
|
||||
Tensor output);
|
||||
OP_CLONE(WhereObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,8 @@
|
|||
namespace infini {
|
||||
|
||||
#define SMALL_ARRAY_SIZE 8
|
||||
struct SmallArray {
|
||||
int data[SMALL_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -348,6 +348,18 @@ class OnnxStub:
|
|||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Min":
|
||||
tensors[node.output[0]] = self.handler.min(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Max":
|
||||
tensors[node.output[0]] = self.handler.max(
|
||||
tensors[node.input[0]],
|
||||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Relu":
|
||||
tensors[node.output[0]] = self.handler.relu(
|
||||
tensors[node.input[0]],
|
||||
|
@ -397,7 +409,8 @@ class OnnxStub:
|
|||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
next(
|
||||
(attr.i for attr in node.attribute if attr.name == "axis")
|
||||
(attr.i for attr in node.attribute if attr.name == "axis"),
|
||||
1,
|
||||
),
|
||||
)
|
||||
elif node.op_type == "PRelu":
|
||||
|
@ -505,7 +518,8 @@ class OnnxStub:
|
|||
tensors[node.input[1]],
|
||||
tensors.get(node.output[0]),
|
||||
next(
|
||||
(attr.i for attr in node.attribute if attr.name == "axis"), 0
|
||||
(attr.i for attr in node.attribute if attr.name == "axis"),
|
||||
0,
|
||||
),
|
||||
)
|
||||
elif node.op_type == "ReduceMean":
|
||||
|
@ -527,7 +541,7 @@ class OnnxStub:
|
|||
for attr in node.attribute
|
||||
if attr.name == "keepdims"
|
||||
),
|
||||
1
|
||||
1,
|
||||
)
|
||||
!= 0,
|
||||
)
|
||||
|
@ -577,6 +591,25 @@ class OnnxStub:
|
|||
tensors.get(node.output[0]),
|
||||
next((attr.i for attr in node.attribute if attr.name == "to")),
|
||||
)
|
||||
elif node.op_type == "Expand":
|
||||
shape = _parse_data(data[node.input[1]])
|
||||
tensors[node.output[0]] = self.handler.expand(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
shape,
|
||||
)
|
||||
elif node.op_type == "Erf":
|
||||
tensors[node.output[0]] = self.handler.erf(
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
elif node.op_type == "Where":
|
||||
tensors[node.output[0]] = self.handler.where(
|
||||
tensors[node.input[1]],
|
||||
tensors[node.input[2]],
|
||||
tensors[node.input[0]],
|
||||
tensors.get(node.output[0]),
|
||||
)
|
||||
else:
|
||||
raise Exception('Unsupported operator "{}"'.format(node.op_type))
|
||||
new_node_name.append(node.name)
|
||||
|
@ -802,6 +835,8 @@ class OnnxStub:
|
|||
backend.OpTypeId.Abs,
|
||||
backend.OpTypeId.Identity,
|
||||
backend.OpTypeId.PRelu,
|
||||
backend.OpTypeId.Sqrt,
|
||||
backend.OpTypeId.Erf,
|
||||
]:
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name))
|
||||
elif ty == backend.OpTypeId.Flatten:
|
||||
|
@ -892,6 +927,13 @@ class OnnxStub:
|
|||
elif ty == backend.OpTypeId.Cast:
|
||||
to = backend.cast_to_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, to=to))
|
||||
elif ty == backend.OpTypeId.Where:
|
||||
assert len(inputs) == 3, "Check Where Op must have three inputs."
|
||||
new_inputs = [inputs[2], inputs[0], inputs[1]]
|
||||
ctx.push_node(make_node(ty.name, new_inputs, outputs, name))
|
||||
elif ty == backend.OpTypeId.Expand:
|
||||
shape = backend.expand_shape_of(op)
|
||||
ctx.push_node(make_node(ty.name, inputs, outputs, name, shape=shape))
|
||||
else:
|
||||
raise Exception("Unsupported OpType", ty)
|
||||
|
||||
|
@ -942,6 +984,17 @@ def _search_shape(model: ModelProto, name: str) -> List[int]:
|
|||
),
|
||||
None,
|
||||
)
|
||||
or next(
|
||||
(
|
||||
[
|
||||
(d.dim_value if d.dim_value > 0 else 1)
|
||||
for d in tensor.type.tensor_type.shape.dim
|
||||
]
|
||||
for tensor in model.graph.output
|
||||
if tensor.name == name
|
||||
),
|
||||
None,
|
||||
)
|
||||
or next(
|
||||
[int(d) for d in tensor.dims]
|
||||
for tensor in model.graph.initializer
|
||||
|
|
|
@ -207,6 +207,18 @@ class TestStringMethods(unittest.TestCase):
|
|||
relu = make_node("Relu", ["x"], ["y"], name="relu")
|
||||
make_and_import_model(make_graph([relu], "relu", [x], [y]))
|
||||
|
||||
def test_erf(self):
|
||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
erf = make_node("Erf", ["x"], ["y"], name="erf")
|
||||
make_and_import_model(make_graph([erf], "erf", [x], [y]))
|
||||
|
||||
def test_sqrt(self):
|
||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
sqrt = make_node("Sqrt", ["x"], ["y"], name="sqrt")
|
||||
make_and_import_model(make_graph([sqrt], "sqrt", [x], [y]))
|
||||
|
||||
def test_sigmoid(self):
|
||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
|
@ -352,6 +364,24 @@ class TestStringMethods(unittest.TestCase):
|
|||
)
|
||||
make_and_import_model(make_graph([cast], "cast", [input1], [output]))
|
||||
|
||||
def test_expand(self):
|
||||
data = make_tensor_value_info("data", TensorProto.FLOAT, [3, 1])
|
||||
dim = make_tensor_value_info("dim", TensorProto.INT64, [3])
|
||||
dim_data = make_tensor("dim", TensorProto.INT64, [3], [2, 1, 6])
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [2, 3, 6])
|
||||
expand = make_node("Expand", ["data", "dim"], ["output"], name="expand")
|
||||
make_and_import_model(
|
||||
make_graph([expand], "expand", [data, dim], [output], [dim_data])
|
||||
)
|
||||
|
||||
def test_where(self):
|
||||
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
con = make_tensor_value_info("con", TensorProto.BOOL, [1, 3, 5, 7])
|
||||
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
|
||||
where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
|
||||
make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
FROM ubuntu:22.04
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install dependencies.
|
||||
RUN apt update && apt-get install -y git make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev openssh-client
|
||||
# Generate ssh key.
|
||||
RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -P ""
|
||||
|
||||
# Update pip and switch to Tsinghua source.
|
||||
RUN python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# TODO: Since SSH clone repo requires adding the SSH key to the GitHub profile,
|
||||
# the process of pulling the project and compiling it has been temporarily commented.
|
||||
|
||||
# Download InfiniTensor.
|
||||
# RUN git clone git@github.com:InfiniTensor/InfiniTensor.git /root/InfiniTensor --branch master --single-branch --recursive
|
||||
|
||||
# Build and Install InfiniTensor
|
||||
# RUN cd /root/InfiniTensor && make install-python
|
|
@ -0,0 +1,18 @@
|
|||
FROM nvcr.io/nvidia/pytorch:23.07-py3
|
||||
|
||||
# Install dependencies.
|
||||
RUN apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update && apt-get install -y git make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev openssh-client
|
||||
# Generate ssh key.
|
||||
RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -P ""
|
||||
|
||||
# Update pip and switch to Tsinghua source.
|
||||
RUN python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# TODO: Since SSH clone repo requires adding the SSH key to the GitHub profile,
|
||||
# the process of pulling the project and compiling it has been temporarily commented.
|
||||
|
||||
# Download InfiniTensor.
|
||||
# RUN git clone git@github.com:InfiniTensor/InfiniTensor.git /root?/InfiniTensor --branch master --single-branch --recursive
|
||||
|
||||
# Build and Install InfiniTensor
|
||||
# RUN cd /root/InfiniTensor && make install-python CUDA=ON
|
|
@ -0,0 +1,23 @@
|
|||
#include "core/data_type.h"
|
||||
|
||||
namespace infini {
|
||||
// Move implementation here to avoid compile time error on some platform
|
||||
// to be consistent with onnx
|
||||
// https://github.com/onnx/onnx/blob/aeb21329122b96df1d3ef33b500a35ca140b1431/onnx/onnx.proto#L484
|
||||
const DataType DataType::Undefine(0);
|
||||
const DataType DataType::Float32(1);
|
||||
const DataType DataType::UInt8(2);
|
||||
const DataType DataType::Int8(3);
|
||||
const DataType DataType::UInt16(4);
|
||||
const DataType DataType::Int16(5);
|
||||
const DataType DataType::Int32(6);
|
||||
const DataType DataType::Int64(7);
|
||||
const DataType DataType::String(8);
|
||||
const DataType DataType::Bool(9);
|
||||
const DataType DataType::Float16(10);
|
||||
const DataType DataType::Double(11);
|
||||
const DataType DataType::UInt32(12);
|
||||
const DataType DataType::UInt64(13);
|
||||
// TODO: Reserved for complex data type.
|
||||
const DataType DataType::BFloat16(16);
|
||||
} // namespace infini
|
|
@ -3,6 +3,7 @@
|
|||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/expand.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/pad.h"
|
||||
|
@ -14,6 +15,7 @@
|
|||
#include "operators/split.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "operators/where.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -135,6 +137,8 @@ DEFINE_ELEMENT_WISE_METHOD(sub, Sub)
|
|||
DEFINE_ELEMENT_WISE_METHOD(mul, Mul)
|
||||
DEFINE_ELEMENT_WISE_METHOD(div, Div)
|
||||
DEFINE_ELEMENT_WISE_METHOD(pow, Pow)
|
||||
DEFINE_ELEMENT_WISE_METHOD(min, Minimum)
|
||||
DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
|
||||
|
||||
// see operators/unary.h
|
||||
#define DEFINE_UNARY_METHOD(name, obj) \
|
||||
|
@ -153,6 +157,7 @@ DEFINE_UNARY_METHOD(tanh, Tanh)
|
|||
DEFINE_UNARY_METHOD(abs, Abs)
|
||||
DEFINE_UNARY_METHOD(sqrt, Sqrt)
|
||||
DEFINE_UNARY_METHOD(shape, Shape)
|
||||
DEFINE_UNARY_METHOD(erf, Erf)
|
||||
|
||||
// see operators/reshape.h
|
||||
DEFINE_UNARY_METHOD(identity, Identity)
|
||||
|
@ -307,6 +312,31 @@ Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::expand(Tensor input, Tensor output, Shape dims) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<ExpandObj>(std::move(input), output,
|
||||
std::move(dims));
|
||||
return output;
|
||||
} else {
|
||||
return g->addOp<ExpandObj>(std::move(input), output, std::move(dims))
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::where(Tensor inputX, Tensor inputY, Tensor condition,
|
||||
Tensor output) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<WhereObj>(std::move(inputX), std::move(inputY),
|
||||
std::move(condition), output);
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<WhereObj>(std::move(inputX), std::move(inputY),
|
||||
std::move(condition), output)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
static CastType inferCastType(Tensor input, int to) {
|
||||
auto iType = input->getDType();
|
||||
auto oType = DataType(to);
|
||||
|
|
|
@ -51,15 +51,14 @@ size_t TensorObj::getOffset(const vector<int> &pos) const {
|
|||
return idx;
|
||||
}
|
||||
|
||||
vector<size_t> TensorObj::getStride() const {
|
||||
vector<size_t> ret;
|
||||
size_t stride = 1;
|
||||
for (int i = shape.size() - 1; i >= 1; i--) {
|
||||
ret.emplace(ret.begin(), stride);
|
||||
stride *= shape.at(i);
|
||||
Shape TensorObj::getStride() const {
|
||||
Shape stride(getRank());
|
||||
ShapeElem p = 1;
|
||||
for (auto i = getRank(); i > 0; --i) {
|
||||
stride[i - 1] = p;
|
||||
p = p * shape[i - 1];
|
||||
}
|
||||
ret.emplace(ret.begin(), stride);
|
||||
return ret;
|
||||
return stride;
|
||||
}
|
||||
|
||||
void TensorObj::printData() const {
|
||||
|
|
|
@ -4,6 +4,19 @@
|
|||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/matmul.h"
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
|
||||
cudaError_t kernelError = cudaGetLastError();
|
||||
if (kernelError != cudaSuccess) {
|
||||
std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
|
||||
<< std::endl
|
||||
<< "Failed Operator: " << op->toString() << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace infini {
|
||||
|
||||
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
||||
|
@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
|||
} else {
|
||||
kernel->compute(op, this);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
CHECK_CUDA_KERNEL_ERROR(op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
|||
opTime[op->getOpType()] += t;
|
||||
opCnt[op->getOpType()]++;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
CHECK_CUDA_KERNEL_ERROR(op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "operators/batch_norm.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/expand.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/pad.h"
|
||||
|
@ -99,6 +100,10 @@ void export_values(py::module &m) {
|
|||
.VALUE(OpType, Resize)
|
||||
.VALUE(OpType, Dropout)
|
||||
.VALUE(OpType, Cast)
|
||||
.VALUE(OpType, Sqrt)
|
||||
.VALUE(OpType, Expand)
|
||||
.VALUE(OpType, Erf)
|
||||
.VALUE(OpType, Where)
|
||||
.export_values();
|
||||
|
||||
#undef VALUE
|
||||
|
@ -233,6 +238,15 @@ static vector<int64_t> reshape_shape_of(Operator op) {
|
|||
return ans;
|
||||
}
|
||||
|
||||
static vector<int64_t> expand_shape_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Expand);
|
||||
auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
|
||||
vector<int64_t> ans(shape.size());
|
||||
std::transform(shape.begin(), shape.end(), ans.begin(),
|
||||
[](auto x) { return static_cast<int64_t>(x); });
|
||||
return ans;
|
||||
}
|
||||
|
||||
static vector<int64_t> pad_pads_of(Operator op) {
|
||||
IT_ASSERT(op->getOpType() == OpType::Pad);
|
||||
auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
|
||||
|
@ -287,6 +301,7 @@ void export_functions(py::module &m) {
|
|||
.FUNCTION(reduce_mean_attrs_of)
|
||||
.FUNCTION(tensor_dtype)
|
||||
.FUNCTION(reshape_shape_of)
|
||||
.FUNCTION(expand_shape_of)
|
||||
.FUNCTION(pad_pads_of)
|
||||
.FUNCTION(transpose_permute_of)
|
||||
.FUNCTION(concat_axis_of)
|
||||
|
@ -374,6 +389,9 @@ void init_graph_builder(py::module &m) {
|
|||
.def("slice", &Handler::slice, policy::move)
|
||||
.def("pad", &Handler::pad, policy::move)
|
||||
.def("cast", &Handler::cast, policy::move)
|
||||
.def("expand", &Handler::expand, policy::move)
|
||||
.def("erf", &Handler::erf, policy::move)
|
||||
.def("where", &Handler::where, policy::move)
|
||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||
.def("optimize", &Handler::optimize, policy::automatic)
|
||||
.def("operators", &Handler::operators, policy::move)
|
||||
|
|
|
@ -22,142 +22,142 @@ class CastCnnl : public BangKernelWithoutConfig {
|
|||
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
|
||||
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
|
||||
cnnlCastDataType_t NlCastType;
|
||||
CastObj::CastType type = op->getType();
|
||||
CastType type = op->getType();
|
||||
switch (type) {
|
||||
case CastObj::Float2Int64:
|
||||
case CastType::Float2Int64:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_INT64;
|
||||
break;
|
||||
case CastObj::Float2Int32:
|
||||
case CastType::Float2Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_INT32;
|
||||
break;
|
||||
case CastObj::Float2Int16:
|
||||
case CastType::Float2Int16:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_INT16;
|
||||
break;
|
||||
case CastObj::Float2Int8:
|
||||
case CastType::Float2Int8:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_FLOAT_TO_INT8;
|
||||
break;
|
||||
case CastObj::Int322Float:
|
||||
case CastType::Int322Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT32_TO_FLOAT;
|
||||
break;
|
||||
case CastObj::Int322Int8:
|
||||
case CastType::Int322Int8:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT32_TO_INT8;
|
||||
break;
|
||||
case CastObj::Int322Int16:
|
||||
case CastType::Int322Int16:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT32_TO_INT16;
|
||||
break;
|
||||
case CastObj::Int162Float:
|
||||
case CastType::Int162Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT16_TO_FLOAT;
|
||||
break;
|
||||
case CastObj::Int162Int32:
|
||||
case CastType::Int162Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT16_TO_INT32;
|
||||
break;
|
||||
case CastObj::Int82Float:
|
||||
case CastType::Int82Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT8_TO_FLOAT;
|
||||
break;
|
||||
case CastObj::Int82Int16:
|
||||
case CastType::Int82Int16:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT8_TO_INT16;
|
||||
break;
|
||||
case CastObj::Int82Int32:
|
||||
case CastType::Int82Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT8_TO_INT32;
|
||||
break;
|
||||
case CastObj::Uint82Float:
|
||||
case CastType::Uint82Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_UINT8_TO_FLOAT;
|
||||
break;
|
||||
case CastObj::Uint82Int32:
|
||||
case CastType::Uint82Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_UINT8_TO_INT32;
|
||||
break;
|
||||
case CastObj::Uint82Int64:
|
||||
case CastType::Uint82Int64:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_UINT8_TO_INT64;
|
||||
break;
|
||||
case CastObj::Int322Int64:
|
||||
case CastType::Int322Int64:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT32_TO_INT64;
|
||||
break;
|
||||
case CastObj::Int642Int32:
|
||||
case CastType::Int642Int32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT64_TO_INT32;
|
||||
break;
|
||||
case CastObj::Int642Uint32:
|
||||
case CastType::Int642Uint32:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT64_TO_UINT32;
|
||||
break;
|
||||
case CastObj::Int642Float:
|
||||
case CastType::Int642Float:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
|
||||
NlCastType = CNNL_CAST_INT64_TO_FLOAT;
|
||||
break;
|
||||
case CastObj::Uint322Int64:
|
||||
case CastType::Uint322Int64:
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
|
||||
checkCnnlError(cnnlSetTensorDescriptor(
|
||||
|
|
|
@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
|
|||
float maxValue) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
|
||||
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
|
||||
maxValue);
|
||||
}
|
||||
|
||||
|
|
|
@ -98,6 +98,14 @@ class MulCudnn : public ElementWiseCudnn {
|
|||
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MUL; }
|
||||
};
|
||||
|
||||
class MinCudnn : public ElementWiseCudnn {
|
||||
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MIN; }
|
||||
};
|
||||
|
||||
class MaxCudnn : public ElementWiseCudnn {
|
||||
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MAX; }
|
||||
};
|
||||
|
||||
class ElementWiseCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
|
@ -137,6 +145,10 @@ REGISTER_KERNEL(Device::CUDA, OpType::Sub, DataType::Float32, SubCudnn,
|
|||
"Sub_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Mul, DataType::Float32, MulCudnn,
|
||||
"Mul_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Min, DataType::Float32, MinCudnn,
|
||||
"Min_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn,
|
||||
"Max_cuDNN_CUDA_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
|
||||
"Div_CUDA_Float32");
|
||||
|
|
|
@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
|||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c0_index = i / (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
|
|||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
|
||||
z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
|
||||
a3_index] /
|
||||
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
|
||||
b3_index];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c0_index = i / (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
|
|||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
|
||||
z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
|
||||
a2_index * a3 + a3_index],
|
||||
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
|
||||
b2_index * b3 + b3_index]);
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int num = c0*c1*c2*c3;
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
|
||||
b3, c0, c1, c2, c3);
|
||||
}
|
||||
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
int blocksize = block_work_size();
|
||||
int num = c0*c1*c2*c3;
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
_pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
|
||||
b3, c0, c1, c2, c3);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#include "operators/expand.h"
|
||||
#include "cuda/cuda_expand.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ExpandCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ExpandObj>(_op);
|
||||
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
|
||||
const auto &out_Shape = op->getShape(); // output shape
|
||||
|
||||
SmallArray inputShape, outputShape;
|
||||
int nDims = op->getInputs(0)->getDims().size();
|
||||
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
int outputsize = 1; // the length of the output vector after flatten
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputShape.data[i] = out_Shape[i];
|
||||
inputShape.data[i] = in_Shape[i];
|
||||
outputsize *= out_Shape[i];
|
||||
}
|
||||
expand_kernel((float *)inputData, (float *)outputData, nDims,
|
||||
outputsize, inputShape, outputShape);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Expand, DataType::Float32, ExpandCuda,
|
||||
"Expand_CUDA_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,49 @@
|
|||
#include "core/common.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _expand_kernel(float *input, float *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputShape,
|
||||
infini::SmallArray outputShape) {
|
||||
|
||||
int outputIdx =
|
||||
blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s
|
||||
if (outputIdx < outputsize) {
|
||||
int inputIdx = 0; // record input index
|
||||
int temp = 1; // stored S, KS, JKS, in order
|
||||
int tmp = 1; // stored s,k,j,i in order
|
||||
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
if (i == 0) {
|
||||
tmp = v; // i = outputIdx/(JKS)
|
||||
} else {
|
||||
tmp = v % outputShape.data[i]; // store s,k,j in order
|
||||
}
|
||||
if (inputShape.data[i] ==
|
||||
1) { // if input shape = 1, the index only equal 0
|
||||
inputIdx += 0;
|
||||
} else {
|
||||
inputIdx +=
|
||||
tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s
|
||||
}
|
||||
temp *= inputShape.data[i];
|
||||
v = v / outputShape.data[i];
|
||||
}
|
||||
output[outputIdx] = input[inputIdx];
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void expand_kernel(float *input, float *output, int nDims, int outputsize,
|
||||
SmallArray inputShape, SmallArray outputShape) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
|
||||
_expand_kernel<<<gridsize, blocksize>>>(input, output, nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
|
|||
int oSize) {
|
||||
int blocksize = 32 * 16;
|
||||
int gridsize = (oSize + blocksize - 1) / blocksize;
|
||||
_extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter,
|
||||
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
|
||||
oSize);
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -12,7 +12,8 @@ class GatherCuda : public CudaKernelWithoutConfig {
|
|||
auto in = op->getInputs(0);
|
||||
auto index = op->getInputs(1);
|
||||
auto out = op->getOutput();
|
||||
metaData.indexValue = index->getRawDataPtr<int *>();
|
||||
metaData.indexValue = index->getRawDataPtr<void *>();
|
||||
metaData.indexType = index->getDType();
|
||||
metaData.axis = op->getAxis();
|
||||
metaData.inNDim = in->getRank();
|
||||
metaData.outNDim = out->getRank();
|
||||
|
|
|
@ -1,19 +1,21 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/gather.h"
|
||||
|
||||
__device__ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
|
||||
int offset = 0;
|
||||
template <typename T>
|
||||
__device__ T gatheredOffset2Offset(int gOffset,
|
||||
infini::GatherMetaData metaData) {
|
||||
T offset = 0;
|
||||
for (int i = metaData.inNDim - 1, k = metaData.outNDim - 1; i >= 0; --i) {
|
||||
int idx = 0;
|
||||
T idx = 0;
|
||||
if (i == metaData.axis) {
|
||||
int idxOffset = 0;
|
||||
T idxOffset = 0;
|
||||
for (int j = metaData.idxNDim - 1; j >= 0; --j) {
|
||||
int p = gOffset % metaData.idxDim[j];
|
||||
T p = gOffset % metaData.idxDim[j];
|
||||
gOffset = gOffset / metaData.idxDim[j];
|
||||
idxOffset += p * metaData.idxStride[j];
|
||||
}
|
||||
|
||||
idx = metaData.indexValue[idxOffset];
|
||||
idx = static_cast<T *>(metaData.indexValue)[idxOffset];
|
||||
k = k - metaData.idxNDim;
|
||||
|
||||
} else {
|
||||
|
@ -26,22 +28,27 @@ __device__ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
|
|||
return offset;
|
||||
}
|
||||
|
||||
__global__ void _gather_kernel(float *in, float *out, GatherMetaData metaData,
|
||||
int num) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
template <typename T>
|
||||
__global__ void _gather_kernel(float *in, float *out,
|
||||
infini::GatherMetaData metaData, size_t num) {
|
||||
T tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
while (tid < num) {
|
||||
int offset = gatheredOffset2Offset(tid, metaData);
|
||||
T offset = gatheredOffset2Offset<T>(tid, metaData);
|
||||
out[tid] = in[offset];
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, int num) {
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num) {
|
||||
int blockSize = 32 * 16;
|
||||
int gridSize = (num + blockSize - 1) / blockSize;
|
||||
|
||||
_gather_kernel<<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
if (metaData.indexType == DataType::Int64) {
|
||||
_gather_kernel<int64_t>
|
||||
<<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
} else {
|
||||
_gather_kernel<int><<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
#include "operators/transpose.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_transpose.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class TransposeCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
|
||||
auto input = op->getInputs(0);
|
||||
auto output = op->getOutput();
|
||||
void *const inputData = input->getRawDataPtr<void *>();
|
||||
void *const outputData = output->getRawDataPtr<void *>();
|
||||
const auto &inputShape = input->getDims();
|
||||
const auto &outputShape = output->getDims();
|
||||
|
||||
const auto &perm = op->getPermute();
|
||||
int size = input->size();
|
||||
int nDims = input->getDims().size();
|
||||
|
||||
// Compute strides
|
||||
SmallArray strides, buffer;
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
int curStride = 1;
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
buffer.data[i] = curStride;
|
||||
curStride *= inputShape[i];
|
||||
}
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
strides.data[i] = buffer.data[perm[i]];
|
||||
}
|
||||
|
||||
SmallArray outputDims;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputDims.data[i] = outputShape[i];
|
||||
}
|
||||
|
||||
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
|
||||
strides, outputDims);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
|
||||
TransposeCuda, "Transpose_CUDA_Float32");
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,37 @@
|
|||
#include "core/common.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _transpose_kernel(float *input, float *output, int nDims,
|
||||
int size, infini::SmallArray strides,
|
||||
infini::SmallArray outputShape) {
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (outputIdx < size) {
|
||||
int inputIdx = 0;
|
||||
int v = outputIdx;
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
inputIdx += v % outputShape.data[i] * strides.data[i];
|
||||
v /= outputShape.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
|
||||
output[outputIdx] = __ldg(input + inputIdx);
|
||||
#else
|
||||
output[outputIdx] = input[inputIdx];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (size + block_work_size() - 1) / block_work_size();
|
||||
_transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
|
||||
strides, outputShape);
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -25,19 +25,25 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
|
|||
|
||||
cudnnTensorDescriptor_t inputDesc, outputDesc;
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
if (dim.size() != 4)
|
||||
IT_TODO_HALT();
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
// assume input and output have the same strides.
|
||||
auto stride = op->getInputs(0)->getStride();
|
||||
// CUDNN requires that dim >= 4.
|
||||
while (dim.size() < 4)
|
||||
dim.push_back(1);
|
||||
while (stride.size() < 4)
|
||||
stride.push_back(1);
|
||||
|
||||
// get inputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(inputDesc, CUDNN_DATA_FLOAT,
|
||||
dim.size(), dim.data(),
|
||||
stride.data()));
|
||||
|
||||
// get outputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(outputDesc, CUDNN_DATA_FLOAT,
|
||||
dim.size(), dim.data(),
|
||||
stride.data()));
|
||||
|
||||
// get op descriptor
|
||||
cudnnActivationDescriptor_t activationDesc;
|
||||
|
|
|
@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
|
|||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_softmax_kernel1<<<1, 1>>>(input, output, num);
|
||||
_softmax_kernel2<<<blocksize, gridsize>>>(input, output, num);
|
||||
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void relu_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_relu_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sigmoid_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void tanh_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_tanh_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void abs_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_abs_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sqrt_kernel(float *input, float *output, int num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sqrt_kernel<<<blocksize, gridsize>>>(input, output, num);
|
||||
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
#include "operators/where.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_where.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class WhereCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<WhereObj>(_op);
|
||||
|
||||
void *const inputxData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const inputyData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
const auto &inputx_Shape = op->getInputs(0)->getDims();
|
||||
const auto &inputy_Shape = op->getInputs(1)->getDims();
|
||||
const auto &condition_Shape = op->getInputs(2)->getDims();
|
||||
const auto &output_Shape = op->getOutput()->getDims();
|
||||
|
||||
int nDims = op->getInputs(0)->getDims().size();
|
||||
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
|
||||
|
||||
SmallArray inputxShape, inputyShape, conditionShape, outputShape;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
inputxShape.data[i] = inputx_Shape[i];
|
||||
inputyShape.data[i] = inputy_Shape[i];
|
||||
conditionShape.data[i] = condition_Shape[i];
|
||||
outputShape.data[i] = output_Shape[i];
|
||||
}
|
||||
where_kernel((float *)inputxData, (float *)inputyData,
|
||||
(float *)conditionData, (float *)outputData, nDims,
|
||||
inputxShape, inputyShape, conditionShape, outputShape);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Where, DataType::Float32, WhereCuda,
|
||||
"Where_CUDA_Float32");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,82 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
__global__ void _where_kernel(const float *inputx, const float *inputy,
|
||||
const float *condition, float *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputxShape,
|
||||
infini::SmallArray inputyShape,
|
||||
infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape) {
|
||||
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (outputIdx < outputsize) {
|
||||
int inputxIdx = 0;
|
||||
int temp_inputx = 1;
|
||||
|
||||
int inputyIdx = 0;
|
||||
int temp_inputy = 1;
|
||||
|
||||
int conditionIdx = 0;
|
||||
int temp_condition = 1;
|
||||
|
||||
int tmp = 1; // stored s,k,j,i in order
|
||||
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
if (i == 0) {
|
||||
tmp = v; // i = outputIdx/(JKS)
|
||||
} else {
|
||||
tmp = v % outputShape.data[i]; // store s,k,j in order
|
||||
}
|
||||
if (inputxShape.data[i] == 1) {
|
||||
inputxIdx += 0;
|
||||
} else {
|
||||
inputxIdx +=
|
||||
tmp *
|
||||
temp_inputx; // otherwise +i(JKS) or j(KS) or k(S) or s
|
||||
}
|
||||
temp_inputx *= inputxShape.data[i];
|
||||
//----------------------------
|
||||
if (inputyShape.data[i] == 1) {
|
||||
inputyIdx += 0;
|
||||
} else {
|
||||
inputyIdx +=
|
||||
tmp *
|
||||
temp_inputy; // otherwise +i(JKS) or j(KS) or k(S) or s
|
||||
}
|
||||
temp_inputy *= inputyShape.data[i];
|
||||
//--------------------------
|
||||
if (conditionShape.data[i] == 1) {
|
||||
conditionIdx += 0;
|
||||
} else {
|
||||
conditionIdx +=
|
||||
tmp *
|
||||
temp_condition; // otherwise +i(JKS) or j(KS) or k(S) or s
|
||||
}
|
||||
temp_condition *= conditionShape.data[i];
|
||||
//-------------------------
|
||||
v = v / outputShape.data[i];
|
||||
}
|
||||
output[outputIdx] =
|
||||
condition[conditionIdx] ? inputx[inputxIdx] : inputy[inputyIdx];
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void where_kernel(const float *inputx, const float *inputy,
|
||||
const float *condition, float *output, int nDims,
|
||||
infini::SmallArray inputxShape,
|
||||
infini::SmallArray inputyShape,
|
||||
infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape) {
|
||||
int outputsize = 1;
|
||||
|
||||
for (int i = 0; i < nDims; i++) {
|
||||
outputsize *= outputShape.data[i];
|
||||
}
|
||||
int blocksize = 32 * 16;
|
||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
||||
_where_kernel<<<gridsize, blocksize>>>(
|
||||
inputx, inputy, condition, output, nDims, outputsize, inputxShape,
|
||||
inputyShape, conditionShape, outputShape);
|
||||
}
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "operators/expand.h"
|
||||
#include "utils/operator_utils.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
ExpandObj::ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
|
||||
: OperatorObj(OpType::Expand, {input}, {output}), dims(std::move(dims)) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> ExpandObj::inferShape(const TensorVec &inputs) const {
|
||||
auto shape_input = inputs[0]->getDims();
|
||||
Shape ret = infer_broadcast(shape_input, dims);
|
||||
return {{ret}};
|
||||
}
|
||||
|
||||
std::string ExpandObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Expand[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "dims=" << vecToString(dims) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> ExpandObj::getWorkloadVector() const {
|
||||
vector<int> ret = inputs[0]->getDims();
|
||||
ret.insert(ret.end(), dims.begin(), dims.end());
|
||||
ret.emplace(ret.begin(), type.underlying());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> ExpandObj::getOpAttrVector() const {
|
||||
vector<int> ret = dims;
|
||||
ret.emplace(ret.begin(), type.underlying());
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -24,8 +24,8 @@ optional<vector<Shape>> GatherObj::inferShape(const TensorVec &inputs) const {
|
|||
|
||||
vector<DataType> GatherObj::inferDataType(const TensorVec &inputs) const {
|
||||
IT_ASSERT(inputs.size() == 2);
|
||||
auto index = inputs[1];
|
||||
IT_ASSERT(index->getDType() == DataType::Int32);
|
||||
auto index_dtype = inputs[1]->getDType();
|
||||
IT_ASSERT(index_dtype == DataType::Int32 || index_dtype == DataType::Int64)
|
||||
return {inputs[0]->getDType()};
|
||||
}
|
||||
|
||||
|
@ -36,19 +36,31 @@ bool GatherObj::CheckIndexValid() const {
|
|||
return true;
|
||||
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
int *data = (int *)runtime->alloc(index->getBytes());
|
||||
index->getRuntime()->copyBlobToCPU(
|
||||
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
|
||||
|
||||
bool ret = true;
|
||||
auto value = inputs[0]->getDims()[axis];
|
||||
for (size_t i = 0; i < index->size(); ++i) {
|
||||
if (data[i] < 0 || data[i] >= value) {
|
||||
ret = false;
|
||||
break;
|
||||
if (index->getDType() == DataType::Int32) {
|
||||
int *data = (int *)runtime->alloc(index->getBytes());
|
||||
index->getRuntime()->copyBlobToCPU(
|
||||
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
|
||||
for (size_t i = 0; i < index->size(); ++i) {
|
||||
if (data[i] < 0 || data[i] >= value) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
runtime->dealloc(data);
|
||||
} else {
|
||||
int64_t *data = (int64_t *)runtime->alloc(index->getBytes());
|
||||
index->getRuntime()->copyBlobToCPU(
|
||||
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
|
||||
for (size_t i = 0; i < index->size(); ++i) {
|
||||
if (data[i] < 0 || data[i] >= value) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
runtime->dealloc(data);
|
||||
}
|
||||
runtime->dealloc(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
|
|||
if (ret.empty()) {
|
||||
b = 1;
|
||||
} else {
|
||||
b = std::accumulate(ret.begin(), ret.end(), 1);
|
||||
b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
|
||||
}
|
||||
auto kA = *(transA ? shape_a.rbegin() + 1 : shape_a.rbegin());
|
||||
auto kB = *(transB ? shape_b.rbegin() : shape_b.rbegin() + 1);
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
#include "operators/where.h"
|
||||
#include "utils/operator_utils.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
WhereObj::WhereObj(GraphObj *graph, Tensor inputX, Tensor inputY,
|
||||
Tensor condition, Tensor output)
|
||||
: OperatorObj(OpType::Where, TensorVec{inputX, inputY, condition},
|
||||
{output}) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>> WhereObj::inferShape(const TensorVec &inputs) const {
|
||||
auto shapeX = inputs[0]->getDims();
|
||||
auto shapeY = inputs[1]->getDims();
|
||||
auto shapeCon = inputs[2]->getDims();
|
||||
auto retXY = infer_broadcast(shapeX, shapeY);
|
||||
auto ret = infer_broadcast(retXY, shapeCon);
|
||||
return {{ret}};
|
||||
}
|
||||
|
||||
std::string WhereObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Where[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[2]->getDims()) << ",";
|
||||
os << "inputX=" << inputs[0]->getGuid() << ",";
|
||||
os << "inputY=" << inputs[1]->getGuid() << ",";
|
||||
os << "condition=" << inputs[2]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> WhereObj::getWorkloadVector() const {
|
||||
vector<int> ret = getOutput()->getDims();
|
||||
ret.emplace(ret.begin(), type.underlying());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> WhereObj::getOpAttrVector() const { return {type.underlying()}; }
|
||||
|
||||
} // namespace infini
|
|
@ -23,7 +23,7 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
|
|||
// GPU
|
||||
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
|
||||
auto inputGpu = bangGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastObj::Float2Int32);
|
||||
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastType::Float2Int32);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
bangGraph->dataMalloc();
|
||||
bangRuntime->run(bangGraph);
|
||||
|
|
|
@ -58,7 +58,12 @@ TEST(cuDNN_ElementWise, run) {
|
|||
testElementWiseCudnn<DivObj>(
|
||||
OneGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
testElementWiseCudnn<MinimumObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
|
||||
testElementWiseCudnn<MaximumObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
|
||||
testElementWiseCudnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
|
||||
ExpectOutput{1, 1, 4, 27});
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/expand.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Expand, Cuda) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto t1 = gCpu->addTensor({2, 1, 2, 1}, DataType::Float32);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
t1->setData(IncrementalGenerator());
|
||||
t1->printData();
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto t1Gpu = gCuda->cloneTensor(t1);
|
||||
|
||||
auto op = gCuda->addOp<ExpandObj>(t1Gpu, nullptr, Shape{2, 2, 2, 3});
|
||||
gCuda->dataMalloc();
|
||||
t1Gpu->setData(IncrementalGenerator());
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput());
|
||||
oCpu->printData();
|
||||
EXPECT_TRUE(
|
||||
oCpu->equalData(vector<float>{0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
|
||||
2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -77,7 +77,7 @@ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
|
|||
idxOffset += p * metaData.idxStride[j];
|
||||
}
|
||||
|
||||
idx = metaData.indexValue[idxOffset];
|
||||
idx = static_cast<int *>(metaData.indexValue)[idxOffset];
|
||||
k = k - metaData.idxNDim;
|
||||
|
||||
} else {
|
||||
|
@ -242,6 +242,31 @@ TEST(Gather, Cuda) {
|
|||
indexCuda->copyin(vector<int>{0, 3, 1});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput());
|
||||
EXPECT_TRUE(oCpu->equalData(
|
||||
vector<float>{0, 1, 6, 7, 2, 3, 8, 9, 14, 15, 10, 11}));
|
||||
}
|
||||
{
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
auto input = gCpu->addTensor({2, 4, 2}, DataType::Float32);
|
||||
auto index = gCpu->addTensor({3, 1}, DataType::Int64);
|
||||
gCpu->dataMalloc();
|
||||
input->setData(IncrementalGenerator());
|
||||
index->copyin(vector<int64_t>{0, 3, 1});
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputCuda = gCuda->cloneTensor(input);
|
||||
auto indexCuda = gCuda->cloneTensor(index);
|
||||
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
|
||||
gCuda->dataMalloc();
|
||||
inputCuda->setData(IncrementalGenerator());
|
||||
indexCuda->copyin(vector<int64_t>{0, 3, 1});
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput());
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
template <class T>
|
||||
void testTranspose(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
inputCpu->dataMalloc();
|
||||
inputCpu->setData(generator);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
vector<int> permute = {0, 2, 1, 3};
|
||||
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr, permute);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->copyData(inputCpu);
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto oCpu = outputGpu->clone(cpuRuntime);
|
||||
// Check
|
||||
// inputCpu->printData();
|
||||
// oCpu->printData();
|
||||
EXPECT_TRUE(oCpu->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
|
||||
4, 5, 6, 7, 16, 17, 18, 19,
|
||||
8, 9, 10, 11, 20, 21, 22, 23}));
|
||||
}
|
||||
|
||||
TEST(cuda_Transpose, run) {
|
||||
testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 2, 3, 4});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,63 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/where.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void test_where(const Shape &inputxshape, const vector<float> &inputxdata,
|
||||
const Shape &inputyshape, const vector<float> &inputydata,
|
||||
const Shape &conditionshape, const vector<int> &conditiondata,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
auto condition = gCpu->addTensor(conditionshape, DataType::Int32);
|
||||
auto inputx = gCpu->addTensor(inputxshape, DataType::Float32);
|
||||
auto inputy = gCpu->addTensor(inputyshape, DataType::Float32);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
condition->copyin(conditiondata); //
|
||||
inputx->copyin(inputxdata);
|
||||
inputy->copyin(inputydata); //
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto conditionGpu = gCuda->cloneTensor(condition);
|
||||
auto inputxGpu = gCuda->cloneTensor(inputx);
|
||||
auto inputyGpu = gCuda->cloneTensor(inputy);
|
||||
|
||||
auto op = gCuda->addOp<WhereObj>(inputxGpu, inputyGpu, conditionGpu,
|
||||
nullptr); // WhereObj
|
||||
gCuda->dataMalloc();
|
||||
conditionGpu->copyin(conditiondata);
|
||||
inputxGpu->copyin(inputxdata);
|
||||
inputyGpu->copyin(inputydata);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
|
||||
TEST(CUDA_Where, run) {
|
||||
test_where(
|
||||
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
Shape{2, 2, 3, 1}, vector<int>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
|
||||
vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
|
||||
|
||||
test_where(Shape{2, 1, 1, 3}, // inputx
|
||||
vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
|
||||
vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
|
||||
vector<int>{0, 1, 1, 0, 0, 0},
|
||||
vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1.,
|
||||
0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1.,
|
||||
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,26 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/expand.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Expand, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({3, 1}, DataType::Float32);
|
||||
auto op = g->addOp<ExpandObj>(i, nullptr, Shape{2, 1, 6});
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 6}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({3, 1}, DataType::Float32);
|
||||
auto op = g->addOp<ExpandObj>(i, nullptr, Shape{3, 4});
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 4}));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -9,11 +9,19 @@ namespace infini {
|
|||
|
||||
TEST(Gather, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
|
||||
Tensor index = g->addTensor({2, 1, 2}, DataType::Int32);
|
||||
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
|
||||
Tensor index = g->addTensor({2, 1, 2}, DataType::Int32);
|
||||
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
|
||||
Tensor index = g->addTensor({2, 1, 2}, DataType::Int64);
|
||||
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -21,6 +21,12 @@ TEST(ReduceMean, ShapeInference) {
|
|||
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, true);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
|
||||
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, true);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
|
||||
|
@ -33,6 +39,13 @@ TEST(ReduceMean, ShapeInference) {
|
|||
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, false);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
|
||||
auto op =
|
||||
g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, false);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/where.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Where, ShapeInference) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({2, 2}, DataType::Float32);
|
||||
Tensor y = g->addTensor({2, 2}, DataType::Float32);
|
||||
Tensor con = g->addTensor({2, 2}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float32);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
|
||||
Tensor con = g->addTensor({1, 224, 1}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({12, 224, 224}, DataType::Float32);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
|
||||
Tensor con = g->addTensor({1, 224}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
|
||||
}
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor x = g->addTensor({12, 224, 224}, DataType::Float32);
|
||||
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
|
||||
Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool);
|
||||
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue