Merge branch 'master' into xpu

This commit is contained in:
Haojie Wang 2023-08-30 09:20:51 +08:00 committed by GitHub
commit 879f2f0315
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 1203 additions and 124 deletions

View File

@ -16,6 +16,23 @@ cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
# Build Type
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
message("Configuring for Debug build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
add_compile_definitions(DEBUG_MODE)
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
message("Configuring for Release build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
add_compile_definitions(NDEBUG)
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
message("Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
else()
message("Build type not specified. Configuring for RelWithDebInfo build.")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
endif()
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")

View File

@ -8,6 +8,20 @@ INTELCPU ?= off
BACKTRACE ?= ON
TEST ?= ON
FORMAT_ORIGIN ?=
# Docker build options
DOCKER_NAME ?= infinitensor
DOCKER_IMAGE_NAME ?= infinitensor
DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
DOCKER_RUN_OPTION ?=
# CUDA option.
ifeq ($(CUDA), ON)
DOCKER_IMAGE_NAME = infinitensor_cuda
DOCKER_NAME = infinitensor_cuda
DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
endif
CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
CMAKE_OPT += -DUSE_CUDA=$(CUDA)
@ -41,3 +55,17 @@ test-cpp:
test-onnx:
@echo
python3 pyinfinitensor/tests/test_onnx.py
docker-build:
docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
docker-run:
docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
docker-start:
docker start $(DOCKER_IMAGE_NAME)
docker-exec:
docker exec -it $(DOCKER_IMAGE_NAME) bash

View File

@ -137,6 +137,29 @@
安装成功后,您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
## Docker
本项目也提供了 Docker 的环境,您可以使用 `make docker-build``make docker-build CUDA=ON` 命令启动并编译 Dockerfile您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中,因此暂时注释掉拉取 repo 并编译项目的过程,由用户在进入 docker 后自己维护 ssh key将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题)。
```shell
# Build docker container.
make docker-build
# Run docker image.
make docker-run
# Execute docker image.
make docker-exec
```
如果需要编译 CUDA 版,请使用如下命令:
```shell
# Build docker container.
make docker-build CUDA=ON
# Run docker image.
make docker-run CUDA=ON
```
## 技术支持
如遇到问题,请联系我们技术支持团队

View File

@ -1,3 +1,4 @@
#pragma once
#include "core/common.h"
namespace infini {
@ -69,23 +70,6 @@ class DataType {
int getIndex() const { return index; }
};
// to be consistent with onnx
// https://github.com/onnx/onnx/blob/aeb21329122b96df1d3ef33b500a35ca140b1431/onnx/onnx.proto#L484
inline const DataType DataType::Undefine(0);
inline const DataType DataType::Float32(1);
inline const DataType DataType::UInt8(2);
inline const DataType DataType::Int8(3);
inline const DataType DataType::UInt16(4);
inline const DataType DataType::Int16(5);
inline const DataType DataType::Int32(6);
inline const DataType DataType::Int64(7);
inline const DataType DataType::String(8);
inline const DataType DataType::Bool(9);
inline const DataType DataType::Float16(10);
inline const DataType DataType::Double(11);
inline const DataType DataType::UInt32(12);
inline const DataType DataType::UInt64(13);
inline const DataType DataType::BFloat16(16);
// Method definitions are out of the declaration due to GCC bug:
// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
template <> inline int DataType::get<float>() { return 0; }

View File

@ -41,10 +41,13 @@ class GraphHandlerObj {
Tensor mul(Tensor a, Tensor b, Tensor c);
Tensor div(Tensor a, Tensor b, Tensor c);
Tensor pow(Tensor a, Tensor b, Tensor c);
Tensor min(Tensor a, Tensor b, Tensor c);
Tensor max(Tensor a, Tensor b, Tensor c);
Tensor relu(Tensor x, Tensor y);
Tensor sigmoid(Tensor x, Tensor y);
Tensor tanh(Tensor x, Tensor y);
Tensor erf(Tensor x, Tensor y);
Tensor softmax(Tensor x, Tensor y, int axis);
Tensor abs(Tensor x, Tensor y);
Tensor sqrt(Tensor x, Tensor y);
@ -68,6 +71,8 @@ class GraphHandlerObj {
Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
const optional<vector<int>> &axes);
Tensor cast(Tensor input, Tensor output, int to);
Tensor expand(Tensor input, Tensor output, Shape dims);
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
//------ modifiers

View File

@ -37,7 +37,7 @@ class TensorObj : public TensorBaseObj {
Shape getDims() const { return shape; }
size_t getRank() const { return shape.size(); }
vector<size_t> getStride() const;
Shape getStride() const;
size_t getOffset(const vector<int> &ds) const;
void dataMalloc();
UidBaseType getFuid() const { return fuid; }

View File

@ -0,0 +1,9 @@
#pragma once
#include "operators/unary.h"
#include "utils/small_array.h"
namespace infini {
void expand_kernel(float *input, float *output, int nDims, int outputsize,
SmallArray inputShape, SmallArray outputShape);
}; // namespace infini

View File

@ -0,0 +1,11 @@
#pragma once
#include "operators/transpose.h"
#include "utils/small_array.h"
namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape);
}; // namespace infini

13
include/cuda/cuda_where.h Normal file
View File

@ -0,0 +1,13 @@
#pragma once
#include "operators/unary.h"
#include "utils/small_array.h"
namespace infini {
void where_kernel(const float *inputx, const float *inputy,
const float *condition, float *output, int nDims,
infini::SmallArray inputxShape,
infini::SmallArray inputyShape,
infini::SmallArray conditionShape,
infini::SmallArray outputShape);
}; // namespace infini

View File

@ -1,7 +1,10 @@
#pragma once
#include "core/data_type.h"
typedef struct {
int *indexValue;
namespace infini {
struct GatherMetaData {
void *indexValue;
DataType indexType;
int axis;
int inNDim;
int outNDim;
@ -10,8 +13,7 @@ typedef struct {
int idxDim[4];
int idxStride[4];
int inStride[4];
} GatherMetaData;
};
namespace infini {
void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
}
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
} // namespace infini

View File

@ -0,0 +1,36 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Broadcast the input tensor following the given shape and the
* broadcast rule.
*
*/
class ExpandObj : public OperatorObj {
Shape dims;
public:
/**
* @brief Construct a new Expand object.
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param output The output tensor.
* @param dims The shape you want to expand to, following the broadcast
* rule.
*/
ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
OP_CLONE(ExpandObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
Shape getShape() const { return dims; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

36
include/operators/where.h Normal file
View File

@ -0,0 +1,36 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Return elements, either from X or Y, depending on condition.
*
*/
class WhereObj : public OperatorObj {
public:
/**
* @brief Construct a new Where object.
*
* @param graph The computation graph that this operator belongs to.
* @param inputX The input tensor X.
* @param inputY The input tensor Y.
* @param output The output tensor.
* @param condition The condition tensor.
*/
WhereObj(GraphObj *graph, Tensor inputX, Tensor inputY, Tensor condition,
Tensor output);
OP_CLONE(WhereObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -0,0 +1,8 @@
namespace infini {
#define SMALL_ARRAY_SIZE 8
struct SmallArray {
int data[SMALL_ARRAY_SIZE];
};
} // namespace infini

View File

@ -348,6 +348,18 @@ class OnnxStub:
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Min":
tensors[node.output[0]] = self.handler.min(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Max":
tensors[node.output[0]] = self.handler.max(
tensors[node.input[0]],
tensors[node.input[1]],
tensors.get(node.output[0]),
)
elif node.op_type == "Relu":
tensors[node.output[0]] = self.handler.relu(
tensors[node.input[0]],
@ -397,7 +409,8 @@ class OnnxStub:
tensors[node.input[0]],
tensors.get(node.output[0]),
next(
(attr.i for attr in node.attribute if attr.name == "axis")
(attr.i for attr in node.attribute if attr.name == "axis"),
1,
),
)
elif node.op_type == "PRelu":
@ -505,7 +518,8 @@ class OnnxStub:
tensors[node.input[1]],
tensors.get(node.output[0]),
next(
(attr.i for attr in node.attribute if attr.name == "axis"), 0
(attr.i for attr in node.attribute if attr.name == "axis"),
0,
),
)
elif node.op_type == "ReduceMean":
@ -527,7 +541,7 @@ class OnnxStub:
for attr in node.attribute
if attr.name == "keepdims"
),
1
1,
)
!= 0,
)
@ -577,6 +591,25 @@ class OnnxStub:
tensors.get(node.output[0]),
next((attr.i for attr in node.attribute if attr.name == "to")),
)
elif node.op_type == "Expand":
shape = _parse_data(data[node.input[1]])
tensors[node.output[0]] = self.handler.expand(
tensors[node.input[0]],
tensors.get(node.output[0]),
shape,
)
elif node.op_type == "Erf":
tensors[node.output[0]] = self.handler.erf(
tensors[node.input[0]],
tensors.get(node.output[0]),
)
elif node.op_type == "Where":
tensors[node.output[0]] = self.handler.where(
tensors[node.input[1]],
tensors[node.input[2]],
tensors[node.input[0]],
tensors.get(node.output[0]),
)
else:
raise Exception('Unsupported operator "{}"'.format(node.op_type))
new_node_name.append(node.name)
@ -802,6 +835,8 @@ class OnnxStub:
backend.OpTypeId.Abs,
backend.OpTypeId.Identity,
backend.OpTypeId.PRelu,
backend.OpTypeId.Sqrt,
backend.OpTypeId.Erf,
]:
ctx.push_node(make_node(ty.name, inputs, outputs, name))
elif ty == backend.OpTypeId.Flatten:
@ -892,6 +927,13 @@ class OnnxStub:
elif ty == backend.OpTypeId.Cast:
to = backend.cast_to_of(op)
ctx.push_node(make_node(ty.name, inputs, outputs, name, to=to))
elif ty == backend.OpTypeId.Where:
assert len(inputs) == 3, "Check Where Op must have three inputs."
new_inputs = [inputs[2], inputs[0], inputs[1]]
ctx.push_node(make_node(ty.name, new_inputs, outputs, name))
elif ty == backend.OpTypeId.Expand:
shape = backend.expand_shape_of(op)
ctx.push_node(make_node(ty.name, inputs, outputs, name, shape=shape))
else:
raise Exception("Unsupported OpType", ty)
@ -942,6 +984,17 @@ def _search_shape(model: ModelProto, name: str) -> List[int]:
),
None,
)
or next(
(
[
(d.dim_value if d.dim_value > 0 else 1)
for d in tensor.type.tensor_type.shape.dim
]
for tensor in model.graph.output
if tensor.name == name
),
None,
)
or next(
[int(d) for d in tensor.dims]
for tensor in model.graph.initializer

View File

@ -207,6 +207,18 @@ class TestStringMethods(unittest.TestCase):
relu = make_node("Relu", ["x"], ["y"], name="relu")
make_and_import_model(make_graph([relu], "relu", [x], [y]))
def test_erf(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
erf = make_node("Erf", ["x"], ["y"], name="erf")
make_and_import_model(make_graph([erf], "erf", [x], [y]))
def test_sqrt(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
sqrt = make_node("Sqrt", ["x"], ["y"], name="sqrt")
make_and_import_model(make_graph([sqrt], "sqrt", [x], [y]))
def test_sigmoid(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
@ -352,6 +364,24 @@ class TestStringMethods(unittest.TestCase):
)
make_and_import_model(make_graph([cast], "cast", [input1], [output]))
def test_expand(self):
data = make_tensor_value_info("data", TensorProto.FLOAT, [3, 1])
dim = make_tensor_value_info("dim", TensorProto.INT64, [3])
dim_data = make_tensor("dim", TensorProto.INT64, [3], [2, 1, 6])
output = make_tensor_value_info("output", TensorProto.FLOAT, [2, 3, 6])
expand = make_node("Expand", ["data", "dim"], ["output"], name="expand")
make_and_import_model(
make_graph([expand], "expand", [data, dim], [output], [dim_data])
)
def test_where(self):
x = make_tensor_value_info("x", TensorProto.FLOAT, [1, 3, 5, 7])
y = make_tensor_value_info("y", TensorProto.FLOAT, [1, 3, 5, 7])
con = make_tensor_value_info("con", TensorProto.BOOL, [1, 3, 5, 7])
output = make_tensor_value_info("output", TensorProto.FLOAT, [1, 3, 5, 7])
where = make_node("Where", ["x", "y", "con"], ["output"], name="where")
make_and_import_model(make_graph([where], "where", [x, y, con], [output]))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,19 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive
# Install dependencies.
RUN apt update && apt-get install -y git make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev openssh-client
# Generate ssh key.
RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -P ""
# Update pip and switch to Tsinghua source.
RUN python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# TODO: Since SSH clone repo requires adding the SSH key to the GitHub profile,
# the process of pulling the project and compiling it has been temporarily commented.
# Download InfiniTensor.
# RUN git clone git@github.com:InfiniTensor/InfiniTensor.git /root/InfiniTensor --branch master --single-branch --recursive
# Build and Install InfiniTensor
# RUN cd /root/InfiniTensor && make install-python

View File

@ -0,0 +1,18 @@
FROM nvcr.io/nvidia/pytorch:23.07-py3
# Install dependencies.
RUN apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update && apt-get install -y git make cmake build-essential python-is-python3 python-dev-is-python3 python3-pip libdw-dev openssh-client
# Generate ssh key.
RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -P ""
# Update pip and switch to Tsinghua source.
RUN python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# TODO: Since SSH clone repo requires adding the SSH key to the GitHub profile,
# the process of pulling the project and compiling it has been temporarily commented.
# Download InfiniTensor.
# RUN git clone git@github.com:InfiniTensor/InfiniTensor.git /root?/InfiniTensor --branch master --single-branch --recursive
# Build and Install InfiniTensor
# RUN cd /root/InfiniTensor && make install-python CUDA=ON

23
src/core/data_type.cc Normal file
View File

@ -0,0 +1,23 @@
#include "core/data_type.h"
namespace infini {
// Move implementation here to avoid compile time error on some platform
// to be consistent with onnx
// https://github.com/onnx/onnx/blob/aeb21329122b96df1d3ef33b500a35ca140b1431/onnx/onnx.proto#L484
const DataType DataType::Undefine(0);
const DataType DataType::Float32(1);
const DataType DataType::UInt8(2);
const DataType DataType::Int8(3);
const DataType DataType::UInt16(4);
const DataType DataType::Int16(5);
const DataType DataType::Int32(6);
const DataType DataType::Int64(7);
const DataType DataType::String(8);
const DataType DataType::Bool(9);
const DataType DataType::Float16(10);
const DataType DataType::Double(11);
const DataType DataType::UInt32(12);
const DataType DataType::UInt64(13);
// TODO: Reserved for complex data type.
const DataType DataType::BFloat16(16);
} // namespace infini

View File

@ -3,6 +3,7 @@
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/element_wise.h"
#include "operators/expand.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/pad.h"
@ -14,6 +15,7 @@
#include "operators/split.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "operators/where.h"
namespace infini {
@ -135,6 +137,8 @@ DEFINE_ELEMENT_WISE_METHOD(sub, Sub)
DEFINE_ELEMENT_WISE_METHOD(mul, Mul)
DEFINE_ELEMENT_WISE_METHOD(div, Div)
DEFINE_ELEMENT_WISE_METHOD(pow, Pow)
DEFINE_ELEMENT_WISE_METHOD(min, Minimum)
DEFINE_ELEMENT_WISE_METHOD(max, Maximum)
// see operators/unary.h
#define DEFINE_UNARY_METHOD(name, obj) \
@ -153,6 +157,7 @@ DEFINE_UNARY_METHOD(tanh, Tanh)
DEFINE_UNARY_METHOD(abs, Abs)
DEFINE_UNARY_METHOD(sqrt, Sqrt)
DEFINE_UNARY_METHOD(shape, Shape)
DEFINE_UNARY_METHOD(erf, Erf)
// see operators/reshape.h
DEFINE_UNARY_METHOD(identity, Identity)
@ -307,6 +312,31 @@ Tensor GraphHandlerObj::cast(Tensor input, Tensor output, int to) {
}
}
Tensor GraphHandlerObj::expand(Tensor input, Tensor output, Shape dims) {
if (output) {
g->addOpWithOutputs<ExpandObj>(std::move(input), output,
std::move(dims));
return output;
} else {
return g->addOp<ExpandObj>(std::move(input), output, std::move(dims))
->getOutput();
}
}
Tensor GraphHandlerObj::where(Tensor inputX, Tensor inputY, Tensor condition,
Tensor output) {
if (output) {
g->addOpWithOutputs<WhereObj>(std::move(inputX), std::move(inputY),
std::move(condition), output);
return output;
} else {
return g
->addOp<WhereObj>(std::move(inputX), std::move(inputY),
std::move(condition), output)
->getOutput();
}
}
static CastType inferCastType(Tensor input, int to) {
auto iType = input->getDType();
auto oType = DataType(to);

View File

@ -51,15 +51,14 @@ size_t TensorObj::getOffset(const vector<int> &pos) const {
return idx;
}
vector<size_t> TensorObj::getStride() const {
vector<size_t> ret;
size_t stride = 1;
for (int i = shape.size() - 1; i >= 1; i--) {
ret.emplace(ret.begin(), stride);
stride *= shape.at(i);
Shape TensorObj::getStride() const {
Shape stride(getRank());
ShapeElem p = 1;
for (auto i = getRank(); i > 0; --i) {
stride[i - 1] = p;
p = p * shape[i - 1];
}
ret.emplace(ret.begin(), stride);
return ret;
return stride;
}
void TensorObj::printData() const {

View File

@ -4,6 +4,19 @@
#include "core/runtime.h"
#include "operators/conv.h"
#include "operators/matmul.h"
#ifdef DEBUG_MODE
void CHECK_CUDA_KERNEL_ERROR(infini::Operator op) {
cudaError_t kernelError = cudaGetLastError();
if (kernelError != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(kernelError)
<< std::endl
<< "Failed Operator: " << op->toString() << std::endl;
exit(EXIT_FAILURE);
}
}
#endif
namespace infini {
void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
@ -22,6 +35,10 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
} else {
kernel->compute(op, this);
}
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
}
}
@ -57,6 +74,10 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
opTime[op->getOpType()] += t;
opCnt[op->getOpType()]++;
}
#ifdef DEBUG_MODE
CHECK_CUDA_KERNEL_ERROR(op);
#endif
}
}

View File

@ -2,6 +2,7 @@
#include "operators/batch_norm.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/expand.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/pad.h"
@ -99,6 +100,10 @@ void export_values(py::module &m) {
.VALUE(OpType, Resize)
.VALUE(OpType, Dropout)
.VALUE(OpType, Cast)
.VALUE(OpType, Sqrt)
.VALUE(OpType, Expand)
.VALUE(OpType, Erf)
.VALUE(OpType, Where)
.export_values();
#undef VALUE
@ -233,6 +238,15 @@ static vector<int64_t> reshape_shape_of(Operator op) {
return ans;
}
static vector<int64_t> expand_shape_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Expand);
auto shape = dynamic_cast<const ExpandObj *>(op.get())->getShape();
vector<int64_t> ans(shape.size());
std::transform(shape.begin(), shape.end(), ans.begin(),
[](auto x) { return static_cast<int64_t>(x); });
return ans;
}
static vector<int64_t> pad_pads_of(Operator op) {
IT_ASSERT(op->getOpType() == OpType::Pad);
auto shape = dynamic_cast<const PadObj *>(op.get())->getPads();
@ -287,6 +301,7 @@ void export_functions(py::module &m) {
.FUNCTION(reduce_mean_attrs_of)
.FUNCTION(tensor_dtype)
.FUNCTION(reshape_shape_of)
.FUNCTION(expand_shape_of)
.FUNCTION(pad_pads_of)
.FUNCTION(transpose_permute_of)
.FUNCTION(concat_axis_of)
@ -374,6 +389,9 @@ void init_graph_builder(py::module &m) {
.def("slice", &Handler::slice, policy::move)
.def("pad", &Handler::pad, policy::move)
.def("cast", &Handler::cast, policy::move)
.def("expand", &Handler::expand, policy::move)
.def("erf", &Handler::erf, policy::move)
.def("where", &Handler::where, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move)

View File

@ -22,142 +22,142 @@ class CastCnnl : public BangKernelWithoutConfig {
checkCnnlError(cnnlCreateTensorDescriptor(&aDesc));
checkCnnlError(cnnlCreateTensorDescriptor(&cDesc));
cnnlCastDataType_t NlCastType;
CastObj::CastType type = op->getType();
CastType type = op->getType();
switch (type) {
case CastObj::Float2Int64:
case CastType::Float2Int64:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
NlCastType = CNNL_CAST_FLOAT_TO_INT64;
break;
case CastObj::Float2Int32:
case CastType::Float2Int32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
NlCastType = CNNL_CAST_FLOAT_TO_INT32;
break;
case CastObj::Float2Int16:
case CastType::Float2Int16:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
NlCastType = CNNL_CAST_FLOAT_TO_INT16;
break;
case CastObj::Float2Int8:
case CastType::Float2Int8:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
NlCastType = CNNL_CAST_FLOAT_TO_INT8;
break;
case CastObj::Int322Float:
case CastType::Int322Float:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
NlCastType = CNNL_CAST_INT32_TO_FLOAT;
break;
case CastObj::Int322Int8:
case CastType::Int322Int8:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
NlCastType = CNNL_CAST_INT32_TO_INT8;
break;
case CastObj::Int322Int16:
case CastType::Int322Int16:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
NlCastType = CNNL_CAST_INT32_TO_INT16;
break;
case CastObj::Int162Float:
case CastType::Int162Float:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
NlCastType = CNNL_CAST_INT16_TO_FLOAT;
break;
case CastObj::Int162Int32:
case CastType::Int162Int32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
NlCastType = CNNL_CAST_INT16_TO_INT32;
break;
case CastObj::Int82Float:
case CastType::Int82Float:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
NlCastType = CNNL_CAST_INT8_TO_FLOAT;
break;
case CastObj::Int82Int16:
case CastType::Int82Int16:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT16, 4, dim_array));
NlCastType = CNNL_CAST_INT8_TO_INT16;
break;
case CastObj::Int82Int32:
case CastType::Int82Int32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
NlCastType = CNNL_CAST_INT8_TO_INT32;
break;
case CastObj::Uint82Float:
case CastType::Uint82Float:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
NlCastType = CNNL_CAST_UINT8_TO_FLOAT;
break;
case CastObj::Uint82Int32:
case CastType::Uint82Int32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
NlCastType = CNNL_CAST_UINT8_TO_INT32;
break;
case CastObj::Uint82Int64:
case CastType::Uint82Int64:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT8, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
NlCastType = CNNL_CAST_UINT8_TO_INT64;
break;
case CastObj::Int322Int64:
case CastType::Int322Int64:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
NlCastType = CNNL_CAST_INT32_TO_INT64;
break;
case CastObj::Int642Int32:
case CastType::Int642Int32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array));
NlCastType = CNNL_CAST_INT64_TO_INT32;
break;
case CastObj::Int642Uint32:
case CastType::Int642Uint32:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
NlCastType = CNNL_CAST_INT64_TO_UINT32;
break;
case CastObj::Int642Float:
case CastType::Int642Float:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT64, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(
cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array));
NlCastType = CNNL_CAST_INT64_TO_FLOAT;
break;
case CastObj::Uint322Int64:
case CastType::Uint322Int64:
checkCnnlError(cnnlSetTensorDescriptor(
aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_UINT32, 4, dim_array));
checkCnnlError(cnnlSetTensorDescriptor(

View File

@ -25,7 +25,7 @@ void clip_kernel(float *input, float *output, int num, float minValue,
float maxValue) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_clip_kernel<<<blocksize, gridsize>>>(input, output, num, minValue,
_clip_kernel<<<gridsize, blocksize>>>(input, output, num, minValue,
maxValue);
}

View File

@ -98,6 +98,14 @@ class MulCudnn : public ElementWiseCudnn {
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MUL; }
};
class MinCudnn : public ElementWiseCudnn {
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MIN; }
};
class MaxCudnn : public ElementWiseCudnn {
cudnnOpTensorOp_t getOpType() const override { return CUDNN_OP_TENSOR_MAX; }
};
class ElementWiseCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
@ -137,6 +145,10 @@ REGISTER_KERNEL(Device::CUDA, OpType::Sub, DataType::Float32, SubCudnn,
"Sub_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Mul, DataType::Float32, MulCudnn,
"Mul_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Min, DataType::Float32, MinCudnn,
"Min_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn,
"Max_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
"Div_CUDA_Float32");

View File

@ -5,15 +5,15 @@ constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3);
int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -27,19 +27,22 @@ __global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1;
int b2_index = c2_index % b2;
int b3_index = c3_index % b3;
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] / y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
z[i] = x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 + a2_index * a3 +
a3_index] /
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 + b2_index * b3 +
b3_index];
}
}
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
int n = c0 * c1 * c2 * c3;
for (int i = index; i < n; i += stride) {
int c0_index = i/ (c1 * c2 * c3);
int c0_index = i / (c1 * c2 * c3);
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
@ -53,27 +56,32 @@ __global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2
int b1_index = c1_index % b1;
int b2_index = c2_index % b2;
int b3_index = c3_index % b3;
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
z[i] = pow(x[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
a2_index * a3 + a3_index],
y[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
b2_index * b3 + b3_index]);
}
}
namespace infini {
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) {
int blocksize = block_work_size();
int num = c0*c1*c2*c3;
int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size();
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_div_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
}
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
int b0, int b1, int b2, int b3,
int c0, int c1, int c2, int c3) {
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) {
int blocksize = block_work_size();
int num = c0*c1*c2*c3;
int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size();
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
_pow_kernel<<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2,
b3, c0, c1, c2, c3);
}
}; // namespace infini

View File

@ -0,0 +1,36 @@
#include "operators/expand.h"
#include "cuda/cuda_expand.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class ExpandCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ExpandObj>(_op);
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
const auto &out_Shape = op->getShape(); // output shape
SmallArray inputShape, outputShape;
int nDims = op->getInputs(0)->getDims().size();
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
int outputsize = 1; // the length of the output vector after flatten
for (int i = 0; i < nDims; ++i) {
outputShape.data[i] = out_Shape[i];
inputShape.data[i] = in_Shape[i];
outputsize *= out_Shape[i];
}
expand_kernel((float *)inputData, (float *)outputData, nDims,
outputsize, inputShape, outputShape);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Expand, DataType::Float32, ExpandCuda,
"Expand_CUDA_Float32");
}; // namespace infini

View File

@ -0,0 +1,49 @@
#include "core/common.h"
#include "cuda/cuda_common.h"
#include "utils/small_array.h"
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _expand_kernel(float *input, float *output, int nDims,
int outputsize, infini::SmallArray inputShape,
infini::SmallArray outputShape) {
int outputIdx =
blockIdx.x * blockDim.x + threadIdx.x; // i(JKS) + j(KS) + k(S) + s
if (outputIdx < outputsize) {
int inputIdx = 0; // record input index
int temp = 1; // stored S, KS, JKS, in order
int tmp = 1; // stored s,k,j,i in order
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
for (int i = nDims - 1; i >= 0; --i) {
if (i == 0) {
tmp = v; // i = outputIdx/(JKS)
} else {
tmp = v % outputShape.data[i]; // store s,k,j in order
}
if (inputShape.data[i] ==
1) { // if input shape = 1, the index only equal 0
inputIdx += 0;
} else {
inputIdx +=
tmp * temp; // otherwise +i(JKS) or j(KS) or k(S) or s
}
temp *= inputShape.data[i];
v = v / outputShape.data[i];
}
output[outputIdx] = input[inputIdx];
}
}
namespace infini {
void expand_kernel(float *input, float *output, int nDims, int outputsize,
SmallArray inputShape, SmallArray outputShape) {
int blocksize = block_work_size();
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
_expand_kernel<<<gridsize, blocksize>>>(input, output, nDims, outputsize,
inputShape, outputShape);
}
} // namespace infini

View File

@ -19,7 +19,7 @@ void extend_kernel(float *in, float *out, int blockSize, int blockSizeOuter,
int oSize) {
int blocksize = 32 * 16;
int gridsize = (oSize + blocksize - 1) / blocksize;
_extend_kernel<<<blocksize, gridsize>>>(in, out, blockSize, blockSizeOuter,
_extend_kernel<<<gridsize, blocksize>>>(in, out, blockSize, blockSizeOuter,
oSize);
}
} // namespace infini

View File

@ -12,7 +12,8 @@ class GatherCuda : public CudaKernelWithoutConfig {
auto in = op->getInputs(0);
auto index = op->getInputs(1);
auto out = op->getOutput();
metaData.indexValue = index->getRawDataPtr<int *>();
metaData.indexValue = index->getRawDataPtr<void *>();
metaData.indexType = index->getDType();
metaData.axis = op->getAxis();
metaData.inNDim = in->getRank();
metaData.outNDim = out->getRank();

View File

@ -1,19 +1,21 @@
#include "cuda/cuda_common.h"
#include "cuda/gather.h"
__device__ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
int offset = 0;
template <typename T>
__device__ T gatheredOffset2Offset(int gOffset,
infini::GatherMetaData metaData) {
T offset = 0;
for (int i = metaData.inNDim - 1, k = metaData.outNDim - 1; i >= 0; --i) {
int idx = 0;
T idx = 0;
if (i == metaData.axis) {
int idxOffset = 0;
T idxOffset = 0;
for (int j = metaData.idxNDim - 1; j >= 0; --j) {
int p = gOffset % metaData.idxDim[j];
T p = gOffset % metaData.idxDim[j];
gOffset = gOffset / metaData.idxDim[j];
idxOffset += p * metaData.idxStride[j];
}
idx = metaData.indexValue[idxOffset];
idx = static_cast<T *>(metaData.indexValue)[idxOffset];
k = k - metaData.idxNDim;
} else {
@ -26,22 +28,27 @@ __device__ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
return offset;
}
__global__ void _gather_kernel(float *in, float *out, GatherMetaData metaData,
int num) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
template <typename T>
__global__ void _gather_kernel(float *in, float *out,
infini::GatherMetaData metaData, size_t num) {
T tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (tid < num) {
int offset = gatheredOffset2Offset(tid, metaData);
T offset = gatheredOffset2Offset<T>(tid, metaData);
out[tid] = in[offset];
tid += stride;
}
}
namespace infini {
void gather_kernel(float *in, float *out, GatherMetaData metaData, int num) {
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num) {
int blockSize = 32 * 16;
int gridSize = (num + blockSize - 1) / blockSize;
_gather_kernel<<<gridSize, blockSize>>>(in, out, metaData, num);
if (metaData.indexType == DataType::Int64) {
_gather_kernel<int64_t>
<<<gridSize, blockSize>>>(in, out, metaData, num);
} else {
_gather_kernel<int><<<gridSize, blockSize>>>(in, out, metaData, num);
}
}
} // namespace infini

View File

@ -0,0 +1,49 @@
#include "operators/transpose.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_transpose.h"
namespace infini {
class TransposeCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<TransposeObj>(_op);
auto input = op->getInputs(0);
auto output = op->getOutput();
void *const inputData = input->getRawDataPtr<void *>();
void *const outputData = output->getRawDataPtr<void *>();
const auto &inputShape = input->getDims();
const auto &outputShape = output->getDims();
const auto &perm = op->getPermute();
int size = input->size();
int nDims = input->getDims().size();
// Compute strides
SmallArray strides, buffer;
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
int curStride = 1;
for (int i = nDims - 1; i >= 0; --i) {
buffer.data[i] = curStride;
curStride *= inputShape[i];
}
for (int i = 0; i < nDims; ++i) {
strides.data[i] = buffer.data[perm[i]];
}
SmallArray outputDims;
for (int i = 0; i < nDims; ++i) {
outputDims.data[i] = outputShape[i];
}
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
strides, outputDims);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
TransposeCuda, "Transpose_CUDA_Float32");
} // namespace infini

View File

@ -0,0 +1,37 @@
#include "core/common.h"
#include "cuda/cuda_common.h"
#include "utils/small_array.h"
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _transpose_kernel(float *input, float *output, int nDims,
int size, infini::SmallArray strides,
infini::SmallArray outputShape) {
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (outputIdx < size) {
int inputIdx = 0;
int v = outputIdx;
for (int i = nDims - 1; i >= 0; --i) {
inputIdx += v % outputShape.data[i] * strides.data[i];
v /= outputShape.data[i];
}
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
output[outputIdx] = __ldg(input + inputIdx);
#else
output[outputIdx] = input[inputIdx];
#endif
}
}
namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size,
SmallArray strides, SmallArray outputShape) {
int blocksize = block_work_size();
int gridsize = (size + block_work_size() - 1) / block_work_size();
_transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
strides, outputShape);
}
} // namespace infini

View File

@ -25,19 +25,25 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
cudnnTensorDescriptor_t inputDesc, outputDesc;
auto dim = op->getInputs(0)->getDims();
if (dim.size() != 4)
IT_TODO_HALT();
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
// assume input and output have the same strides.
auto stride = op->getInputs(0)->getStride();
// CUDNN requires that dim >= 4.
while (dim.size() < 4)
dim.push_back(1);
while (stride.size() < 4)
stride.push_back(1);
// get inputs
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
checkCudnnError(cudnnSetTensorNdDescriptor(inputDesc, CUDNN_DATA_FLOAT,
dim.size(), dim.data(),
stride.data()));
// get outputs
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
checkCudnnError(cudnnSetTensorNdDescriptor(outputDesc, CUDNN_DATA_FLOAT,
dim.size(), dim.data(),
stride.data()));
// get op descriptor
cudnnActivationDescriptor_t activationDesc;

View File

@ -72,36 +72,36 @@ void softmax_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_softmax_kernel1<<<1, 1>>>(input, output, num);
_softmax_kernel2<<<blocksize, gridsize>>>(input, output, num);
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
}
void relu_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_relu_kernel<<<blocksize, gridsize>>>(input, output, num);
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sigmoid_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sigmoid_kernel<<<blocksize, gridsize>>>(input, output, num);
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void tanh_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_tanh_kernel<<<blocksize, gridsize>>>(input, output, num);
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void abs_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_abs_kernel<<<blocksize, gridsize>>>(input, output, num);
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
}
void sqrt_kernel(float *input, float *output, int num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<blocksize, gridsize>>>(input, output, num);
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
}
}; // namespace infini

41
src/kernels/cuda/where.cc Normal file
View File

@ -0,0 +1,41 @@
#include "operators/where.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_where.h"
namespace infini {
class WhereCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<WhereObj>(_op);
void *const inputxData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const inputyData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
const auto &inputx_Shape = op->getInputs(0)->getDims();
const auto &inputy_Shape = op->getInputs(1)->getDims();
const auto &condition_Shape = op->getInputs(2)->getDims();
const auto &output_Shape = op->getOutput()->getDims();
int nDims = op->getInputs(0)->getDims().size();
IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
SmallArray inputxShape, inputyShape, conditionShape, outputShape;
for (int i = 0; i < nDims; ++i) {
inputxShape.data[i] = inputx_Shape[i];
inputyShape.data[i] = inputy_Shape[i];
conditionShape.data[i] = condition_Shape[i];
outputShape.data[i] = output_Shape[i];
}
where_kernel((float *)inputxData, (float *)inputyData,
(float *)conditionData, (float *)outputData, nDims,
inputxShape, inputyShape, conditionShape, outputShape);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Where, DataType::Float32, WhereCuda,
"Where_CUDA_Float32");
}; // namespace infini

82
src/kernels/cuda/where.cu Normal file
View File

@ -0,0 +1,82 @@
#include "cuda/cuda_common.h"
#include "utils/small_array.h"
__global__ void _where_kernel(const float *inputx, const float *inputy,
const float *condition, float *output, int nDims,
int outputsize, infini::SmallArray inputxShape,
infini::SmallArray inputyShape,
infini::SmallArray conditionShape,
infini::SmallArray outputShape) {
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (outputIdx < outputsize) {
int inputxIdx = 0;
int temp_inputx = 1;
int inputyIdx = 0;
int temp_inputy = 1;
int conditionIdx = 0;
int temp_condition = 1;
int tmp = 1; // stored s,k,j,i in order
int v = outputIdx; // v = i(JKS) + j(KS) + k(S) + s
for (int i = nDims - 1; i >= 0; --i) {
if (i == 0) {
tmp = v; // i = outputIdx/(JKS)
} else {
tmp = v % outputShape.data[i]; // store s,k,j in order
}
if (inputxShape.data[i] == 1) {
inputxIdx += 0;
} else {
inputxIdx +=
tmp *
temp_inputx; // otherwise +i(JKS) or j(KS) or k(S) or s
}
temp_inputx *= inputxShape.data[i];
//----------------------------
if (inputyShape.data[i] == 1) {
inputyIdx += 0;
} else {
inputyIdx +=
tmp *
temp_inputy; // otherwise +i(JKS) or j(KS) or k(S) or s
}
temp_inputy *= inputyShape.data[i];
//--------------------------
if (conditionShape.data[i] == 1) {
conditionIdx += 0;
} else {
conditionIdx +=
tmp *
temp_condition; // otherwise +i(JKS) or j(KS) or k(S) or s
}
temp_condition *= conditionShape.data[i];
//-------------------------
v = v / outputShape.data[i];
}
output[outputIdx] =
condition[conditionIdx] ? inputx[inputxIdx] : inputy[inputyIdx];
}
}
namespace infini {
void where_kernel(const float *inputx, const float *inputy,
const float *condition, float *output, int nDims,
infini::SmallArray inputxShape,
infini::SmallArray inputyShape,
infini::SmallArray conditionShape,
infini::SmallArray outputShape) {
int outputsize = 1;
for (int i = 0; i < nDims; i++) {
outputsize *= outputShape.data[i];
}
int blocksize = 32 * 16;
int gridsize = (outputsize + blocksize - 1) / blocksize;
_where_kernel<<<gridsize, blocksize>>>(
inputx, inputy, condition, output, nDims, outputsize, inputxShape,
inputyShape, conditionShape, outputShape);
}
} // namespace infini

41
src/operators/expand.cc Normal file
View File

@ -0,0 +1,41 @@
#include "operators/expand.h"
#include "utils/operator_utils.h"
namespace infini {
ExpandObj::ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims)
: OperatorObj(OpType::Expand, {input}, {output}), dims(std::move(dims)) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> ExpandObj::inferShape(const TensorVec &inputs) const {
auto shape_input = inputs[0]->getDims();
Shape ret = infer_broadcast(shape_input, dims);
return {{ret}};
}
std::string ExpandObj::toString() const {
std::ostringstream os;
os << "Expand[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "dims=" << vecToString(dims) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> ExpandObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.insert(ret.end(), dims.begin(), dims.end());
ret.emplace(ret.begin(), type.underlying());
return ret;
}
vector<int> ExpandObj::getOpAttrVector() const {
vector<int> ret = dims;
ret.emplace(ret.begin(), type.underlying());
return ret;
}
} // namespace infini

View File

@ -24,8 +24,8 @@ optional<vector<Shape>> GatherObj::inferShape(const TensorVec &inputs) const {
vector<DataType> GatherObj::inferDataType(const TensorVec &inputs) const {
IT_ASSERT(inputs.size() == 2);
auto index = inputs[1];
IT_ASSERT(index->getDType() == DataType::Int32);
auto index_dtype = inputs[1]->getDType();
IT_ASSERT(index_dtype == DataType::Int32 || index_dtype == DataType::Int64)
return {inputs[0]->getDType()};
}
@ -36,19 +36,31 @@ bool GatherObj::CheckIndexValid() const {
return true;
Runtime runtime = NativeCpuRuntimeObj::getInstance();
int *data = (int *)runtime->alloc(index->getBytes());
index->getRuntime()->copyBlobToCPU(
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
bool ret = true;
auto value = inputs[0]->getDims()[axis];
for (size_t i = 0; i < index->size(); ++i) {
if (data[i] < 0 || data[i] >= value) {
ret = false;
break;
if (index->getDType() == DataType::Int32) {
int *data = (int *)runtime->alloc(index->getBytes());
index->getRuntime()->copyBlobToCPU(
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
for (size_t i = 0; i < index->size(); ++i) {
if (data[i] < 0 || data[i] >= value) {
ret = false;
break;
}
}
runtime->dealloc(data);
} else {
int64_t *data = (int64_t *)runtime->alloc(index->getBytes());
index->getRuntime()->copyBlobToCPU(
(void *)data, index->getRawDataPtr<void *>(), index->getBytes());
for (size_t i = 0; i < index->size(); ++i) {
if (data[i] < 0 || data[i] >= value) {
ret = false;
break;
}
}
runtime->dealloc(data);
}
runtime->dealloc(data);
return ret;
}

View File

@ -20,7 +20,7 @@ MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
if (ret.empty()) {
b = 1;
} else {
b = std::accumulate(ret.begin(), ret.end(), 1);
b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
}
auto kA = *(transA ? shape_a.rbegin() + 1 : shape_a.rbegin());
auto kB = *(transB ? shape_b.rbegin() : shape_b.rbegin() + 1);

42
src/operators/where.cc Normal file
View File

@ -0,0 +1,42 @@
#include "operators/where.h"
#include "utils/operator_utils.h"
namespace infini {
WhereObj::WhereObj(GraphObj *graph, Tensor inputX, Tensor inputY,
Tensor condition, Tensor output)
: OperatorObj(OpType::Where, TensorVec{inputX, inputY, condition},
{output}) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>> WhereObj::inferShape(const TensorVec &inputs) const {
auto shapeX = inputs[0]->getDims();
auto shapeY = inputs[1]->getDims();
auto shapeCon = inputs[2]->getDims();
auto retXY = infer_broadcast(shapeX, shapeY);
auto ret = infer_broadcast(retXY, shapeCon);
return {{ret}};
}
std::string WhereObj::toString() const {
std::ostringstream os;
os << "Where[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[2]->getDims()) << ",";
os << "inputX=" << inputs[0]->getGuid() << ",";
os << "inputY=" << inputs[1]->getGuid() << ",";
os << "condition=" << inputs[2]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> WhereObj::getWorkloadVector() const {
vector<int> ret = getOutput()->getDims();
ret.emplace(ret.begin(), type.underlying());
return ret;
}
vector<int> WhereObj::getOpAttrVector() const { return {type.underlying()}; }
} // namespace infini

View File

@ -23,7 +23,7 @@ void testCast(const std::function<void(void *, size_t, DataType)> &generator,
// GPU
Graph bangGraph = make_ref<GraphObj>(bangRuntime);
auto inputGpu = bangGraph->cloneTensor(inputCpu);
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastObj::Float2Int32);
auto gpuOp = bangGraph->addOp<T>(inputGpu, nullptr, CastType::Float2Int32);
auto outputGpu = gpuOp->getOutput();
bangGraph->dataMalloc();
bangRuntime->run(bangGraph);

View File

@ -58,7 +58,12 @@ TEST(cuDNN_ElementWise, run) {
testElementWiseCudnn<DivObj>(
OneGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
testElementWiseCudnn<MinimumObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
testElementWiseCudnn<MaximumObj>(
IncrementalGenerator(), Shape{1, 2, 2, 3},
ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
testElementWiseCudnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
ExpectOutput{1, 1, 4, 27});
}

View File

@ -0,0 +1,41 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/expand.h"
#include "test.h"
namespace infini {
TEST(Expand, Cuda) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto t1 = gCpu->addTensor({2, 1, 2, 1}, DataType::Float32);
gCpu->dataMalloc();
t1->setData(IncrementalGenerator());
t1->printData();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto t1Gpu = gCuda->cloneTensor(t1);
auto op = gCuda->addOp<ExpandObj>(t1Gpu, nullptr, Shape{2, 2, 2, 3});
gCuda->dataMalloc();
t1Gpu->setData(IncrementalGenerator());
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
oCpu->printData();
EXPECT_TRUE(
oCpu->equalData(vector<float>{0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3}));
}
} // namespace infini

View File

@ -77,7 +77,7 @@ int gatheredOffset2Offset(int gOffset, GatherMetaData metaData) {
idxOffset += p * metaData.idxStride[j];
}
idx = metaData.indexValue[idxOffset];
idx = static_cast<int *>(metaData.indexValue)[idxOffset];
k = k - metaData.idxNDim;
} else {
@ -242,6 +242,31 @@ TEST(Gather, Cuda) {
indexCuda->copyin(vector<int>{0, 3, 1});
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
EXPECT_TRUE(oCpu->equalData(
vector<float>{0, 1, 6, 7, 2, 3, 8, 9, 14, 15, 10, 11}));
}
{
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor({2, 4, 2}, DataType::Float32);
auto index = gCpu->addTensor({3, 1}, DataType::Int64);
gCpu->dataMalloc();
input->setData(IncrementalGenerator());
index->copyin(vector<int64_t>{0, 3, 1});
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputCuda = gCuda->cloneTensor(input);
auto indexCuda = gCuda->cloneTensor(index);
auto op = gCuda->addOp<GatherObj>(inputCuda, indexCuda, nullptr, 1);
gCuda->dataMalloc();
inputCuda->setData(IncrementalGenerator());
indexCuda->copyin(vector<int64_t>{0, 3, 1});
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());

View File

@ -0,0 +1,46 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "operators/transpose.h"
#include "test.h"
namespace infini {
template <class T>
void testTranspose(
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &shape) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor inputCpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
inputCpu->dataMalloc();
inputCpu->setData(generator);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
vector<int> permute = {0, 2, 1, 3};
auto gpuOp = cudaGraph->addOp<T>(inputGpu, nullptr, permute);
cudaGraph->dataMalloc();
inputGpu->copyData(inputCpu);
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto oCpu = outputGpu->clone(cpuRuntime);
// Check
// inputCpu->printData();
// oCpu->printData();
EXPECT_TRUE(oCpu->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
4, 5, 6, 7, 16, 17, 18, 19,
8, 9, 10, 11, 20, 21, 22, 23}));
}
TEST(cuda_Transpose, run) {
testTranspose<TransposeObj>(IncrementalGenerator(), Shape{1, 2, 3, 4});
}
} // namespace infini

View File

@ -0,0 +1,63 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/where.h"
#include "test.h"
namespace infini {
void test_where(const Shape &inputxshape, const vector<float> &inputxdata,
const Shape &inputyshape, const vector<float> &inputydata,
const Shape &conditionshape, const vector<int> &conditiondata,
const vector<float> &ExpectData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto condition = gCpu->addTensor(conditionshape, DataType::Int32);
auto inputx = gCpu->addTensor(inputxshape, DataType::Float32);
auto inputy = gCpu->addTensor(inputyshape, DataType::Float32);
gCpu->dataMalloc();
condition->copyin(conditiondata); //
inputx->copyin(inputxdata);
inputy->copyin(inputydata); //
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto conditionGpu = gCuda->cloneTensor(condition);
auto inputxGpu = gCuda->cloneTensor(inputx);
auto inputyGpu = gCuda->cloneTensor(inputy);
auto op = gCuda->addOp<WhereObj>(inputxGpu, inputyGpu, conditionGpu,
nullptr); // WhereObj
gCuda->dataMalloc();
conditionGpu->copyin(conditiondata);
inputxGpu->copyin(inputxdata);
inputyGpu->copyin(inputydata);
cudaRuntime->run(gCuda);
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
TEST(CUDA_Where, run) {
test_where(
Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Shape{2, 2, 3, 1}, vector<int>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
vector<float>{0., 1., 2., 0., 0., 0., 6., 7., 0., 9., 10., 11.});
test_where(Shape{2, 1, 1, 3}, // inputx
vector<float>{0, 1, 2, 3, 4, 5}, Shape{1, 2, 1, 1}, // inputy
vector<float>{1, 1}, Shape{2, 1, 3, 1}, // condition
vector<int>{0, 1, 1, 0, 0, 0},
vector<float>{1., 1., 1., 0., 1., 2., 0., 1., 2., 1., 1., 1.,
0., 1., 2., 0., 1., 2., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.});
} // python output
} // namespace infini

View File

@ -0,0 +1,26 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/expand.h"
#include "test.h"
namespace infini {
TEST(Expand, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({3, 1}, DataType::Float32);
auto op = g->addOp<ExpandObj>(i, nullptr, Shape{2, 1, 6});
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 6}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({3, 1}, DataType::Float32);
auto op = g->addOp<ExpandObj>(i, nullptr, Shape{3, 4});
EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 4}));
}
}
} // namespace infini

View File

@ -9,11 +9,19 @@ namespace infini {
TEST(Gather, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
Tensor index = g->addTensor({2, 1, 2}, DataType::Int32);
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
Tensor index = g->addTensor({2, 1, 2}, DataType::Int32);
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({1, 3, 4, 4}, DataType::Int32);
Tensor index = g->addTensor({2, 1, 2}, DataType::Int64);
auto op = g->addOp<GatherObj>(i, index, nullptr, 1);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
}
}
} // namespace infini

View File

@ -21,6 +21,12 @@ TEST(ReduceMean, ShapeInference) {
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, true);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, true);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 3, 1}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
@ -33,6 +39,13 @@ TEST(ReduceMean, ShapeInference) {
auto op = g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{1, 3}, false);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor i = g->addTensor({2, 3, 3, 4}, DataType::Float32);
auto op =
g->addOp<ReduceMeanObj>(i, nullptr, vector<int>{-3, 3}, false);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3}));
}
}
} // namespace infini

View File

@ -0,0 +1,46 @@
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/where.h"
#include "test.h"
namespace infini {
TEST(Where, ShapeInference) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
{
Graph g = make_ref<GraphObj>(runtime);
Tensor x = g->addTensor({2, 2}, DataType::Float32);
Tensor y = g->addTensor({2, 2}, DataType::Float32);
Tensor con = g->addTensor({2, 2}, DataType::Bool);
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 2}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor x = g->addTensor({1, 12, 224, 224}, DataType::Float32);
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
Tensor con = g->addTensor({1, 224, 1}, DataType::Bool);
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor x = g->addTensor({12, 224, 224}, DataType::Float32);
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
Tensor con = g->addTensor({1, 224}, DataType::Bool);
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 12, 224, 224}));
}
{
Graph g = make_ref<GraphObj>(runtime);
Tensor x = g->addTensor({12, 224, 224}, DataType::Float32);
Tensor y = g->addTensor({1, 1, 224, 224}, DataType::Float32);
Tensor con = g->addTensor({2, 1, 1, 1, 224}, DataType::Bool);
auto op = g->addOp<WhereObj>(x, y, con, nullptr);
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 1, 12, 224, 224}));
}
}
} // namespace infini