Compare commits

...

39 Commits

Author SHA1 Message Date
kilinchange dc6befb549 fix: fix re-dataMalloc for weight tensor and use of naive allocator 2023-12-29 17:27:36 +08:00
zhangyunze 935b465cf2 fix: fix matmul fp16 2023-12-29 16:55:38 +08:00
panzezhong a91ed84354 fix (slice): add guard for area out of range 2023-12-28 16:35:47 +08:00
kilinchange e5ca66db66 feat: support int8 llama 2023-12-27 15:28:05 +08:00
panzezhong 85de28ef1e fix: 为中间结果提供tensor到node的mapping 2023-12-27 10:48:06 +08:00
YdrMaster c34946a0d8 refactor(frontend): 先排序后构图
Signed-off-by: YdrMaster <ydrml@hotmail.com>
2023-12-25 17:58:29 +08:00
xgqdut2016 ce23b8356f modified dynamic_quantize_linear 2023-12-25 17:33:43 +08:00
kilinchange 7b48b93fb3 feat: add field tensors for stub 2023-12-25 15:49:21 +08:00
kilinchange 8d901ba7aa fix: run int8 llama but has nan output 2023-12-19 17:17:22 +08:00
OdinaryWord 8ae5958b29 feat: add support for dynamic_quantize_linear 2023-12-19 16:41:17 +08:00
kilinchange 0e75f99e7e feat: add dynamic quantize linear kernel 2023-12-19 14:46:14 +08:00
zhangyunze 97e3377ca5 feat: add matmulinteger op 2023-12-19 14:14:33 +08:00
xgqdut2016 9c82936386 add half and float dequantizeLinear 2023-12-18 17:47:53 +08:00
kilinchange 03ed8c4de7 feat: support unary int8 2023-12-18 17:32:22 +08:00
kilinchange c63ed4326d feat: add frontend DynamicQuantizeLinear and DequantizeLinear kernels 2023-12-18 13:58:20 +08:00
xgqdut2016 f51ce3231a where support int8 2023-12-15 15:40:30 +08:00
kilinchange 9d9e996713 fix(graph.cc): fix topo_sort 2023-12-15 10:13:18 +08:00
xgqdut2016 c859e655d3 Merge branch 'master' into support_fp16 2023-12-15 10:02:03 +08:00
zhangyunze e66f1c0421 fix: fix dist code to support fp16 2023-12-14 18:02:08 +08:00
xgqdut2016 ff98241db7 modified test_cuda_conv_transposed 2023-12-14 14:44:28 +08:00
OdinaryWord 046b2d68d8 style:fix style 2023-12-14 13:35:08 +08:00
OdinaryWord 2af4c1276b feat:support int8 for gather 2023-12-14 13:28:41 +08:00
OdinaryWord db8c3eec15 style: fix style 2023-12-14 11:32:07 +08:00
OdinaryWord c29dcf1e6d add cuda cast & support half-precision for gather 2023-12-14 11:24:25 +08:00
zhangyunze 5ed7db1506 feat: support powOp int8 2023-12-14 11:15:28 +08:00
zhangyunze bdb8d8d65f feat: support matmulOp/expandOp fp16 2023-12-14 11:07:45 +08:00
kilinchange cbdeb73e86 - feat: support reduceOp fp16 2023-12-13 17:39:39 +08:00
kilinchange 5af7f1e753 - unary support fp16 2023-12-13 17:06:27 +08:00
zhangyunze ee4ecd27e2 feat: support sliceOp fp16 2023-12-13 16:55:16 +08:00
zhangyunze d5e775397d feat: support transpose fp16 2023-12-13 16:36:37 +08:00
kilinchange 4b02de7e17 - element_wise support fp16 2023-12-13 15:57:25 +08:00
xgqdut2016 e07516ebe9
Merge branch 'master' into support_fp16 2023-12-11 16:50:02 +08:00
xgqdut2016 dd4a90fb5e add split_concat fp16 2023-12-11 16:45:16 +08:00
xgqdut2016 fda0a5f982 add layernorm fp16 2023-12-11 15:05:34 +08:00
xgqdut2016 8b2e3b8e19 add where fp16 2023-12-08 16:57:49 +08:00
xgqdut2016 a000cb0304 modified all register kernel 2023-12-07 17:53:28 +08:00
kilinchange c587901586 - cpu kernel: adapt the new registration mechanism 2023-12-07 13:43:40 +08:00
kilinchange c19256bca6 - support fp16 for conv 2023-12-04 16:56:16 +08:00
kilinchange 4db6699e09 - Remove dataType from the kernel registration. 2023-11-30 13:51:24 +08:00
106 changed files with 4923 additions and 2148 deletions

View File

@ -212,7 +212,7 @@ if(USE_CUDA)
${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER}
CACHE STRING "Set cuda host compiler path") CACHE STRING "Set cuda host compiler path")
# CMP0104 requires CUDA_ARCHITECTURES # CMP0104 requires CUDA_ARCHITECTURES
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70;80") set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES 80)
enable_language(CUDA) enable_language(CUDA)
find_package(CUDAToolkit) # For nvrtc and cuda driver find_package(CUDAToolkit) # For nvrtc and cuda driver
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver) target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)

View File

@ -137,11 +137,19 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
place[node.output[0]] = Shard(list(perm).index(plc.dim)) place[node.output[0]] = Shard(list(perm).index(plc.dim))
def shard_node(node: NodeProto): def shard_node(node: NodeProto):
if node.op_type in ["Relu", "Tanh", "Softmax"]: if node.op_type in [
"Relu",
"Tanh",
"Softmax",
"Cast",
"DequantizeLinear",
"DynamicQuantizeLinear",
"Sigmoid",
]:
place[node.output[0]] = place[node.input[0]] place[node.output[0]] = place[node.input[0]]
elif node.op_type in ["Where"]: elif node.op_type in ["Where"]:
place[node.output[0]] = place[node.input[1]] place[node.output[0]] = place[node.input[1]]
if node.op_type in {"Add", "Mul", "Div", "Max"}: if node.op_type in {"Add", "Mul", "Div", "Max", "Pow"}:
shard_binary(node) shard_binary(node)
elif node.op_type == "Reshape": elif node.op_type == "Reshape":
shard_reshape(node) shard_reshape(node)
@ -149,7 +157,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
shard_transpose(node) shard_transpose(node)
elif node.op_type == "Split": elif node.op_type == "Split":
shard_split(node) shard_split(node)
elif node.op_type == "MatMul": elif node.op_type in {"MatMul", "MatMulInteger"}:
assert ( assert (
place[node.input[0]] == place[node.input[1]] place[node.input[0]] == place[node.input[1]]
), f"{place[node.input[0]]} != {place[node.input[1]]}" ), f"{place[node.input[0]]} != {place[node.input[1]]}"
@ -173,11 +181,23 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
for index, node in enumerate(model.graph.node): for index, node in enumerate(model.graph.node):
nodes.append(node) nodes.append(node)
# linear # linear
if (node.op_type == "MatMul" or node.op_type == "Gemm") and any( if (
input in data for input in node.input node.op_type == "MatMul"
): or node.op_type == "Gemm"
or node.op_type == "MatMulInteger"
) and any(input in data for input in node.input):
# FIXME(constroy): the last MatMul should not be sharded as TP. # FIXME(constroy): the last MatMul should not be sharded as TP.
if node.output[0] in output: if (
node.output[0] in output
or (
index + 1 < len(model.graph.node)
and model.graph.node[index + 1].output[0] in output
)
or (
index + 2 < len(model.graph.node)
and model.graph.node[index + 2].output[0] in output
)
):
continue continue
groups = 1 groups = 1
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
@ -202,7 +222,9 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
) )
place[node.output[0]] = Replicate() place[node.output[0]] = Replicate()
node.output[0] = new_name node.output[0] = new_name
if len(node.input) > 2: # split bias to add if (
len(node.input) > 2 and node.op_type != "MatMulInteger"
): # split bias to add
prev = nodes[-1] prev = nodes[-1]
new_name = prev.output[0] + "_no_bias" new_name = prev.output[0] + "_no_bias"
place[new_name] = place[node.output[0]] place[new_name] = place[node.output[0]]

View File

@ -85,6 +85,8 @@ class GraphHandlerObj {
Tensor cast(Tensor input, Tensor output, int to); Tensor cast(Tensor input, Tensor output, int to);
Tensor expand(Tensor input, Tensor output, Shape dims); Tensor expand(Tensor input, Tensor output, Shape dims);
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output); Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
Tensor dequantizeLinear(Tensor inputX, Tensor inputScale, Tensor output,
Tensor inputZeroPoint, int axis);
std::vector<int> getDims(Tensor x) { return x->getDims(); } std::vector<int> getDims(Tensor x) { return x->getDims(); }
Tensor allReduceSum(Tensor input, Tensor output); Tensor allReduceSum(Tensor input, Tensor output);
@ -99,6 +101,10 @@ class GraphHandlerObj {
int outputType, Tensor input); int outputType, Tensor input);
Tensor depthToSpace(Tensor input, Tensor output, int blocksize, Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
std::string mode); std::string mode);
TensorVec dynamicQuantizeLinear(Tensor input,
std::optional<TensorVec> outputs);
Tensor matmulInteger(Tensor inputA, Tensor inputB, Tensor output,
Tensor a_zero_point, Tensor b_zero_point);
//------ modifiers //------ modifiers

View File

@ -29,7 +29,6 @@ class Kernel {
public: public:
Kernel() {} Kernel() {}
virtual ~Kernel() {} virtual ~Kernel() {}
/** /**
* @param op The operator to be executed. * @param op The operator to be executed.
* @param record The parameters for kernel execution. If extra parameters * @param record The parameters for kernel execution. If extra parameters
@ -105,8 +104,7 @@ class KernelRegistry {
IT_ASSERT(it != kernels.end(), IT_ASSERT(it != kernels.end(),
"Kernel not found for key {" + "Kernel not found for key {" +
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) + to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " + ", " + std::to_string(std::get<1>(kernelAttrs)) + "}");
std::get<2>(kernelAttrs).toString() + "}");
return std::get<0>(it->second); return std::get<0>(it->second);
} }
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const { const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -131,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {
} // namespace infini } // namespace infini
#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \ #define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \
namespace infini { \ namespace infini { \
static const bool _CAT(_register_kernel_, cnt) = \ static const bool _CAT(_register_kernel_, cnt) = \
KernelRegistry::getInstance().registerKernel( \ KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \
KernelAttrs{device, opType, dataType}, new kernel(), name); \ opType}, \
new kernel(), name); \
} }
#define REGISTER_KERNEL(device, opType, dataType, kernel, name) \ #define REGISTER_KERNEL(device, opType, kernel, name) \
_REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__) _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \ #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
namespace infini { \ namespace infini { \

View File

@ -4,7 +4,7 @@
#include "core/tensor.h" #include "core/tensor.h"
namespace infini { namespace infini {
using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>; using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
struct OpPerfKey { struct OpPerfKey {
HashType hash; HashType hash;
@ -90,6 +90,8 @@ class OperatorObj : public Object {
OpType getOpType() const { return type; } OpType getOpType() const { return type; }
// HACK: set correct data type // HACK: set correct data type
DataType getDType() const { return getInputs(0)->getDType(); } DataType getDType() const { return getInputs(0)->getDType(); }
DataType getInDType() const { return getInputs(0)->getDType(); }
DataType getOutDType() const { return getOutput()->getDType(); }
virtual int numInputs() const = 0; virtual int numInputs() const = 0;
virtual int numOutputs() const = 0; virtual int numOutputs() const = 0;

View File

@ -42,8 +42,16 @@ class TensorObj : public TensorBaseObj {
bool isOutput() const { return tensorType == TensorType::output; } bool isOutput() const { return tensorType == TensorType::output; }
bool isOthers() const { return tensorType == TensorType::others; } bool isOthers() const { return tensorType == TensorType::others; }
void setWeight() { tensorType = TensorType::weight; } void setWeight() { tensorType = TensorType::weight; }
void setInput() { tensorType = TensorType::input; } void setInput() {
void setOutput() { tensorType = TensorType::output; } if (!this->isWeight()) {
tensorType = TensorType::input;
}
}
void setOutput() {
if (!this->isWeight()) {
tensorType = TensorType::output;
}
}
string tensorTypeToString() const { string tensorTypeToString() const {
switch (tensorType) { switch (tensorType) {
case TensorType::weight: case TensorType::weight:

View File

@ -0,0 +1,17 @@
#pragma once
#include "operators/dequantize_linear.h"
namespace infini {
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
float *output, const int dimsize, const int stride,
const uint8_t *inputZeroPoint, const int size);
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
float *output, const int dimsize, const int stride,
const int size);
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
half *output, const int dimsize, const int stride,
const uint8_t *inputZeroPoint, const int size);
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
half *output, const int dimsize, const int stride,
const int size);
}; // namespace infini

View File

@ -0,0 +1,7 @@
#pragma once
#include "operators/dynamic_quantize_linear.h"
namespace infini {
void dynamicQuantizeLinearKernel(float *input, uint8_t *outputY, float *yScale,
uint8_t *yZeroPoint, int size);
}; // namespace infini

View File

@ -1,13 +1,16 @@
#pragma once #pragma once
namespace infini { namespace infini {
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3); int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, int c2, int c3);
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3); void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3); int c2, int c3);
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c3); int c2, int c3);
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
int c2, int c3);
}; // namespace infini }; // namespace infini

View File

@ -3,7 +3,8 @@
#include "operators/unary.h" #include "operators/unary.h"
#include "utils/small_array.h" #include "utils/small_array.h"
namespace infini { namespace infini {
void expandKernel(float *input, float *output, int nDims, int outputsize, void expandKernel(int dType, void *input, void *output, int nDims,
SmallArray inputShape, SmallArray outputShape); int outputsize, SmallArray inputShape,
SmallArray outputShape);
}; // namespace infini }; // namespace infini

View File

@ -8,4 +8,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
void LaynormKernel(const float *input, const float *scale, const float eps, void LaynormKernel(const float *input, const float *scale, const float eps,
int size, int scaleSize, const int dimsize, const int stride, int size, int scaleSize, const int dimsize, const int stride,
float *output); float *output);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output, const half *bias, int biasSize);
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output);
}; // namespace infini }; // namespace infini

View File

@ -0,0 +1,7 @@
#pragma once
namespace infini {
void subA_kernel(int dType, void *a, void *b, int size, int k, int delta);
void subB_kernel(int dType, void *a, void *b, int size, int k, int n,
int delta);
}; // namespace infini

View File

@ -3,4 +3,6 @@
namespace infini { namespace infini {
void softmax_kernel(int num_blocks, float *input, float *output, int size, void softmax_kernel(int num_blocks, float *input, float *output, int size,
int dimsize, int stride); int dimsize, int stride);
} void softmax_kernel(int num_blocks, half *input, half *output, int size,
int dimsize, int stride);
} // namespace infini

View File

@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
// Concat operator acts like element tensors composing to one big tensor,and // Concat operator acts like element tensors composing to one big tensor,and
// split operator acts like one big tensor being composed by element // split operator acts like one big tensor being composed by element
// tensors. // tensors.
struct ElementTensorMetadata { template <typename T> struct ElementTensorMetadata {
float *data[BATCH_SIZE]; T *data[BATCH_SIZE];
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
// the composed tensor. // the composed tensor.
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor. int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,16 +20,17 @@ struct ElementTensorMetadata {
data[i], dimBgNo[i], dimSize[i], nElements[i]); data[i], dimBgNo[i], dimSize[i], nElements[i]);
} }
}; };
template <typename T> struct ComposedTensorMetadata {
struct ComposedTensorMetadata {
int dimSize[DIM_MAX_SIZE]; int dimSize[DIM_MAX_SIZE];
int stride[DIM_MAX_SIZE]; int stride[DIM_MAX_SIZE];
float *data; T *data;
}; };
namespace infini { namespace infini {
void split_concat_kernel(const ElementTensorMetadata &eleMeta, void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
const ComposedTensorMetadata &compMeta, int dim, const ComposedTensorMetadata<float> &compMeta, int dim,
int batchSize, int nDims, bool isSplit);
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
const ComposedTensorMetadata<half> &compMeta, int dim,
int batchSize, int nDims, bool isSplit); int batchSize, int nDims, bool isSplit);
} // namespace infini } // namespace infini

View File

@ -5,7 +5,7 @@
namespace infini { namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size, void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
SmallArray strides, SmallArray outputShape); SmallArray strides, SmallArray outputShape);
}; // namespace infini }; // namespace infini

View File

@ -3,48 +3,21 @@
#include "operators/unary.h" #include "operators/unary.h"
namespace infini { namespace infini {
void softmax_kernel(float *input, float *output, size_t num); template <typename T> void softmax_kernel(T *input, T *output, size_t num);
void relu_kernel(float *input, float *output, size_t num); template <typename T> void relu_kernel(T *input, T *output, size_t num);
void sigmoid_kernel(float *input, float *output, size_t num); template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
void tanh_kernel(float *input, float *output, size_t num); template <typename T> void tanh_kernel(T *input, T *output, size_t num);
void abs_kernel(float *input, float *output, size_t num); template <typename T> void abs_kernel(T *input, T *output, size_t num);
void sqrt_kernel(float *input, float *output, size_t num); template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
void neg_kernel(float *input, float *output, size_t num); template <typename T> void neg_kernel(T *input, T *output, size_t num);
void gelu_kernel(float *input, float *output, size_t num); template <typename T> void gelu_kernel(T *input, T *output, size_t num);
void erf_kernel(float *input, float *output, size_t num); template <typename T> void erf_kernel(T *input, T *output, size_t num);
void hard_sigmoid_kernel(float *input, float *output, size_t num); template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
void hard_swish_kernel(float *input, float *output, size_t num); template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
void unary_kernel(const Operator &_op) { template <typename INPUT, typename OUTPUT>
auto op = as<UnaryObj>(_op); void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
size_t num = op->getOutput()->size(); void unary_kernel(const Operator &_op);
if (op->getOpType() == OpType::Softmax)
softmax_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Relu)
relu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sigmoid)
sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSigmoid)
hard_sigmoid_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::HardSwish)
hard_swish_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Tanh)
tanh_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Abs)
abs_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Sqrt)
sqrt_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Gelu)
gelu_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Neg)
neg_kernel(inputData, outputData, num);
else if (op->getOpType() == OpType::Erf)
erf_kernel(inputData, outputData, num);
else
IT_TODO_HALT();
}
}; // namespace infini }; // namespace infini

View File

@ -1,11 +1,29 @@
#pragma once
#include "core/tensor.h" #include "core/tensor.h"
#include "cuda/cuda_common.h"
namespace infini { namespace infini {
void cudaPrintFloat(float *x, int len); void cudaPrintFloat(float *x, int len);
void cudaPrintTensor(const Tensor &tensor) { void cudaPrintTensor(const Tensor &tensor);
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
} // namespace infini cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
cudaDataType cublasDataTypeConvert(DataType);
template <int index> struct DT_CUDA {};
template <> struct DT_CUDA<0> { using t = bool; };
template <> struct DT_CUDA<1> { using t = float; };
template <> struct DT_CUDA<2> { using t = unsigned char; };
template <> struct DT_CUDA<3> { using t = char; };
template <> struct DT_CUDA<4> { using t = unsigned short; };
template <> struct DT_CUDA<5> { using t = short; };
template <> struct DT_CUDA<6> { using t = int; };
template <> struct DT_CUDA<7> { using t = long long; };
template <> struct DT_CUDA<9> { using t = bool; };
template <> struct DT_CUDA<10> { using t = half; };
template <> struct DT_CUDA<11> { using t = double; };
template <> struct DT_CUDA<12> { using t = unsigned int; };
template <> struct DT_CUDA<13> { using t = unsigned long long; };
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
} // namespace infini

View File

@ -3,10 +3,10 @@
#include "utils/small_array.h" #include "utils/small_array.h"
namespace infini { namespace infini {
void whereKernel(const float *inputX, const float *inputY,
const uint8_t *condition, float *output, int nDims, void whereKernel(int dTypeIndex, void *inputX, void *inputY,
const uint8_t *condition, void *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape, int outputsize, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize, SmallArray conditionShape, SmallArray outputShape, int xSize,
int ySize, int cSize); int ySize, int cSize);
}; // namespace infini }; // namespace infini

View File

@ -53,7 +53,8 @@ inline void initGatherMetaData(GatherMetaData &metaData,
metaData.inStride[i] = in->getStride()[i]; metaData.inStride[i] = in->getStride()[i];
} }
} }
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num); template <typename T>
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData, void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
size_t num); size_t num);

View File

@ -0,0 +1,44 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief y = (x - x_zero_point) *x_scale
*
*/
class DequantizeLinearObj : public OperatorObj {
int axis;
public:
/**
* @brief Construct a new DequantizeLinear object.
*
* @param graph The computation graph that this operator belongs to.
* @param inputX The input tensor X.
* @param inputScale The input tensor x_scale.
* @param output The output tensor.
* @param inputZeroPoint The z_zero_point.
*/
DequantizeLinearObj(GraphObj *graph, Tensor inputX, Tensor inputScale,
Tensor output, Tensor inputZeroPoint = nullptr,
int axis = 1);
OP_CLONE(DequantizeLinearObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
Tensor getZeroPoint() const {
return inputs.size() > 2 ? inputs[2] : nullptr;
}
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
int getAxis() const { return axis; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
};
} // namespace infini

View File

@ -0,0 +1,37 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief A Function to fuse calculation for Scale, Zero Point and FP32->8Bit
* conversion of FP32 Input data.
*
*/
class DynamicQuantizeLinearObj : public OperatorObj {
public:
/**
* @brief Construct a new DynamicQuantizeLinear object.
*
* @param graph The computation graph that this operator belongs to.
* @param input The input tensor.
* @param outputs The output tensors.
*/
DynamicQuantizeLinearObj(GraphObj *graph, Tensor input,
std::optional<TensorVec> outputs);
OP_CLONE(DynamicQuantizeLinearObj);
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
std::string toString() const override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 3; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
vector<DataType> inferDataType(const TensorVec &inputs) const override;
};
} // namespace infini

View File

@ -0,0 +1,63 @@
#pragma once
#include "core/operator.h"
namespace infini {
/**
* @brief Matrix multiplication.
*
*/
class MatmulIntegerObj : public OperatorObj {
private:
// Auxiliary attributes which are not a part of operator attributes.
int b, m, n, k;
public:
/**
* @brief Matmul operator with batch broadcast and tensor transpose
* supports. Only one tensor with singe batch can be broadcasted due to the
* BLAS interface restriction. Tranpose indicates whether the last two
* dimensions should be transposed before Matmul and does not affect other
* leading dimensions.
*
* Matmul show how operators are defined in InfiniTensor. The constructor of
* an operator can create output tensors for the operator or not, which
* depends on `graph`.
*
* @param graph The computation graph that this operator belongs to.
* @param A The input tensor.
* @param B The input tensor.
* @param C C is the output of Matmul. If outputs are going to be created in
* the constructor, C should be an empty Ref.
* @param a_zero_point Zero point tensor for input 'A'.
* @param b_zero_point Zero point tensor for input 'B'.
*/
MatmulIntegerObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
Tensor a_zero_point = nullptr,
Tensor b_zero_point = nullptr);
OP_CLONE(MatmulIntegerObj);
std::string toString() const override;
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
Tensor getZeroPointA() const {
return inputs.size() > 2 ? inputs[2] : nullptr;
}
Tensor getZeroPointB() const {
return inputs.size() > 3 ? inputs[3] : nullptr;
}
int getB() const { return b; }
int getM() const { return m; }
int getN() const { return n; }
int getK() const { return k; }
auto getBMNK() const { return tuple{b, m, n, k}; }
vector<DataType> inferDataType(const TensorVec &inputs) const override;
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -91,6 +91,12 @@ template <int val> class ValGenerator : public DataGenerator {
fill<uint32_t>(data, size); fill<uint32_t>(data, size);
} }
void fill(float *data, size_t size) override { fill<float>(data, size); } void fill(float *data, size_t size) override { fill<float>(data, size); }
void fill_fp16(uint16_t *data, size_t size) {
for (size_t i = 0; i < size; i++) {
float x = 1.0f * val;
data[i] = float_to_fp16(x);
}
}
}; };
typedef ValGenerator<1> OneGenerator; typedef ValGenerator<1> OneGenerator;
typedef ValGenerator<0> ZeroGenerator; typedef ValGenerator<0> ZeroGenerator;

File diff suppressed because it is too large Load Diff

View File

@ -102,6 +102,18 @@ class TestStringMethods(unittest.TestCase):
matmul = make_node("MatMul", ["x", "a"], ["xa"], name="matmul") matmul = make_node("MatMul", ["x", "a"], ["xa"], name="matmul")
make_and_import_model(make_graph([matmul], "matmul", [x, a], [xa])) make_and_import_model(make_graph([matmul], "matmul", [x, a], [xa]))
def test_matmul_integer(self):
A = make_tensor_value_info("A", TensorProto.INT8, [1, 2, 4])
B = make_tensor_value_info("B", TensorProto.UINT8, [1, 4, 4])
A_ZeroPoint = make_tensor_value_info("A_ZeroPoint", TensorProto.INT8, [1, 2, 1])
y = make_tensor_value_info("y", TensorProto.INT32, [1, 2, 4])
matmulInteger = make_node(
"MatMulInteger", ["A", "B", "A_ZeroPoint"], ["y"], name="matmul_integer"
)
make_and_import_model(
make_graph([matmulInteger], "matmul_integer", [A, B, A_ZeroPoint], [y])
)
def test_gemm(self): def test_gemm(self):
a = make_tensor_value_info("a", TensorProto.FLOAT, [1, 2, 3]) a = make_tensor_value_info("a", TensorProto.FLOAT, [1, 2, 3])
b = make_tensor_value_info("b", TensorProto.FLOAT, [1, 4, 3]) b = make_tensor_value_info("b", TensorProto.FLOAT, [1, 4, 3])

View File

@ -87,48 +87,33 @@ string GraphObj::toString() const {
} }
bool GraphObj::topo_sort() { bool GraphObj::topo_sort() {
if (this->sorted) if (this->sorted) {
return true; return true;
}
// std::unordered_set<Tensor> inputs;
std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
std::vector<Operator> sorted; std::vector<Operator> sorted;
std::unordered_set<OperatorObj *> flags;
while (!waiting.empty()) { sorted.reserve(ops.size());
flags.reserve(ops.size());
while (sorted.size() < ops.size()) {
// Any node is move to sorted in this loop. // Any node is move to sorted in this loop.
auto modified = false; auto modified = false;
// Find head nodes. for (auto const &op : ops) {
for (auto it = waiting.begin(); it != waiting.end();) { if (auto const &inputs = op->getInputs();
const auto &this_inputs = (*it)->getInputs(); flags.find(op.get()) == flags.end() &&
// If none of the input tensors is in waiting list, std::all_of(inputs.begin(), inputs.end(),
// this node is a head node. [&flags](auto const &input) {
const auto is_head = std::all_of( auto ptr = input->getSource().get();
this_inputs.begin(), this_inputs.end(), [&](const auto &input) { return !ptr || flags.find(ptr) != flags.end();
auto src = input->getSource(); })) {
return src // If the source node is in the waiting
// list, means that this node is not the
// head node.
? waiting.find(src) == waiting.end()
// This tensor has no source node,
// it must be a input tensor.
: (/*inputs.insert(input),*/ true);
});
// Moves head node to sorted.
if (is_head) {
modified = true; modified = true;
sorted.emplace_back(std::move(*it)); sorted.emplace_back(op);
it = waiting.erase(it); flags.insert(op.get());
} else {
++it;
} }
} }
// Waiting list never modifies during a pass,
// sorting fails.
if (!modified) { if (!modified) {
return false; return false;
} }
} }
// Done.
this->ops = std::move(sorted); this->ops = std::move(sorted);
return this->sorted = true; return this->sorted = true;
} }
@ -182,7 +167,10 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
// note: behavior may not match running in non-naive mode, and it may // note: behavior may not match running in non-naive mode, and it may
// not reproduce the bug // not reproduce the bug
for (auto &tensor : tensors) { for (auto &tensor : tensors) {
tensor->dataMalloc(); if (!tensor->isWeight() ||
(tensor->isWeight() && !weightAllocated)) {
tensor->dataMalloc();
}
} }
return; return;
} }

View File

@ -6,11 +6,14 @@
#include "operators/broadcast.h" #include "operators/broadcast.h"
#include "operators/concat.h" #include "operators/concat.h"
#include "operators/conv.h" #include "operators/conv.h"
#include "operators/dequantize_linear.h"
#include "operators/dynamic_quantize_linear.h"
#include "operators/element_wise.h" #include "operators/element_wise.h"
#include "operators/expand.h" #include "operators/expand.h"
#include "operators/gather.h" #include "operators/gather.h"
#include "operators/layer_norm.h" #include "operators/layer_norm.h"
#include "operators/matmul.h" #include "operators/matmul.h"
#include "operators/matmul_integer.h"
#include "operators/pad.h" #include "operators/pad.h"
#include "operators/pooling.h" #include "operators/pooling.h"
#include "operators/recv.h" #include "operators/recv.h"
@ -145,6 +148,23 @@ Tensor GraphHandlerObj::avgPool(Tensor input, Tensor output, int kh, int kw,
} }
} }
Tensor GraphHandlerObj::matmulInteger(Tensor inputA, Tensor inputB,
Tensor output, Tensor a_zero_point,
Tensor b_zero_point) {
if (output) {
g->addOpWithOutputs<MatmulIntegerObj>(
std::move(inputA), std::move(inputB), output,
std::move(a_zero_point), std::move(b_zero_point));
return output;
} else {
return g
->addOp<MatmulIntegerObj>(std::move(inputA), std::move(inputB),
output, std::move(a_zero_point),
std::move(b_zero_point))
->getOutput();
}
}
// see operators/element_wise.h // see operators/element_wise.h
#define DEFINE_ELEMENT_WISE_METHOD(name, obj) \ #define DEFINE_ELEMENT_WISE_METHOD(name, obj) \
Tensor GraphHandlerObj::name(Tensor a, Tensor b, Tensor c) { \ Tensor GraphHandlerObj::name(Tensor a, Tensor b, Tensor c) { \
@ -506,6 +526,36 @@ Tensor GraphHandlerObj::where(Tensor inputX, Tensor inputY, Tensor condition,
} }
} }
TensorVec
GraphHandlerObj::dynamicQuantizeLinear(Tensor input,
std::optional<TensorVec> outputs) {
if (outputs) {
g->addOpWithOutputs<DynamicQuantizeLinearObj>(std::move(input),
outputs);
return *outputs;
} else {
return g->addOp<DynamicQuantizeLinearObj>(std::move(input), outputs)
->getOutputs();
}
}
Tensor GraphHandlerObj::dequantizeLinear(Tensor inputX, Tensor inputScale,
Tensor output, Tensor inputZeroPoint,
int axis) {
if (output) {
g->addOpWithOutputs<DequantizeLinearObj>(
std::move(inputX), std::move(inputScale), output,
std::move(inputZeroPoint), axis);
return output;
} else {
return g
->addOp<DequantizeLinearObj>(std::move(inputX),
std::move(inputScale), output,
std::move(inputZeroPoint), axis)
->getOutput();
}
}
Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize, Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
std::string mode) { std::string mode) {
if (output) { if (output) {

View File

@ -58,12 +58,13 @@ HashType OperatorObj::hash() const {
bool OperatorObj::checkValid(GraphObj *graph) { bool OperatorObj::checkValid(GraphObj *graph) {
auto optShapes = inferShape(); auto optShapes = inferShape();
if (!optShapes) // shape inference failed if (!optShapes) { // shape inference failed
return false; return false;
}
const vector<Shape> &shapes = *optShapes; const vector<Shape> &shapes = *optShapes;
if (shapes.size() != outputs.size()) if (shapes.size() != outputs.size()) {
return false; return false;
}
if (graph) { // if graph != nullptr, outputs should be created if (graph) { // if graph != nullptr, outputs should be created
auto dataTypes = inferDataType(); auto dataTypes = inferDataType();
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {

View File

@ -17,8 +17,7 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
auto kernelAttrs = auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
@ -66,8 +65,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
auto kernelAttrs = auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);

View File

@ -25,8 +25,7 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
auto &perfEngine = PerfEngine::getInstance(); auto &perfEngine = PerfEngine::getInstance();
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
// HACK: set correct data type // HACK: set correct data type
auto kernelAttrs = auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);
@ -48,8 +47,7 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
std::map<OpType, int> opCnt; std::map<OpType, int> opCnt;
for (auto &op : graph->getOperators()) { for (auto &op : graph->getOperators()) {
// HACK: set correct data type // HACK: set correct data type
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(), auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
DataType::Float32};
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs); Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
auto perfData = perfEngine.getPerfData(perfKey); auto perfData = perfEngine.getPerfData(perfKey);

View File

@ -1,4 +1,6 @@
#include "core/data_type.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include <cstdio> #include <cstdio>
__global__ void cudaPrintFloatImpl(float *x, int len) { __global__ void cudaPrintFloatImpl(float *x, int len) {
@ -18,4 +20,55 @@ void cudaPrintFloat(float *x, int len) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
void cudaPrintTensor(const Tensor &tensor) {
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
}
cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
if (dataType == DataType::Float32) {
return CUDNN_DATA_FLOAT;
}
if (dataType == DataType::Double) {
return CUDNN_DATA_DOUBLE;
}
if (dataType == DataType::Float16) {
return CUDNN_DATA_HALF;
}
if (dataType == DataType::Int8) {
return CUDNN_DATA_INT8;
}
if (dataType == DataType::Int32) {
return CUDNN_DATA_INT32;
}
if (dataType == DataType::UInt8) {
return CUDNN_DATA_UINT8;
}
if (dataType == DataType::BFloat16) {
return CUDNN_DATA_BFLOAT16;
}
if (dataType == DataType::Int64) {
return CUDNN_DATA_INT64;
}
if (dataType == DataType::Bool) {
return CUDNN_DATA_BOOLEAN;
}
IT_ASSERT(false, "Unsupported data type");
}
cudaDataType cublasDataTypeConvert(DataType dataType) {
switch (dataType.getIndex()) {
case 1:
return CUDA_R_32F;
// case 3:
// return CUDA_R_8I;
case 10:
return CUDA_R_16F;
case 11:
return CUDA_R_64F;
// case 16:
// return CUDA_R_16BF;
default:
IT_ASSERT(false, "MatMul Unsupported data type");
}
}
} // namespace infini } // namespace infini

View File

@ -517,6 +517,10 @@ void init_graph_builder(py::module &m) {
.def("expand", &Handler::expand, policy::move) .def("expand", &Handler::expand, policy::move)
.def("erf", &Handler::erf, policy::move) .def("erf", &Handler::erf, policy::move)
.def("where", &Handler::where, policy::move) .def("where", &Handler::where, policy::move)
.def("dequantizeLinear", &Handler::dequantizeLinear, policy::move)
.def("dynamicQuantizeLinear", &Handler::dynamicQuantizeLinear,
policy::move)
.def("matmulInteger", &Handler::matmulInteger, policy::move)
.def("topo_sort", &Handler::topo_sort, policy::automatic) .def("topo_sort", &Handler::topo_sort, policy::automatic)
.def("optimize", &Handler::optimize, policy::automatic) .def("optimize", &Handler::optimize, policy::automatic)
.def("operators", &Handler::operators, policy::move) .def("operators", &Handler::operators, policy::move)

View File

@ -3,9 +3,9 @@
namespace infini { namespace infini {
template <typename T> class NaiveConcat : public CpuKernelWithoutConfig { class NaiveConcat : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<ConcatObj>(_op); auto op = as<ConcatObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs(); auto inputs = op->getInputs(), outputs = op->getOutputs();
auto dim = op->getDim(); auto dim = op->getDim();
@ -41,11 +41,25 @@ template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
} }
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Concat, NaiveConcat, "ConcatNaive_CPU");
NaiveConcat<uint32_t>, "ConcatNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32,
NaiveConcat<float>, "ConcatNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -3,9 +3,9 @@
namespace infini { namespace infini {
template <typename T> class NaiveConv : public CpuKernelWithoutConfig { class NaiveConv : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<ConvObj>(_op); auto op = as<ConvObj>(_op);
T *iptr = op->getInputs(0)->getRawDataPtr<T *>(); T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
T *wptr = op->getInputs(1)->getRawDataPtr<T *>(); T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
@ -50,11 +50,25 @@ template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
} }
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Conv, NaiveConv, "ConvNaive_CPU");
NaiveConv<uint32_t>, "ConvNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::Float32, NaiveConv<float>,
"ConvNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -2,10 +2,45 @@
#include "core/kernel.h" #include "core/kernel.h"
namespace infini { namespace infini {
template <typename T> class NativeElementWise : public CpuKernelWithoutConfig { class NativeElementWise : public CpuKernelWithoutConfig {
virtual T doCompute(T val0, T val1) const = 0; template <typename T> static T addCompute(T val0, T val1) {
void compute(const Operator &_op, return val0 + val1;
const RuntimeObj *context) const override { }
template <typename T> static T subCompute(T val0, T val1) {
return val0 - val1;
}
template <typename T> static T mulCompute(T val0, T val1) {
return val0 * val1;
}
template <typename T> static T divCompute(T val0, T val1) {
return (T)(val0 / val1);
}
template <typename T> static T equalCompute(T val0, T val1) {
return (T)(val0 == val1);
}
template <typename T> static T greaterOrEqualCompute(T val0, T val1) {
return (T)(val0 >= val1);
}
template <typename T> static T greaterCompute(T val0, T val1) {
return (T)(val0 > val1);
}
template <typename T> static T lessOrEqualCompute(T val0, T val1) {
return (T)(val0 <= val1);
}
template <typename T> static T lessCompute(T val0, T val1) {
return (T)(val0 < val1);
}
template <typename T>
void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<ElementWiseObj>(_op); auto op = as<ElementWiseObj>(_op);
T *inptr0 = op->getInputs(0)->getRawDataPtr<T *>(); T *inptr0 = op->getInputs(0)->getRawDataPtr<T *>();
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>(); T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
@ -22,6 +57,39 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size())); std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
auto n = op->getOutput()->size(); auto n = op->getOutput()->size();
T (*_doCompute)(T val0, T val1);
switch (op->getOpType().underlying()) {
case OpType::Add:
_doCompute = addCompute<T>;
break;
case OpType::Sub:
_doCompute = subCompute<T>;
break;
case OpType::Mul:
_doCompute = mulCompute<T>;
break;
case OpType::Div:
_doCompute = divCompute<T>;
break;
case OpType::Equal:
_doCompute = equalCompute<T>;
break;
case OpType::GreaterOrEqual:
_doCompute = greaterOrEqualCompute<T>;
break;
case OpType::Greater:
_doCompute = greaterCompute<T>;
break;
case OpType::LessOrEqual:
_doCompute = lessOrEqualCompute<T>;
break;
case OpType::Less:
_doCompute = lessCompute<T>;
break;
default:
IT_TODO_HALT();
}
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
int c0_index = i / (c[1] * c[2] * c[3]); int c0_index = i / (c[1] * c[2] * c[3]);
int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]); int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
@ -37,77 +105,44 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
int b1_index = c1_index % b[1]; int b1_index = c1_index % b[1];
int b2_index = c2_index % b[2]; int b2_index = c2_index % b[2];
int b3_index = c3_index % b[3]; int b3_index = c3_index % b[3];
outptr[i] = doCompute( outptr[i] = _doCompute(
inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] + inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
a2_index * a[3] + a3_index], a2_index * a[3] + a3_index],
inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] + inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
b2_index * b[3] + b3_index]); b2_index * b[3] + b3_index]);
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
template <typename T> class NaiveAdd : public NativeElementWise<T> { REGISTER_KERNEL(Device::CPU, OpType::Add, NativeElementWise, "addNaive_CPU");
T doCompute(T val0, T val1) const override { return val0 + val1; } REGISTER_KERNEL(Device::CPU, OpType::Sub, NativeElementWise, "subNaive_CPU");
}; REGISTER_KERNEL(Device::CPU, OpType::Mul, NativeElementWise, "mulNaive_CPU");
template <typename T> class NaiveSub : public NativeElementWise<T> { REGISTER_KERNEL(Device::CPU, OpType::Div, NativeElementWise, "divNaive_CPU");
T doCompute(T val0, T val1) const override { return val0 - val1; } REGISTER_KERNEL(Device::CPU, OpType::Equal, NativeElementWise,
}; "equalNaive_CPU");
template <typename T> class NaiveMul : public NativeElementWise<T> { REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, NativeElementWise,
T doCompute(T val0, T val1) const override { return val0 * val1; } "greaterEqualNaive_CPU");
}; REGISTER_KERNEL(Device::CPU, OpType::Greater, NativeElementWise,
template <typename T> class NaiveDiv : public NativeElementWise<T> { "greaterThanNaive_CPU");
T doCompute(T val0, T val1) const override { return (T)(val0 / val1); } REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, NativeElementWise,
}; "lessEqualNaive_CPU");
template <typename T> class NaiveEqual : public NativeElementWise<T> { REGISTER_KERNEL(Device::CPU, OpType::Less, NativeElementWise,
T doCompute(T val0, T val1) const override { return (T)(val0 == val1); } "lessEqualNaive_CPU");
};
template <typename T> class NaiveGreaterEqual : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 >= val1); }
};
template <typename T> class NaiveGreaterThan : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 > val1); }
};
template <typename T> class NaiveLessEqual : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 <= val1); }
};
template <typename T> class NaiveLessThan : public NativeElementWise<T> {
T doCompute(T val0, T val1) const override { return (T)(val0 < val1); }
};
REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::UInt32, NaiveAdd<uint32_t>,
"addNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::Float32, NaiveAdd<float>,
"addNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::UInt32, NaiveSub<uint32_t>,
"subNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::Float32, NaiveSub<float>,
"subNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::UInt32, NaiveMul<uint32_t>,
"mulNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::Float32, NaiveMul<float>,
"mulNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
"divNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
"divNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::UInt32,
NaiveEqual<uint32_t>, "equalNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::Float32,
NaiveEqual<float>, "equalNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::UInt32,
NaiveGreaterEqual<uint32_t>, "greaterEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::Float32,
NaiveGreaterEqual<float>, "greaterEqualNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::UInt32,
NaiveGreaterThan<uint32_t>, "greaterThanNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::Float32,
NaiveGreaterThan<float>, "greaterThanNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::UInt32,
NaiveLessEqual<uint32_t>, "lessEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::Float32,
NaiveLessEqual<float>, "lessEqualNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::UInt32,
NaiveLessThan<uint32_t>, "lessEqualNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::Float32,
NaiveLessThan<float>, "lessEqualNaive_CPU_float32");
}; // namespace infini }; // namespace infini

View File

@ -3,9 +3,9 @@
namespace infini { namespace infini {
template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig { class NaiveMatmul : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<MatmulObj>(_op); auto op = as<MatmulObj>(_op);
IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet."); IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
T *A = op->getInputs(0)->getRawDataPtr<T *>(); T *A = op->getInputs(0)->getRawDataPtr<T *>();
@ -23,11 +23,25 @@ template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig {
} }
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::MatMul, NaiveMatmul, "MatmulNaive_CPU");
NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::Float32,
NaiveMatmul<float>, "MatmulNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -80,8 +80,8 @@ class MemboundInterpreter : public Kernel {
} }
}; };
REGISTER_KERNEL(Device::CPU, OpType::MemBound, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::MemBound, MemboundInterpreter,
MemboundInterpreter, "MemboundInterpreter_CPU"); "MemboundInterpreter_CPU");
} // namespace infini } // namespace infini

View File

@ -2,42 +2,10 @@
#include "core/kernel.h" #include "core/kernel.h"
namespace infini { namespace infini {
template <typename T> class NativePooling : public CpuKernelWithoutConfig { class NativePooling : public CpuKernelWithoutConfig {
virtual T getPoolingValue(int kh, int kw, int posh, int posw, int ih, template <typename T>
int iw, T *inptr) const = 0; static T getMaxPoolingValue(int kh, int kw, int posh, int posw, int ih,
void compute(const Operator &_op, int iw, T *inptr) {
const RuntimeObj *context) const override {
auto op = as<PoolingObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>();
const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
if (dh != 1 || dw != 1)
IT_TODO_HALT(); // To support dailated pooling
auto outDim = op->getOutput()->getDims();
int oh = outDim[2], ow = outDim[3];
for (auto i = 0; i < n; i++) {
for (auto j = 0; j < c; j++) {
auto inoffset = i * (c * ih * iw) + j * ih * iw;
for (auto h = 0; h < oh; h++) {
for (auto w = 0; w < ow; w++) {
// TODO: verify ceil mode
T val =
getPoolingValue(kh, kw, h * sh - ph, w * sw - pw,
ih, iw, inptr + inoffset);
auto outoffset =
w + h * ow + j * (oh * ow) + i * (c * oh * ow);
outptr[outoffset] = val;
}
}
}
}
}
};
template <typename T> class NaiveMaxPool : public NativePooling<T> {
T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw,
T *inptr) const override {
T maxval = 0; T maxval = 0;
for (auto k = 0; k < kh; k++) { for (auto k = 0; k < kh; k++) {
for (auto l = 0; l < kw; l++) { for (auto l = 0; l < kw; l++) {
@ -53,11 +21,10 @@ template <typename T> class NaiveMaxPool : public NativePooling<T> {
} }
return maxval; return maxval;
} }
};
template <typename T> class NaiveAvgPool : public NativePooling<T> { template <typename T>
T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw, static T getAvgPoolingValue(int kh, int kw, int posh, int posw, int ih,
T *inptr) const override { int iw, T *inptr) {
T sum = 0; T sum = 0;
for (auto k = 0; k < kh; k++) { for (auto k = 0; k < kh; k++) {
for (auto l = 0; l < kw; l++) { for (auto l = 0; l < kw; l++) {
@ -71,12 +38,70 @@ template <typename T> class NaiveAvgPool : public NativePooling<T> {
} }
return T(sum / (kh * kw)); return T(sum / (kh * kw));
} }
template <typename T>
void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<PoolingObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>();
const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
if (dh != 1 || dw != 1)
IT_TODO_HALT(); // To support dailated pooling
auto outDim = op->getOutput()->getDims();
int oh = outDim[2], ow = outDim[3];
T(*_doCompute)
(int kh, int kw, int posh, int posw, int ih, int iw, T *inptr);
switch (op->getOpType().underlying()) {
case OpType::MaxPool:
_doCompute = getMaxPoolingValue<T>;
break;
case OpType::AveragePool:
_doCompute = getAvgPoolingValue<T>;
break;
default:
IT_TODO_HALT();
}
for (auto i = 0; i < n; i++) {
for (auto j = 0; j < c; j++) {
auto inoffset = i * (c * ih * iw) + j * ih * iw;
for (auto h = 0; h < oh; h++) {
for (auto w = 0; w < ow; w++) {
// TODO: verify ceil mode
T val = _doCompute(kh, kw, h * sh - ph, w * sw - pw, ih,
iw, inptr + inoffset);
auto outoffset =
w + h * ow + j * (oh * ow) + i * (c * oh * ow);
outptr[outoffset] = val;
}
}
}
}
}
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::MaxPool, NativePooling,
NaiveMaxPool<uint32_t>, "maxPoolNaive_CPU_uint32"); "maxPoolNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::Float32, REGISTER_KERNEL(Device::CPU, OpType::AveragePool, NativePooling,
NaiveMaxPool<float>, "maxPoolNaive_CPU_float32"); "avgPoolNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::AveragePool, DataType::Float32,
NaiveAvgPool<float>, "AvgPoolNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -3,9 +3,9 @@
namespace infini { namespace infini {
template <typename T> class NaiveSplit : public CpuKernelWithoutConfig { class NaiveSplit : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<SplitObj>(_op); auto op = as<SplitObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs(); auto inputs = op->getInputs(), outputs = op->getOutputs();
auto dim = op->getDim(); auto dim = op->getDim();
@ -40,11 +40,24 @@ template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
} }
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Split, NaiveSplit, "SplitNaive_CPU");
NaiveSplit<uint32_t>, "SplitNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32,
NaiveSplit<float>, "SplitNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -14,9 +14,9 @@ inline Shape idx2Pos(const Shape &shape, size_t idx) {
return pos; return pos;
} }
template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig { class NaiveTranspose : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<TransposeObj>(_op); auto op = as<TransposeObj>(_op);
auto inputs = op->getInputs(), outputs = op->getOutputs(); auto inputs = op->getInputs(), outputs = op->getOutputs();
const auto &inDim = inputs[0]->getDims(); const auto &inDim = inputs[0]->getDims();
@ -35,11 +35,26 @@ template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
outPtr[outIdx] = inPtr[inIdx]; outPtr[outIdx] = inPtr[inIdx];
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Transpose, NaiveTranspose,
NaiveTranspose<uint32_t>, "TransposeNaive_CPU_uint32"); "TransposeNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32,
NaiveTranspose<float>, "TransposeNaive_CPU_float32");
} // namespace infini } // namespace infini

View File

@ -4,25 +4,170 @@
#include "operators/softmax.h" #include "operators/softmax.h"
namespace infini { namespace infini {
template <typename T> class NativeUnary : public CpuKernelWithoutConfig { class NativeUnary : public CpuKernelWithoutConfig {
virtual T doCompute(T val) const = 0; template <typename T> static T reluCompute(T val) {
void compute(const Operator &_op, return std::max(T(0), val);
const RuntimeObj *context) const override { }
template <typename T> static T sigmoidCompute(T val) {
return 1 / (1 + pow(E_CONSTANT, -val));
}
template <typename T> static T hardSigmoidCompute(T val) {
return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
}
template <typename T> static T hardSwishCompute(T val) {
return val *
std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
}
template <typename T> static T tanhCompute(T val) {
return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
(pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
}
template <typename T> static T absCompute(T val) {
return val < 0 ? -val : val;
}
template <typename T> static T sqrtCompute(T val) { return std::sqrt(val); }
template <typename T> static T cosCompute(T val) { return std::cos(val); }
template <typename T> static T sinCompute(T val) { return std::sin(val); }
template <typename T> static T tanCompute(T val) { return std::tan(val); }
template <typename T> static T sinhCompute(T val) { return std::sinh(val); }
template <typename T> static T coshCompute(T val) { return std::cosh(val); }
template <typename T> static T geluCompute(T val) {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
}
template <typename T> static T erfCompute(T val) { return std::erf(val); }
template <typename T> static T aCosCompute(T val) { return std::acos(val); }
template <typename T> static T aCoshCompute(T val) {
return std::acosh(val);
}
template <typename T> static T aSinCompute(T val) { return std::asin(val); }
template <typename T> static T aSinhCompute(T val) {
return std::asinh(val);
}
template <typename T> static T aTanCompute(T val) { return std::atan(val); }
template <typename T> static T aTanhCompute(T val) {
return std::atanh(val);
}
template <typename T> static T negCompute(T val) { return -val; }
template <typename T>
void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<UnaryObj>(_op); auto op = as<UnaryObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>(); T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>(); T *outptr = op->getOutput()->getRawDataPtr<T *>();
auto outDim = op->getOutput()->getDims(); auto outDim = op->getOutput()->getDims();
auto n = op->getOutput()->size(); auto n = op->getOutput()->size();
T (*_doCompute)(T val);
switch (op->getOpType().underlying()) {
case OpType::Relu:
_doCompute = reluCompute<T>;
break;
case OpType::Gelu:
_doCompute = geluCompute<T>;
break;
case OpType::Sigmoid:
_doCompute = sigmoidCompute<T>;
break;
case OpType::HardSigmoid:
_doCompute = hardSigmoidCompute<T>;
break;
case OpType::HardSwish:
_doCompute = hardSwishCompute<T>;
break;
case OpType::Tanh:
_doCompute = tanhCompute<T>;
break;
case OpType::Abs:
_doCompute = absCompute<T>;
break;
case OpType::Sqrt:
_doCompute = sqrtCompute<T>;
break;
case OpType::Erf:
_doCompute = erfCompute<T>;
break;
case OpType::Neg:
_doCompute = negCompute<T>;
break;
case OpType::Cos:
_doCompute = cosCompute<T>;
break;
case OpType::Sin:
_doCompute = sinCompute<T>;
break;
case OpType::Tan:
_doCompute = tanCompute<T>;
break;
case OpType::Sinh:
_doCompute = sinhCompute<T>;
break;
case OpType::Cosh:
_doCompute = coshCompute<T>;
break;
case OpType::Acos:
_doCompute = aCosCompute<T>;
break;
case OpType::Asin:
_doCompute = aSinCompute<T>;
break;
case OpType::Asinh:
_doCompute = aSinhCompute<T>;
break;
case OpType::Atan:
_doCompute = aTanCompute<T>;
break;
case OpType::Atanh:
_doCompute = aTanhCompute<T>;
break;
default:
IT_TODO_HALT();
}
for (size_t offset = 0; offset < n; offset++) { for (size_t offset = 0; offset < n; offset++) {
outptr[offset] = doCompute(inptr[offset]); outptr[offset] = _doCompute(inptr[offset]);
}
}
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
} }
} }
}; };
template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig { class NaiveSoftmax : public CpuKernelWithoutConfig {
void compute(const Operator &_op, template <typename T>
const RuntimeObj *context) const override { void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<SoftmaxObj>(_op); auto op = as<SoftmaxObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>(); T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>(); T *outptr = op->getOutput()->getRawDataPtr<T *>();
@ -37,98 +182,28 @@ template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
outptr[offset] = pow(E_CONSTANT, inptr[offset]) / sum; outptr[offset] = pow(E_CONSTANT, inptr[offset]) / sum;
} }
} }
};
template <typename T> class NaiveRelu : public NativeUnary<T> {
T doCompute(T val) const override { return std::max(T(0), val); }
};
template <typename T> class NaiveSigmoid : public NativeUnary<T> {
T doCompute(T val) const override {
return 1 / (1 + pow(E_CONSTANT, -val));
}
};
template <typename T> class NaiveHardSigmoid : public NativeUnary<T> {
T doCompute(T val) const override {
return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
}
};
template <typename T> class NaiveHardSwish : public NativeUnary<T> {
T doCompute(T val) const override {
return val *
std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
}
};
template <typename T> class NaiveTanh : public NativeUnary<T> {
T doCompute(T val) const override {
return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
(pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
}
};
template <typename T> class NaiveAbs : public NativeUnary<T> {
T doCompute(T val) const override { return val < 0 ? -val : val; }
};
template <typename T> class NaiveSqrt : public NativeUnary<T> {
T doCompute(T val) const override { return std::sqrt(val); }
};
template <typename T> class NaiveCos : public NativeUnary<T> {
T doCompute(T val) const override { return std::cos(val); }
};
template <typename T> class NaiveSin : public NativeUnary<T> {
T doCompute(T val) const override { return std::sin(val); }
};
template <typename T> class NaiveTan : public NativeUnary<T> {
T doCompute(T val) const override { return std::tan(val); }
};
template <typename T> class NaiveSinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::sinh(val); }
};
template <typename T> class NaiveCosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::cosh(val); }
};
template <typename T> class NaiveGelu : public NativeUnary<T> {
T doCompute(T val) const override {
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
}
};
template <typename T> class NaiveErf : public NativeUnary<T> {
T doCompute(T val) const override { return std::erf(val); }
};
template <typename T> class NaiveACos : public NativeUnary<T> {
T doCompute(T val) const override { return std::acos(val); }
};
template <typename T> class NaiveACosh : public NativeUnary<T> {
T doCompute(T val) const override { return std::acosh(val); }
};
template <typename T> class NaiveASin : public NativeUnary<T> {
T doCompute(T val) const override { return std::asin(val); }
};
template <typename T> class NaiveASinh : public NativeUnary<T> {
T doCompute(T val) const override { return std::asinh(val); }
};
template <typename T> class NaiveATanh : public NativeUnary<T> {
T doCompute(T val) const override { return std::atanh(val); }
};
template <typename T> class NaiveNeg : public NativeUnary<T> {
T doCompute(T val) const override { return -val; }
};
template <typename T> class Clip : public CpuKernelWithoutConfig {
void compute(const Operator &_op, void compute(const Operator &_op,
const RuntimeObj *context) const override { const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
};
class Clip : public CpuKernelWithoutConfig {
template <typename T>
void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<ClipObj>(_op); auto op = as<ClipObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>(); T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>(); T *outptr = op->getOutput()->getRawDataPtr<T *>();
@ -143,11 +218,28 @@ template <typename T> class Clip : public CpuKernelWithoutConfig {
: val; : val;
} }
} }
};
template <typename T> class Log : public CpuKernelWithoutConfig {
void compute(const Operator &_op, void compute(const Operator &_op,
const RuntimeObj *context) const override { const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
};
class Log : public CpuKernelWithoutConfig {
template <typename T>
void doCompute(const Operator &_op, const RuntimeObj *context) const {
auto op = as<LogObj>(_op); auto op = as<LogObj>(_op);
T *inptr = op->getInputs(0)->getRawDataPtr<T *>(); T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
T *outptr = op->getOutput()->getRawDataPtr<T *>(); T *outptr = op->getOutput()->getRawDataPtr<T *>();
@ -176,70 +268,50 @@ template <typename T> class Log : public CpuKernelWithoutConfig {
} }
} }
} }
void compute(const Operator &_op,
const RuntimeObj *context) const override {
#define CASE(N) \
case N: \
doCompute<DT<N>::t>(_op, context)
int dataTypeIdx = _op->getDType().getIndex();
switch (dataTypeIdx) {
CASE(1); // DataType::Float32
break;
CASE(12); // DataType::UInt32
break;
default:
IT_TODO_HALT();
}
}
}; };
template <typename T> class NaiveATan : public NativeUnary<T> { REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
T doCompute(T val) const override { return std::atan(val); } REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
}; REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
"hardSigmoidNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::HardSwish, NativeUnary,
"hardSwishNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Tanh, NativeUnary, "tanhNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Abs, NativeUnary, "absNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Sqrt, NativeUnary, "sqrtNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Erf, NativeUnary, "erfNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Neg, NativeUnary, "negNaive_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Cos, NativeUnary, "Cos_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Sin, NativeUnary, "Sin_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Tan, NativeUnary, "Tan_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Sinh, NativeUnary, "Sinh_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Cosh, NativeUnary, "Cosh_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Acos, NativeUnary, "ACos_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Acosh, NativeUnary, "ACosh_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Asin, NativeUnary, "ASin_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Asinh, NativeUnary, "ASinh_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Atan, NativeUnary, "Atan_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Atanh, NativeUnary, "ATanh_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32, REGISTER_KERNEL(Device::CPU, OpType::Softmax, NaiveSoftmax, "softmaxNaive_CPU");
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32"); REGISTER_KERNEL(Device::CPU, OpType::Clip, Clip, "Clip_CPU");
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>, REGISTER_KERNEL(Device::CPU, OpType::Log, Log, "Log_CPU");
"reluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu<float>,
"geluNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32,
NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32,
NaiveSigmoid<float>, "sigmoidNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, DataType::Float32,
NaiveHardSigmoid<float>, "hardSigmoidNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::HardSwish, DataType::Float32,
NaiveHardSwish<float>, "hardSwishNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::UInt32,
NaiveTanh<uint32_t>, "tanhNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::Float32, NaiveTanh<float>,
"tanhNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::UInt32, NaiveAbs<uint32_t>,
"absNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::Float32, NaiveAbs<float>,
"absNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sqrt, DataType::Float32, NaiveSqrt<float>,
"sqrtNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf<float>,
"erfNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Neg, DataType::Float32, NaiveNeg<float>,
"negNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
"Clip_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atan, DataType::Float32, NaiveATan<float>,
"Atan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Log, DataType::Float32, Log<float>,
"Log_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cos, DataType::Float32, NaiveCos<float>,
"Cos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sin, DataType::Float32, NaiveSin<float>,
"Sin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Tan, DataType::Float32, NaiveTan<float>,
"Tan_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Sinh, DataType::Float32, NaiveSinh<float>,
"Sinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Cosh, DataType::Float32, NaiveCosh<float>,
"Cosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acos, DataType::Float32, NaiveACos<float>,
"ACos_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Acosh, DataType::Float32,
NaiveACosh<float>, "ACosh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asin, DataType::Float32, NaiveASin<float>,
"ASin_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Asinh, DataType::Float32,
NaiveASinh<float>, "ASinh_CPU_float32");
REGISTER_KERNEL(Device::CPU, OpType::Atanh, DataType::Float32,
NaiveATanh<float>, "ATanh_CPU_float32");
}; // namespace infini }; // namespace infini

View File

@ -54,7 +54,6 @@ class G2BMMCudnn : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, DataType::Float32, G2BMMCudnn, REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, G2BMMCudnn, "G2BMM_cuDNN_CUDA");
"G2BMM_cuDNN_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -55,7 +55,6 @@ class GBMMCudnn : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::GBMM, DataType::Float32, GBMMCudnn, REGISTER_KERNEL(Device::CUDA, OpType::GBMM, GBMMCudnn, "GBMM_cuDNN_CUDA");
"GBMM_cuDNN_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -39,8 +39,8 @@ class AllGatherNCCL : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::AllGather, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllGather, AllGatherNCCL,
AllGatherNCCL, "AllGather_NCCL_CUDA_Float32"); "AllGather_NCCL_CUDA");
} // namespace infini } // namespace infini
#endif #endif

View File

@ -13,15 +13,20 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
auto context = dynamic_cast<const CudaRuntimeObj *>(_context); auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
void *input = op->getInputs(0)->getRawDataPtr<void *>(); void *input = op->getInputs(0)->getRawDataPtr<void *>();
void *output = op->getOutput()->getRawDataPtr<void *>(); void *output = op->getOutput()->getRawDataPtr<void *>();
IT_ASSERT(op->getDType() == DataType::Float32); ncclDataType_t ncclType = ncclFloat;
if (op->getDType() == DataType::Float16) {
ncclType = ncclFloat16;
} else if (op->getDType() == DataType::Int8) {
ncclType = ncclInt8;
}
size_t count = op->getInputs(0)->size(); size_t count = op->getInputs(0)->size();
ncclComm_t comm = ncclComm_t comm =
dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator()) dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
.getNcclComm(); .getNcclComm();
// TODO: Using default stream 0 for now. // TODO: Using default stream 0 for now.
checkNcclError(ncclAllReduce(input, output, count, ncclFloat, checkNcclError(
getRedOp(), comm, 0)); ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
} }
virtual ncclRedOp_t getRedOp() const = 0; virtual ncclRedOp_t getRedOp() const = 0;
@ -43,16 +48,16 @@ class AllReduceAvgNCCL : public AllReduceNCCL {
ncclRedOp_t getRedOp() const override { return ncclAvg; } ncclRedOp_t getRedOp() const override { return ncclAvg; }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, AllReduceSumNCCL,
AllReduceSumNCCL, "AllReduce_Sum_NCCL_CUDA_Float32"); "AllReduce_Sum_NCCL_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, AllReduceProdNCCL,
AllReduceProdNCCL, "AllReduce_Prod_NCCL_CUDA_Float32"); "AllReduce_Prod_NCCL_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, AllReduceMinNCCL,
AllReduceMinNCCL, "AllReduce_Min_NCCL_CUDA_Float32"); "AllReduce_Min_NCCL_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, AllReduceMaxNCCL,
AllReduceMaxNCCL, "AllReduce_Max_NCCL_CUDA_Float32"); "AllReduce_Max_NCCL_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, AllReduceAvgNCCL,
AllReduceAvgNCCL, "AllReduce_Avg_NCCL_CUDA_Float32"); "AllReduce_Avg_NCCL_CUDA");
} // namespace infini } // namespace infini
#endif #endif

View File

@ -47,6 +47,6 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, AttentionKVCacheCuda,
AttentionKVCacheCuda, "AttentionKVCache_CUDA_Float32"); "AttentionKVCache_CUDA");
} // namespace infini } // namespace infini

View File

@ -59,6 +59,6 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, BatchNormCudnn,
BatchNormCudnn, "BatchNorm_cuDNN_CUDA_Float32"); "BatchNorm_cuDNN_CUDA");
} // namespace infini } // namespace infini

View File

@ -25,8 +25,8 @@ class BroadcastNCCL : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, BroadcastNCCL,
BroadcastNCCL, "Broadcast_NCCL_CUDA_Float32"); "Broadcast_NCCL_CUDA");
} // namespace infini } // namespace infini
#endif #endif

View File

@ -21,7 +21,6 @@ class ClipCuda : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda, REGISTER_KERNEL(Device::CUDA, OpType::Clip, ClipCuda, "Clip_CUDA");
"Clip_CUDA_Float32");
}; // namespace infini }; // namespace infini

View File

@ -1,10 +1,12 @@
#include "operators/conv.h" #include "operators/conv.h"
#include "core/kernel.h" #include "core/kernel.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include <chrono> #include <chrono>
#include <functional> #include <functional>
#include <limits> #include <limits>
#include <tuple> #include <tuple>
namespace infini { namespace infini {
struct ConvCuDnnPerfRecordObj : public PerfRecordObj { struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
@ -56,8 +58,11 @@ class convCudnn : public Kernel {
const ConvCuDnnPerfRecord &record) const { const ConvCuDnnPerfRecord &record) const {
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>()); void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>()); void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
if (op->getInputs().size() > 2) // Bias is not supported yet // Bias is not supported yet
if (op->getInputs().size() > 2) {
IT_TODO_HALT(); IT_TODO_HALT();
}
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>()); // void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>()); void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@ -72,27 +77,26 @@ class convCudnn : public Kernel {
cudnnTensorDescriptor_t inDesc; cudnnTensorDescriptor_t inDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w)); inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, n, channels, h, w));
// get kernels // get kernels
cudnnFilterDescriptor_t knDesc; cudnnFilterDescriptor_t knDesc;
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc)); checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT, checkCudnnError(cudnnSetFilter4dDescriptor(
CUDNN_TENSOR_NCHW, f, knDesc, cudnnDataType, CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
channelsPerGrp, r, s));
// get bias // get bias
cudnnTensorDescriptor_t biasDesc; cudnnTensorDescriptor_t biasDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1)); cudnnDataType, 1, f, 1, 1));
// get convlution descriptor // get convolution descriptor
cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionDescriptor_t convDesc;
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc)); checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
// TODO: CUDNN_CONVOLUTION is a tunable argument // TODO: CUDNN_CONVOLUTION is a tunable argument
checkCudnnError(cudnnSetConvolution2dDescriptor( checkCudnnError(cudnnSetConvolution2dDescriptor(
convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode], convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
CUDNN_DATA_FLOAT)); cudnnDataType));
if (g > 1) { if (g > 1) {
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g)); checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
} }
@ -120,14 +124,14 @@ class convCudnn : public Kernel {
assert(false); assert(false);
} }
// get output descriptor
int outn, outc, outh, outw; int outn, outc, outh, outw;
checkCudnnError(cudnnGetConvolution2dForwardOutputDim( checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw)); convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
cudnnTensorDescriptor_t outDesc; cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(
CUDNN_DATA_FLOAT, outn, outc, outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, outn, outc, outh, outw));
outh, outw));
IT_ASSERT((vector{outn, outc, outh, outw}) == IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(), op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape"); "cuDNN output shape mismatches with OP output shape");
@ -151,55 +155,9 @@ class convCudnn : public Kernel {
inData, knDesc, knData, convDesc, inData, knDesc, knData, convDesc,
ALGOS[record->algo], wsData, wsSize, ALGOS[record->algo], wsData, wsSize,
&beta, outDesc, outData); &beta, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS) if (stat != CUDNN_STATUS_SUCCESS) {
return false; return false;
// TODO: }
// // bias
// if (bias != nullptr) {
// auto sz = op.getOutputs()[0]->size();
// // TODO: element wise
// t += sz * 2 / 400;
// }
// // act
// if (act != None) {
// stat = cudnnActivationForward(cudnnHandle(), actDesc,
// &alpha, inDesc, inData,
// &beta, outDesc, outData);
// checkCudaError(cudaDeviceSynchronize());
// end = ch::high_resolution_clock::now();
// if (stat != CUDNN_STATUS_SUCCESS) {
// durtime = INFINITY;
// break;
// }
// t +=
// ch::duration_cast<ch::duration<double>>(end -
// beg).count() * 1000; // ms
// }
// best = ConvResult{durtime, ALGOS[i], wsSize, false};
// // w/ bias & act
// for (int j = 0; j < rounds + warmupRounds; ++j) {
// cudnnStatus_t stat;
// if (j == warmupRounds) {
// checkCudaError(cudaDeviceSynchronize());
// beg = ch::high_resolution_clock::now();
// }
// stat = cudnnConvolutionBiasActivationForward(
// cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
// convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
// outData, biasDesc, biasData, actDesc, outDesc, outData);
// if (stat != CUDNN_STATUS_SUCCESS) {
// // checkCudnnError(stat);
// // Do not checkCudnnError since not all algorithms are
// // supported
// durtime_fuse = INFINITY;
// break;
// }
// }
// Destories in CUDA does not require sync. But cuDNN does not state
// whether sync is required before destories.
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc)); checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc)); checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc)); checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
@ -238,10 +196,12 @@ class convCudnn : public Kernel {
stat = cudnnGetConvolutionForwardWorkspaceSize( stat = cudnnGetConvolutionForwardWorkspaceSize(
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc, context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
ALGOS[record.algo], &record.workspaceSize); ALGOS[record.algo], &record.workspaceSize);
if (stat != CUDNN_STATUS_SUCCESS) if (stat != CUDNN_STATUS_SUCCESS) {
continue; continue;
if (record.workspaceSize > context->getWorkspaceSize()) }
if (record.workspaceSize > context->getWorkspaceSize()) {
continue; continue;
}
CudaPtr wsData = context->getWorkspace(record.workspaceSize); CudaPtr wsData = context->getWorkspace(record.workspaceSize);
float alpha = 1.f, beta = 0.f; float alpha = 1.f, beta = 0.f;
@ -249,8 +209,9 @@ class convCudnn : public Kernel {
context->cudnnHandle(), &alpha, inDesc, inData, knDesc, context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
knData, convDesc, ALGOS[record.algo], wsData, knData, convDesc, ALGOS[record.algo], wsData,
record.workspaceSize, &beta, outDesc, outData); record.workspaceSize, &beta, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS) if (stat != CUDNN_STATUS_SUCCESS) {
continue; continue;
}
record.time = timeit( record.time = timeit(
[&]() { [&]() {
cudnnConvolutionForward(context->cudnnHandle(), &alpha, cudnnConvolutionForward(context->cudnnHandle(), &alpha,
@ -263,8 +224,9 @@ class convCudnn : public Kernel {
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time); // printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
// Update the tune result // Update the tune result
if (ret.time > record.time) if (ret.time > record.time) {
ret = record; ret = record;
}
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc)); checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc)); checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc)); checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
@ -291,8 +253,7 @@ class convCudnn : public Kernel {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Conv, convCudnn, "Conv_cuDNN_CUDA");
"Conv_cuDNN_CUDA_Float32");
REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json); REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
} // namespace infini } // namespace infini

View File

@ -1,261 +0,0 @@
#include "core/kernel.h"
#include "cuda/cuda_runtime.h"
#include "operators/conv.h"
#include <chrono>
#include <functional>
#include <limits>
#include <tuple>
namespace infini {
struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
int algo = 0; // cudnnConvolutionFwdAlgo_t
int mode = 1;
size_t workspaceSize = 100000;
bool fuseAct = false;
void to_json(json &j) override {
j["type"] = 1;
j["data"] = std::make_tuple(algo, mode, fuseAct, time, workspaceSize);
}
static PerfRecord from_json(const json &j) {
ConvCuDnnPerfRecordObj tmp;
auto [Algo, Mode, FuseAct, Time, WorkspaceSize] =
j["data"].get<tuple<int, int, bool, double, size_t>>();
tmp.algo = Algo;
tmp.mode = Mode;
tmp.fuseAct = FuseAct;
tmp.time = Time;
tmp.workspaceSize = WorkspaceSize;
return make_ref<ConvCuDnnPerfRecordObj>(tmp);
}
};
using ConvCuDnnPerfRecord = Ref<ConvCuDnnPerfRecordObj>;
class convCudnnFP16 : public Kernel {
static constexpr int N_ALGO = 8;
static constexpr int N_MODE = 2;
static constexpr cudnnConvolutionFwdAlgo_t ALGOS[8] = {
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
static constexpr cudnnConvolutionMode_t MODES[2] = {
CUDNN_CONVOLUTION, CUDNN_CROSS_CORRELATION};
std::tuple<void *, void *, void *, cudnnTensorDescriptor_t,
cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
cudnnTensorDescriptor_t>
createCuDNNDescriptor(const Ref<ConvObj> &op,
const ConvCuDnnPerfRecord &record) const {
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
// Bias is not supported yet
if (op->getInputs().size() > 2) {
IT_TODO_HALT();
}
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
int channelsPerGrp = cpg, channels = c;
// get inputs
cudnnTensorDescriptor_t inDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(inDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_HALF, n, channels,
h, w)); /*fp16 type*/
// get kernels
cudnnFilterDescriptor_t knDesc;
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
checkCudnnError(cudnnSetFilter4dDescriptor(
knDesc, CUDNN_DATA_HALF, /*fp16 type*/
CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
// get bias
cudnnTensorDescriptor_t biasDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_HALF, 1, f, 1,
1)); /*fp16 type*/
// get convolution descriptor
cudnnConvolutionDescriptor_t convDesc;
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
// TODO: CUDNN_CONVOLUTION is a tunable argument
checkCudnnError(cudnnSetConvolution2dDescriptor(
convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
CUDNN_DATA_HALF)); /*fp16 type*/
if (g > 1) {
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
}
// get activation descriptor
cudnnActivationDescriptor_t actDesc;
checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
// NOT_PROPAGATE_NAN is requierd by
// cudnnConvolotionBiasActivationForward
switch (op->getAct()) {
case ActType::Relu:
checkCudnnError(cudnnSetActivationDescriptor(
actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
break;
case ActType::Sigmoid:
checkCudnnError(cudnnSetActivationDescriptor(
actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
break;
case ActType::None:
checkCudnnError(
cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
CUDNN_NOT_PROPAGATE_NAN, 0));
break;
default:
assert(false);
}
// get output descriptor
int outn, outc, outh, outw;
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_HALF, outn, outc,
outh, outw));
IT_ASSERT((vector{outn, outc, outh, outw}) ==
op->getOutput()->getDims(),
"cuDNN output shape mismatches with OP output shape");
return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc);
}
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const {
cudnnStatus_t stat;
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc] =
createCuDNNDescriptor(op, record);
size_t wsSize = record->workspaceSize;
CudaPtr wsData = context->getWorkspace(wsSize);
float alpha = 1.f, beta = 0.f;
stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
inData, knDesc, knData, convDesc,
ALGOS[record->algo], wsData, wsSize,
&beta, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS) {
return false;
}
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
return true;
}
void compute(const Operator &op, const RuntimeObj *context) const override {
auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
// default ctor
compute(op, record, context);
}
PerfRecord tune(const Operator &_op,
const RuntimeObj *_context) const override {
ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max();
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto op = as<ConvObj>(_op);
// Both modes have the same performance. Only run cross-correlation.
for (int mode = 1; mode < 2; mode++) {
// Try every possible algorithm of convolution
for (int algo = 0; algo < N_ALGO; algo++) {
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
auto &record = *recordRef;
record.mode = mode;
record.algo = algo;
cudnnStatus_t stat;
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc] =
createCuDNNDescriptor(op, recordRef);
// get workspace
stat = cudnnGetConvolutionForwardWorkspaceSize(
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
ALGOS[record.algo], &record.workspaceSize);
if (stat != CUDNN_STATUS_SUCCESS) {
continue;
}
if (record.workspaceSize > context->getWorkspaceSize()) {
continue;
}
CudaPtr wsData = context->getWorkspace(record.workspaceSize);
float alpha = 1.f, beta = 0.f;
stat = cudnnConvolutionForward(
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
knData, convDesc, ALGOS[record.algo], wsData,
record.workspaceSize, &beta, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS) {
continue;
}
record.time = timeit(
[&]() {
cudnnConvolutionForward(context->cudnnHandle(), &alpha,
inDesc, inData, knDesc, knData,
convDesc, ALGOS[record.algo],
wsData, record.workspaceSize,
&beta, outDesc, outData);
},
[&]() { context->sync(); });
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
// Update the tune result
if (ret.time > record.time) {
ret = record;
}
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
}
}
// printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
// ret.mode);
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
"algorithm "
"found");
return make_ref<ConvCuDnnPerfRecordObj>(ret);
}
void compute(const Operator &_op, const PerfRecord &_record,
const RuntimeObj *_context) const override {
auto op = as<ConvObj>(_op);
auto record = as<ConvCuDnnPerfRecordObj>(_record);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
bool success = cuDNNUnfused(op, record, context);
IT_ASSERT(success);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float16, convCudnnFP16,
"Conv_cuDNN_CUDA_Float16");
} // namespace infini

View File

@ -300,8 +300,9 @@ class convBackwardDataCudnn : public Kernel {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, convBackwardDataCudnn,
convBackwardDataCudnn, "ConvTranposed_cuDNN_CUDA_Float32"); "ConvTranposed_cuDNN_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, convBackwardDataCudnn,
convBackwardDataCudnn, "ConvTranposedNHWC_cuDNN_CUDA_Float32"); "ConvTranposedNHWC_cuDNN_CUDA");
} // namespace infini } // namespace infini

View File

@ -0,0 +1,57 @@
#include "operators/dequantize_linear.h"
#include "cuda/cuda_dequantize_linear.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class DequantizeLinearCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<DequantizeLinearObj>(_op);
void *const inputX = (op->getInputs(0)->getRawDataPtr<void *>());
void *const inputScale = (op->getInputs(1)->getRawDataPtr<void *>());
void *const output = (op->getOutput()->getRawDataPtr<void *>());
const int axis = op->getAxis();
const int stride = op->getInputs(0)->getStride().at(axis);
auto dims = op->getInputs(0)->getDims();
int dimsize = dims[op->getAxis()];
int size = op->getOutput()->size();
if (op->getInputs(1)->getDType() == DataType::Float32) {
if (op->numInputs() == 3) {
void *const inputZeroPoint =
(op->getInputs(2)->getRawDataPtr<void *>());
DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
(float *)output, dimsize, stride,
(uint8_t *)inputZeroPoint, size);
} else {
DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
(float *)output, dimsize, stride, size);
}
} else if (op->getInputs(1)->getDType() == DataType::Float16) {
if (op->numInputs() == 3) {
void *const inputZeroPoint =
(op->getInputs(2)->getRawDataPtr<void *>());
DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
(half *)output, dimsize, stride,
(uint8_t *)inputZeroPoint, size);
} else {
DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
(half *)output, dimsize, stride, size);
}
}
}
};
REGISTER_KERNEL(Device::CUDA, OpType::DequantizeLinear, DequantizeLinearCuda,
"DequantizeLinear_CUDA");
}; // namespace infini

View File

@ -0,0 +1,345 @@
#include "cuda/cuda_common.h"
#include <cub/cub.cuh>
template <typename T, int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__
void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
T *output, const int dimsize,
const int stride,
const uint8_t *inputZeroPoint) {
// len(scale) = len(bias) = dimsize
int tmp = blockIdx.x % stride;
int tid = (blockIdx.x - tmp) * dimsize + tmp;
int remain = dimsize % BLOCK_DIM;
int step = (dimsize - remain) / BLOCK_DIM + 1;
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
output[tid + (threadIdx.x * step + ind) * stride] =
static_cast<T>(
inputX[tid + (threadIdx.x * step + ind) * stride] -
inputZeroPoint[threadIdx.x * step + ind]) *
inputScale[threadIdx.x * step + ind];
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid +
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
stride] =
static_cast<T>(
inputX[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride] -
inputZeroPoint[remain * step +
(threadIdx.x - remain) * (step - 1) + ind]) *
inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
ind];
}
}
}
template <typename T, int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__
void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
T *output, const int dimsize,
const int stride) {
// len(scale) = len(bias) = dimsize
int tmp = blockIdx.x % stride;
int tid = (blockIdx.x - tmp) * dimsize + tmp;
int remain = dimsize % BLOCK_DIM;
int step = (dimsize - remain) / BLOCK_DIM + 1;
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
output[tid + (threadIdx.x * step + ind) * stride] =
static_cast<T>(
inputX[tid + (threadIdx.x * step + ind) * stride]) *
inputScale[threadIdx.x * step + ind];
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid +
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
stride] =
static_cast<T>(
inputX[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride]) *
inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
ind];
}
}
}
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
__global__ void
warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
T *output, const int dimsize, const int otherSize,
const int stride, const uint8_t *inputZeroPoint) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
int remain = dimsize % BLOCK_DIM_x;
int step = (dimsize - remain) / BLOCK_DIM_x + 1;
if (otherIdx < otherSize) {
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
output[tid + (threadIdx.x * step + ind) * stride] =
static_cast<T>(
inputX[tid + (threadIdx.x * step + ind) * stride] -
inputZeroPoint[threadIdx.x * step + ind]) *
inputScale[threadIdx.x * step + ind];
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride] =
static_cast<T>(
inputX[tid +
(remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride] -
inputZeroPoint[remain * step +
(threadIdx.x - remain) * (step - 1) +
ind]) *
inputScale[remain * step +
(threadIdx.x - remain) * (step - 1) + ind];
}
}
}
}
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
__global__ void
warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
T *output, const int dimsize, const int otherSize,
const int stride) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
int remain = dimsize % BLOCK_DIM_x;
int step = (dimsize - remain) / BLOCK_DIM_x + 1;
if (otherIdx < otherSize) {
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
output[tid + (threadIdx.x * step + ind) * stride] =
static_cast<T>(
inputX[tid + (threadIdx.x * step + ind) * stride]) *
inputScale[threadIdx.x * step + ind];
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride] =
static_cast<T>(
inputX[tid +
(remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride]) *
inputScale[remain * step +
(threadIdx.x - remain) * (step - 1) + ind];
}
}
}
}
namespace infini {
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
float *output, const int dimsize, const int stride,
const uint8_t *inputZeroPoint, const int size) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
inputX, inputScale, output, dimsize, stride, inputZeroPoint);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 32, 32>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 16, 64>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 8, 128>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 4, 256>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
}
}
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
float *output, const int dimsize, const int stride,
const int size) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
inputX, inputScale, output, dimsize, stride);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 32, 32><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 16, 64><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 8, 128><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<float, 4, 256><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
}
}
//-------------
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
half *output, const int dimsize, const int stride,
const uint8_t *inputZeroPoint, const int size) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
inputX, inputScale, output, dimsize, stride, inputZeroPoint);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 32, 32>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 16, 64>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 8, 128>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 4, 256>
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
num_block, stride, inputZeroPoint);
}
}
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
half *output, const int dimsize, const int stride,
const int size) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
inputX, inputScale, output, dimsize, stride);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 32, 32><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 16, 64><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 8, 128><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpDequantizeLinearKernel<half, 4, 256><<<grid_dim, block_dim>>>(
inputX, inputScale, output, dimsize, num_block, stride);
}
}
} // namespace infini

View File

@ -0,0 +1,31 @@
#include "operators/dynamic_quantize_linear.h"
#include "cuda/cuda_dynamic_quantize_linear.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h"
namespace infini {
class DynamicQuantizeLinearCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<DynamicQuantizeLinearObj>(_op);
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outputY = (op->getOutput(0)->getRawDataPtr<void *>());
void *const outputYScale = (op->getOutput(1)->getRawDataPtr<void *>());
void *const outputYZeroPoint =
(op->getOutput(2)->getRawDataPtr<void *>());
int size = op->getInputs(0)->size();
dynamicQuantizeLinearKernel((float *)input, (uint8_t *)outputY,
(float *)outputYScale,
(uint8_t *)outputYZeroPoint, size);
}
};
REGISTER_KERNEL(Device::CUDA, OpType::DynamicQuantizeLinear,
DynamicQuantizeLinearCuda, "DynamicQuantizeLinear_CUDA");
}; // namespace infini

View File

@ -0,0 +1,182 @@
#include <cub/cub.cuh>
#include "cuda/cuda_common.h"
__device__ float _saturate(float x) {
return x < 0.f ? 0.f : (x > 255.0f ? 255.0f : x);
}
template <class T>
__device__ __forceinline__ static T max___(T a, T b) noexcept {
return a > b ? a : b;
}
template <class T>
__device__ __forceinline__ static T min___(T a, T b) noexcept {
return a < b ? a : b;
}
template <int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__
void _dynamicQuantizeLinearKernel(float *input, uint8_t *outputY,
float *yScale, uint8_t *yZeroPoint,
int size) {
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
float maxData = -__FLT_MAX__;
float minData = __FLT_MAX__;
int remain = size % BLOCK_DIM;
int step = (size - remain) / BLOCK_DIM + 1;
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
maxData = max___(maxData, input[threadIdx.x * step + ind]);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
maxData = max___(maxData,
input[remain * step +
(threadIdx.x - remain) * (step - 1) + ind]);
}
}
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
minData = min___(minData, input[threadIdx.x * step + ind]);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
minData = min___(minData,
input[remain * step +
(threadIdx.x - remain) * (step - 1) + ind]);
}
}
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storageMax;
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
__shared__ typename BlockReduce::TempStorage temp_storageMin;
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
//-----
int qmax = 255;
int qmin = 0;
__shared__ float _yZeroPoint;
if (threadIdx.x == 0) {
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
float intermediate_zero_point = qmin - blockMin / yScale[0];
_yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
}
__syncthreads();
if (i < size) {
outputY[i] = static_cast<uint8_t>(
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));
}
}
//----------
template <int BLOCK_DIM, int numPerThread>
__launch_bounds__(BLOCK_DIM) __global__
void _dynamicQuantizeLinearKernel(float *input, uint8_t *outputY,
float *yScale, uint8_t *yZeroPoint,
int size) {
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
float maxData = -__FLT_MAX__;
float minData = __FLT_MAX__;
int remain = size % BLOCK_DIM;
int step = (size - remain) / BLOCK_DIM + 1;
float dataPerThread[numPerThread];
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
dataPerThread[ind] = input[threadIdx.x * step + ind];
maxData = max___(maxData, dataPerThread[ind]);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
dataPerThread[ind] =
input[remain * step + (threadIdx.x - remain) * (step - 1) +
ind];
maxData = max___(maxData, dataPerThread[ind]);
}
}
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
minData = min___(minData, dataPerThread[ind]);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
minData = min___(minData, dataPerThread[ind]);
}
}
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storageMax;
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
__shared__ typename BlockReduce::TempStorage temp_storageMin;
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
//-----
int qmax = 255;
int qmin = 0;
__shared__ float _yZeroPoint;
if (threadIdx.x == 0) {
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
float intermediate_zero_point = qmin - blockMin / yScale[0];
_yZeroPoint = round(_saturate(intermediate_zero_point));
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
}
__syncthreads();
if (i < size) {
outputY[i] = static_cast<uint8_t>(
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));
}
}
namespace infini {
void dynamicQuantizeLinearKernel(float *input, uint8_t *outputY, float *yScale,
uint8_t *yZeroPoint, int size) {
if (size > 1024 * 128) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else if (size > 1024 * 64) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 128><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else if (size > 1024 * 32) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 64><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else if (size > 1024 * 16) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 32><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else if (size > 1024 * 4) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 16><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else if (size > 1024) {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 4><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
} else {
int BLOCK_DIM = 1024;
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
_dynamicQuantizeLinearKernel<1024, 1><<<num_blocks, BLOCK_DIM>>>(
input, outputY, yScale, yZeroPoint, size);
}
}
} // namespace infini

View File

@ -2,6 +2,7 @@
#include "cuda/cuda_element_wise.h" #include "cuda/cuda_element_wise.h"
#include "cuda/cuda_kernel_wihtout_config.h" #include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
namespace infini { namespace infini {
class ElementWiseCudnn : public CudaKernelWithoutConfig { class ElementWiseCudnn : public CudaKernelWithoutConfig {
@ -44,22 +45,21 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size())); std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size())); std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
// get inputs // get inputs
checkCudnnError(cudnnCreateTensorDescriptor(&aDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(
CUDNN_DATA_FLOAT, a[0], a[1], aDesc, CUDNN_TENSOR_NCHW, cudnnDataType, a[0], a[1], a[2], a[3]));
a[2], a[3]));
checkCudnnError(cudnnCreateTensorDescriptor(&bDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(
CUDNN_DATA_FLOAT, b[0], b[1], bDesc, CUDNN_TENSOR_NCHW, cudnnDataType, b[0], b[1], b[2], b[3]));
b[2], b[3]));
// get outputs // get outputs
checkCudnnError(cudnnCreateTensorDescriptor(&cDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, checkCudnnError(cudnnSetTensor4dDescriptor(
CUDNN_DATA_FLOAT, c[0], c[1], cDesc, CUDNN_TENSOR_NCHW, cudnnDataType, c[0], c[1], c[2], c[3]));
c[2], c[3]));
// get op descriptor // get op descriptor
cudnnOpTensorDescriptor_t opDesc; cudnnOpTensorDescriptor_t opDesc;
@ -127,40 +127,33 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size())); std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size())); std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
if (op->getOpType() == OpType::Div) const int dType = _op->getDType().getIndex();
div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], if (op->getOpType() == OpType::Div) {
b[2], b[3], c[0], c[1], c[2], c[3]); div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
else if (op->getOpType() == OpType::Pow) b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], } else if (op->getOpType() == OpType::Add) {
b[2], b[3], c[0], c[1], c[2], c[3]); add_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
else if (op->getOpType() == OpType::Add) { b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
add_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], } else if (op->getOpType() == OpType::Pow) {
b[2], b[3], c[0], c[1], c[2], c[3]); pow_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
} else if (op->getOpType() == OpType::Less) { } else if (op->getOpType() == OpType::Less) {
less_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1], less_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3],
b[2], b[3], c[0], c[1], c[2], c[3]); b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
} else } else {
IT_TODO_HALT(); IT_TODO_HALT();
}
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Float32, AddCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Add, AddCudnn, "Add_cuDNN_CUDA");
"Add_cuDNN_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Sub, SubCudnn, "Sub_cuDNN_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Sub, DataType::Float32, SubCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Mul, MulCudnn, "Mul_cuDNN_CUDA");
"Sub_cuDNN_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Min, MinCudnn, "Min_cuDNN_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Mul, DataType::Float32, MulCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Max, MaxCudnn, "Max_cuDNN_CUDA");
"Mul_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Min, DataType::Float32, MinCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Div, ElementWiseCuda, "Div_CUDA");
"Min_cuDNN_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Pow, ElementWiseCuda, "Pow_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Less, ElementWiseCuda, "Less_CUDA");
"Max_cuDNN_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
"Div_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Int64, ElementWiseCuda,
"Add_CUDA_Int64");
REGISTER_KERNEL(Device::CUDA, OpType::Pow, DataType::Float32, ElementWiseCuda,
"Pow__CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Less, DataType::Int64, ElementWiseCuda,
"Less__CUDA_Int64");
}; // namespace infini }; // namespace infini

View File

@ -1,4 +1,5 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include <math.h> #include <math.h>
constexpr unsigned int num_threads() { return 32 * 4; } constexpr unsigned int num_threads() { return 32 * 4; }
@ -129,44 +130,113 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
} }
} }
#define CASE(OP, T) \
_##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
#define SWITCH_DTYPE(OP, DTYPE) \
switch (DTYPE) { \
case 1: \
CASE(OP, 1) \
break; \
case 2: \
CASE(OP, 2) \
break; \
case 3: \
CASE(OP, 3) \
break; \
case 4: \
CASE(OP, 4) \
break; \
case 5: \
CASE(OP, 5) \
break; \
case 6: \
CASE(OP, 6) \
break; \
case 7: \
CASE(OP, 7) \
break; \
case 10: \
CASE(OP, 10) \
break; \
case 11: \
CASE(OP, 11) \
break; \
case 12: \
CASE(OP, 12) \
break; \
case 13: \
CASE(OP, 13) \
break; \
case 16: \
CASE(OP, 16) \
break; \
default: \
IT_TODO_HALT(); \
}
namespace infini { namespace infini {
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0 * c1 * c2 * c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_div_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, SWITCH_DTYPE(div, dType)
b2, b3, c0, c1, c2, c3);
} }
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0 * c1 * c2 * c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_add_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, SWITCH_DTYPE(add, dType)
b1, b2, b3, c0, c1, c2, c3);
} }
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0 * c1 * c2 * c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1, if (dType == 1) {
b2, b3, c0, c1, c2, c3); _pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
b1, b2, b3, c0, c1, c2, c3);
} else if (dType == 3) {
_pow_kernel<int8_t><<<gridsize, blocksize>>>(
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
} else if (dType == 10) {
int a_size = a0 * a1 * a2 * a3;
int b_size = b0 * b1 * b2 * b3;
int c_size = c0 * c1 * c2 * c3;
vector<float> a_float(a_size);
vector<float> b_float(b_size);
vector<float> c_float(c_size);
for (int i = 0; i < a_size; ++i) {
a_float[i] = __half2float(((half *)a)[i]);
}
for (int i = 0; i < b_size; ++i) {
b_float[i] = __half2float(((half *)b)[i]);
}
_pow_kernel<float><<<gridsize, blocksize>>>(
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
b1, b2, b3, c0, c1, c2, c3);
for (int i = 0; i < c_size; ++i) {
((half *)c)[i] = __float2half(c_float[i]);
}
} else {
IT_TODO_HALT();
}
} }
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3, void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
int c3) { int c3) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int num = c0 * c1 * c2 * c3; int num = c0 * c1 * c2 * c3;
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_less_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, SWITCH_DTYPE(less, dType)
b1, b2, b3, c0, c1, c2, c3);
} }
}; // namespace infini }; // namespace infini

View File

@ -25,12 +25,12 @@ class ExpandCuda : public CudaKernelWithoutConfig {
inputShape.data[i] = in_Shape[i]; inputShape.data[i] = in_Shape[i];
outputsize *= out_Shape[i]; outputsize *= out_Shape[i];
} }
expandKernel((float *)inputData, (float *)outputData, nDims, outputsize, const int dType = op->getDType().getIndex();
expandKernel(dType, inputData, outputData, nDims, outputsize,
inputShape, outputShape); inputShape, outputShape);
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Expand, DataType::Float32, ExpandCuda, REGISTER_KERNEL(Device::CUDA, OpType::Expand, ExpandCuda, "Expand_CUDA");
"Expand_CUDA_Float32");
}; // namespace infini }; // namespace infini

View File

@ -1,12 +1,14 @@
#include "core/common.h" #include "core/common.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include "utils/small_array.h" #include "utils/small_array.h"
constexpr unsigned int num_threads() { return 32 * 4; } constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; } constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); } constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _expandKernel(float *input, float *output, int nDims, template <class T>
__global__ void _expandKernel(void *input, void *output, int nDims,
int outputsize, infini::SmallArray inputShape, int outputsize, infini::SmallArray inputShape,
infini::SmallArray outputShape) { infini::SmallArray outputShape) {
@ -33,17 +35,64 @@ __global__ void _expandKernel(float *input, float *output, int nDims,
temp *= inputShape.data[i]; temp *= inputShape.data[i];
v = v / outputShape.data[i]; v = v / outputShape.data[i];
} }
output[outputIdx] = input[inputIdx]; ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
} }
} }
namespace infini { namespace infini {
void expandKernel(float *input, float *output, int nDims, int outputsize,
SmallArray inputShape, SmallArray outputShape) { #define CASE(T) \
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
input, output, nDims, outputsize, inputShape, outputShape);
#define SWITCH_DTYPE(DTYPE) \
switch (DTYPE) { \
case 1: \
CASE(1) \
break; \
case 2: \
CASE(2) \
break; \
case 3: \
CASE(3) \
break; \
case 4: \
CASE(4) \
break; \
case 5: \
CASE(5) \
break; \
case 6: \
CASE(6) \
break; \
case 7: \
CASE(7) \
break; \
case 10: \
CASE(10) \
break; \
case 11: \
CASE(11) \
break; \
case 12: \
CASE(12) \
break; \
case 13: \
CASE(13) \
break; \
case 16: \
CASE(16) \
break; \
default: \
IT_TODO_HALT(); \
}
void expandKernel(int dType, void *input, void *output, int nDims,
int outputsize, SmallArray inputShape,
SmallArray outputShape) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (outputsize + block_work_size() - 1) / block_work_size(); int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
_expandKernel<<<gridsize, blocksize>>>(input, output, nDims, outputsize, SWITCH_DTYPE(dType)
inputShape, outputShape);
} }
} // namespace infini } // namespace infini

View File

@ -22,6 +22,5 @@ class ExtendCuda : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Extend, DataType::Float32, ExtendCuda, REGISTER_KERNEL(Device::CUDA, OpType::Extend, ExtendCuda, "Extend_CUDA");
"Extend_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -15,12 +15,26 @@ class GatherCuda : public CudaKernelWithoutConfig {
GatherMetaData metaData; GatherMetaData metaData;
initGatherMetaData(metaData, op); initGatherMetaData(metaData, op);
auto inData = input->getRawDataPtr<float *>(); void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
auto outData = op->getOutput()->getRawDataPtr<float *>(); void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
gather_kernel(inData, outData, metaData, op->getOutput()->size());
if (op->getDType() == DataType::Float32) {
gather_kernel<float>((float *)inputData, (float *)outputData,
metaData, op->getOutput()->size());
} else if (op->getDType() == DataType::Float16) {
gather_kernel<half>((half *)inputData, (half *)outputData, metaData,
op->getOutput()->size());
} else if (op->getDType() == DataType::Int8) {
gather_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData,
metaData, op->getOutput()->size());
} else if (op->getDType() == DataType::UInt8) {
gather_kernel<uint8_t>((uint8_t *)inputData, (uint8_t *)outputData,
metaData, op->getOutput()->size());
} else {
IT_ASSERT(false);
}
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Gather, DataType::Float32, GatherCuda, REGISTER_KERNEL(Device::CUDA, OpType::Gather, GatherCuda, "Gather_CUDA");
"Gather_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -28,27 +28,34 @@ __device__ T gatheredOffset2Offset(int gOffset,
return offset; return offset;
} }
template <typename T> template <typename dataT, typename T>
__global__ void _gather_kernel(float *in, float *out, __global__ void _gather_kernel(dataT *in, dataT *out,
infini::GatherMetaData metaData, size_t num) { infini::GatherMetaData metaData, size_t num) {
T tid = threadIdx.x + blockIdx.x * blockDim.x; T tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; if (tid < num) {
while (tid < num) {
T offset = gatheredOffset2Offset<T>(tid, metaData); T offset = gatheredOffset2Offset<T>(tid, metaData);
out[tid] = in[offset]; out[tid] = in[offset];
tid += stride;
} }
} }
namespace infini { namespace infini {
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num) { template <typename T>
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
int blockSize = 32 * 16; int blockSize = 32 * 16;
int gridSize = (num + blockSize - 1) / blockSize; int gridSize = (num + blockSize - 1) / blockSize;
if (metaData.indexType == DataType::Int64) { if (metaData.indexType == DataType::Int64) {
_gather_kernel<int64_t> _gather_kernel<T, int64_t>
<<<gridSize, blockSize>>>(in, out, metaData, num); <<<gridSize, blockSize>>>(in, out, metaData, num);
} else { } else {
_gather_kernel<int><<<gridSize, blockSize>>>(in, out, metaData, num); _gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
} }
} }
template void gather_kernel<float>(float *in, float *out,
GatherMetaData metaData, size_t num);
template void gather_kernel<half>(half *in, half *out, GatherMetaData metaData,
size_t num);
template void gather_kernel<int8_t>(int8_t *in, int8_t *out,
GatherMetaData metaData, size_t num);
template void gather_kernel<uint8_t>(uint8_t *in, uint8_t *out,
GatherMetaData metaData, size_t num);
} // namespace infini } // namespace infini

View File

@ -21,8 +21,7 @@ class GatherElementsCuda : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, GatherElementsCuda,
GatherElementsCuda, "GatherELements_CUDA_Float32"); "GatherELements_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Int32,
GatherElementsCuda, "GatherElements_CUDA_Int32");
} // namespace infini } // namespace infini

View File

@ -24,22 +24,39 @@ class LayerNormCuda : public CudaKernelWithoutConfig {
int dimsize = dims[op->getAxis()]; int dimsize = dims[op->getAxis()];
int size = op->getOutput(0)->size(); int size = op->getOutput(0)->size();
int scaleSize = op->getInputs(1)->size(); int scaleSize = op->getInputs(1)->size();
if (op->numInputs() == 3) { if (op->getDType() == DataType::Float32) {
void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>()); if (op->numInputs() == 3) {
int biasSize = op->getInputs(2)->size(); void *const biasData =
// printf("kernel bias:true:%d\n", 1); (op->getInputs(2)->getRawDataPtr<void *>());
LaynormKernel((float *)inputData, (float *)scaleData, eps, size, int biasSize = op->getInputs(2)->size();
scaleSize, dimsize, stride, (float *)outputData, // printf("kernel bias:true:%d\n", 1);
(float *)biasData, biasSize); LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
} else { scaleSize, dimsize, stride, (float *)outputData,
// printf("kernel bias:false:%d\n", 0); (float *)biasData, biasSize);
LaynormKernel((float *)inputData, (float *)scaleData, eps, size, } else {
scaleSize, dimsize, stride, (float *)outputData); // printf("kernel bias:false:%d\n", 0);
LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
scaleSize, dimsize, stride, (float *)outputData);
}
} else if (op->getDType() == DataType::Float16) {
if (op->numInputs() == 3) {
void *const biasData =
(op->getInputs(2)->getRawDataPtr<void *>());
int biasSize = op->getInputs(2)->size();
// printf("kernel bias:true:%d\n", 1);
LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
scaleSize, dimsize, stride, (half *)outputData,
(half *)biasData, biasSize);
} else {
// printf("kernel bias:false:%d\n", 0);
LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
scaleSize, dimsize, stride, (half *)outputData);
}
} }
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, LayerNormCuda,
LayerNormCuda, "LayerNorm_CUDA_Float32"); "LayerNorm_CUDA");
}; // namespace infini }; // namespace infini

View File

@ -1,43 +1,41 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include <cub/cub.cuh> #include <cub/cub.cuh>
template <int BLOCK_DIM> template <typename T, int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__ __launch_bounds__(BLOCK_DIM) __global__
void blockLaynormKernel(const float *input, const float *scale, void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
const int dimsize, const int stride, float *output, const int stride, T *output, const T eps,
const float eps, int scaleSize, const float *bias, int scaleSize, const T *bias, int biasSize) {
int biasSize) {
// len(scale) = len(bias) = dimsize // len(scale) = len(bias) = dimsize
int tmp = blockIdx.x % stride; int tmp = blockIdx.x % stride;
int tid = (blockIdx.x - tmp) * dimsize + tmp; int tid = (blockIdx.x - tmp) * dimsize + tmp;
float muPartial = 0.0f; T muPartial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride]; muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
} }
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce; typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float mu; __shared__ T mu;
float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum()); T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
if (threadIdx.x == if (threadIdx.x ==
0) { // must set threadIdx.x = 0 write the output to memory 0) { // must set threadIdx.x = 0 write the output to memory
mu = muBlock / dimsize; mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
} }
__syncthreads(); __syncthreads();
float sigma2Partial = 0.0f; T sigma2Partial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
sigma2Partial += sigma2Partial +=
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) * (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu); (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
} }
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce; typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
__shared__ float sigma2; __shared__ T sigma2;
float sigma2Block = T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
if (threadIdx.x == if (threadIdx.x ==
0) { // must set threadIdx.x = 0 write the output to memory 0) { // must set threadIdx.x = 0 write the output to memory
sigma2 = sigma2Block / dimsize; sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
} }
__syncthreads(); __syncthreads();
if (biasSize == dimsize) { if (biasSize == dimsize) {
@ -47,8 +45,9 @@ __launch_bounds__(BLOCK_DIM) __global__
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[threadIdx.x + ph * BLOCK_DIM] * scale[threadIdx.x + ph * BLOCK_DIM] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
mu) / mu) *
sqrt(sigma2 + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
bias[threadIdx.x + ph * BLOCK_DIM]; bias[threadIdx.x + ph * BLOCK_DIM];
} }
} else { } else {
@ -57,8 +56,9 @@ __launch_bounds__(BLOCK_DIM) __global__
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[0] * scale[0] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
mu) / mu) *
sqrt(sigma2 + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
bias[threadIdx.x + ph * BLOCK_DIM]; bias[threadIdx.x + ph * BLOCK_DIM];
} }
} }
@ -69,8 +69,9 @@ __launch_bounds__(BLOCK_DIM) __global__
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[threadIdx.x + ph * BLOCK_DIM] * scale[threadIdx.x + ph * BLOCK_DIM] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
mu) / mu) *
sqrt(sigma2 + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
bias[0]; bias[0];
} }
} else { } else {
@ -79,50 +80,50 @@ __launch_bounds__(BLOCK_DIM) __global__
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[0] * scale[0] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
mu) / mu) *
sqrt(sigma2 + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
bias[0]; bias[0];
} }
} }
} }
} }
//----------------- //-----------------
template <int BLOCK_DIM> template <typename T, int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__ __launch_bounds__(BLOCK_DIM) __global__
void blockLaynormKernel(const float *input, const float *scale, void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
const int dimsize, const int stride, float *output, const int stride, T *output, const T eps,
const float eps, int scaleSize) { int scaleSize) {
// len(scale) = len(bias) = dimsize // len(scale) = len(bias) = dimsize
int tmp = blockIdx.x % stride; int tmp = blockIdx.x % stride;
int tid = (blockIdx.x - tmp) * dimsize + tmp; int tid = (blockIdx.x - tmp) * dimsize + tmp;
float muPartial = 0.0f; T muPartial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride]; muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
} }
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce; typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float mu; __shared__ T mu;
float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum()); T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
if (threadIdx.x == if (threadIdx.x ==
0) { // must set threadIdx.x = 0 write the output to memory 0) { // must set threadIdx.x = 0 write the output to memory
mu = muBlock / dimsize; mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
} }
__syncthreads(); __syncthreads();
float sigma2Partial = 0.0f; T sigma2Partial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
sigma2Partial += sigma2Partial +=
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) * (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu); (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
} }
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce; typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
__shared__ float sigma2; __shared__ T sigma2;
float sigma2Block = T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
if (threadIdx.x == if (threadIdx.x ==
0) { // must set threadIdx.x = 0 write the output to memory 0) { // must set threadIdx.x = 0 write the output to memory
sigma2 = sigma2Block / dimsize; sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
} }
__syncthreads(); __syncthreads();
if (scaleSize == dimsize) { if (scaleSize == dimsize) {
@ -130,16 +131,18 @@ __launch_bounds__(BLOCK_DIM) __global__
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[threadIdx.x + ph * BLOCK_DIM] * scale[threadIdx.x + ph * BLOCK_DIM] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) / (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
sqrt(sigma2 + eps); static_cast<T>(
__fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
} }
} else { } else {
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
scale[0] * scale[0] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) / (input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
sqrt(sigma2 + eps); static_cast<T>(
__fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
} }
} }
} }
@ -158,33 +161,33 @@ __inline__ __device__ T WarpAllReduce(T val) {
} }
return val; return val;
} }
template <int BLOCK_DIM_x, int BLOCK_DIM_y> template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
__global__ void warpLaynormKernel(const float *input, const float *scale, __global__ void warpLaynormKernel(const T *input, const T *scale,
const int dimsize, const int stride, const int dimsize, const int stride,
float *output, const float eps, int scaleSize, T *output, const T eps, int scaleSize,
int otherSize, const float *bias, int otherSize, const T *bias, int biasSize) {
int biasSize) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y; int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize; int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
if (otherIdx < otherSize) { if (otherIdx < otherSize) {
__shared__ float muTotal[BLOCK_DIM_y]; __shared__ T muTotal[BLOCK_DIM_y];
__shared__ float sigma2Total[BLOCK_DIM_y]; __shared__ T sigma2Total[BLOCK_DIM_y];
float muPartial = 0.0f; T muPartial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]; muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
} }
muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial); muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
if (threadIdx.x == 0) if (threadIdx.x == 0)
muTotal[threadIdx.y] = muPartial / dimsize; muTotal[threadIdx.y] =
muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
//-------------------------------------------- //--------------------------------------------
float sigma2Partial = 0.0f; T sigma2Partial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
sigma2Partial += sigma2Partial +=
@ -194,10 +197,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
muTotal[threadIdx.y]); muTotal[threadIdx.y]);
} }
sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial); sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
if (threadIdx.x == 0) if (threadIdx.x == 0)
sigma2Total[threadIdx.y] = sigma2Partial / dimsize; sigma2Total[threadIdx.y] =
sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
//-------------------------------------------- //--------------------------------------------
if (biasSize == dimsize) { if (biasSize == dimsize) {
@ -209,8 +213,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
scale[threadIdx.x + ph * BLOCK_DIM_x] * scale[threadIdx.x + ph * BLOCK_DIM_x] *
(input[tid + (input[tid +
(threadIdx.x + ph * BLOCK_DIM_x) * stride] - (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps)))) +
bias[threadIdx.x + ph * BLOCK_DIM_x]; bias[threadIdx.x + ph * BLOCK_DIM_x];
} }
} else { } else {
@ -221,8 +227,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
scale[0] * scale[0] *
(input[tid + (input[tid +
(threadIdx.x + ph * BLOCK_DIM_x) * stride] - (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps)))) +
bias[threadIdx.x + ph * BLOCK_DIM_x]; bias[threadIdx.x + ph * BLOCK_DIM_x];
} }
} }
@ -235,8 +243,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
scale[threadIdx.x + ph * BLOCK_DIM_x] * scale[threadIdx.x + ph * BLOCK_DIM_x] *
(input[tid + (input[tid +
(threadIdx.x + ph * BLOCK_DIM_x) * stride] - (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps)))) +
bias[0]; bias[0];
} }
} else { } else {
@ -247,40 +257,43 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
scale[0] * scale[0] *
(input[tid + (input[tid +
(threadIdx.x + ph * BLOCK_DIM_x) * stride] - (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps) + static_cast<T>(__fdividef(
1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps)))) +
bias[0]; bias[0];
} }
} }
} }
} }
} }
template <int BLOCK_DIM_x, int BLOCK_DIM_y> template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
__global__ void warpLaynormKernel(const float *input, const float *scale, __global__ void warpLaynormKernel(const T *input, const T *scale,
const int dimsize, const int stride, const int dimsize, const int stride,
float *output, const float eps, int scaleSize, T *output, const T eps, int scaleSize,
int otherSize) { int otherSize) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y; int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize; int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
if (otherIdx < otherSize) { if (otherIdx < otherSize) {
__shared__ float muTotal[BLOCK_DIM_y]; __shared__ T muTotal[BLOCK_DIM_y];
__shared__ float sigma2Total[BLOCK_DIM_y]; __shared__ T sigma2Total[BLOCK_DIM_y];
float muPartial = 0.0f; T muPartial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]; muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
} }
muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial); muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
if (threadIdx.x == 0) if (threadIdx.x == 0)
muTotal[threadIdx.y] = muPartial / dimsize; muTotal[threadIdx.y] =
muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
//-------------------------------------------- //--------------------------------------------
float sigma2Partial = 0.0f; T sigma2Partial = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
sigma2Partial += sigma2Partial +=
@ -290,10 +303,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
muTotal[threadIdx.y]); muTotal[threadIdx.y]);
} }
sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial); sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
if (threadIdx.x == 0) if (threadIdx.x == 0)
sigma2Total[threadIdx.y] = sigma2Partial / dimsize; sigma2Total[threadIdx.y] =
sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
//-------------------------------------------- //--------------------------------------------
if (scaleSize == dimsize) { if (scaleSize == dimsize) {
@ -302,8 +316,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
scale[threadIdx.x + ph * BLOCK_DIM_x] * scale[threadIdx.x + ph * BLOCK_DIM_x] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps); static_cast<T>(
__fdividef(1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps))));
} }
} else { } else {
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
@ -311,8 +327,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
scale[0] * scale[0] *
(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] - (input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
muTotal[threadIdx.y]) / muTotal[threadIdx.y]) *
sqrt(sigma2Total[threadIdx.y] + eps); static_cast<T>(
__fdividef(1.0F, sqrt(static_cast<float>(
sigma2Total[threadIdx.y] + eps))));
} }
} }
} }
@ -325,7 +343,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
if (dimsize > 1024) { if (dimsize > 1024) {
int BLOCK_DIM = 1024; int BLOCK_DIM = 1024;
blockLaynormKernel<1024> blockLaynormKernel<float, 1024>
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output, <<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
eps, scaleSize, bias, biasSize); eps, scaleSize, bias, biasSize);
} else if (dimsize > 31) { } else if (dimsize > 31) {
@ -335,7 +353,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block, input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize); bias, biasSize);
} else if (dimsize > 15) { } else if (dimsize > 15) {
@ -345,7 +363,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block, input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize); bias, biasSize);
} else if (dimsize > 7) { } else if (dimsize > 7) {
@ -355,7 +373,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block, input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize); bias, biasSize);
} else { } else {
@ -365,7 +383,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block, input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize); bias, biasSize);
} }
@ -378,7 +396,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
if (dimsize > 1024) { if (dimsize > 1024) {
int BLOCK_DIM = 1024; int BLOCK_DIM = 1024;
blockLaynormKernel<1024><<<num_block, BLOCK_DIM>>>( blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
input, scale, dimsize, stride, output, eps, scaleSize); input, scale, dimsize, stride, output, eps, scaleSize);
} else if (dimsize > 31) { } else if (dimsize > 31) {
int BLOCK_DIM_x = 32; int BLOCK_DIM_x = 32;
@ -387,7 +405,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block); input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 15) { } else if (dimsize > 15) {
int BLOCK_DIM_x = 16; int BLOCK_DIM_x = 16;
@ -396,7 +414,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block); input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 7) { } else if (dimsize > 7) {
int BLOCK_DIM_x = 8; int BLOCK_DIM_x = 8;
@ -405,7 +423,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block); input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else { } else {
int BLOCK_DIM_x = 4; int BLOCK_DIM_x = 4;
@ -414,7 +432,108 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>( warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
}
}
//-----------------
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output, const half *bias, int biasSize) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockLaynormKernel<half, 1024>
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
eps, scaleSize, bias, biasSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
bias, biasSize);
}
}
void LaynormKernel(const half *input, const half *scale, const half eps,
int size, int scaleSize, const int dimsize, const int stride,
half *output) {
int num_block = size / dimsize;
if (dimsize > 1024) {
int BLOCK_DIM = 1024;
blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
input, scale, dimsize, stride, output, eps, scaleSize);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
input, scale, dimsize, stride, output, eps, scaleSize, num_block); input, scale, dimsize, stride, output, eps, scaleSize, num_block);
} }
} }

View File

@ -2,6 +2,7 @@
#include "core/kernel.h" #include "core/kernel.h"
#include "cuda/cuda_expand.h" #include "cuda/cuda_expand.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "utils/small_array.h" #include "utils/small_array.h"
namespace infini { namespace infini {
@ -48,11 +49,12 @@ class matmulCublas : public Kernel {
auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N; auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N;
const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n, const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
ldc = n; ldc = n;
float alpha = 1.f, beta = 0.f; float alpha_naive = 1.f, beta_naive = 0.f;
if (op->numInputs() == 2) { // no bias auto dataType = op->getDType();
beta = 0.f; auto cuDataType = cublasDataTypeConvert(dataType);
} else { // broadcast bias to output IT_ASSERT(cuDataType != CUDA_R_8I, "matmul don't support int8 dtype.");
beta = 1.f; if (op->numInputs() == 3) { // have bias
beta_naive = 1.f;
auto inC = op->getInputs(2); auto inC = op->getInputs(2);
auto out = op->getOutput(); auto out = op->getOutput();
SmallArray inputShape, outputShape; SmallArray inputShape, outputShape;
@ -69,8 +71,9 @@ class matmulCublas : public Kernel {
if (i >= offset) if (i >= offset)
inputShape.data[i] = inC->getDims()[i - offset]; inputShape.data[i] = inC->getDims()[i - offset];
} }
expandKernel(inC->getRawDataPtr<float *>(), const int dType = dataType.getIndex();
out->getRawDataPtr<float *>(), nDims, outputsize, expandKernel(dType, inC->getRawDataPtr<void *>(),
out->getRawDataPtr<void *>(), nDims, outputsize,
inputShape, outputShape); inputShape, outputShape);
} }
// TODO:use compute type // TODO:use compute type
@ -89,16 +92,38 @@ class matmulCublas : public Kernel {
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1)) (dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
? 0 // Broadcast the batch dimension if batch size is 1 ? 0 // Broadcast the batch dimension if batch size is 1
: n * k; : n * k;
stat = cublasGemmStridedBatchedEx( if (dataType == DataType::Float16) {
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData, half alpha_half = static_cast<half>(alpha_naive);
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA, half beta_half = static_cast<half>(beta_naive);
&beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F, stat = cublasGemmStridedBatchedEx(
(cublasGemmAlgo_t)record->algo); context->cublasHandle(), opB, opA, n, m, k, &alpha_half,
inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
strideA, &beta_half, outData, cuDataType, ldc, m * n, b,
cuDataType, (cublasGemmAlgo_t)record->algo);
} else {
stat = cublasGemmStridedBatchedEx(
context->cublasHandle(), opB, opA, n, m, k, &alpha_naive,
inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
strideA, &beta_naive, outData, cuDataType, ldc, m * n, b,
cuDataType, (cublasGemmAlgo_t)record->algo);
}
} else { } else {
stat = cublasGemmEx( if (dataType == DataType::Float16) {
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData, half alpha_half = static_cast<half>(alpha_naive);
CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData, half beta_half = static_cast<half>(beta_naive);
CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo); stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
&alpha_half, inBData, cuDataType, ldb,
inAData, cuDataType, lda, &beta_half,
outData, cuDataType, ldc, cuDataType,
(cublasGemmAlgo_t)record->algo);
} else {
stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
&alpha_naive, inBData, cuDataType, ldb,
inAData, cuDataType, lda, &beta_naive,
outData, cuDataType, ldc, cuDataType,
(cublasGemmAlgo_t)record->algo);
}
} }
// if (stat != CUBLAS_STATUS_SUCCESS) // if (stat != CUBLAS_STATUS_SUCCESS)
// cout << cublasGetErrorString(stat); // cout << cublasGetErrorString(stat);
@ -140,8 +165,9 @@ class matmulCublas : public Kernel {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::MatMul, DataType::Float32, matmulCublas, REGISTER_KERNEL(Device::CUDA, OpType::MatMul, matmulCublas,
"Matmul_cuBLAS_CUDA_Float32"); "Matmul_cuBLAS_CUDA");
REGISTER_CONSTRUCTOR(2, MatmulCublasPerfRecordObj::from_json); REGISTER_CONSTRUCTOR(2, MatmulCublasPerfRecordObj::from_json);
}; // namespace infini }; // namespace infini

View File

@ -0,0 +1,109 @@
#include "operators/matmul_integer.h"
#include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_matmul_integer.h"
#include "cuda/cuda_runtime.h"
#include "utils/small_array.h"
#include <thrust/transform.h>
namespace infini {
class matmulIntegerCublas : public CudaKernelWithoutConfig {
bool do_compute(const Operator &_op, const RuntimeObj *_context) const {
auto op = as<MatmulIntegerObj>(_op);
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
void *const inAData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const inBData = (op->getInputs(1)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
const auto [b, m, n, k] = op->getBMNK();
if (op->numInputs() >= 3) { // have a_zero_point
int aZeroSize = op->getInputs(2)->size();
int aSize = op->getInputs(0)->size();
void *const aZeroPointData =
(op->getInputs(2)->getRawDataPtr<void *>());
if (op->getInputs(0)->getDType() == DataType::Int8) {
if (aZeroSize > 1) {
subA_kernel(DataType::Int8.getIndex(), inAData,
aZeroPointData, aSize, k, 1);
} else {
subA_kernel(DataType::Int8.getIndex(), inAData,
aZeroPointData, aSize, k, 0);
}
}
if (op->getInputs(0)->getDType() == DataType::UInt8) {
if (aZeroSize > 1) {
subA_kernel(DataType::UInt8.getIndex(), inAData,
aZeroPointData, aSize, k, 1);
} else {
subA_kernel(DataType::UInt8.getIndex(), inAData,
aZeroPointData, aSize, k, 0);
}
}
}
if (op->numInputs() == 4) { // have b_zero_point
int bZeroSize = op->getInputs(3)->size();
int bSize = op->getInputs(1)->size();
void *const bZeroPointData =
(op->getInputs(3)->getRawDataPtr<void *>());
if (op->getInputs(1)->getDType() == DataType::Int8) {
if (bZeroSize > 1) {
subB_kernel(DataType::Int8.getIndex(), inBData,
bZeroPointData, bSize, k, n, 1);
} else {
subB_kernel(DataType::Int8.getIndex(), inBData,
bZeroPointData, bSize, k, n, 0);
}
}
if (op->getInputs(1)->getDType() == DataType::UInt8) {
if (bZeroSize > 1) {
subB_kernel(DataType::UInt8.getIndex(), inBData,
bZeroPointData, bSize, k, n, 1);
} else {
subB_kernel(DataType::UInt8.getIndex(), inBData,
bZeroPointData, bSize, k, n, 0);
}
}
}
int lda = k, ldb = n, ldc = n;
int32_t alpha = 1, beta = 0;
// TODO:use compute type
cublasStatus_t stat;
if (b > 1) {
// Support batch broadcast with zero stride
int dimA = op->getInputs(0)->getRank();
int dimB = op->getInputs(1)->getRank();
long long strideA =
(dimA == 2 ||
(dimA == 3 && op->getInputs(0)->getDims()[0] == 1))
? 0 // Broadcast the batch dimension if batch size is 1
: m * k;
long long strideB =
(dimB == 2 ||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
? 0 // Broadcast the batch dimension if batch size is 1
: n * k;
stat = cublasGemmStridedBatchedEx(
context->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
&alpha, inBData, CUDA_R_8I, ldb, strideB, inAData, CUDA_R_8I,
lda, strideA, &beta, outData, CUDA_R_32I, ldc, m * n, b,
CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
} else {
stat = cublasGemmEx(
context->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
&alpha, inBData, CUDA_R_8I, ldb, inAData, CUDA_R_8I, lda, &beta,
outData, CUDA_R_32I, ldc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
}
return (stat == CUBLAS_STATUS_SUCCESS);
}
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
IT_ASSERT(do_compute(_op, _context));
}
};
REGISTER_KERNEL(Device::CUDA, OpType::MatMulInteger, matmulIntegerCublas,
"MatmulInteger_cuBLAS_CUDA");
}; // namespace infini

View File

@ -0,0 +1,75 @@
#include "cuda/cuda_common.h"
constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _subA_kernel(void *a, void *b, int size, int k, int delta) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < size; i += stride) {
int j = delta * (i - i % k) / k;
((int8_t *)a)[i] = ((int8_t *)a)[i] - ((int8_t *)b)[j];
}
}
__global__ void _subA_u8_kernel(void *a, void *b, int size, int k, int delta) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < size; i += stride) {
int j = delta * (i - i % k) / k;
auto aData = static_cast<int16_t>(((uint8_t *)a)[i]);
auto bData = static_cast<int16_t>(((uint8_t *)b)[j]);
((int8_t *)a)[i] = static_cast<int8_t>(aData - bData);
}
}
__global__ void _subB_kernel(void *a, void *b, int size, int k, int n,
int delta) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < size; i += stride) {
int j = delta * ((i / k) + (i % n));
((int8_t *)a)[i] = ((int8_t *)a)[i] - ((int8_t *)b)[j];
}
}
__global__ void _subB_u8_kernel(void *a, void *b, int size, int k, int n,
int delta) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < size; i += stride) {
int j = delta * ((i / k) + (i % n));
auto aData = static_cast<int16_t>(((uint8_t *)a)[i]);
auto bData = static_cast<int16_t>(((uint8_t *)b)[j]);
((int8_t *)a)[i] = static_cast<int8_t>(aData - bData);
}
}
namespace infini {
void subA_kernel(int dType, void *a, void *b, int size, int k, int delta) {
int blocksize = block_work_size();
int gridsize = (size + block_work_size() - 1) / block_work_size();
if (dType == 3) {
_subA_kernel<<<gridsize, blocksize>>>(a, b, size, k, delta);
} else if (dType == 2) {
_subA_u8_kernel<<<gridsize, blocksize>>>(a, b, size, k, delta);
} else {
IT_TODO_HALT();
}
}
void subB_kernel(int dType, void *a, void *b, int size, int k, int n,
int delta) {
int blocksize = block_work_size();
int gridsize = (size + block_work_size() - 1) / block_work_size();
if (dType == 3) {
_subB_kernel<<<gridsize, blocksize>>>(a, b, size, k, n, delta);
} else if (dType == 2) {
_subB_u8_kernel<<<gridsize, blocksize>>>(a, b, size, k, n, delta);
} else {
IT_TODO_HALT();
}
}
}; // namespace infini

View File

@ -229,9 +229,8 @@ class MemboundTVMExtractSource : public Kernel {
} }
}; };
// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMExtractSource,
// MemboundTVMExtractSource, "Memobund_TVM_Ansor_extract_source");
// "Memobund_TVM_Ansor_extract_source");
}; // namespace infini }; // namespace infini
#endif #endif

View File

@ -216,9 +216,9 @@ class MemboundTVMPackedFunction : public Kernel {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMPackedFunction,
MemboundTVMPackedFunction,
"Memobund_TVM_Ansor_packed_funciton"); "Memobund_TVM_Ansor_packed_funciton");
}; // namespace infini }; // namespace infini
#endif #endif

View File

@ -39,10 +39,8 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda, REGISTER_KERNEL(Device::CUDA, OpType::Slice, SliceCuda, "Slice__CUDA");
"Slice__CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Int64, SliceCuda, REGISTER_KERNEL(Device::CUDA, OpType::Pad, PadCuda, "Pad__CUDA");
"Slice__CUDA_Int64");
REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
"Pad__CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -1,6 +1,7 @@
#include "core/data_type.h" #include "core/data_type.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_pad_slice.h" #include "cuda/cuda_pad_slice.h"
#include "cuda/cuda_utility.h"
__device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset, __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
TransMetaData metaData, TransMetaData metaData,
@ -21,39 +22,83 @@ __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
} }
template <typename T> template <typename T>
__global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData, __global__ void _pad_slice_kernel(void *part, void *whole,
int nDims, int num, bool isPad) { TransMetaData metaData, int nDims, int num,
bool isPad) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= num) if (tid >= num) {
return; return;
}
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
while (tid < num) { while (tid < num) {
int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims); int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
if (isPad) if (isPad) {
if (offset < 0) if (offset < 0) {
whole[tid] = 0; ((T *)whole)[tid] = 0;
else } else {
whole[tid] = part[offset]; ((T *)whole)[tid] = ((T *)part)[offset];
else }
part[offset] = whole[tid]; } else if (offset >= 0) {
((T *)part)[offset] = ((T *)whole)[tid];
}
tid += stride; tid += stride;
} }
} }
namespace infini { namespace infini {
#define CASE(T) \
_pad_slice_kernel<DT_CUDA<T>::t><<<gridSize, blockSize>>>( \
partData, wholeData, metadata, nDims, num, isPad);
#define SWITCH_DTYPE(DTYPE) \
switch (DTYPE) { \
case 1: \
CASE(1) \
break; \
case 2: \
CASE(2) \
break; \
case 3: \
CASE(3) \
break; \
case 4: \
CASE(4) \
break; \
case 5: \
CASE(5) \
break; \
case 6: \
CASE(6) \
break; \
case 7: \
CASE(7) \
break; \
case 10: \
CASE(10) \
break; \
case 11: \
CASE(11) \
break; \
case 12: \
CASE(12) \
break; \
case 13: \
CASE(13) \
break; \
case 16: \
CASE(16) \
break; \
default: \
IT_TODO_HALT(); \
}
void pad_slice_kernel(void *partData, void *wholeData, void pad_slice_kernel(void *partData, void *wholeData,
const TransMetaData &metadata, int nDims, int num, const TransMetaData &metadata, int nDims, int num,
bool isPad) { bool isPad) {
int blockSize = 32 * 16; int blockSize = 32 * 16;
int gridSize = (num + blockSize - 1) / blockSize; int gridSize = (num + blockSize - 1) / blockSize;
if (metadata.DType == DataType::Int64.getIndex()) { int dType = metadata.DType;
_pad_slice_kernel<int64_t> SWITCH_DTYPE(dType)
<<<gridSize, blockSize>>>((int64_t *)partData, (int64_t *)wholeData,
metadata, nDims, num, isPad);
} else if (metadata.DType == DataType::Float32.getIndex()) {
_pad_slice_kernel<float><<<gridSize, blockSize>>>(
(float *)partData, (float *)wholeData, metadata, nDims, num, isPad);
}
} }
} // namespace infini } // namespace infini

View File

@ -76,8 +76,9 @@ class avgPoolCudnn : public poolingCudnn {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, DataType::Float32, maxPoolCudnn, REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, maxPoolCudnn,
"MaxPool_cuDNN_CUDA_Float32"); "MaxPool_cuDNN_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, avgPoolCudnn,
avgPoolCudnn, "AvgPool_cuDNN_CUDA_Float32"); "AvgPool_cuDNN_CUDA");
}; // namespace infini }; // namespace infini

View File

@ -40,8 +40,7 @@ class RecvNCCL : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Recv, DataType::Float32, RecvNCCL, REGISTER_KERNEL(Device::CUDA, OpType::Recv, RecvNCCL, "Recv_NCCL_CUDA");
"Recv_NCCL_CUDA_Float32");
} // namespace infini } // namespace infini
#endif #endif

View File

@ -1,6 +1,7 @@
#include "operators/reduce.h" #include "operators/reduce.h"
#include "cuda/cuda_kernel_wihtout_config.h" #include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
namespace infini { namespace infini {
class ReduceCudnnBase : public CudaKernelWithoutConfig { class ReduceCudnnBase : public CudaKernelWithoutConfig {
@ -46,12 +47,12 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
cudnnTensorDescriptor_t outDesc; cudnnTensorDescriptor_t outDesc;
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
if (nInDims > 3) { if (nInDims > 3) {
checkCudnnError(cudnnSetTensorNdDescriptor( checkCudnnError(cudnnSetTensorNdDescriptor(
inDesc, CUDNN_DATA_FLOAT, nInDims, inDimArray, inStrideArray)); inDesc, cudnnDataType, nInDims, inDimArray, inStrideArray));
checkCudnnError( checkCudnnError(cudnnSetTensorNdDescriptor(
cudnnSetTensorNdDescriptor(outDesc, CUDNN_DATA_FLOAT, nInDims, outDesc, cudnnDataType, nInDims, outDimArray, outStrideArray));
outDimArray, outStrideArray));
} else { } else {
int idims[4] = {1, 1, 1, 1}, odims[4] = {1, 1, 1, 1}; int idims[4] = {1, 1, 1, 1}, odims[4] = {1, 1, 1, 1};
for (int i = 0; i < nInDims; ++i) { for (int i = 0; i < nInDims; ++i) {
@ -62,11 +63,11 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
} }
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, idims[0], idims[1], inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, idims[0], idims[1],
idims[2], idims[3])); idims[2], idims[3]));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
outDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, odims[0], outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, odims[0], odims[1],
odims[1], odims[2], odims[3])); odims[2], odims[3]));
} }
// get reduce descriptor // get reduce descriptor
@ -120,8 +121,9 @@ class ReduceSumCudnn : public ReduceCudnnBase {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, ReduceMeanCudnn,
ReduceMeanCudnn, "ReduceMean_cuDNN_CUDA_Float32"); "ReduceMean_cuDNN_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, ReduceSumCudnn,
ReduceSumCudnn, "ReduceSum_cuDNN_CUDA_Float32"); "ReduceSum_cuDNN_CUDA");
}; // namespace infini }; // namespace infini

View File

@ -11,15 +11,10 @@ class CopyCuda : public CudaKernelWithoutConfig {
} }
}; };
// reshape/flatten/identity all act as copying from input to output. // reshape/flatten/identity all act as copying from input to output.
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Float32, CopyCuda,
"Reshape_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Reshape, CopyCuda, "Reshape_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int64, CopyCuda,
"Reshape_CUDA_Int64"); REGISTER_KERNEL(Device::CUDA, OpType::Flatten, CopyCuda, "Flatten_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int32, CopyCuda, REGISTER_KERNEL(Device::CUDA, OpType::Identity, CopyCuda, "Identity_CUDA");
"Reshape_CUDA_Int32");
REGISTER_KERNEL(Device::CUDA, OpType::Flatten, DataType::Float32, CopyCuda,
"Flatten_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Identity, DataType::Float32, CopyCuda,
"Identity_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -48,7 +48,6 @@ class ResizeCuda : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Resize, DataType::Float32, ResizeCuda, REGISTER_KERNEL(Device::CUDA, OpType::Resize, ResizeCuda, "Resize_CUDA");
"Resize_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -36,8 +36,7 @@ class SendNCCL : public CudaKernelWithoutConfig {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Send, DataType::Float32, SendNCCL, REGISTER_KERNEL(Device::CUDA, OpType::Send, SendNCCL, "Send_NCCL_CUDA");
"Send_NCCL_CUDA_Float32");
} // namespace infini } // namespace infini
#endif #endif

View File

@ -20,11 +20,15 @@ class SoftmaxCuda : public CudaKernelWithoutConfig {
int stride = op->getInputs(0)->getStride().at(op->getAxis()); int stride = op->getInputs(0)->getStride().at(op->getAxis());
int num_blocks = size / dimsize; int num_blocks = size / dimsize;
softmax_kernel(num_blocks, (float *)input, (float *)output, size, if (op->getDType() == DataType::Float32) {
dimsize, stride); softmax_kernel(num_blocks, (float *)input, (float *)output, size,
dimsize, stride);
} else if (op->getDType() == DataType::Float16) {
softmax_kernel(num_blocks, (half *)input, (half *)output, size,
dimsize, stride);
}
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda, REGISTER_KERNEL(Device::CUDA, OpType::Softmax, SoftmaxCuda, "Softmax_CUDA");
"Softmax_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -1,6 +1,5 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include <cub/cub.cuh> #include <cub/cub.cuh>
struct __align__(8) DataMaxSum { // update the global max and sum, store the struct __align__(8) DataMaxSum { // update the global max and sum, store the
// output at max_tmp and sum_tmp // output at max_tmp and sum_tmp
float max_tmp; // store max float max_tmp; // store max
@ -16,9 +15,9 @@ __device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
return bigger; return bigger;
} }
template <int BLOCK_DIM> template <typename T, int BLOCK_DIM>
__launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel( __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
float *__restrict input, float *__restrict output, int size, int dimsize, T *__restrict input, T *__restrict output, int size, int dimsize,
int stride) { // if set axis = 1, inputShape=[I,J,K,S] int stride) { // if set axis = 1, inputShape=[I,J,K,S]
// tid = i(JKS) + j(KS) + k(S) + s // tid = i(JKS) + j(KS) + k(S) + s
@ -33,15 +32,33 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
dms_partial.max_tmp = -__FLT_MAX__; dms_partial.max_tmp = -__FLT_MAX__;
dms_partial.sum_tmp = 0.0f; dms_partial.sum_tmp = 0.0f;
DataMaxSum dms_input; DataMaxSum dms_input;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { int remain = dimsize % BLOCK_DIM;
int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
dms_input.max_tmp = if (threadIdx.x < remain) {
input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride]; for (int ind = 0; ind < step; ind++) {
dms_input.max_tmp =
input[tid + (threadIdx.x * step + ind) * stride];
dms_input.sum_tmp = 1.0f; dms_input.sum_tmp = 1.0f;
dms_partial = reduce_dms_op(dms_partial, dms_partial =
dms_input); // reduce the data to one block reduce_dms_op(dms_partial,
dms_input); // reduce the data to one block
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
dms_input.max_tmp =
input[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride];
dms_input.sum_tmp = 1.0f;
dms_partial =
reduce_dms_op(dms_partial,
dms_input); // reduce the data to one block
}
} }
typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce; typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ DataMaxSum dms_total; __shared__ DataMaxSum dms_total;
@ -53,12 +70,102 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
} }
__syncthreads(); __syncthreads();
//----------------- //-----------------
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) { output[tid + (threadIdx.x * step + ind) * stride] =
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] = __expf(static_cast<float>(
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - input[tid + (threadIdx.x * step + ind) * stride]) -
dms_total.max_tmp) * dms_total.max_tmp) *
__fdividef(1.0F, dms_total.sum_tmp); __fdividef(1.0F, dms_total.sum_tmp);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid +
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
stride] =
__expf(static_cast<float>(
input[tid +
(remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride]) -
dms_total.max_tmp) *
__fdividef(1.0F, dms_total.sum_tmp);
}
}
}
template <typename T, int BLOCK_DIM, int numPerThread>
__global__ void
_blockSoftmaxKernel(T *__restrict input, T *__restrict output, int size,
int dimsize,
int stride) { // if set axis = 1, inputShape=[I,J,K,S]
// tid = i(JKS) + j(KS) + k(S) + s
// blockDim.x = size/dimsize = IKS
// blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
int tid =
blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
dimsize; // now, tid = i(JKS) + k(S) + s;
int remain = dimsize % BLOCK_DIM;
int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
float dataPerThread[numPerThread];
DataMaxSum dms_partial;
dms_partial.max_tmp = -__FLT_MAX__;
dms_partial.sum_tmp = 0.0f;
DataMaxSum dms_input;
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
dataPerThread[ind] =
input[tid + (threadIdx.x * step + ind) * stride];
dms_input.max_tmp = dataPerThread[ind];
dms_input.sum_tmp = 1.0f;
dms_partial =
reduce_dms_op(dms_partial,
dms_input); // reduce the data to one block
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
dataPerThread[ind] =
input[tid + (remain * step +
(threadIdx.x - remain) * (step - 1) + ind) *
stride];
dms_input.max_tmp = dataPerThread[ind];
dms_input.sum_tmp = 1.0f;
dms_partial =
reduce_dms_op(dms_partial,
dms_input); // reduce the data to one block
}
}
typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ DataMaxSum dms_total;
DataMaxSum dms_block =
BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
if (threadIdx.x ==
0) { // must set threadIdx.x = 0 write the output to memory
dms_total = dms_block;
}
__syncthreads();
//-----------------
if (threadIdx.x < remain) {
for (int ind = 0; ind < step; ind++) {
output[tid + (threadIdx.x * step + ind) * stride] =
__expf(dataPerThread[ind] - dms_total.max_tmp) *
__fdividef(1.0F, dms_total.sum_tmp);
}
} else {
for (int ind = 0; ind < step - 1; ind++) {
output[tid +
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
stride] =
__expf(dataPerThread[ind] - dms_total.max_tmp) *
__fdividef(1.0F, dms_total.sum_tmp);
}
} }
} }
@ -81,14 +188,14 @@ __inline__ __device__ T WarpAllReduce(T val) {
} }
return val; return val;
} }
template <int BLOCK_DIM_x, int BLOCK_DIM_y>
__global__ void _warpSoftmaxKernel(float *__restrict input, template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y, int numPerThreadx>
float *__restrict output, int size, __global__ void _warpSoftmaxKernel(T *__restrict input, T *__restrict output,
int dimsize, int stride) { int size, int dimsize, int stride) {
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y; int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
int otherSize = size / dimsize; int otherSize = size / dimsize;
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize; int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
float dataPerThreadx[numPerThreadx];
if (otherIdx < otherSize) { if (otherIdx < otherSize) {
__shared__ float max_total[BLOCK_DIM_y]; __shared__ float max_total[BLOCK_DIM_y];
@ -96,9 +203,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
float max_data = -__FLT_MAX__; float max_data = -__FLT_MAX__;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
max_data = dataPerThreadx[ph] =
max(max_data, input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]); max_data = max(max_data, dataPerThreadx[ph]);
} }
max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data); max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
@ -110,9 +217,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
float sum_data = 0.0f; float sum_data = 0.0f;
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
sum_data += dataPerThreadx[ph] =
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] - __expf(dataPerThreadx[ph] - max_total[threadIdx.y]);
max_total[threadIdx.y]); sum_data += dataPerThreadx[ph];
} }
sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data); sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
@ -124,9 +231,7 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) { for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] = output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] - dataPerThreadx[ph] * __fdividef(1.0F, sum_total[threadIdx.y]);
max_total[threadIdx.y]) *
__fdividef(1.0F, sum_total[threadIdx.y]);
} }
} }
} }
@ -137,10 +242,35 @@ namespace infini {
void softmax_kernel(int num_blocks, float *input, float *output, int size, void softmax_kernel(int num_blocks, float *input, float *output, int size,
int dimsize, int stride) { int dimsize, int stride) {
if (dimsize > 1024) { if (dimsize > 1024 * 128) {
int BLOCK_DIM = 1024; int BLOCK_DIM = 1024;
_blockSoftmaxKernel<1024> _blockSoftmaxKernel<float, 1024>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 64) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<float, 1024, 128>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 32) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<float, 1024, 64>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 16) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<float, 1024, 32>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 4) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<float, 1024, 16>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<float, 1024, 4>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride); <<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 31) { } else if (dimsize > 31) {
int BLOCK_DIM_x = 32; int BLOCK_DIM_x = 32;
@ -149,7 +279,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<32, 32> _warpSoftmaxKernel<float, 32, 32, 32>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride); <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else if (dimsize > 15) { } else if (dimsize > 15) {
int BLOCK_DIM_x = 16; int BLOCK_DIM_x = 16;
@ -158,7 +288,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<16, 64> _warpSoftmaxKernel<float, 16, 64, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride); <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else if (dimsize > 7) { } else if (dimsize > 7) {
int BLOCK_DIM_x = 8; int BLOCK_DIM_x = 8;
@ -167,7 +297,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<8, 128> _warpSoftmaxKernel<float, 8, 128, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride); <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else { } else {
int BLOCK_DIM_x = 4; int BLOCK_DIM_x = 4;
@ -176,7 +306,79 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1); dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1); dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<4, 256> _warpSoftmaxKernel<float, 4, 256, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
}
}
//------------------
void softmax_kernel(int num_blocks, half *input, half *output, int size,
int dimsize, int stride) {
if (dimsize > 1024 * 128) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 64) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024, 128>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 32) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024, 64>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 16) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024, 32>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024 * 4) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024, 16>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 1024) {
int BLOCK_DIM = 1024;
_blockSoftmaxKernel<half, 1024, 4>
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
} else if (dimsize > 31) {
int BLOCK_DIM_x = 32;
int BLOCK_DIM_y = 32;
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<half, 32, 32, 32>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else if (dimsize > 15) {
int BLOCK_DIM_x = 16;
int BLOCK_DIM_y = 64;
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<half, 16, 64, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else if (dimsize > 7) {
int BLOCK_DIM_x = 8;
int BLOCK_DIM_y = 128;
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<half, 8, 128, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} else {
int BLOCK_DIM_x = 4;
int BLOCK_DIM_y = 256;
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
_warpSoftmaxKernel<half, 4, 256, 2>
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride); <<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
} }
} }

View File

@ -7,7 +7,8 @@
namespace infini { namespace infini {
class CudaCompute { class CudaCompute {
void initComposedTensorMetadata(ComposedTensorMetadata &metadata, template <typename T>
void initComposedTensorMetadata(ComposedTensorMetadata<T> &metadata,
Tensor tensor) const { Tensor tensor) const {
int nDims = tensor->getRank(); int nDims = tensor->getRank();
auto strides = tensor->getStride(); auto strides = tensor->getStride();
@ -16,10 +17,10 @@ class CudaCompute {
metadata.dimSize[i] = tensor->getDims().at(i); metadata.dimSize[i] = tensor->getDims().at(i);
metadata.stride[i] = strides.at(i); metadata.stride[i] = strides.at(i);
} }
metadata.data = tensor->getRawDataPtr<float *>(); metadata.data = tensor->getRawDataPtr<T *>();
} }
template <typename T>
void initElementTensorMetadata(ElementTensorMetadata &metadata, void initElementTensorMetadata(ElementTensorMetadata<T> &metadata,
TensorVec tensors, int idx, int dim, TensorVec tensors, int idx, int dim,
int &dimBgIdx, int &batchCounter) const { int &dimBgIdx, int &batchCounter) const {
int nTensors = tensors.size(); int nTensors = tensors.size();
@ -27,7 +28,7 @@ class CudaCompute {
++batchCounter) { ++batchCounter) {
auto tensor = tensors.at(idx + batchCounter); auto tensor = tensors.at(idx + batchCounter);
auto dimSize = tensor->getDims()[dim]; auto dimSize = tensor->getDims()[dim];
metadata.data[batchCounter] = tensor->getRawDataPtr<float *>(); metadata.data[batchCounter] = tensor->getRawDataPtr<T *>();
metadata.dimBgNo[batchCounter] = dimBgIdx; metadata.dimBgNo[batchCounter] = dimBgIdx;
metadata.dimSize[batchCounter] = dimSize; metadata.dimSize[batchCounter] = dimSize;
metadata.nElements[batchCounter] = tensor->size(); metadata.nElements[batchCounter] = tensor->size();
@ -36,17 +37,17 @@ class CudaCompute {
} }
public: public:
template <typename T>
void do_compute(Tensor composedTensor, TensorVec elementsTensor, int dim, void do_compute(Tensor composedTensor, TensorVec elementsTensor, int dim,
int nDims, bool isSplit) const { int nDims, bool isSplit) const {
IT_ASSERT(nDims <= DIM_MAX_SIZE); IT_ASSERT(nDims <= DIM_MAX_SIZE);
ComposedTensorMetadata<T> composedMetadata;
ComposedTensorMetadata composedMetadata; initComposedTensorMetadata<T>(composedMetadata, composedTensor);
initComposedTensorMetadata(composedMetadata, composedTensor);
int dimBgNo = 0; int dimBgNo = 0;
int nElemets = elementsTensor.size(); int nElemets = elementsTensor.size();
for (int i = 0; i < nElemets; i += BATCH_SIZE) { for (int i = 0; i < nElemets; i += BATCH_SIZE) {
ElementTensorMetadata elemMetadata; ElementTensorMetadata<T> elemMetadata;
int batchCounter = 0; int batchCounter = 0;
initElementTensorMetadata(elemMetadata, elementsTensor, i, dim, initElementTensorMetadata(elemMetadata, elementsTensor, i, dim,
dimBgNo, batchCounter); dimBgNo, batchCounter);
@ -74,23 +75,34 @@ class ConcatCuda : private CudaCompute, public CudaKernelWithoutConfig {
} }
} }
} }
do_compute(_op->getOutput(), _op->getInputs(), if (_op->getDType() == DataType::Float32) {
as<ConcatObj>(_op)->getDim(), _op->getOutput()->getRank(), do_compute<float>(_op->getOutput(), _op->getInputs(),
false); as<ConcatObj>(_op)->getDim(),
_op->getOutput()->getRank(), false);
} else if (_op->getDType() == DataType::Float16) {
do_compute<half>(_op->getOutput(), _op->getInputs(),
as<ConcatObj>(_op)->getDim(),
_op->getOutput()->getRank(), false);
}
} }
}; };
class SplitCuda : private CudaCompute, public CudaKernelWithoutConfig { class SplitCuda : private CudaCompute, public CudaKernelWithoutConfig {
void compute(const Operator &_op, void compute(const Operator &_op,
const RuntimeObj *_context) const override { const RuntimeObj *_context) const override {
do_compute(_op->getInputs(0), _op->getOutputs(), if (_op->getDType() == DataType::Float32) {
as<SplitObj>(_op)->getDim(), _op->getInputs(0)->getRank(), do_compute<float>(_op->getInputs(0), _op->getOutputs(),
true); as<SplitObj>(_op)->getDim(),
_op->getInputs(0)->getRank(), true);
} else if (_op->getDType() == DataType::Float16) {
do_compute<half>(_op->getInputs(0), _op->getOutputs(),
as<SplitObj>(_op)->getDim(),
_op->getInputs(0)->getRank(), true);
}
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Concat, DataType::Float32, ConcatCuda, REGISTER_KERNEL(Device::CUDA, OpType::Concat, ConcatCuda, "Concat_CUDA");
"Concat_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Split, SplitCuda, "Split_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Split, DataType::Float32, SplitCuda,
"Split_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -1,9 +1,9 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_split_concat.h" #include "cuda/cuda_split_concat.h"
template <typename T>
__host__ __device__ int __host__ __device__ int
elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim, elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
int nDim, ComposedTensorMetadata wholeMeta) { int nDim, ComposedTensorMetadata<T> wholeMeta) {
int offset = 0; int offset = 0;
// COMP(x0,...,xk,...,xn-1) = ELMT[xk / d](x0,...,xk % d,...xn-1) // COMP(x0,...,xk,...,xn-1) = ELMT[xk / d](x0,...,xk % d,...xn-1)
@ -25,10 +25,10 @@ elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
int oP = (dim == 0) ? (elementIndex + dimBgNo) : elementIndex; int oP = (dim == 0) ? (elementIndex + dimBgNo) : elementIndex;
return offset + oP * wholeMeta.stride[0]; return offset + oP * wholeMeta.stride[0];
} }
template <typename T>
__global__ void _split_concat_kernel(ElementTensorMetadata elemMeta, __global__ void _split_concat_kernel(ElementTensorMetadata<T> elemMeta,
ComposedTensorMetadata compMeta, int dim, ComposedTensorMetadata<T> compMeta,
int nDims, bool isSplit) { int dim, int nDims, bool isSplit) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nElements = elemMeta.nElements[blockIdx.y]; int nElements = elemMeta.nElements[blockIdx.y];
if (tid >= nElements) if (tid >= nElements)
@ -36,10 +36,10 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
auto dimBgNo = elemMeta.dimBgNo[blockIdx.y]; auto dimBgNo = elemMeta.dimBgNo[blockIdx.y];
auto dimSize = elemMeta.dimSize[blockIdx.y]; auto dimSize = elemMeta.dimSize[blockIdx.y];
float *elemData = elemMeta.data[blockIdx.y]; T *elemData = elemMeta.data[blockIdx.y];
int Offset = int Offset =
elementIdx2ComposedIdx(tid, dimBgNo, dimSize, dim, nDims, compMeta); elementIdx2ComposedIdx<T>(tid, dimBgNo, dimSize, dim, nDims, compMeta);
// copy data from input to output // copy data from input to output
// for split:input is composed tensor;for concat:input is element // for split:input is composed tensor;for concat:input is element
// tensors. // tensors.
@ -52,8 +52,22 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
namespace infini { namespace infini {
// TODO: when dim=0, the operation can be executed in-place // TODO: when dim=0, the operation can be executed in-place
void split_concat_kernel(const ElementTensorMetadata &eleMeta, void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
const ComposedTensorMetadata &compMeta, int dim, const ComposedTensorMetadata<float> &compMeta, int dim,
int batchSize, int nDims, bool isSplit) {
dim3 blockSize = dim3(32 * 16);
// gridsize = max_n_elements / blockSize
int max_n_elements =
*std::max_element(eleMeta.nElements, eleMeta.nElements + batchSize);
int gridDimX = (max_n_elements - 1) / (32 * 16) + 1;
// each y is a split among the batch
dim3 gridSize(gridDimX, batchSize);
_split_concat_kernel<<<gridSize, blockSize>>>(eleMeta, compMeta, dim, nDims,
isSplit);
}
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
const ComposedTensorMetadata<half> &compMeta, int dim,
int batchSize, int nDims, bool isSplit) { int batchSize, int nDims, bool isSplit) {
dim3 blockSize = dim3(32 * 16); dim3 blockSize = dim3(32 * 16);
// gridsize = max_n_elements / blockSize // gridsize = max_n_elements / blockSize

View File

@ -38,8 +38,9 @@ class TransposeCuda : public CudaKernelWithoutConfig {
outputDims.data[i] = outputShape[i]; outputDims.data[i] = outputShape[i];
} }
transpose_kernel((float *)inputData, (float *)outputData, nDims, size, const int dType = op->getDType().getIndex();
strides, outputDims); transpose_kernel(dType, inputData, outputData, nDims, size, strides,
outputDims);
} }
}; };
@ -82,15 +83,16 @@ class DepthToSpaceCuda : public CudaKernelWithoutConfig {
for (int i = 0; i < nDims; ++i) { for (int i = 0; i < nDims; ++i) {
outputDims.data[i] = transpose[i]; outputDims.data[i] = transpose[i];
} }
const int dType = op->getDType().getIndex();
transpose_kernel((float *)inputData, (float *)outputData, nDims, size, transpose_kernel(dType, inputData, outputData, nDims, size, strides,
strides, outputDims); outputDims);
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32, REGISTER_KERNEL(Device::CUDA, OpType::Transpose, TransposeCuda,
TransposeCuda, "Transpose_CUDA_Float32"); "Transpose_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DepthToSpaceCuda,
"DepthToSpace_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DataType::Float32,
DepthToSpaceCuda, "DepthToSpace_CUDA_Float32");
} // namespace infini } // namespace infini

View File

@ -1,12 +1,14 @@
#include "core/common.h" #include "core/common.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include "utils/small_array.h" #include "utils/small_array.h"
constexpr unsigned int num_threads() { return 32 * 4; } constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; } constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); } constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _transpose_kernel(float *input, float *output, int nDims, template <class T>
__global__ void _transpose_kernel(void *input, void *output, int nDims,
int size, infini::SmallArray strides, int size, infini::SmallArray strides,
infini::SmallArray outputShape) { infini::SmallArray outputShape) {
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x; int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
@ -17,21 +19,61 @@ __global__ void _transpose_kernel(float *input, float *output, int nDims,
inputIdx += v % outputShape.data[i] * strides.data[i]; inputIdx += v % outputShape.data[i] * strides.data[i];
v /= outputShape.data[i]; v /= outputShape.data[i];
} }
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
output[outputIdx] = __ldg(input + inputIdx);
#else
output[outputIdx] = input[inputIdx];
#endif
} }
} }
#define CASE(T) \
_transpose_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
input, output, nDims, size, strides, outputShape);
#define SWITCH_DTYPE(DTYPE) \
switch (DTYPE) { \
case 1: \
CASE(1) \
break; \
case 2: \
CASE(2) \
break; \
case 3: \
CASE(3) \
break; \
case 4: \
CASE(4) \
break; \
case 5: \
CASE(5) \
break; \
case 6: \
CASE(6) \
break; \
case 7: \
CASE(7) \
break; \
case 10: \
CASE(10) \
break; \
case 11: \
CASE(11) \
break; \
case 12: \
CASE(12) \
break; \
case 13: \
CASE(13) \
break; \
case 16: \
CASE(16) \
break; \
default: \
IT_TODO_HALT(); \
}
namespace infini { namespace infini {
void transpose_kernel(float *input, float *output, int nDims, int size, void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
SmallArray strides, SmallArray outputShape) { SmallArray strides, SmallArray outputShape) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (size + block_work_size() - 1) / block_work_size(); int gridsize = (size + block_work_size() - 1) / block_work_size();
_transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size, SWITCH_DTYPE(dType)
strides, outputShape);
} }
} // namespace infini } // namespace infini

View File

@ -2,6 +2,7 @@
#include "cuda/cuda_kernel_wihtout_config.h" #include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_unary.h" #include "cuda/cuda_unary.h"
#include "cuda/cuda_utility.h"
namespace infini { namespace infini {
@ -12,6 +13,51 @@ class UnaryCuda : public CudaKernelWithoutConfig {
} }
}; };
class CastCuda : public CudaKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<CastObj>(_op);
size_t num = op->getOutput()->size();
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
if (op->getType() == CastType::Float162Float) {
IT_ASSERT(op->getInDType() == DataType::Float16 &&
op->getOutDType() == DataType::Float32);
cast_kernel<half, float>((half *)inputData, (float *)outputData,
num);
} else if (op->getType() == CastType::Float2Float16) {
IT_ASSERT(op->getInDType() == DataType::Float32 &&
op->getOutDType() == DataType::Float16);
cast_kernel<float, half>((float *)inputData, (half *)outputData,
num);
} else if (op->getType() == CastType::Float2Int32) {
IT_ASSERT(op->getInDType() == DataType::Float32 &&
op->getOutDType() == DataType::Int32);
cast_kernel<float, int32_t>((float *)inputData,
(int32_t *)outputData, num);
} else if (op->getType() == CastType::Float2Int8) {
IT_ASSERT(op->getInDType() == DataType::Float32 &&
op->getOutDType() == DataType::Int8);
cast_kernel<float, int8_t>((float *)inputData, (int8_t *)outputData,
num);
} else if (op->getType() == CastType::Int82Float) {
IT_ASSERT(op->getInDType() == DataType::Int8 &&
op->getOutDType() == DataType::Float32);
cast_kernel<int8_t, float>((int8_t *)inputData, (float *)outputData,
num);
} else if (op->getType() == CastType::Int322Float) {
IT_ASSERT(op->getInDType() == DataType::Int32 &&
op->getOutDType() == DataType::Float32);
cast_kernel<int32_t, float>((int32_t *)inputData,
(float *)outputData, num);
} else {
IT_ASSERT(false);
}
}
};
class ActivationCudnn : public CudaKernelWithoutConfig { class ActivationCudnn : public CudaKernelWithoutConfig {
virtual cudnnActivationMode_t getOpType() const = 0; virtual cudnnActivationMode_t getOpType() const = 0;
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; } virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
@ -33,17 +79,17 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
while (stride.size() < 4) while (stride.size() < 4)
stride.push_back(1); stride.push_back(1);
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
// get inputs // get inputs
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
checkCudnnError(cudnnSetTensorNdDescriptor(inputDesc, CUDNN_DATA_FLOAT, checkCudnnError(cudnnSetTensorNdDescriptor(
dim.size(), dim.data(), inputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
stride.data()));
// get outputs // get outputs
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
checkCudnnError(cudnnSetTensorNdDescriptor(outputDesc, CUDNN_DATA_FLOAT, checkCudnnError(cudnnSetTensorNdDescriptor(
dim.size(), dim.data(), outputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
stride.data()));
// get op descriptor // get op descriptor
cudnnActivationDescriptor_t activationDesc; cudnnActivationDescriptor_t activationDesc;
@ -86,16 +132,18 @@ class SoftmaxCudnn : public CudaKernelWithoutConfig {
memcpy(dim_array + (4 - dim.size()), dim.data(), memcpy(dim_array + (4 - dim.size()), dim.data(),
dim.size() * sizeof(int)); dim.size() * sizeof(int));
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
// get inputs // get inputs
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0], inputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
dim_array[1], dim_array[2], dim_array[3])); dim_array[1], dim_array[2], dim_array[3]));
// get outputs // get outputs
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc)); checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
checkCudnnError(cudnnSetTensor4dDescriptor( checkCudnnError(cudnnSetTensor4dDescriptor(
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0], outputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
dim_array[1], dim_array[2], dim_array[3])); dim_array[1], dim_array[2], dim_array[3]));
auto [alpha, beta] = getAlphBeta(); auto [alpha, beta] = getAlphBeta();
@ -130,35 +178,27 @@ class TanhCudnn : public ActivationCudnn {
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, ReluCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Relu, ReluCudnn, "Relu_CUDA");
"Relu_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, SigmoidCudnn, "Sigmoid_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, SigmoidCudnn, REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, UnaryCuda,
"Sigmoid_CUDA_Float32"); "Hard_Sigmoid_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, UnaryCuda, "Hard_Swish_CUDA");
"Hard_Sigmoid_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Tanh, TanhCudnn, "Tanh_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda, "Abs_CUDA");
"Hard_Swish_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, UnaryCuda, "Sqrt_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, TanhCudnn, REGISTER_KERNEL(Device::CUDA, OpType::Gelu, UnaryCuda, "Gelu_CUDA");
"Tanh_CUDA_Float32"); REGISTER_KERNEL(Device::CUDA, OpType::Neg, UnaryCuda, "Neg_CUDA");
REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Erf, UnaryCuda, "Erf_CUDA");
"Abs_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda,
"Sqrt_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda,
"Gelu_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda,
"Neg_CUDA_Float32");
REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda,
"Erf_CUDA_Float32");
// REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, UnaryCuda, REGISTER_KERNEL(Device::CUDA, OpType::Cast, CastCuda, "Cast_CUDA");
// "Softmax_CUDA_Float32");
// REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, UnaryCuda, // REGISTER_KERNEL(Device::CUDA, OpType::Softmax, UnaryCuda, "Softmax_CUDA");
// "Relu_CUDA_Float32"); // REGISTER_KERNEL(Device::CUDA, OpType::Relu, UnaryCuda,
// REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, UnaryCuda, // "Relu_CUDA");
// "Sigmoid_CUDA_Float32"); // REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, UnaryCuda,
// REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, UnaryCuda, // "Sigmoid_CUDA");
// "Tanh_CUDA_Float32"); // REGISTER_KERNEL(Device::CUDA, OpType::Tanh, UnaryCuda,
// REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda, // "Tanh_CUDA");
// "Abs_CUDA_Float32"); // REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda,
// "Abs_CUDA");
}; // namespace infini }; // namespace infini

View File

@ -1,6 +1,8 @@
#include "core/common.h" #include "core/common.h"
#include "core/constants.h" #include "core/constants.h"
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_unary.h"
#include <cub/cub.cuh>
#include <math.h> #include <math.h>
using infini::E_CONSTANT; using infini::E_CONSTANT;
@ -8,15 +10,16 @@ constexpr unsigned int num_threads() { return 32 * 4; }
constexpr int thread_work_size() { return 4; } constexpr int thread_work_size() { return 4; }
constexpr int block_work_size() { return thread_work_size() * num_threads(); } constexpr int block_work_size() { return thread_work_size() * num_threads(); }
__global__ void _softmax_kernel1(float *input, float *output, size_t n) { template <typename T>
__global__ void _softmax_kernel1(T *input, T *output, size_t n) {
float sum = 0.0f; float sum = 0.0f;
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
sum += pow(E_CONSTANT, input[i]); sum += pow(E_CONSTANT, input[i]);
} }
*output = sum; *output = sum;
} }
template <typename T>
__global__ void _softmax_kernel2(float *input, float *output, size_t n) { __global__ void _softmax_kernel2(T *input, T *output, size_t n) {
float sum = *output; float sum = *output;
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
@ -24,32 +27,32 @@ __global__ void _softmax_kernel2(float *input, float *output, size_t n) {
output[i] = pow(E_CONSTANT, input[i]) / sum; output[i] = pow(E_CONSTANT, input[i]) / sum;
} }
} }
template <typename T>
__global__ void _relu_kernel(float *input, float *output, size_t n) { __global__ void _relu_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
output[i] = max(input[i], float(0)); output[i] = max(input[i], float(0));
} }
} }
template <typename T>
__global__ void _sigmoid_kernel(float *input, float *output, size_t n) { __global__ void _sigmoid_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
output[i] = 1 / (1 + pow(E_CONSTANT, -input[i])); output[i] = 1 / (1 + pow(E_CONSTANT, -input[i]));
} }
} }
template <typename T>
__global__ void _hard_sigmoid_kernel(float *input, float *output, size_t n) { __global__ void _hard_sigmoid_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
output[i] = max(0.0f, min(1.0f, 0.2f * input[i] + 0.5f)); output[i] = max(0.0f, min(1.0f, 0.2f * input[i] + 0.5f));
} }
} }
template <typename T>
__global__ void _hard_swish_kernel(float *input, float *output, size_t n) { __global__ void _hard_swish_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
@ -57,8 +60,8 @@ __global__ void _hard_swish_kernel(float *input, float *output, size_t n) {
input[i] * max(0.f, min(1.f, (1.f / 6.f) * input[i] + 0.5f)); input[i] * max(0.f, min(1.f, (1.f / 6.f) * input[i] + 0.5f));
} }
} }
template <typename T>
__global__ void _tanh_kernel(float *input, float *output, size_t n) { __global__ void _tanh_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
@ -66,8 +69,8 @@ __global__ void _tanh_kernel(float *input, float *output, size_t n) {
(pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i])); (pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i]));
} }
} }
template <typename T>
__global__ void _abs_kernel(float *input, float *output, size_t n) { __global__ void _abs_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) { for (size_t i = index; i < n; i += stride) {
@ -83,7 +86,24 @@ __global__ void _sqrt_kernel(float *input, float *output, size_t n) {
} }
} }
__global__ void _gelu_kernel(float *input, float *output, size_t n) { __global__ void _sqrt_kernel(half *input, half *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = hsqrt(input[i]);
}
}
__global__ void _sqrt_kernel(int8_t *input, int8_t *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
output[i] = __fsqrt_rn(static_cast<float>(input[i]));
}
}
template <typename T>
__global__ void _gelu_kernel(T *input, T *output, size_t n) {
int index = threadIdx.x + blockIdx.x * blockDim.x; int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride) {
@ -91,8 +111,8 @@ __global__ void _gelu_kernel(float *input, float *output, size_t n) {
output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f))); output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f)));
} }
} }
template <typename T>
__global__ void _erf_kernel(float *input, float *output, size_t n) { __global__ void _erf_kernel(T *input, T *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x; size_t index = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x; size_t stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) { for (int i = index; i < n; i += stride) {
@ -109,72 +129,191 @@ __global__ void _neg_kernel(T *input, T *output, size_t n) {
} }
} }
template <typename INPUT, typename OUTPUT>
__global__ void _cast_kernel(INPUT *input, OUTPUT *output, size_t n) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
cub::CastOp<OUTPUT> _CastOp;
output[index] = _CastOp(input[index]);
}
}
namespace infini { namespace infini {
void softmax_kernel(float *input, float *output, size_t num) { template <typename T> void softmax_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_softmax_kernel1<<<1, 1>>>(input, output, num); _softmax_kernel1<T><<<1, 1>>>(input, output, num);
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num); _softmax_kernel2<T><<<gridsize, blocksize>>>(input, output, num);
} }
void relu_kernel(float *input, float *output, size_t num) { template <typename T> void relu_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_relu_kernel<<<gridsize, blocksize>>>(input, output, num); _relu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void sigmoid_kernel(float *input, float *output, size_t num) { template <typename T> void sigmoid_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num); _sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void hard_sigmoid_kernel(float *input, float *output, size_t num) { template <typename T>
void hard_sigmoid_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_hard_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num); _hard_sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void hard_swish_kernel(float *input, float *output, size_t num) { template <typename T> void hard_swish_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_hard_swish_kernel<<<gridsize, blocksize>>>(input, output, num); _hard_swish_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void tanh_kernel(float *input, float *output, size_t num) { template <typename T> void tanh_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num); _tanh_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void abs_kernel(float *input, float *output, size_t num) { template <typename T> void abs_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_abs_kernel<<<gridsize, blocksize>>>(input, output, num); _abs_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void sqrt_kernel(float *input, float *output, size_t num) { template <typename T> void sqrt_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num); _sqrt_kernel<<<gridsize, blocksize>>>((T *)input, (T *)output, num);
} }
void gelu_kernel(float *input, float *output, size_t num) {
template <typename T> void gelu_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_gelu_kernel<<<gridsize, blocksize>>>(input, output, num); _gelu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void erf_kernel(float *input, float *output, size_t num) { template <typename T> void erf_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_erf_kernel<<<gridsize, blocksize>>>(input, output, num); _erf_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void neg_kernel(float *input, float *output, size_t num) { template <typename T> void neg_kernel(T *input, T *output, size_t num) {
int blocksize = block_work_size(); int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size(); int gridsize = (num + block_work_size() - 1) / block_work_size();
_neg_kernel<<<gridsize, blocksize>>>(input, output, num); _neg_kernel<T><<<gridsize, blocksize>>>(input, output, num);
} }
void unary_kernel(const Operator &_op) {
auto op = as<UnaryObj>(_op);
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
size_t num = op->getOutput()->size();
if (op->getOpType() == OpType::Softmax) {
if (_op->getDType() == DataType::Float32) {
softmax_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Relu) {
if (_op->getDType() == DataType::Float32) {
relu_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Sigmoid) {
if (_op->getDType() == DataType::Float32) {
sigmoid_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::HardSigmoid) {
if (_op->getDType() == DataType::Float32) {
hard_sigmoid_kernel<float>((float *)inputData, (float *)outputData,
num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::HardSwish) {
if (_op->getDType() == DataType::Float32) {
hard_swish_kernel<float>((float *)inputData, (float *)outputData,
num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Tanh) {
if (_op->getDType() == DataType::Float32) {
tanh_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Abs) {
if (_op->getDType() == DataType::Float32) {
abs_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Sqrt) {
if (_op->getDType() == DataType::Float32) {
sqrt_kernel<float>((float *)inputData, (float *)outputData, num);
} else if (_op->getDType() == DataType::Float16) {
sqrt_kernel<half>((half *)inputData, (half *)outputData, num);
} else if (_op->getDType() == DataType::Int8) {
sqrt_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Gelu) {
if (_op->getDType() == DataType::Float32) {
gelu_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Neg) {
if (_op->getDType() == DataType::Float32) {
neg_kernel<float>((float *)inputData, (float *)outputData, num);
} else if (_op->getDType() == DataType::Float16) {
neg_kernel<half>((half *)inputData, (half *)outputData, num);
} else if (_op->getDType() == DataType::Int8) {
neg_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData, num);
} else {
IT_TODO_HALT();
}
} else if (op->getOpType() == OpType::Erf) {
if (_op->getDType() == DataType::Float32) {
erf_kernel<float>((float *)inputData, (float *)outputData, num);
} else {
IT_TODO_HALT();
}
} else
IT_TODO_HALT();
}
template <typename INPUT, typename OUTPUT>
void cast_kernel(INPUT *input, OUTPUT *output, size_t num) {
int blocksize = block_work_size();
int gridsize = (num + block_work_size() - 1) / block_work_size();
_cast_kernel<INPUT, OUTPUT><<<gridsize, blocksize>>>(input, output, num);
}
template void cast_kernel<float, half>(float *input, half *output, size_t num);
template void cast_kernel<half, float>(half *input, float *output, size_t num);
template void cast_kernel<float, int32_t>(float *input, int32_t *output,
size_t num);
template void cast_kernel<float, int8_t>(float *input, int8_t *output,
size_t num);
template void cast_kernel<int8_t, float>(int8_t *input, float *output,
size_t num);
template void cast_kernel<int32_t, float>(int32_t *input, float *output,
size_t num);
}; // namespace infini }; // namespace infini

View File

@ -1,6 +1,7 @@
#include "operators/where.h" #include "operators/where.h"
#include "cuda/cuda_kernel_wihtout_config.h" #include "cuda/cuda_kernel_wihtout_config.h"
#include "cuda/cuda_runtime.h" #include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "cuda/cuda_where.h" #include "cuda/cuda_where.h"
#include "utils/broadcast_shape.h" #include "utils/broadcast_shape.h"
@ -35,15 +36,14 @@ class WhereCuda : public CudaKernelWithoutConfig {
broadcastShape(opInputXShape, inputXShape, nDims, xSize); broadcastShape(opInputXShape, inputXShape, nDims, xSize);
broadcastShape(opInputYShape, inputYShape, nDims, ySize); broadcastShape(opInputYShape, inputYShape, nDims, ySize);
broadcastShape(opConditionShape, conditionShape, nDims, cSize); broadcastShape(opConditionShape, conditionShape, nDims, cSize);
const int dTypeIndex = op->getDType().getIndex();
whereKernel((float *)inputXData, (float *)inputYData, whereKernel(dTypeIndex, inputXData, inputYData,
(uint8_t *)conditionData, (float *)outputData, nDims, (uint8_t *)conditionData, outputData, nDims, outputsize,
outputsize, inputXShape, inputYShape, conditionShape, inputXShape, inputYShape, conditionShape, outputShape,
outputShape, xSize, ySize, cSize); xSize, ySize, cSize);
} }
}; };
REGISTER_KERNEL(Device::CUDA, OpType::Where, DataType::Float32, WhereCuda, REGISTER_KERNEL(Device::CUDA, OpType::Where, WhereCuda, "Where_CUDA");
"Where_CUDA_Float32");
}; // namespace infini }; // namespace infini

View File

@ -1,4 +1,5 @@
#include "cuda/cuda_common.h" #include "cuda/cuda_common.h"
#include "cuda/cuda_utility.h"
#include "utils/small_array.h" #include "utils/small_array.h"
__device__ int inferIndex(infini::SmallArray inputShape, __device__ int inferIndex(infini::SmallArray inputShape,
@ -17,13 +18,12 @@ __device__ int inferIndex(infini::SmallArray inputShape,
} }
return inputIdx; return inputIdx;
} }
__global__ void _whereKernel(const float *inputX, const float *inputY, template <typename T>
const uint8_t *condition, float *output, int nDims, __global__ void
int outputsize, infini::SmallArray inputXShape, _whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
infini::SmallArray inputYShape, int nDims, int outputsize, infini::SmallArray inputXShape,
infini::SmallArray conditionShape, infini::SmallArray inputYShape, infini::SmallArray conditionShape,
infini::SmallArray outputShape, int xSize, infini::SmallArray outputShape, int xSize, int ySize, int cSize) {
int ySize, int cSize) {
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x; int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (outputIdx < outputsize) { if (outputIdx < outputsize) {
@ -35,14 +35,61 @@ __global__ void _whereKernel(const float *inputX, const float *inputY,
int inputYIdx = int inputYIdx =
inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx); inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
output[outputIdx] = ((T *)output)[outputIdx] = condition[conditionIdx]
condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx]; ? ((T *)inputX)[inputXIdx]
: ((T *)inputY)[inputYIdx];
} }
} }
#define CASE(T) \
_whereKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
inputX, inputY, condition, output, nDims, outputsize, inputXShape, \
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
#define SWITCH_DTYPE(DTYPE) \
switch (DTYPE) { \
case 1: \
CASE(1) \
break; \
case 2: \
CASE(2) \
break; \
case 3: \
CASE(3) \
break; \
case 4: \
CASE(4) \
break; \
case 5: \
CASE(5) \
break; \
case 6: \
CASE(6) \
break; \
case 7: \
CASE(7) \
break; \
case 10: \
CASE(10) \
break; \
case 11: \
CASE(11) \
break; \
case 12: \
CASE(12) \
break; \
case 13: \
CASE(13) \
break; \
case 16: \
CASE(16) \
break; \
default: \
IT_TODO_HALT(); \
}
namespace infini { namespace infini {
void whereKernel(const float *inputX, const float *inputY,
const uint8_t *condition, float *output, int nDims, void whereKernel(int dTypeIndex, void *inputX, void *inputY,
const uint8_t *condition, void *output, int nDims,
int outputsize, SmallArray inputXShape, SmallArray inputYShape, int outputsize, SmallArray inputXShape, SmallArray inputYShape,
SmallArray conditionShape, SmallArray outputShape, int xSize, SmallArray conditionShape, SmallArray outputShape, int xSize,
int ySize, int cSize) { int ySize, int cSize) {
@ -61,8 +108,8 @@ void whereKernel(const float *inputX, const float *inputY,
blocksize = 32; blocksize = 32;
} }
int gridsize = (outputsize + blocksize - 1) / blocksize; int gridsize = (outputsize + blocksize - 1) / blocksize;
_whereKernel<<<gridsize, blocksize>>>(
inputX, inputY, condition, output, nDims, outputsize, inputXShape, SWITCH_DTYPE(dTypeIndex)
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
} }
} // namespace infini } // namespace infini

View File

@ -0,0 +1,53 @@
#include "operators/dequantize_linear.h"
#include "utils/operator_utils.h"
namespace infini {
DequantizeLinearObj::DequantizeLinearObj(GraphObj *graph, Tensor inputX,
Tensor inputScale, Tensor output,
[[maybe_unused]] Tensor inputZeroPoint,
int axis)
: OperatorObj(OpType::DequantizeLinear,
inputZeroPoint ? TensorVec{inputX, inputScale, inputZeroPoint}
: TensorVec{inputX, inputScale},
{output}),
axis(axis) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
DequantizeLinearObj::inferShape(const TensorVec &inputs) {
return {{inputs[0]->getDims()}}; // x.shape = output.shape = inputs[0].shape
}
vector<DataType>
DequantizeLinearObj::inferDataType(const TensorVec &inputs) const {
IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
return {
inputs[1]->getDType()}; // scale.dtype = output.dtype = inputs[1].dtype
}
std::string DequantizeLinearObj::toString() const {
std::ostringstream os;
os << "DequantizeLinear[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "inputX=" << inputs[0]->getGuid() << ",";
os << "inputScale=" << inputs[1]->getGuid() << ",";
// os << "inputZeroPoint=" << inputs[2]->getGuid() << ",";
os << "axis=" << axis << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> DequantizeLinearObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.emplace(ret.begin(), type.underlying());
return ret;
}
vector<int> DequantizeLinearObj::getOpAttrVector() const {
return {type.underlying(), axis};
}
} // namespace infini

View File

@ -0,0 +1,46 @@
#include "operators/dynamic_quantize_linear.h"
#include "utils/operator_utils.h"
namespace infini {
DynamicQuantizeLinearObj::DynamicQuantizeLinearObj(
GraphObj *graph, Tensor input, std::optional<TensorVec> outputs)
: OperatorObj(OpType::DynamicQuantizeLinear, TensorVec{input},
((!outputs) ? TensorVec(3, nullptr) : std::move(*outputs))) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) {
return {{inputs[0]->getDims(), {}, {}}};
}
vector<DataType>
DynamicQuantizeLinearObj::inferDataType(const TensorVec &inputs) const {
IT_ASSERT(inputs.size() == 1);
return {DataType(2), DataType(1), DataType(2)};
}
std::string DynamicQuantizeLinearObj::toString() const {
std::ostringstream os;
os << "DynamicQuantizeLinear[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=";
for (auto output : outputs)
os << output->getGuid() << ",";
os << ")";
return os.str();
}
vector<int> DynamicQuantizeLinearObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.emplace(ret.begin(), type.underlying());
return ret;
}
vector<int> DynamicQuantizeLinearObj::getOpAttrVector() const {
return {type.underlying()};
}
} // namespace infini

View File

@ -27,10 +27,7 @@ optional<vector<Shape>> LayerNormObj::inferShape(const TensorVec &inputs) {
vector<DataType> LayerNormObj::inferDataType(const TensorVec &inputs) const { vector<DataType> LayerNormObj::inferDataType(const TensorVec &inputs) const {
IT_ASSERT(inputs.size() == 2 || inputs.size() == 3); IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
IT_ASSERT(inputs[1]->getDType() == DataType::Float32);
if (inputs.size() == 3) {
IT_ASSERT(inputs[2]->getDType() == DataType::Float32);
}
return {inputs[0]->getDType()}; return {inputs[0]->getDType()};
} }

View File

@ -0,0 +1,75 @@
#include "operators/matmul_integer.h"
#include "utils/operator_utils.h"
#include <numeric>
namespace infini {
MatmulIntegerObj::MatmulIntegerObj(GraphObj *graph, Tensor A, Tensor B,
Tensor C,
[[maybe_unused]] Tensor a_zero_point,
[[maybe_unused]] Tensor b_zero_point)
: OperatorObj(OpType::MatMulInteger,
a_zero_point ? (b_zero_point ? TensorVec{A, B, a_zero_point,
b_zero_point}
: TensorVec{A, B, a_zero_point})
: TensorVec{A, B},
{C}),
b(1) {
IT_ASSERT(checkValid(graph));
}
string MatmulIntegerObj::toString() const {
std::ostringstream os;
os << "MatmulInteger(A=" << inputs[0]->getGuid()
<< ",B=" << inputs[1]->getGuid() << ",C=" << outputs[0]->getGuid()
<< ",bmnk=[" << b << "," << m << "," << n << "," << k << "])";
return os.str();
}
optional<vector<Shape>> MatmulIntegerObj::inferShape(const TensorVec &inputs) {
auto A = inputs[0], B = inputs[1];
auto shapeA = A->getDims();
auto shapeB = B->getDims();
int rankA = A->getRank();
int rankB = B->getRank();
Shape shapeA1(shapeA.begin(), shapeA.begin() + (rankA - 2));
Shape shapeB1(shapeB.begin(), shapeB.begin() + (rankB - 2));
Shape ret = infer_broadcast(shapeA1, shapeB1);
if (ret.empty()) {
b = 1;
} else {
b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
}
IT_ASSERT(*(shapeA.rbegin()) == *(shapeB.rbegin() + 1));
m = *(shapeA.rbegin() + 1);
n = *(shapeB.rbegin());
k = *(shapeA.rbegin());
ret.emplace_back(m);
ret.emplace_back(n);
return {{ret}};
}
vector<DataType>
MatmulIntegerObj::inferDataType(const TensorVec &inputs) const {
for (auto &input : inputs) {
IT_ASSERT(input->getDType() == DataType::Int8 ||
input->getDType() == DataType::UInt8);
}
if (inputs.size() >= 3) {
IT_ASSERT(inputs[0]->getDType() == inputs[2]->getDType());
}
if (inputs.size() == 4) {
IT_ASSERT(inputs[1]->getDType() == inputs[3]->getDType());
}
return vector(numOutputs(), DataType::Int32);
}
vector<int> MatmulIntegerObj::getWorkloadVector() const {
return {type.underlying(), b, m, n, k};
}
vector<int> MatmulIntegerObj::getOpAttrVector() const {
return {type.underlying()};
}
} // namespace infini

View File

@ -187,4 +187,42 @@ TEST(ConcatToIdentity, Cuda) {
EXPECT_TRUE( EXPECT_TRUE(
oCpu->equalData(vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11})); oCpu->equalData(vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}));
} }
//----------
TEST(ConcatFp16, CudaHigh) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto t1 = gCpu->addTensor({2, 2, 3, 1, 2}, DataType::Float16);
auto t2 = gCpu->addTensor({2, 2, 1, 1, 2}, DataType::Float16);
auto t3 = gCpu->addTensor({2, 2, 2, 1, 2}, DataType::Float16);
gCpu->dataMalloc();
t1->setData(ValGenerator<2>());
t2->setData(ValGenerator<1>());
t3->setData(ValGenerator<4>());
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto t1Gpu = gCuda->cloneTensor(t1);
auto t2Gpu = gCuda->cloneTensor(t2);
auto t3Gpu = gCuda->cloneTensor(t3);
auto op =
gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
gCuda->dataMalloc();
t1Gpu->setData(ValGenerator<2>());
t2Gpu->setData(ValGenerator<1>());
t3Gpu->setData(ValGenerator<4>());
cudaRuntime->run(gCuda);
// cudaPrintTensor(op->getOutput());
// copy output from CUDA to CPU
auto oCpu = gCpu->cloneTensor(op->getOutput());
EXPECT_TRUE(oCpu->equalData(vector<float>{
2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2.,
2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1.,
4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4.}));
}
} // namespace infini } // namespace infini

View File

@ -160,8 +160,8 @@ TEST(cuDNN_ConvTransposed, tune) {
bool tune = true; bool tune = true;
cuda->run(gCuda, tune); cuda->run(gCuda, tune);
// check record // check record
auto kernelAttrs = KernelAttrs{Device::CUDA, conv->getOpType().underlying(), auto kernelAttrs =
DataType::Float32}; KernelAttrs{Device::CUDA, conv->getOpType().underlying()};
auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()}; auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
std::optional<PerfRecord> perfData = std::optional<PerfRecord> perfData =
PerfEngine::getInstance().getPerfData(perfKey); PerfEngine::getInstance().getPerfData(perfKey);

View File

@ -0,0 +1,219 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/dequantize_linear.h"
#include "test.h"
namespace infini {
void test_dequantizeLinearFp32(
const Shape &inputXShape, const vector<uint8_t> &inputXData,
const Shape &inputScaleShape, const vector<float> &inputScaleData, int axis,
const vector<float> &ExpectData,
const std::optional<Shape> &zeroPointShape = std::nullopt,
const std::optional<std::vector<uint8_t>> &inputZeroPointData =
std::nullopt) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
Shape inputZeroPointShape = *zeroPointShape;
auto inputZeroPoint =
gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
gCpu->dataMalloc();
inputZeroPoint->copyin(*inputZeroPointData); //
inputX->copyin(inputXData);
inputScale->copyin(inputScaleData); //
// inputX->printData();
// inputZeroPoint->printData();
// inputScale->printData();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
auto op = gCuda->addOp<DequantizeLinearObj>(
inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
axis); // DequantizeLinearObj
gCuda->dataMalloc();
inputZeroPointGpu->copyin(*inputZeroPointData);
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
inputXGpu->copyin(inputXData);
inputScaleGpu->copyin(inputScaleData);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
} else {
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
gCpu->dataMalloc();
inputX->copyin(inputXData);
inputScale->copyin(inputScaleData); //
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
auto op = gCuda->addOp<DequantizeLinearObj>(
inputXGpu, inputScaleGpu, nullptr, nullptr,
axis); // DequantizeLinearObj
gCuda->dataMalloc();
inputXGpu->copyin(inputXData);
inputScaleGpu->copyin(inputScaleData);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
}
void test_dequantizeLinearFp16(
const Shape &inputXShape, const vector<uint8_t> &inputXData,
const Shape &inputScaleShape,
const std::function<void(void *, size_t, DataType)> &generator, int axis,
const vector<float> &ExpectData,
const std::optional<Shape> &zeroPointShape = std::nullopt,
const std::optional<std::vector<uint8_t>> &inputZeroPointData =
std::nullopt) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
Shape inputZeroPointShape = *zeroPointShape;
auto inputZeroPoint =
gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
gCpu->dataMalloc();
inputZeroPoint->copyin(*inputZeroPointData); //
// inputZeroPoint->printData();
inputX->copyin(inputXData);
inputScale->setData(generator);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
auto op = gCuda->addOp<DequantizeLinearObj>(
inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
axis); // DequantizeLinearObj
gCuda->dataMalloc();
inputZeroPointGpu->copyin(*inputZeroPointData);
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
inputXGpu->copyin(inputXData);
inputScaleGpu->setData(generator);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
} else {
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
gCpu->dataMalloc();
inputX->copyin(inputXData);
inputScale->setData(generator);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputXGpu = gCuda->cloneTensor(inputX);
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
auto op = gCuda->addOp<DequantizeLinearObj>(
inputXGpu, inputScaleGpu, nullptr, nullptr,
axis); // DequantizeLinearObj
gCuda->dataMalloc();
inputXGpu->copyin(inputXData);
inputScaleGpu->setData(generator);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
}
TEST(CUDA_DequantizeLinearFp32, run) {
test_dequantizeLinearFp32(
Shape{2, 3, 2, 3},
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
vector<float>{-0.3000000, 0.0000000, 0.3000000, 0.6000000,
0.9000000, 1.2000000, 0.8000000, 1.0000000,
1.2000000, 1.4000000, 1.6000000, 1.8000001,
4.5000000, 5.0000000, 5.5000000, 6.0000000,
6.5000000, 7.0000000, 5.1000004, 5.4000001,
5.7000003, 6.0000000, 6.3000002, 6.6000004,
4.4000001, 4.5999999, 4.8000002, 5.0000000,
5.2000003, 5.4000001, 13.5000000, 14.0000000,
14.5000000, 15.0000000, 15.5000000, 16.0000000},
Shape{3}, vector<uint8_t>{1, 2, 3});
test_dequantizeLinearFp32(
Shape{2, 3, 2, 3},
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
vector<float>{0.0000000, 0.3000000, 0.6000000, 0.9000000,
1.2000000, 1.5000000, 1.2000000, 1.4000000,
1.6000000, 1.8000001, 2.0000000, 2.2000000,
6.0000000, 6.5000000, 7.0000000, 7.5000000,
8.0000000, 8.5000000, 5.4000001, 5.7000003,
6.0000000, 6.3000002, 6.6000004, 6.9000001,
4.8000002, 5.0000000, 5.2000003, 5.4000001,
5.5999999, 5.8000002, 15.0000000, 15.5000000,
16.0000000, 16.5000000, 17.0000000, 17.5000000});
} // python output
TEST(CUDA_DequantizeLinearFp16, run) {
test_dequantizeLinearFp16(
Shape{2, 3, 2, 3},
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
Shape{3}, ValGenerator<2>(), 1,
vector<float>{-2., 0., 2., 4., 6., 8., 8., 10., 12.,
14., 16., 18., 18., 20., 22., 24., 26., 28.,
34., 36., 38., 40., 42., 44., 44., 46., 48.,
50., 52., 54., 54., 56., 58., 60., 62., 64.},
Shape{3}, vector<uint8_t>{1, 2, 3});
test_dequantizeLinearFp16(
Shape{2, 3, 2, 3},
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
Shape{3}, ValGenerator<2>(), 1,
vector<float>{0., 2., 4., 6., 8., 10., 12., 14., 16.,
18., 20., 22., 24., 26., 28., 30., 32., 34.,
36., 38., 40., 42., 44., 46., 48., 50., 52.,
54., 56., 58., 60., 62., 64., 66., 68., 70.});
} // python output
} // namespace infini

View File

@ -0,0 +1,74 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/dynamic_quantize_linear.h"
#include "test.h"
namespace infini {
void test_dynamicquantizeLinearFp32(
const Shape &inputShape, const vector<float> &inputData,
const vector<uint8_t> &outputYData, const vector<float> &outputYScaleData,
const vector<uint8_t> &outputYZeroPointData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor(inputShape, DataType::Float32);
gCpu->dataMalloc();
input->copyin(inputData);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = gCuda->cloneTensor(input);
auto op = gCuda->addOp<DynamicQuantizeLinearObj>(
inputGpu,
std::nullopt); // DynamicQuantizeLinear
gCuda->dataMalloc();
inputGpu->copyin(inputData);
cudaRuntime->run(gCuda);
EXPECT_EQ(op->getOutputs().size(), (size_t)3);
auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
EXPECT_TRUE(o0Cpu->equalData(outputYData));
EXPECT_TRUE(o1Cpu->equalData(outputYScaleData));
EXPECT_TRUE(o2Cpu->equalData(outputYZeroPointData));
}
TEST(CUDA_DynamicquantizeLinearFp32, run) {
test_dynamicquantizeLinearFp32(
Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17.,
18., 19., 20., 21., 22., 23., 24., 25., 26.,
27., 28., 29., 30., 31., 32., 33., 34., 35.},
vector<uint8_t>{0, 7, 15, 22, 29, 36, 44, 51, 58,
66, 73, 80, 87, 95, 102, 109, 117, 124,
131, 138, 146, 153, 160, 168, 175, 182, 189,
197, 204, 211, 219, 226, 233, 240, 248, 255},
vector<float>{0.1372549}, vector<uint8_t>{0});
test_dynamicquantizeLinearFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
vector<uint8_t>{0, 11, 22, 33, 44, 55, 67, 78,
89, 100, 111, 122, 133, 144, 155, 166,
177, 188, 200, 211, 222, 233, 244, 255},
vector<float>{0.0901961}, vector<uint8_t>{0});
} // python output
} // namespace infini

View File

@ -8,7 +8,7 @@
namespace infini { namespace infini {
void test_layernorm( void test_layernormFp32(
const Shape &inputShape, const vector<float> &inputData, const Shape &inputShape, const vector<float> &inputData,
const Shape &scaleShape, const vector<float> &scaleData, float eps, const Shape &scaleShape, const vector<float> &scaleData, float eps,
int axis, int stash_type, const vector<float> &ExpectData, int axis, int stash_type, const vector<float> &ExpectData,
@ -77,9 +77,78 @@ void test_layernorm(
EXPECT_TRUE(oCpu->equalData(ExpectData)); EXPECT_TRUE(oCpu->equalData(ExpectData));
} }
} }
void test_layernormFp16(
const Shape &inputShape,
const std::function<void(void *, size_t, DataType)> &generator,
const Shape &scaleShape, float eps, int axis, int stash_type,
const vector<float> &ExpectData,
const std::optional<Shape> &bShape = std::nullopt) {
TEST(CUDA_Layernorm, run) { Runtime runtime = NativeCpuRuntimeObj::getInstance();
test_layernorm( Graph gCpu = make_ref<GraphObj>(runtime);
if (bShape.has_value()) {
Shape biasShape = *bShape;
auto bias = gCpu->addTensor(biasShape, DataType::Float16);
auto input = gCpu->addTensor(inputShape, DataType::Float16);
auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
gCpu->dataMalloc();
bias->setData(generator);
// bias->printData();
input->setData(generator);
scale->setData(generator);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto biasGpu = gCuda->cloneTensor(bias);
auto inputGpu = gCuda->cloneTensor(input);
auto scaleGpu = gCuda->cloneTensor(scale);
// gCpu->cloneTensor(biasGpu)->printData();
auto op =
gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, biasGpu,
eps, axis, stash_type); // LayernormObj
gCuda->dataMalloc();
biasGpu->setData(generator);
// gCpu->cloneTensor(biasGpu)->printData();
inputGpu->setData(generator);
scaleGpu->setData(generator);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
} else {
auto input = gCpu->addTensor(inputShape, DataType::Float16);
auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
gCpu->dataMalloc();
input->setData(generator);
scale->setData(generator);
auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = gCuda->cloneTensor(input);
auto scaleGpu = gCuda->cloneTensor(scale);
auto op =
gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, nullptr,
eps, axis, stash_type); // LayernormObj
gCuda->dataMalloc();
inputGpu->setData(generator);
scaleGpu->setData(generator);
cudaRuntime->run(gCuda);
auto oCpu =
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
oCpu->printData(); //->printData
EXPECT_TRUE(oCpu->equalData(ExpectData));
}
}
TEST(CUDA_LayernormFp32, run) {
test_layernormFp32(
Shape{2, 3, 2, 3}, Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8., vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17., 9., 10., 11., 12., 13., 14., 15., 16., 17.,
@ -94,7 +163,7 @@ TEST(CUDA_Layernorm, run) {
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678}, -0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
Shape{3}, vector<float>{0, 0, 0}); Shape{3}, vector<float>{0, 0, 0});
test_layernorm( test_layernormFp32(
Shape{2, 3, 2, 3}, Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8., vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17., 9., 10., 11., 12., 13., 14., 15., 16., 17.,
@ -109,7 +178,7 @@ TEST(CUDA_Layernorm, run) {
-0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
-0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679}, -0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
Shape{3}, vector<float>{0.3, 0.2, 0.5}); Shape{3}, vector<float>{0.3, 0.2, 0.5});
test_layernorm( test_layernormFp32(
Shape{2, 3, 2, 3}, Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8., vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17., 9., 10., 11., 12., 13., 14., 15., 16., 17.,
@ -124,7 +193,7 @@ TEST(CUDA_Layernorm, run) {
-0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
-0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207}, -0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207},
Shape{3}, vector<float>{0.3, 0.2, 0.5}); Shape{3}, vector<float>{0.3, 0.2, 0.5});
test_layernorm( test_layernormFp32(
Shape{2, 3, 2, 3}, Shape{2, 3, 2, 3},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8., vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
9., 10., 11., 12., 13., 14., 15., 16., 17., 9., 10., 11., 12., 13., 14., 15., 16., 17.,
@ -141,6 +210,15 @@ TEST(CUDA_Layernorm, run) {
0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.0000000, 0.6123678, -0.3674207, 0.0000000,
0.6123678, -0.3674207, 0.0000000, 0.6123678}); 0.6123678, -0.3674207, 0.0000000, 0.6123678});
} // python output
TEST(CUDA_LayernormFp16, run) {
test_layernormFp16(Shape{2, 3, 2, 3}, ValGenerator<2>(), Shape{3}, 1e-5, 3,
1, vector<float>{2., 2., 2., 2., 2., 2., 2., 2., 2.,
2., 2., 2., 2., 2., 2., 2., 2., 2.,
2., 2., 2., 2., 2., 2., 2., 2., 2.,
2., 2., 2., 2., 2., 2., 2., 2., 2.},
Shape{3});
} // python output } // python output
} // namespace infini } // namespace infini

View File

@ -0,0 +1,68 @@
#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "operators/matmul_integer.h"
#include "test.h"
namespace infini {
using ExpectOutput = vector<int32_t>;
TEST(cuBLAS_MatmulInteger, ZeroPoint1) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
auto gCuda = make_ref<GraphObj>(cudaRuntime);
auto ACuda = gCuda->addTensor({1, 4}, DataType::UInt8);
auto BCuda = gCuda->addTensor({4, 12}, DataType::UInt8);
auto AZeroPointCuda = gCuda->addTensor({}, DataType::UInt8);
auto BZeroPointCuda = gCuda->addTensor({}, DataType::UInt8);
auto op = gCuda->addOp<MatmulIntegerObj>(ACuda, BCuda, nullptr,
AZeroPointCuda, BZeroPointCuda);
// allocate CUDA memory
gCuda->dataMalloc();
// ACuda->copyin(vector<uint8_t>{11, 7, 3, 10, 6, 2, 9, 5, 1, 8, 4, 0});
ACuda->copyin(vector<uint8_t>{11, 7, 3, 10});
// BCuda->copyin(vector<uint8_t>({1, 4, 2, 5, 3, 6,}));
BCuda->copyin(vector<uint8_t>(48, 1));
AZeroPointCuda->copyin(vector<uint8_t>{12});
BZeroPointCuda->copyin(vector<uint8_t>{0});
cudaRuntime->run(gCuda);
auto result = op->getOutput()->clone(NativeCpuRuntimeObj::getInstance());
// ExpectOutput ans = {
// -38, -83, -44, -98, -50, -113, -56, -128,
// };
ExpectOutput ans = {-17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17};
EXPECT_TRUE(result->equalData(ans));
}
TEST(cuBLAS_MatmulInteger, ZeroPoint2) {
auto cudaRuntime = make_ref<CudaRuntimeObj>();
auto gCuda = make_ref<GraphObj>(cudaRuntime);
auto ACuda = gCuda->addTensor({2, 3, 1, 4}, DataType::UInt8);
auto BCuda = gCuda->addTensor({2, 3, 4, 12}, DataType::UInt8);
auto AZeroPointCuda = gCuda->addTensor({2, 3, 1, 1}, DataType::UInt8);
auto BZeroPointCuda = gCuda->addTensor({2, 3, 1, 12}, DataType::UInt8);
auto op = gCuda->addOp<MatmulIntegerObj>(ACuda, BCuda, nullptr,
AZeroPointCuda, BZeroPointCuda);
// allocate CUDA memory
gCuda->dataMalloc();
ACuda->copyin(vector<uint8_t>{11, 7, 3, 10, 11, 7, 3, 10, 11, 7, 3, 10,
11, 7, 3, 10, 11, 7, 3, 10, 11, 7, 3, 10});
BCuda->copyin(vector<uint8_t>(288, 1));
AZeroPointCuda->copyin(vector<uint8_t>(6, 12));
BZeroPointCuda->copyin(vector<uint8_t>(72, 0));
cudaRuntime->run(gCuda);
auto result = op->getOutput()->clone(NativeCpuRuntimeObj::getInstance());
ExpectOutput ans = {-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
-17, -17, -17, -17, -17, -17};
EXPECT_TRUE(result->equalData(ans));
}
}; // namespace infini

View File

@ -8,130 +8,127 @@
#include <cmath> #include <cmath>
namespace infini { namespace infini {
TEST(cuDNN_Softmax, run_axis1) { void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
// Runtime int axis, const vector<float> &ExpectData) {
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor(inputShape, DataType::Float32);
gCpu->dataMalloc();
input->copyin(inputData);
auto cudaRuntime = make_ref<CudaRuntimeObj>(); auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
// Build input data on CPU auto inputGpu = gCuda->cloneTensor(input);
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
// GPU auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime); gCuda->dataMalloc();
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1); inputGpu->copyin(inputData);
cudaGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003}); cudaRuntime->run(gCuda);
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput(); auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); oCpu->printData(); //->printData
cudaPrintTensor(outputGpu); EXPECT_TRUE(oCpu->equalData(ExpectData));
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(
vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
0.032058604, 0.08714432, 0.23688284, 0.6439143}));
} }
void test_softmaxFp16(
const Shape &inputShape,
const std::function<void(void *, size_t, DataType)> &generator, int axis,
const vector<float> &ExpectData) {
Runtime runtime = NativeCpuRuntimeObj::getInstance();
Graph gCpu = make_ref<GraphObj>(runtime);
auto input = gCpu->addTensor(inputShape, DataType::Float16);
gCpu->dataMalloc();
input->setData(generator);
TEST(cuDNN_Softmax, run_axis0) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>(); auto cudaRuntime = make_ref<CudaRuntimeObj>();
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
// Build input data on CPU auto inputGpu = gCuda->cloneTensor(input);
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
// GPU auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime); gCuda->dataMalloc();
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0); inputGpu->setData(generator);
cudaGraph->dataMalloc();
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003}); cudaRuntime->run(gCuda);
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput(); auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); oCpu->printData(); //->printData
cudaPrintTensor(outputGpu); EXPECT_TRUE(oCpu->equalData(ExpectData));
// Check
EXPECT_TRUE(
outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
} }
TEST(CUDA_SoftmaxFP32, run) {
test_softmaxFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
test_softmaxFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
1, vector<float>{3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
9.81690347e-01, 9.81690347e-01, 9.81690347e-01,
3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
9.81690347e-01, 9.81690347e-01, 9.81690347e-01});
test_softmaxFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
2, vector<float>{0.11920292, 0.11920292, 0.88079703, 0.88079703,
0.11920292, 0.11920292, 0.88079703, 0.88079703,
0.11920292, 0.11920292, 0.88079703, 0.88079703,
0.11920292, 0.11920292, 0.88079703, 0.88079703,
0.11920292, 0.11920292, 0.88079703, 0.88079703,
0.11920292, 0.11920292, 0.88079703, 0.88079703});
test_softmaxFp32(
Shape{2, 3, 2, 2},
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
8., 9., 10., 11., 12., 13., 14., 15.,
16., 17., 18., 19., 20., 21., 22., 23.},
3, vector<float>{0.26894143, 0.73105860, 0.26894143, 0.73105860,
0.26894143, 0.73105860, 0.26894143, 0.73105860,
0.26894143, 0.73105860, 0.26894143, 0.73105860,
0.26894143, 0.73105860, 0.26894143, 0.73105860,
0.26894143, 0.73105860, 0.26894143, 0.73105860,
0.26894143, 0.73105860, 0.26894143, 0.73105860});
} // python output
TEST(CUDA_SoftmaxFP16, run) {
test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0,
vector<float>{0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
0.5000, 0.5000, 0.5000, 0.5000});
test_softmaxFp16(
Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down
vector<float>{0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
0.333252, 0.333252, 0.333252, 0.333252});
TEST(cuDNN_Softmax2, run_axis1) { } // python output
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
cudaPrintTensor(outputGpu);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
0.9820138, 0.9820138, 0.9820138, 0.9820138}));
}
TEST(cuDNN_Softmax2, run_axis2) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
cudaPrintTensor(outputGpu);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
0.1192029, 0.1192029, 0.8807971, 0.8807971}));
}
TEST(cuDNN_Softmax2, run_axis3) {
// Runtime
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
auto cudaRuntime = make_ref<CudaRuntimeObj>();
// Build input data on CPU
Tensor inputCpu =
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
// GPU
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
cudaGraph->dataMalloc();
inputGpu->setData(IncrementalGenerator());
cudaRuntime->run(cudaGraph);
auto outputGpu = gpuOp->getOutput();
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
cudaPrintTensor(outputGpu);
// Check
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
0.2689414, 0.7310586, 0.2689414, 0.7310586}));
}
} // namespace infini } // namespace infini

Some files were not shown because too many files have changed in this diff Show More