forked from jiuyuan/InfiniTensor
Compare commits
39 Commits
master
...
support_fp
Author | SHA1 | Date |
---|---|---|
kilinchange | dc6befb549 | |
zhangyunze | 935b465cf2 | |
panzezhong | a91ed84354 | |
kilinchange | e5ca66db66 | |
panzezhong | 85de28ef1e | |
YdrMaster | c34946a0d8 | |
xgqdut2016 | ce23b8356f | |
kilinchange | 7b48b93fb3 | |
kilinchange | 8d901ba7aa | |
OdinaryWord | 8ae5958b29 | |
kilinchange | 0e75f99e7e | |
zhangyunze | 97e3377ca5 | |
xgqdut2016 | 9c82936386 | |
kilinchange | 03ed8c4de7 | |
kilinchange | c63ed4326d | |
xgqdut2016 | f51ce3231a | |
kilinchange | 9d9e996713 | |
xgqdut2016 | c859e655d3 | |
zhangyunze | e66f1c0421 | |
xgqdut2016 | ff98241db7 | |
OdinaryWord | 046b2d68d8 | |
OdinaryWord | 2af4c1276b | |
OdinaryWord | db8c3eec15 | |
OdinaryWord | c29dcf1e6d | |
zhangyunze | 5ed7db1506 | |
zhangyunze | bdb8d8d65f | |
kilinchange | cbdeb73e86 | |
kilinchange | 5af7f1e753 | |
zhangyunze | ee4ecd27e2 | |
zhangyunze | d5e775397d | |
kilinchange | 4b02de7e17 | |
xgqdut2016 | e07516ebe9 | |
xgqdut2016 | dd4a90fb5e | |
xgqdut2016 | fda0a5f982 | |
xgqdut2016 | 8b2e3b8e19 | |
xgqdut2016 | a000cb0304 | |
kilinchange | c587901586 | |
kilinchange | c19256bca6 | |
kilinchange | 4db6699e09 |
|
@ -212,7 +212,7 @@ if(USE_CUDA)
|
|||
${CMAKE_CXX_COMPILER}
|
||||
CACHE STRING "Set cuda host compiler path")
|
||||
# CMP0104 requires CUDA_ARCHITECTURES
|
||||
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70;80")
|
||||
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES 80)
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit) # For nvrtc and cuda driver
|
||||
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
|
||||
|
|
|
@ -137,11 +137,19 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
place[node.output[0]] = Shard(list(perm).index(plc.dim))
|
||||
|
||||
def shard_node(node: NodeProto):
|
||||
if node.op_type in ["Relu", "Tanh", "Softmax"]:
|
||||
if node.op_type in [
|
||||
"Relu",
|
||||
"Tanh",
|
||||
"Softmax",
|
||||
"Cast",
|
||||
"DequantizeLinear",
|
||||
"DynamicQuantizeLinear",
|
||||
"Sigmoid",
|
||||
]:
|
||||
place[node.output[0]] = place[node.input[0]]
|
||||
elif node.op_type in ["Where"]:
|
||||
place[node.output[0]] = place[node.input[1]]
|
||||
if node.op_type in {"Add", "Mul", "Div", "Max"}:
|
||||
if node.op_type in {"Add", "Mul", "Div", "Max", "Pow"}:
|
||||
shard_binary(node)
|
||||
elif node.op_type == "Reshape":
|
||||
shard_reshape(node)
|
||||
|
@ -149,7 +157,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
shard_transpose(node)
|
||||
elif node.op_type == "Split":
|
||||
shard_split(node)
|
||||
elif node.op_type == "MatMul":
|
||||
elif node.op_type in {"MatMul", "MatMulInteger"}:
|
||||
assert (
|
||||
place[node.input[0]] == place[node.input[1]]
|
||||
), f"{place[node.input[0]]} != {place[node.input[1]]}"
|
||||
|
@ -173,11 +181,23 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
for index, node in enumerate(model.graph.node):
|
||||
nodes.append(node)
|
||||
# linear
|
||||
if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
|
||||
input in data for input in node.input
|
||||
):
|
||||
if (
|
||||
node.op_type == "MatMul"
|
||||
or node.op_type == "Gemm"
|
||||
or node.op_type == "MatMulInteger"
|
||||
) and any(input in data for input in node.input):
|
||||
# FIXME(constroy): the last MatMul should not be sharded as TP.
|
||||
if node.output[0] in output:
|
||||
if (
|
||||
node.output[0] in output
|
||||
or (
|
||||
index + 1 < len(model.graph.node)
|
||||
and model.graph.node[index + 1].output[0] in output
|
||||
)
|
||||
or (
|
||||
index + 2 < len(model.graph.node)
|
||||
and model.graph.node[index + 2].output[0] in output
|
||||
)
|
||||
):
|
||||
continue
|
||||
groups = 1
|
||||
# If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
|
||||
|
@ -202,7 +222,9 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
|
|||
)
|
||||
place[node.output[0]] = Replicate()
|
||||
node.output[0] = new_name
|
||||
if len(node.input) > 2: # split bias to add
|
||||
if (
|
||||
len(node.input) > 2 and node.op_type != "MatMulInteger"
|
||||
): # split bias to add
|
||||
prev = nodes[-1]
|
||||
new_name = prev.output[0] + "_no_bias"
|
||||
place[new_name] = place[node.output[0]]
|
||||
|
|
|
@ -85,6 +85,8 @@ class GraphHandlerObj {
|
|||
Tensor cast(Tensor input, Tensor output, int to);
|
||||
Tensor expand(Tensor input, Tensor output, Shape dims);
|
||||
Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
|
||||
Tensor dequantizeLinear(Tensor inputX, Tensor inputScale, Tensor output,
|
||||
Tensor inputZeroPoint, int axis);
|
||||
std::vector<int> getDims(Tensor x) { return x->getDims(); }
|
||||
|
||||
Tensor allReduceSum(Tensor input, Tensor output);
|
||||
|
@ -99,6 +101,10 @@ class GraphHandlerObj {
|
|||
int outputType, Tensor input);
|
||||
Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||
std::string mode);
|
||||
TensorVec dynamicQuantizeLinear(Tensor input,
|
||||
std::optional<TensorVec> outputs);
|
||||
Tensor matmulInteger(Tensor inputA, Tensor inputB, Tensor output,
|
||||
Tensor a_zero_point, Tensor b_zero_point);
|
||||
|
||||
//------ modifiers
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ class Kernel {
|
|||
public:
|
||||
Kernel() {}
|
||||
virtual ~Kernel() {}
|
||||
|
||||
/**
|
||||
* @param op The operator to be executed.
|
||||
* @param record The parameters for kernel execution. If extra parameters
|
||||
|
@ -105,8 +104,7 @@ class KernelRegistry {
|
|||
IT_ASSERT(it != kernels.end(),
|
||||
"Kernel not found for key {" +
|
||||
to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
|
||||
", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
|
||||
std::get<2>(kernelAttrs).toString() + "}");
|
||||
", " + std::to_string(std::get<1>(kernelAttrs)) + "}");
|
||||
return std::get<0>(it->second);
|
||||
}
|
||||
const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
|
||||
|
@ -131,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {
|
|||
|
||||
} // namespace infini
|
||||
|
||||
#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt) \
|
||||
#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt) \
|
||||
namespace infini { \
|
||||
static const bool _CAT(_register_kernel_, cnt) = \
|
||||
KernelRegistry::getInstance().registerKernel( \
|
||||
KernelAttrs{device, opType, dataType}, new kernel(), name); \
|
||||
KernelRegistry::getInstance().registerKernel(KernelAttrs{device, \
|
||||
opType}, \
|
||||
new kernel(), name); \
|
||||
}
|
||||
|
||||
#define REGISTER_KERNEL(device, opType, dataType, kernel, name) \
|
||||
_REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
|
||||
#define REGISTER_KERNEL(device, opType, kernel, name) \
|
||||
_REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
|
||||
|
||||
#define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt) \
|
||||
namespace infini { \
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
#include "core/tensor.h"
|
||||
|
||||
namespace infini {
|
||||
using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
|
||||
using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
|
||||
|
||||
struct OpPerfKey {
|
||||
HashType hash;
|
||||
|
@ -90,6 +90,8 @@ class OperatorObj : public Object {
|
|||
OpType getOpType() const { return type; }
|
||||
// HACK: set correct data type
|
||||
DataType getDType() const { return getInputs(0)->getDType(); }
|
||||
DataType getInDType() const { return getInputs(0)->getDType(); }
|
||||
DataType getOutDType() const { return getOutput()->getDType(); }
|
||||
virtual int numInputs() const = 0;
|
||||
virtual int numOutputs() const = 0;
|
||||
|
||||
|
|
|
@ -42,8 +42,16 @@ class TensorObj : public TensorBaseObj {
|
|||
bool isOutput() const { return tensorType == TensorType::output; }
|
||||
bool isOthers() const { return tensorType == TensorType::others; }
|
||||
void setWeight() { tensorType = TensorType::weight; }
|
||||
void setInput() { tensorType = TensorType::input; }
|
||||
void setOutput() { tensorType = TensorType::output; }
|
||||
void setInput() {
|
||||
if (!this->isWeight()) {
|
||||
tensorType = TensorType::input;
|
||||
}
|
||||
}
|
||||
void setOutput() {
|
||||
if (!this->isWeight()) {
|
||||
tensorType = TensorType::output;
|
||||
}
|
||||
}
|
||||
string tensorTypeToString() const {
|
||||
switch (tensorType) {
|
||||
case TensorType::weight:
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "operators/dequantize_linear.h"
|
||||
|
||||
namespace infini {
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
|
||||
float *output, const int dimsize, const int stride,
|
||||
const uint8_t *inputZeroPoint, const int size);
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
|
||||
float *output, const int dimsize, const int stride,
|
||||
const int size);
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
|
||||
half *output, const int dimsize, const int stride,
|
||||
const uint8_t *inputZeroPoint, const int size);
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
|
||||
half *output, const int dimsize, const int stride,
|
||||
const int size);
|
||||
}; // namespace infini
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include "operators/dynamic_quantize_linear.h"
|
||||
|
||||
namespace infini {
|
||||
void dynamicQuantizeLinearKernel(float *input, uint8_t *outputY, float *yScale,
|
||||
uint8_t *yZeroPoint, int size);
|
||||
}; // namespace infini
|
|
@ -1,13 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
|
||||
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3);
|
||||
void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
|
||||
int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
|
||||
int c2, int c3);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
#include "operators/unary.h"
|
||||
#include "utils/small_array.h"
|
||||
namespace infini {
|
||||
void expandKernel(float *input, float *output, int nDims, int outputsize,
|
||||
SmallArray inputShape, SmallArray outputShape);
|
||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -8,4 +8,10 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
void LaynormKernel(const float *input, const float *scale, const float eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
float *output);
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output, const half *bias, int biasSize);
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output);
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
namespace infini {
|
||||
void subA_kernel(int dType, void *a, void *b, int size, int k, int delta);
|
||||
void subB_kernel(int dType, void *a, void *b, int size, int k, int n,
|
||||
int delta);
|
||||
}; // namespace infini
|
|
@ -3,4 +3,6 @@
|
|||
namespace infini {
|
||||
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||
int dimsize, int stride);
|
||||
}
|
||||
void softmax_kernel(int num_blocks, half *input, half *output, int size,
|
||||
int dimsize, int stride);
|
||||
} // namespace infini
|
||||
|
|
|
@ -8,8 +8,8 @@ const int DIM_MAX_SIZE = 8;
|
|||
// Concat operator acts like element tensors composing to one big tensor,and
|
||||
// split operator acts like one big tensor being composed by element
|
||||
// tensors.
|
||||
struct ElementTensorMetadata {
|
||||
float *data[BATCH_SIZE];
|
||||
template <typename T> struct ElementTensorMetadata {
|
||||
T *data[BATCH_SIZE];
|
||||
int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
|
||||
// the composed tensor.
|
||||
int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
|
||||
|
@ -20,16 +20,17 @@ struct ElementTensorMetadata {
|
|||
data[i], dimBgNo[i], dimSize[i], nElements[i]);
|
||||
}
|
||||
};
|
||||
|
||||
struct ComposedTensorMetadata {
|
||||
template <typename T> struct ComposedTensorMetadata {
|
||||
int dimSize[DIM_MAX_SIZE];
|
||||
int stride[DIM_MAX_SIZE];
|
||||
float *data;
|
||||
T *data;
|
||||
};
|
||||
|
||||
namespace infini {
|
||||
void split_concat_kernel(const ElementTensorMetadata &eleMeta,
|
||||
const ComposedTensorMetadata &compMeta, int dim,
|
||||
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
|
||||
const ComposedTensorMetadata<float> &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit);
|
||||
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
|
||||
const ComposedTensorMetadata<half> &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit);
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -3,48 +3,21 @@
|
|||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
void softmax_kernel(float *input, float *output, size_t num);
|
||||
void relu_kernel(float *input, float *output, size_t num);
|
||||
void sigmoid_kernel(float *input, float *output, size_t num);
|
||||
void tanh_kernel(float *input, float *output, size_t num);
|
||||
void abs_kernel(float *input, float *output, size_t num);
|
||||
void sqrt_kernel(float *input, float *output, size_t num);
|
||||
void neg_kernel(float *input, float *output, size_t num);
|
||||
void gelu_kernel(float *input, float *output, size_t num);
|
||||
void erf_kernel(float *input, float *output, size_t num);
|
||||
void hard_sigmoid_kernel(float *input, float *output, size_t num);
|
||||
void hard_swish_kernel(float *input, float *output, size_t num);
|
||||
template <typename T> void softmax_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void relu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void tanh_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void abs_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void neg_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void gelu_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void erf_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
|
||||
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
|
||||
|
||||
void unary_kernel(const Operator &_op) {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
|
||||
float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
|
||||
|
||||
size_t num = op->getOutput()->size();
|
||||
if (op->getOpType() == OpType::Softmax)
|
||||
softmax_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Relu)
|
||||
relu_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Sigmoid)
|
||||
sigmoid_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::HardSigmoid)
|
||||
hard_sigmoid_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::HardSwish)
|
||||
hard_swish_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Tanh)
|
||||
tanh_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Abs)
|
||||
abs_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Sqrt)
|
||||
sqrt_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Gelu)
|
||||
gelu_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Neg)
|
||||
neg_kernel(inputData, outputData, num);
|
||||
else if (op->getOpType() == OpType::Erf)
|
||||
erf_kernel(inputData, outputData, num);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
void unary_kernel(const Operator &_op);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,11 +1,29 @@
|
|||
#pragma once
|
||||
#include "core/tensor.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void cudaPrintFloat(float *x, int len);
|
||||
|
||||
void cudaPrintTensor(const Tensor &tensor) {
|
||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||
}
|
||||
void cudaPrintTensor(const Tensor &tensor);
|
||||
|
||||
} // namespace infini
|
||||
cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
|
||||
cudaDataType cublasDataTypeConvert(DataType);
|
||||
|
||||
template <int index> struct DT_CUDA {};
|
||||
template <> struct DT_CUDA<0> { using t = bool; };
|
||||
template <> struct DT_CUDA<1> { using t = float; };
|
||||
template <> struct DT_CUDA<2> { using t = unsigned char; };
|
||||
template <> struct DT_CUDA<3> { using t = char; };
|
||||
template <> struct DT_CUDA<4> { using t = unsigned short; };
|
||||
template <> struct DT_CUDA<5> { using t = short; };
|
||||
template <> struct DT_CUDA<6> { using t = int; };
|
||||
template <> struct DT_CUDA<7> { using t = long long; };
|
||||
template <> struct DT_CUDA<9> { using t = bool; };
|
||||
template <> struct DT_CUDA<10> { using t = half; };
|
||||
template <> struct DT_CUDA<11> { using t = double; };
|
||||
template <> struct DT_CUDA<12> { using t = unsigned int; };
|
||||
template <> struct DT_CUDA<13> { using t = unsigned long long; };
|
||||
template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
|
||||
} // namespace infini
|
||||
|
|
|
@ -3,10 +3,10 @@
|
|||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
void whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
|
||||
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||
const uint8_t *condition, void *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -53,7 +53,8 @@ inline void initGatherMetaData(GatherMetaData &metaData,
|
|||
metaData.inStride[i] = in->getStride()[i];
|
||||
}
|
||||
}
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
|
||||
template <typename T>
|
||||
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
|
||||
|
||||
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
|
||||
size_t num);
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief y = (x - x_zero_point) *x_scale
|
||||
*
|
||||
*/
|
||||
class DequantizeLinearObj : public OperatorObj {
|
||||
int axis;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new DequantizeLinear object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param inputX The input tensor X.
|
||||
* @param inputScale The input tensor x_scale.
|
||||
* @param output The output tensor.
|
||||
* @param inputZeroPoint The z_zero_point.
|
||||
*/
|
||||
DequantizeLinearObj(GraphObj *graph, Tensor inputX, Tensor inputScale,
|
||||
Tensor output, Tensor inputZeroPoint = nullptr,
|
||||
int axis = 1);
|
||||
OP_CLONE(DequantizeLinearObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
|
||||
Tensor getZeroPoint() const {
|
||||
return inputs.size() > 2 ? inputs[2] : nullptr;
|
||||
}
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
int getAxis() const { return axis; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief A Function to fuse calculation for Scale, Zero Point and FP32->8Bit
|
||||
* conversion of FP32 Input data.
|
||||
*
|
||||
*/
|
||||
class DynamicQuantizeLinearObj : public OperatorObj {
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new DynamicQuantizeLinear object.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param input The input tensor.
|
||||
* @param outputs The output tensors.
|
||||
*/
|
||||
DynamicQuantizeLinearObj(GraphObj *graph, Tensor input,
|
||||
std::optional<TensorVec> outputs);
|
||||
OP_CLONE(DynamicQuantizeLinearObj);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 3; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,63 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
/**
|
||||
* @brief Matrix multiplication.
|
||||
*
|
||||
*/
|
||||
class MatmulIntegerObj : public OperatorObj {
|
||||
private:
|
||||
// Auxiliary attributes which are not a part of operator attributes.
|
||||
int b, m, n, k;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Matmul operator with batch broadcast and tensor transpose
|
||||
* supports. Only one tensor with singe batch can be broadcasted due to the
|
||||
* BLAS interface restriction. Tranpose indicates whether the last two
|
||||
* dimensions should be transposed before Matmul and does not affect other
|
||||
* leading dimensions.
|
||||
*
|
||||
* Matmul show how operators are defined in InfiniTensor. The constructor of
|
||||
* an operator can create output tensors for the operator or not, which
|
||||
* depends on `graph`.
|
||||
*
|
||||
* @param graph The computation graph that this operator belongs to.
|
||||
* @param A The input tensor.
|
||||
* @param B The input tensor.
|
||||
* @param C C is the output of Matmul. If outputs are going to be created in
|
||||
* the constructor, C should be an empty Ref.
|
||||
* @param a_zero_point Zero point tensor for input 'A'.
|
||||
* @param b_zero_point Zero point tensor for input 'B'.
|
||||
*/
|
||||
MatmulIntegerObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
|
||||
Tensor a_zero_point = nullptr,
|
||||
Tensor b_zero_point = nullptr);
|
||||
OP_CLONE(MatmulIntegerObj);
|
||||
|
||||
std::string toString() const override;
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
|
||||
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
|
||||
Tensor getZeroPointA() const {
|
||||
return inputs.size() > 2 ? inputs[2] : nullptr;
|
||||
}
|
||||
Tensor getZeroPointB() const {
|
||||
return inputs.size() > 3 ? inputs[3] : nullptr;
|
||||
}
|
||||
int getB() const { return b; }
|
||||
int getM() const { return m; }
|
||||
int getN() const { return n; }
|
||||
int getK() const { return k; }
|
||||
auto getBMNK() const { return tuple{b, m, n, k}; }
|
||||
vector<DataType> inferDataType(const TensorVec &inputs) const override;
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -91,6 +91,12 @@ template <int val> class ValGenerator : public DataGenerator {
|
|||
fill<uint32_t>(data, size);
|
||||
}
|
||||
void fill(float *data, size_t size) override { fill<float>(data, size); }
|
||||
void fill_fp16(uint16_t *data, size_t size) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
float x = 1.0f * val;
|
||||
data[i] = float_to_fp16(x);
|
||||
}
|
||||
}
|
||||
};
|
||||
typedef ValGenerator<1> OneGenerator;
|
||||
typedef ValGenerator<0> ZeroGenerator;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -102,6 +102,18 @@ class TestStringMethods(unittest.TestCase):
|
|||
matmul = make_node("MatMul", ["x", "a"], ["xa"], name="matmul")
|
||||
make_and_import_model(make_graph([matmul], "matmul", [x, a], [xa]))
|
||||
|
||||
def test_matmul_integer(self):
|
||||
A = make_tensor_value_info("A", TensorProto.INT8, [1, 2, 4])
|
||||
B = make_tensor_value_info("B", TensorProto.UINT8, [1, 4, 4])
|
||||
A_ZeroPoint = make_tensor_value_info("A_ZeroPoint", TensorProto.INT8, [1, 2, 1])
|
||||
y = make_tensor_value_info("y", TensorProto.INT32, [1, 2, 4])
|
||||
matmulInteger = make_node(
|
||||
"MatMulInteger", ["A", "B", "A_ZeroPoint"], ["y"], name="matmul_integer"
|
||||
)
|
||||
make_and_import_model(
|
||||
make_graph([matmulInteger], "matmul_integer", [A, B, A_ZeroPoint], [y])
|
||||
)
|
||||
|
||||
def test_gemm(self):
|
||||
a = make_tensor_value_info("a", TensorProto.FLOAT, [1, 2, 3])
|
||||
b = make_tensor_value_info("b", TensorProto.FLOAT, [1, 4, 3])
|
||||
|
|
|
@ -87,48 +87,33 @@ string GraphObj::toString() const {
|
|||
}
|
||||
|
||||
bool GraphObj::topo_sort() {
|
||||
if (this->sorted)
|
||||
if (this->sorted) {
|
||||
return true;
|
||||
|
||||
// std::unordered_set<Tensor> inputs;
|
||||
std::unordered_set<Operator> waiting(this->ops.begin(), this->ops.end());
|
||||
}
|
||||
std::vector<Operator> sorted;
|
||||
|
||||
while (!waiting.empty()) {
|
||||
std::unordered_set<OperatorObj *> flags;
|
||||
sorted.reserve(ops.size());
|
||||
flags.reserve(ops.size());
|
||||
while (sorted.size() < ops.size()) {
|
||||
// Any node is move to sorted in this loop.
|
||||
auto modified = false;
|
||||
// Find head nodes.
|
||||
for (auto it = waiting.begin(); it != waiting.end();) {
|
||||
const auto &this_inputs = (*it)->getInputs();
|
||||
// If none of the input tensors is in waiting list,
|
||||
// this node is a head node.
|
||||
const auto is_head = std::all_of(
|
||||
this_inputs.begin(), this_inputs.end(), [&](const auto &input) {
|
||||
auto src = input->getSource();
|
||||
return src // If the source node is in the waiting
|
||||
// list, means that this node is not the
|
||||
// head node.
|
||||
? waiting.find(src) == waiting.end()
|
||||
// This tensor has no source node,
|
||||
// it must be a input tensor.
|
||||
: (/*inputs.insert(input),*/ true);
|
||||
});
|
||||
// Moves head node to sorted.
|
||||
if (is_head) {
|
||||
for (auto const &op : ops) {
|
||||
if (auto const &inputs = op->getInputs();
|
||||
flags.find(op.get()) == flags.end() &&
|
||||
std::all_of(inputs.begin(), inputs.end(),
|
||||
[&flags](auto const &input) {
|
||||
auto ptr = input->getSource().get();
|
||||
return !ptr || flags.find(ptr) != flags.end();
|
||||
})) {
|
||||
modified = true;
|
||||
sorted.emplace_back(std::move(*it));
|
||||
it = waiting.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
sorted.emplace_back(op);
|
||||
flags.insert(op.get());
|
||||
}
|
||||
}
|
||||
// Waiting list never modifies during a pass,
|
||||
// sorting fails.
|
||||
if (!modified) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Done.
|
||||
this->ops = std::move(sorted);
|
||||
return this->sorted = true;
|
||||
}
|
||||
|
@ -182,7 +167,10 @@ void GraphObj::dataMalloc(bool useNaiveAllocator, size_t memPoolSize) {
|
|||
// note: behavior may not match running in non-naive mode, and it may
|
||||
// not reproduce the bug
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->dataMalloc();
|
||||
if (!tensor->isWeight() ||
|
||||
(tensor->isWeight() && !weightAllocated)) {
|
||||
tensor->dataMalloc();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -6,11 +6,14 @@
|
|||
#include "operators/broadcast.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/dequantize_linear.h"
|
||||
#include "operators/dynamic_quantize_linear.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/expand.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/layer_norm.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/matmul_integer.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/recv.h"
|
||||
|
@ -145,6 +148,23 @@ Tensor GraphHandlerObj::avgPool(Tensor input, Tensor output, int kh, int kw,
|
|||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::matmulInteger(Tensor inputA, Tensor inputB,
|
||||
Tensor output, Tensor a_zero_point,
|
||||
Tensor b_zero_point) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<MatmulIntegerObj>(
|
||||
std::move(inputA), std::move(inputB), output,
|
||||
std::move(a_zero_point), std::move(b_zero_point));
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<MatmulIntegerObj>(std::move(inputA), std::move(inputB),
|
||||
output, std::move(a_zero_point),
|
||||
std::move(b_zero_point))
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
// see operators/element_wise.h
|
||||
#define DEFINE_ELEMENT_WISE_METHOD(name, obj) \
|
||||
Tensor GraphHandlerObj::name(Tensor a, Tensor b, Tensor c) { \
|
||||
|
@ -506,6 +526,36 @@ Tensor GraphHandlerObj::where(Tensor inputX, Tensor inputY, Tensor condition,
|
|||
}
|
||||
}
|
||||
|
||||
TensorVec
|
||||
GraphHandlerObj::dynamicQuantizeLinear(Tensor input,
|
||||
std::optional<TensorVec> outputs) {
|
||||
if (outputs) {
|
||||
g->addOpWithOutputs<DynamicQuantizeLinearObj>(std::move(input),
|
||||
outputs);
|
||||
return *outputs;
|
||||
} else {
|
||||
return g->addOp<DynamicQuantizeLinearObj>(std::move(input), outputs)
|
||||
->getOutputs();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::dequantizeLinear(Tensor inputX, Tensor inputScale,
|
||||
Tensor output, Tensor inputZeroPoint,
|
||||
int axis) {
|
||||
if (output) {
|
||||
g->addOpWithOutputs<DequantizeLinearObj>(
|
||||
std::move(inputX), std::move(inputScale), output,
|
||||
std::move(inputZeroPoint), axis);
|
||||
return output;
|
||||
} else {
|
||||
return g
|
||||
->addOp<DequantizeLinearObj>(std::move(inputX),
|
||||
std::move(inputScale), output,
|
||||
std::move(inputZeroPoint), axis)
|
||||
->getOutput();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphHandlerObj::depthToSpace(Tensor input, Tensor output, int blocksize,
|
||||
std::string mode) {
|
||||
if (output) {
|
||||
|
|
|
@ -58,12 +58,13 @@ HashType OperatorObj::hash() const {
|
|||
|
||||
bool OperatorObj::checkValid(GraphObj *graph) {
|
||||
auto optShapes = inferShape();
|
||||
if (!optShapes) // shape inference failed
|
||||
if (!optShapes) { // shape inference failed
|
||||
return false;
|
||||
|
||||
}
|
||||
const vector<Shape> &shapes = *optShapes;
|
||||
if (shapes.size() != outputs.size())
|
||||
if (shapes.size() != outputs.size()) {
|
||||
return false;
|
||||
}
|
||||
if (graph) { // if graph != nullptr, outputs should be created
|
||||
auto dataTypes = inferDataType();
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
|
|
|
@ -17,8 +17,7 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -66,8 +65,7 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
|
||||
for (auto &op : graph->getOperators()) {
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
|
|
@ -25,8 +25,7 @@ void CudaRuntimeObj::runWithoutSync(const Graph &graph) const {
|
|||
auto &perfEngine = PerfEngine::getInstance();
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{device, op->getOpType().underlying(), op->getDType()};
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
@ -48,8 +47,7 @@ void CudaRuntimeObj::tune(const Graph &graph, bool profiling = false) const {
|
|||
std::map<OpType, int> opCnt;
|
||||
for (auto &op : graph->getOperators()) {
|
||||
// HACK: set correct data type
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying(),
|
||||
DataType::Float32};
|
||||
auto kernelAttrs = KernelAttrs{device, op->getOpType().underlying()};
|
||||
Kernel *kernel = kernelRegistry.getKernel(kernelAttrs);
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
|
||||
auto perfData = perfEngine.getPerfData(perfKey);
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#include "core/data_type.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include <cstdio>
|
||||
|
||||
__global__ void cudaPrintFloatImpl(float *x, int len) {
|
||||
|
@ -18,4 +20,55 @@ void cudaPrintFloat(float *x, int len) {
|
|||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void cudaPrintTensor(const Tensor &tensor) {
|
||||
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||
}
|
||||
|
||||
cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
|
||||
if (dataType == DataType::Float32) {
|
||||
return CUDNN_DATA_FLOAT;
|
||||
}
|
||||
if (dataType == DataType::Double) {
|
||||
return CUDNN_DATA_DOUBLE;
|
||||
}
|
||||
if (dataType == DataType::Float16) {
|
||||
return CUDNN_DATA_HALF;
|
||||
}
|
||||
if (dataType == DataType::Int8) {
|
||||
return CUDNN_DATA_INT8;
|
||||
}
|
||||
if (dataType == DataType::Int32) {
|
||||
return CUDNN_DATA_INT32;
|
||||
}
|
||||
if (dataType == DataType::UInt8) {
|
||||
return CUDNN_DATA_UINT8;
|
||||
}
|
||||
if (dataType == DataType::BFloat16) {
|
||||
return CUDNN_DATA_BFLOAT16;
|
||||
}
|
||||
if (dataType == DataType::Int64) {
|
||||
return CUDNN_DATA_INT64;
|
||||
}
|
||||
if (dataType == DataType::Bool) {
|
||||
return CUDNN_DATA_BOOLEAN;
|
||||
}
|
||||
IT_ASSERT(false, "Unsupported data type");
|
||||
}
|
||||
|
||||
cudaDataType cublasDataTypeConvert(DataType dataType) {
|
||||
switch (dataType.getIndex()) {
|
||||
case 1:
|
||||
return CUDA_R_32F;
|
||||
// case 3:
|
||||
// return CUDA_R_8I;
|
||||
case 10:
|
||||
return CUDA_R_16F;
|
||||
case 11:
|
||||
return CUDA_R_64F;
|
||||
// case 16:
|
||||
// return CUDA_R_16BF;
|
||||
default:
|
||||
IT_ASSERT(false, "MatMul Unsupported data type");
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -517,6 +517,10 @@ void init_graph_builder(py::module &m) {
|
|||
.def("expand", &Handler::expand, policy::move)
|
||||
.def("erf", &Handler::erf, policy::move)
|
||||
.def("where", &Handler::where, policy::move)
|
||||
.def("dequantizeLinear", &Handler::dequantizeLinear, policy::move)
|
||||
.def("dynamicQuantizeLinear", &Handler::dynamicQuantizeLinear,
|
||||
policy::move)
|
||||
.def("matmulInteger", &Handler::matmulInteger, policy::move)
|
||||
.def("topo_sort", &Handler::topo_sort, policy::automatic)
|
||||
.def("optimize", &Handler::optimize, policy::automatic)
|
||||
.def("operators", &Handler::operators, policy::move)
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveConcat : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<ConcatObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
auto dim = op->getDim();
|
||||
|
@ -41,11 +41,25 @@ template <typename T> class NaiveConcat : public CpuKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::UInt32,
|
||||
NaiveConcat<uint32_t>, "ConcatNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Concat, DataType::Float32,
|
||||
NaiveConcat<float>, "ConcatNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Concat, NaiveConcat, "ConcatNaive_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveConv : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<ConvObj>(_op);
|
||||
T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
|
||||
|
@ -50,11 +50,25 @@ template <typename T> class NaiveConv : public CpuKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::UInt32,
|
||||
NaiveConv<uint32_t>, "ConvNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Conv, DataType::Float32, NaiveConv<float>,
|
||||
"ConvNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Conv, NaiveConv, "ConvNaive_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -2,10 +2,45 @@
|
|||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
||||
virtual T doCompute(T val0, T val1) const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NativeElementWise : public CpuKernelWithoutConfig {
|
||||
template <typename T> static T addCompute(T val0, T val1) {
|
||||
return val0 + val1;
|
||||
}
|
||||
|
||||
template <typename T> static T subCompute(T val0, T val1) {
|
||||
return val0 - val1;
|
||||
}
|
||||
|
||||
template <typename T> static T mulCompute(T val0, T val1) {
|
||||
return val0 * val1;
|
||||
}
|
||||
|
||||
template <typename T> static T divCompute(T val0, T val1) {
|
||||
return (T)(val0 / val1);
|
||||
}
|
||||
|
||||
template <typename T> static T equalCompute(T val0, T val1) {
|
||||
return (T)(val0 == val1);
|
||||
}
|
||||
|
||||
template <typename T> static T greaterOrEqualCompute(T val0, T val1) {
|
||||
return (T)(val0 >= val1);
|
||||
}
|
||||
|
||||
template <typename T> static T greaterCompute(T val0, T val1) {
|
||||
return (T)(val0 > val1);
|
||||
}
|
||||
|
||||
template <typename T> static T lessOrEqualCompute(T val0, T val1) {
|
||||
return (T)(val0 <= val1);
|
||||
}
|
||||
|
||||
template <typename T> static T lessCompute(T val0, T val1) {
|
||||
return (T)(val0 < val1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
T *inptr0 = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *inptr1 = op->getInputs(1)->getRawDataPtr<T *>();
|
||||
|
@ -22,6 +57,39 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
|||
std::copy(c_output.begin(), c_output.end(), c + (4 - c_output.size()));
|
||||
|
||||
auto n = op->getOutput()->size();
|
||||
T (*_doCompute)(T val0, T val1);
|
||||
switch (op->getOpType().underlying()) {
|
||||
case OpType::Add:
|
||||
_doCompute = addCompute<T>;
|
||||
break;
|
||||
case OpType::Sub:
|
||||
_doCompute = subCompute<T>;
|
||||
break;
|
||||
case OpType::Mul:
|
||||
_doCompute = mulCompute<T>;
|
||||
break;
|
||||
case OpType::Div:
|
||||
_doCompute = divCompute<T>;
|
||||
break;
|
||||
case OpType::Equal:
|
||||
_doCompute = equalCompute<T>;
|
||||
break;
|
||||
case OpType::GreaterOrEqual:
|
||||
_doCompute = greaterOrEqualCompute<T>;
|
||||
break;
|
||||
case OpType::Greater:
|
||||
_doCompute = greaterCompute<T>;
|
||||
break;
|
||||
case OpType::LessOrEqual:
|
||||
_doCompute = lessOrEqualCompute<T>;
|
||||
break;
|
||||
case OpType::Less:
|
||||
_doCompute = lessCompute<T>;
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
int c0_index = i / (c[1] * c[2] * c[3]);
|
||||
int c1_index = (i % (c[1] * c[2] * c[3])) / (c[2] * c[3]);
|
||||
|
@ -37,77 +105,44 @@ template <typename T> class NativeElementWise : public CpuKernelWithoutConfig {
|
|||
int b1_index = c1_index % b[1];
|
||||
int b2_index = c2_index % b[2];
|
||||
int b3_index = c3_index % b[3];
|
||||
outptr[i] = doCompute(
|
||||
outptr[i] = _doCompute(
|
||||
inptr0[a0_index * a[1] * a[2] * a[3] + a1_index * a[2] * a[3] +
|
||||
a2_index * a[3] + a3_index],
|
||||
inptr1[b0_index * b[1] * b[2] * b[3] + b1_index * b[2] * b[3] +
|
||||
b2_index * b[3] + b3_index]);
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveAdd : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return val0 + val1; }
|
||||
};
|
||||
template <typename T> class NaiveSub : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return val0 - val1; }
|
||||
};
|
||||
template <typename T> class NaiveMul : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return val0 * val1; }
|
||||
};
|
||||
template <typename T> class NaiveDiv : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 / val1); }
|
||||
};
|
||||
template <typename T> class NaiveEqual : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 == val1); }
|
||||
};
|
||||
template <typename T> class NaiveGreaterEqual : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 >= val1); }
|
||||
};
|
||||
template <typename T> class NaiveGreaterThan : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 > val1); }
|
||||
};
|
||||
template <typename T> class NaiveLessEqual : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 <= val1); }
|
||||
};
|
||||
template <typename T> class NaiveLessThan : public NativeElementWise<T> {
|
||||
T doCompute(T val0, T val1) const override { return (T)(val0 < val1); }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::UInt32, NaiveAdd<uint32_t>,
|
||||
"addNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Add, DataType::Float32, NaiveAdd<float>,
|
||||
"addNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::UInt32, NaiveSub<uint32_t>,
|
||||
"subNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sub, DataType::Float32, NaiveSub<float>,
|
||||
"subNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::UInt32, NaiveMul<uint32_t>,
|
||||
"mulNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Mul, DataType::Float32, NaiveMul<float>,
|
||||
"mulNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::UInt32, NaiveDiv<uint32_t>,
|
||||
"divNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Div, DataType::Float32, NaiveDiv<float>,
|
||||
"divNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::UInt32,
|
||||
NaiveEqual<uint32_t>, "equalNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Equal, DataType::Float32,
|
||||
NaiveEqual<float>, "equalNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::UInt32,
|
||||
NaiveGreaterEqual<uint32_t>, "greaterEqualNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, DataType::Float32,
|
||||
NaiveGreaterEqual<float>, "greaterEqualNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::UInt32,
|
||||
NaiveGreaterThan<uint32_t>, "greaterThanNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Greater, DataType::Float32,
|
||||
NaiveGreaterThan<float>, "greaterThanNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::UInt32,
|
||||
NaiveLessEqual<uint32_t>, "lessEqualNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, DataType::Float32,
|
||||
NaiveLessEqual<float>, "lessEqualNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::UInt32,
|
||||
NaiveLessThan<uint32_t>, "lessEqualNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Less, DataType::Float32,
|
||||
NaiveLessThan<float>, "lessEqualNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Add, NativeElementWise, "addNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sub, NativeElementWise, "subNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Mul, NativeElementWise, "mulNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Div, NativeElementWise, "divNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Equal, NativeElementWise,
|
||||
"equalNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::GreaterOrEqual, NativeElementWise,
|
||||
"greaterEqualNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Greater, NativeElementWise,
|
||||
"greaterThanNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::LessOrEqual, NativeElementWise,
|
||||
"lessEqualNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Less, NativeElementWise,
|
||||
"lessEqualNaive_CPU");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveMatmul : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<MatmulObj>(_op);
|
||||
IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
|
||||
T *A = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
|
@ -23,11 +23,25 @@ template <typename T> class NaiveMatmul : public CpuKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::UInt32,
|
||||
NaiveMatmul<uint32_t>, "MatmulNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MatMul, DataType::Float32,
|
||||
NaiveMatmul<float>, "MatmulNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MatMul, NaiveMatmul, "MatmulNaive_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -80,8 +80,8 @@ class MemboundInterpreter : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MemBound, DataType::UInt32,
|
||||
MemboundInterpreter, "MemboundInterpreter_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MemBound, MemboundInterpreter,
|
||||
"MemboundInterpreter_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
||||
|
|
|
@ -2,42 +2,10 @@
|
|||
#include "core/kernel.h"
|
||||
|
||||
namespace infini {
|
||||
template <typename T> class NativePooling : public CpuKernelWithoutConfig {
|
||||
virtual T getPoolingValue(int kh, int kw, int posh, int posw, int ih,
|
||||
int iw, T *inptr) const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
if (dh != 1 || dw != 1)
|
||||
IT_TODO_HALT(); // To support dailated pooling
|
||||
auto outDim = op->getOutput()->getDims();
|
||||
int oh = outDim[2], ow = outDim[3];
|
||||
for (auto i = 0; i < n; i++) {
|
||||
for (auto j = 0; j < c; j++) {
|
||||
auto inoffset = i * (c * ih * iw) + j * ih * iw;
|
||||
for (auto h = 0; h < oh; h++) {
|
||||
for (auto w = 0; w < ow; w++) {
|
||||
// TODO: verify ceil mode
|
||||
T val =
|
||||
getPoolingValue(kh, kw, h * sh - ph, w * sw - pw,
|
||||
ih, iw, inptr + inoffset);
|
||||
auto outoffset =
|
||||
w + h * ow + j * (oh * ow) + i * (c * oh * ow);
|
||||
outptr[outoffset] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveMaxPool : public NativePooling<T> {
|
||||
T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw,
|
||||
T *inptr) const override {
|
||||
class NativePooling : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
static T getMaxPoolingValue(int kh, int kw, int posh, int posw, int ih,
|
||||
int iw, T *inptr) {
|
||||
T maxval = 0;
|
||||
for (auto k = 0; k < kh; k++) {
|
||||
for (auto l = 0; l < kw; l++) {
|
||||
|
@ -53,11 +21,10 @@ template <typename T> class NaiveMaxPool : public NativePooling<T> {
|
|||
}
|
||||
return maxval;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveAvgPool : public NativePooling<T> {
|
||||
T getPoolingValue(int kh, int kw, int posh, int posw, int ih, int iw,
|
||||
T *inptr) const override {
|
||||
template <typename T>
|
||||
static T getAvgPoolingValue(int kh, int kw, int posh, int posw, int ih,
|
||||
int iw, T *inptr) {
|
||||
T sum = 0;
|
||||
for (auto k = 0; k < kh; k++) {
|
||||
for (auto l = 0; l < kw; l++) {
|
||||
|
@ -71,12 +38,70 @@ template <typename T> class NaiveAvgPool : public NativePooling<T> {
|
|||
}
|
||||
return T(sum / (kh * kw));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<PoolingObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
||||
const auto [n, c, ih, iw, kh, kw] = op->getNCHWRS();
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
if (dh != 1 || dw != 1)
|
||||
IT_TODO_HALT(); // To support dailated pooling
|
||||
auto outDim = op->getOutput()->getDims();
|
||||
int oh = outDim[2], ow = outDim[3];
|
||||
|
||||
T(*_doCompute)
|
||||
(int kh, int kw, int posh, int posw, int ih, int iw, T *inptr);
|
||||
switch (op->getOpType().underlying()) {
|
||||
case OpType::MaxPool:
|
||||
_doCompute = getMaxPoolingValue<T>;
|
||||
break;
|
||||
case OpType::AveragePool:
|
||||
_doCompute = getAvgPoolingValue<T>;
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
for (auto i = 0; i < n; i++) {
|
||||
for (auto j = 0; j < c; j++) {
|
||||
auto inoffset = i * (c * ih * iw) + j * ih * iw;
|
||||
for (auto h = 0; h < oh; h++) {
|
||||
for (auto w = 0; w < ow; w++) {
|
||||
// TODO: verify ceil mode
|
||||
T val = _doCompute(kh, kw, h * sh - ph, w * sw - pw, ih,
|
||||
iw, inptr + inoffset);
|
||||
auto outoffset =
|
||||
w + h * ow + j * (oh * ow) + i * (c * oh * ow);
|
||||
outptr[outoffset] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::UInt32,
|
||||
NaiveMaxPool<uint32_t>, "maxPoolNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MaxPool, DataType::Float32,
|
||||
NaiveMaxPool<float>, "maxPoolNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::AveragePool, DataType::Float32,
|
||||
NaiveAvgPool<float>, "AvgPoolNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::MaxPool, NativePooling,
|
||||
"maxPoolNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::AveragePool, NativePooling,
|
||||
"avgPoolNaive_CPU");
|
||||
} // namespace infini
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveSplit : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<SplitObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
auto dim = op->getDim();
|
||||
|
@ -40,11 +40,24 @@ template <typename T> class NaiveSplit : public CpuKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::UInt32,
|
||||
NaiveSplit<uint32_t>, "SplitNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Split, DataType::Float32,
|
||||
NaiveSplit<float>, "SplitNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Split, NaiveSplit, "SplitNaive_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -14,9 +14,9 @@ inline Shape idx2Pos(const Shape &shape, size_t idx) {
|
|||
return pos;
|
||||
}
|
||||
|
||||
template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveTranspose : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<TransposeObj>(_op);
|
||||
auto inputs = op->getInputs(), outputs = op->getOutputs();
|
||||
const auto &inDim = inputs[0]->getDims();
|
||||
|
@ -35,11 +35,26 @@ template <typename T> class NaiveTranspose : public CpuKernelWithoutConfig {
|
|||
outPtr[outIdx] = inPtr[inIdx];
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::UInt32,
|
||||
NaiveTranspose<uint32_t>, "TransposeNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Transpose, DataType::Float32,
|
||||
NaiveTranspose<float>, "TransposeNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Transpose, NaiveTranspose,
|
||||
"TransposeNaive_CPU");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -4,25 +4,170 @@
|
|||
#include "operators/softmax.h"
|
||||
|
||||
namespace infini {
|
||||
template <typename T> class NativeUnary : public CpuKernelWithoutConfig {
|
||||
virtual T doCompute(T val) const = 0;
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NativeUnary : public CpuKernelWithoutConfig {
|
||||
template <typename T> static T reluCompute(T val) {
|
||||
return std::max(T(0), val);
|
||||
}
|
||||
|
||||
template <typename T> static T sigmoidCompute(T val) {
|
||||
return 1 / (1 + pow(E_CONSTANT, -val));
|
||||
}
|
||||
|
||||
template <typename T> static T hardSigmoidCompute(T val) {
|
||||
return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
|
||||
}
|
||||
|
||||
template <typename T> static T hardSwishCompute(T val) {
|
||||
return val *
|
||||
std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
|
||||
}
|
||||
|
||||
template <typename T> static T tanhCompute(T val) {
|
||||
return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
|
||||
(pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
|
||||
}
|
||||
|
||||
template <typename T> static T absCompute(T val) {
|
||||
return val < 0 ? -val : val;
|
||||
}
|
||||
|
||||
template <typename T> static T sqrtCompute(T val) { return std::sqrt(val); }
|
||||
|
||||
template <typename T> static T cosCompute(T val) { return std::cos(val); }
|
||||
|
||||
template <typename T> static T sinCompute(T val) { return std::sin(val); }
|
||||
|
||||
template <typename T> static T tanCompute(T val) { return std::tan(val); }
|
||||
|
||||
template <typename T> static T sinhCompute(T val) { return std::sinh(val); }
|
||||
|
||||
template <typename T> static T coshCompute(T val) { return std::cosh(val); }
|
||||
|
||||
template <typename T> static T geluCompute(T val) {
|
||||
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
|
||||
}
|
||||
|
||||
template <typename T> static T erfCompute(T val) { return std::erf(val); }
|
||||
|
||||
template <typename T> static T aCosCompute(T val) { return std::acos(val); }
|
||||
|
||||
template <typename T> static T aCoshCompute(T val) {
|
||||
return std::acosh(val);
|
||||
}
|
||||
|
||||
template <typename T> static T aSinCompute(T val) { return std::asin(val); }
|
||||
|
||||
template <typename T> static T aSinhCompute(T val) {
|
||||
return std::asinh(val);
|
||||
}
|
||||
template <typename T> static T aTanCompute(T val) { return std::atan(val); }
|
||||
|
||||
template <typename T> static T aTanhCompute(T val) {
|
||||
return std::atanh(val);
|
||||
}
|
||||
template <typename T> static T negCompute(T val) { return -val; }
|
||||
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
||||
auto outDim = op->getOutput()->getDims();
|
||||
auto n = op->getOutput()->size();
|
||||
|
||||
T (*_doCompute)(T val);
|
||||
switch (op->getOpType().underlying()) {
|
||||
case OpType::Relu:
|
||||
_doCompute = reluCompute<T>;
|
||||
break;
|
||||
case OpType::Gelu:
|
||||
_doCompute = geluCompute<T>;
|
||||
break;
|
||||
case OpType::Sigmoid:
|
||||
_doCompute = sigmoidCompute<T>;
|
||||
break;
|
||||
case OpType::HardSigmoid:
|
||||
_doCompute = hardSigmoidCompute<T>;
|
||||
break;
|
||||
case OpType::HardSwish:
|
||||
_doCompute = hardSwishCompute<T>;
|
||||
break;
|
||||
case OpType::Tanh:
|
||||
_doCompute = tanhCompute<T>;
|
||||
break;
|
||||
case OpType::Abs:
|
||||
_doCompute = absCompute<T>;
|
||||
break;
|
||||
case OpType::Sqrt:
|
||||
_doCompute = sqrtCompute<T>;
|
||||
break;
|
||||
case OpType::Erf:
|
||||
_doCompute = erfCompute<T>;
|
||||
break;
|
||||
case OpType::Neg:
|
||||
_doCompute = negCompute<T>;
|
||||
break;
|
||||
case OpType::Cos:
|
||||
_doCompute = cosCompute<T>;
|
||||
break;
|
||||
case OpType::Sin:
|
||||
_doCompute = sinCompute<T>;
|
||||
break;
|
||||
case OpType::Tan:
|
||||
_doCompute = tanCompute<T>;
|
||||
break;
|
||||
case OpType::Sinh:
|
||||
_doCompute = sinhCompute<T>;
|
||||
break;
|
||||
case OpType::Cosh:
|
||||
_doCompute = coshCompute<T>;
|
||||
break;
|
||||
case OpType::Acos:
|
||||
_doCompute = aCosCompute<T>;
|
||||
break;
|
||||
case OpType::Asin:
|
||||
_doCompute = aSinCompute<T>;
|
||||
break;
|
||||
case OpType::Asinh:
|
||||
_doCompute = aSinhCompute<T>;
|
||||
break;
|
||||
case OpType::Atan:
|
||||
_doCompute = aTanCompute<T>;
|
||||
break;
|
||||
case OpType::Atanh:
|
||||
_doCompute = aTanhCompute<T>;
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
for (size_t offset = 0; offset < n; offset++) {
|
||||
outptr[offset] = doCompute(inptr[offset]);
|
||||
outptr[offset] = _doCompute(inptr[offset]);
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
class NaiveSoftmax : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<SoftmaxObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
@ -37,98 +182,28 @@ template <typename T> class NaiveSoftmax : public CpuKernelWithoutConfig {
|
|||
outptr[offset] = pow(E_CONSTANT, inptr[offset]) / sum;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveRelu : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::max(T(0), val); }
|
||||
};
|
||||
template <typename T> class NaiveSigmoid : public NativeUnary<T> {
|
||||
T doCompute(T val) const override {
|
||||
return 1 / (1 + pow(E_CONSTANT, -val));
|
||||
}
|
||||
};
|
||||
template <typename T> class NaiveHardSigmoid : public NativeUnary<T> {
|
||||
T doCompute(T val) const override {
|
||||
return std::max(T(0), std::min(T(1), T(0.2) * val + T(0.5)));
|
||||
}
|
||||
};
|
||||
template <typename T> class NaiveHardSwish : public NativeUnary<T> {
|
||||
T doCompute(T val) const override {
|
||||
return val *
|
||||
std::max(T(0), std::min(T(1), val * T(1.0 / 6.0) + T(0.5)));
|
||||
}
|
||||
};
|
||||
template <typename T> class NaiveTanh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override {
|
||||
return (pow(E_CONSTANT, val) - pow(E_CONSTANT, -val)) /
|
||||
(pow(E_CONSTANT, val) + pow(E_CONSTANT, -val));
|
||||
}
|
||||
};
|
||||
template <typename T> class NaiveAbs : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return val < 0 ? -val : val; }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveSqrt : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::sqrt(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveCos : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::cos(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveSin : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::sin(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveTan : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::tan(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveSinh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::sinh(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveCosh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::cosh(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveGelu : public NativeUnary<T> {
|
||||
T doCompute(T val) const override {
|
||||
return 0.5 * val * (1 + std::erf(val / std::sqrt(2)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveErf : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::erf(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveACos : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::acos(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveACosh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::acosh(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveASin : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::asin(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveASinh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::asinh(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveATanh : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::atanh(val); }
|
||||
};
|
||||
|
||||
template <typename T> class NaiveNeg : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return -val; }
|
||||
};
|
||||
|
||||
template <typename T> class Clip : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class Clip : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<ClipObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
@ -143,11 +218,28 @@ template <typename T> class Clip : public CpuKernelWithoutConfig {
|
|||
: val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class Log : public CpuKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class Log : public CpuKernelWithoutConfig {
|
||||
template <typename T>
|
||||
void doCompute(const Operator &_op, const RuntimeObj *context) const {
|
||||
auto op = as<LogObj>(_op);
|
||||
T *inptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||
T *outptr = op->getOutput()->getRawDataPtr<T *>();
|
||||
|
@ -176,70 +268,50 @@ template <typename T> class Log : public CpuKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *context) const override {
|
||||
#define CASE(N) \
|
||||
case N: \
|
||||
doCompute<DT<N>::t>(_op, context)
|
||||
|
||||
int dataTypeIdx = _op->getDType().getIndex();
|
||||
switch (dataTypeIdx) {
|
||||
CASE(1); // DataType::Float32
|
||||
break;
|
||||
CASE(12); // DataType::UInt32
|
||||
break;
|
||||
default:
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class NaiveATan : public NativeUnary<T> {
|
||||
T doCompute(T val) const override { return std::atan(val); }
|
||||
};
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, NativeUnary, "reluNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Gelu, NativeUnary, "geluNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, NativeUnary, "sigmoidNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, NativeUnary,
|
||||
"hardSigmoidNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::HardSwish, NativeUnary,
|
||||
"hardSwishNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Tanh, NativeUnary, "tanhNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Abs, NativeUnary, "absNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sqrt, NativeUnary, "sqrtNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Erf, NativeUnary, "erfNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Neg, NativeUnary, "negNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Cos, NativeUnary, "Cos_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sin, NativeUnary, "Sin_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Tan, NativeUnary, "Tan_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sinh, NativeUnary, "Sinh_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Cosh, NativeUnary, "Cosh_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Acos, NativeUnary, "ACos_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Acosh, NativeUnary, "ACosh_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Asin, NativeUnary, "ASin_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Asinh, NativeUnary, "ASinh_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Atan, NativeUnary, "Atan_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Atanh, NativeUnary, "ATanh_CPU");
|
||||
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::UInt32,
|
||||
NaiveRelu<uint32_t>, "reluNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Relu, DataType::Float32, NaiveRelu<float>,
|
||||
"reluNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::UInt32, NaiveGelu<float>,
|
||||
"geluNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Gelu, DataType::Float32, NaiveGelu<float>,
|
||||
"geluNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::UInt32,
|
||||
NaiveSigmoid<uint32_t>, "sigmoidNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sigmoid, DataType::Float32,
|
||||
NaiveSigmoid<float>, "sigmoidNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::HardSigmoid, DataType::Float32,
|
||||
NaiveHardSigmoid<float>, "hardSigmoidNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::HardSwish, DataType::Float32,
|
||||
NaiveHardSwish<float>, "hardSwishNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::UInt32,
|
||||
NaiveTanh<uint32_t>, "tanhNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Tanh, DataType::Float32, NaiveTanh<float>,
|
||||
"tanhNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::UInt32, NaiveAbs<uint32_t>,
|
||||
"absNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Abs, DataType::Float32, NaiveAbs<float>,
|
||||
"absNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sqrt, DataType::Float32, NaiveSqrt<float>,
|
||||
"sqrtNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Erf, DataType::Float32, NaiveErf<float>,
|
||||
"erfNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Neg, DataType::Float32, NaiveNeg<float>,
|
||||
"negNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::UInt32,
|
||||
NaiveSoftmax<uint32_t>, "softmaxNaive_CPU_uint32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Softmax, DataType::Float32,
|
||||
NaiveSoftmax<float>, "softmaxNaive_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Clip, DataType::Float32, Clip<float>,
|
||||
"Clip_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Atan, DataType::Float32, NaiveATan<float>,
|
||||
"Atan_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Log, DataType::Float32, Log<float>,
|
||||
"Log_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Cos, DataType::Float32, NaiveCos<float>,
|
||||
"Cos_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sin, DataType::Float32, NaiveSin<float>,
|
||||
"Sin_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Tan, DataType::Float32, NaiveTan<float>,
|
||||
"Tan_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Sinh, DataType::Float32, NaiveSinh<float>,
|
||||
"Sinh_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Cosh, DataType::Float32, NaiveCosh<float>,
|
||||
"Cosh_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Acos, DataType::Float32, NaiveACos<float>,
|
||||
"ACos_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Acosh, DataType::Float32,
|
||||
NaiveACosh<float>, "ACosh_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Asin, DataType::Float32, NaiveASin<float>,
|
||||
"ASin_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Asinh, DataType::Float32,
|
||||
NaiveASinh<float>, "ASinh_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Atanh, DataType::Float32,
|
||||
NaiveATanh<float>, "ATanh_CPU_float32");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Softmax, NaiveSoftmax, "softmaxNaive_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Clip, Clip, "Clip_CPU");
|
||||
REGISTER_KERNEL(Device::CPU, OpType::Log, Log, "Log_CPU");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -54,7 +54,6 @@ class G2BMMCudnn : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, DataType::Float32, G2BMMCudnn,
|
||||
"G2BMM_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::G2BMM, G2BMMCudnn, "G2BMM_cuDNN_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -55,7 +55,6 @@ class GBMMCudnn : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::GBMM, DataType::Float32, GBMMCudnn,
|
||||
"GBMM_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::GBMM, GBMMCudnn, "GBMM_cuDNN_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -39,8 +39,8 @@ class AllGatherNCCL : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllGather, DataType::Float32,
|
||||
AllGatherNCCL, "AllGather_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllGather, AllGatherNCCL,
|
||||
"AllGather_NCCL_CUDA");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -13,15 +13,20 @@ class AllReduceNCCL : public CudaKernelWithoutConfig {
|
|||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
void *input = op->getInputs(0)->getRawDataPtr<void *>();
|
||||
void *output = op->getOutput()->getRawDataPtr<void *>();
|
||||
IT_ASSERT(op->getDType() == DataType::Float32);
|
||||
ncclDataType_t ncclType = ncclFloat;
|
||||
if (op->getDType() == DataType::Float16) {
|
||||
ncclType = ncclFloat16;
|
||||
} else if (op->getDType() == DataType::Int8) {
|
||||
ncclType = ncclInt8;
|
||||
}
|
||||
size_t count = op->getInputs(0)->size();
|
||||
|
||||
ncclComm_t comm =
|
||||
dynamic_cast<NcclCommunicatorObj &>(context->getCommunicator())
|
||||
.getNcclComm();
|
||||
// TODO: Using default stream 0 for now.
|
||||
checkNcclError(ncclAllReduce(input, output, count, ncclFloat,
|
||||
getRedOp(), comm, 0));
|
||||
checkNcclError(
|
||||
ncclAllReduce(input, output, count, ncclType, getRedOp(), comm, 0));
|
||||
}
|
||||
|
||||
virtual ncclRedOp_t getRedOp() const = 0;
|
||||
|
@ -43,16 +48,16 @@ class AllReduceAvgNCCL : public AllReduceNCCL {
|
|||
ncclRedOp_t getRedOp() const override { return ncclAvg; }
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, DataType::Float32,
|
||||
AllReduceSumNCCL, "AllReduce_Sum_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, DataType::Float32,
|
||||
AllReduceProdNCCL, "AllReduce_Prod_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, DataType::Float32,
|
||||
AllReduceMinNCCL, "AllReduce_Min_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, DataType::Float32,
|
||||
AllReduceMaxNCCL, "AllReduce_Max_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, DataType::Float32,
|
||||
AllReduceAvgNCCL, "AllReduce_Avg_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceSum, AllReduceSumNCCL,
|
||||
"AllReduce_Sum_NCCL_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceProd, AllReduceProdNCCL,
|
||||
"AllReduce_Prod_NCCL_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMin, AllReduceMinNCCL,
|
||||
"AllReduce_Min_NCCL_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceMax, AllReduceMaxNCCL,
|
||||
"AllReduce_Max_NCCL_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AllReduceAvg, AllReduceAvgNCCL,
|
||||
"AllReduce_Avg_NCCL_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
#endif
|
||||
|
|
|
@ -47,6 +47,6 @@ class AttentionKVCacheCuda : private AttentionKVCacheCompute,
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, DataType::Float32,
|
||||
AttentionKVCacheCuda, "AttentionKVCache_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AttentionKVCache, AttentionKVCacheCuda,
|
||||
"AttentionKVCache_CUDA");
|
||||
} // namespace infini
|
||||
|
|
|
@ -59,6 +59,6 @@ class BatchNormCudnn : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, DataType::Float32,
|
||||
BatchNormCudnn, "BatchNorm_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::BatchNormalization, BatchNormCudnn,
|
||||
"BatchNorm_cuDNN_CUDA");
|
||||
} // namespace infini
|
||||
|
|
|
@ -25,8 +25,8 @@ class BroadcastNCCL : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, DataType::Float32,
|
||||
BroadcastNCCL, "Broadcast_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Broadcast, BroadcastNCCL,
|
||||
"Broadcast_NCCL_CUDA");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -21,7 +21,6 @@ class ClipCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Clip, DataType::Float32, ClipCuda,
|
||||
"Clip_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Clip, ClipCuda, "Clip_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
#include "operators/conv.h"
|
||||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <tuple>
|
||||
|
||||
namespace infini {
|
||||
|
||||
struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
|
||||
|
@ -56,8 +58,11 @@ class convCudnn : public Kernel {
|
|||
const ConvCuDnnPerfRecord &record) const {
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
if (op->getInputs().size() > 2) // Bias is not supported yet
|
||||
// Bias is not supported yet
|
||||
if (op->getInputs().size() > 2) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
|
||||
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
|
@ -72,27 +77,26 @@ class convCudnn : public Kernel {
|
|||
cudnnTensorDescriptor_t inDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, channels, h, w));
|
||||
inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, n, channels, h, w));
|
||||
|
||||
// get kernels
|
||||
cudnnFilterDescriptor_t knDesc;
|
||||
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
||||
checkCudnnError(cudnnSetFilter4dDescriptor(knDesc, CUDNN_DATA_FLOAT,
|
||||
CUDNN_TENSOR_NCHW, f,
|
||||
channelsPerGrp, r, s));
|
||||
checkCudnnError(cudnnSetFilter4dDescriptor(
|
||||
knDesc, cudnnDataType, CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
|
||||
// get bias
|
||||
cudnnTensorDescriptor_t biasDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, f, 1, 1));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
|
||||
cudnnDataType, 1, f, 1, 1));
|
||||
|
||||
// get convlution descriptor
|
||||
// get convolution descriptor
|
||||
cudnnConvolutionDescriptor_t convDesc;
|
||||
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
|
||||
// TODO: CUDNN_CONVOLUTION is a tunable argument
|
||||
checkCudnnError(cudnnSetConvolution2dDescriptor(
|
||||
convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
|
||||
CUDNN_DATA_FLOAT));
|
||||
cudnnDataType));
|
||||
if (g > 1) {
|
||||
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
|
||||
}
|
||||
|
@ -120,14 +124,14 @@ class convCudnn : public Kernel {
|
|||
assert(false);
|
||||
}
|
||||
|
||||
// get output descriptor
|
||||
int outn, outc, outh, outw;
|
||||
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
|
||||
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
||||
cudnnTensorDescriptor_t outDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, outn, outc,
|
||||
outh, outw));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, outn, outc, outh, outw));
|
||||
IT_ASSERT((vector{outn, outc, outh, outw}) ==
|
||||
op->getOutput()->getDims(),
|
||||
"cuDNN output shape mismatches with OP output shape");
|
||||
|
@ -151,55 +155,9 @@ class convCudnn : public Kernel {
|
|||
inData, knDesc, knData, convDesc,
|
||||
ALGOS[record->algo], wsData, wsSize,
|
||||
&beta, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
return false;
|
||||
// TODO:
|
||||
// // bias
|
||||
// if (bias != nullptr) {
|
||||
// auto sz = op.getOutputs()[0]->size();
|
||||
// // TODO: element wise
|
||||
// t += sz * 2 / 400;
|
||||
// }
|
||||
// // act
|
||||
// if (act != None) {
|
||||
// stat = cudnnActivationForward(cudnnHandle(), actDesc,
|
||||
// &alpha, inDesc, inData,
|
||||
// &beta, outDesc, outData);
|
||||
// checkCudaError(cudaDeviceSynchronize());
|
||||
// end = ch::high_resolution_clock::now();
|
||||
// if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
// durtime = INFINITY;
|
||||
// break;
|
||||
// }
|
||||
// t +=
|
||||
// ch::duration_cast<ch::duration<double>>(end -
|
||||
// beg).count() * 1000; // ms
|
||||
// }
|
||||
|
||||
// best = ConvResult{durtime, ALGOS[i], wsSize, false};
|
||||
|
||||
// // w/ bias & act
|
||||
// for (int j = 0; j < rounds + warmupRounds; ++j) {
|
||||
// cudnnStatus_t stat;
|
||||
// if (j == warmupRounds) {
|
||||
// checkCudaError(cudaDeviceSynchronize());
|
||||
// beg = ch::high_resolution_clock::now();
|
||||
// }
|
||||
// stat = cudnnConvolutionBiasActivationForward(
|
||||
// cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
|
||||
// convDesc, ALGOS[i], wsData, wsSize, &beta, outDesc,
|
||||
// outData, biasDesc, biasData, actDesc, outDesc, outData);
|
||||
// if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
// // checkCudnnError(stat);
|
||||
// // Do not checkCudnnError since not all algorithms are
|
||||
// // supported
|
||||
// durtime_fuse = INFINITY;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
// Destories in CUDA does not require sync. But cuDNN does not state
|
||||
// whether sync is required before destories.
|
||||
}
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
|
@ -238,10 +196,12 @@ class convCudnn : public Kernel {
|
|||
stat = cudnnGetConvolutionForwardWorkspaceSize(
|
||||
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
|
||||
ALGOS[record.algo], &record.workspaceSize);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
continue;
|
||||
if (record.workspaceSize > context->getWorkspaceSize())
|
||||
}
|
||||
if (record.workspaceSize > context->getWorkspaceSize()) {
|
||||
continue;
|
||||
}
|
||||
CudaPtr wsData = context->getWorkspace(record.workspaceSize);
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
||||
|
@ -249,8 +209,9 @@ class convCudnn : public Kernel {
|
|||
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
|
||||
knData, convDesc, ALGOS[record.algo], wsData,
|
||||
record.workspaceSize, &beta, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
record.time = timeit(
|
||||
[&]() {
|
||||
cudnnConvolutionForward(context->cudnnHandle(), &alpha,
|
||||
|
@ -263,8 +224,9 @@ class convCudnn : public Kernel {
|
|||
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
|
||||
|
||||
// Update the tune result
|
||||
if (ret.time > record.time)
|
||||
if (ret.time > record.time) {
|
||||
ret = record;
|
||||
}
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
|
@ -291,8 +253,7 @@ class convCudnn : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float32, convCudnn,
|
||||
"Conv_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Conv, convCudnn, "Conv_cuDNN_CUDA");
|
||||
|
||||
REGISTER_CONSTRUCTOR(1, ConvCuDnnPerfRecordObj::from_json);
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,261 +0,0 @@
|
|||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/conv.h"
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <tuple>
|
||||
|
||||
namespace infini {
|
||||
|
||||
struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
|
||||
int algo = 0; // cudnnConvolutionFwdAlgo_t
|
||||
int mode = 1;
|
||||
size_t workspaceSize = 100000;
|
||||
bool fuseAct = false;
|
||||
void to_json(json &j) override {
|
||||
j["type"] = 1;
|
||||
j["data"] = std::make_tuple(algo, mode, fuseAct, time, workspaceSize);
|
||||
}
|
||||
static PerfRecord from_json(const json &j) {
|
||||
ConvCuDnnPerfRecordObj tmp;
|
||||
auto [Algo, Mode, FuseAct, Time, WorkspaceSize] =
|
||||
j["data"].get<tuple<int, int, bool, double, size_t>>();
|
||||
tmp.algo = Algo;
|
||||
tmp.mode = Mode;
|
||||
tmp.fuseAct = FuseAct;
|
||||
tmp.time = Time;
|
||||
tmp.workspaceSize = WorkspaceSize;
|
||||
return make_ref<ConvCuDnnPerfRecordObj>(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
using ConvCuDnnPerfRecord = Ref<ConvCuDnnPerfRecordObj>;
|
||||
|
||||
class convCudnnFP16 : public Kernel {
|
||||
|
||||
static constexpr int N_ALGO = 8;
|
||||
static constexpr int N_MODE = 2;
|
||||
static constexpr cudnnConvolutionFwdAlgo_t ALGOS[8] = {
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED};
|
||||
|
||||
static constexpr cudnnConvolutionMode_t MODES[2] = {
|
||||
CUDNN_CONVOLUTION, CUDNN_CROSS_CORRELATION};
|
||||
|
||||
std::tuple<void *, void *, void *, cudnnTensorDescriptor_t,
|
||||
cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
|
||||
cudnnConvolutionDescriptor_t, cudnnActivationDescriptor_t,
|
||||
cudnnTensorDescriptor_t>
|
||||
createCuDNNDescriptor(const Ref<ConvObj> &op,
|
||||
const ConvCuDnnPerfRecord &record) const {
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
// Bias is not supported yet
|
||||
if (op->getInputs().size() > 2) {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
|
||||
int channelsPerGrp = cpg, channels = c;
|
||||
|
||||
// get inputs
|
||||
cudnnTensorDescriptor_t inDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(inDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_HALF, n, channels,
|
||||
h, w)); /*fp16 type*/
|
||||
|
||||
// get kernels
|
||||
cudnnFilterDescriptor_t knDesc;
|
||||
checkCudnnError(cudnnCreateFilterDescriptor(&knDesc));
|
||||
checkCudnnError(cudnnSetFilter4dDescriptor(
|
||||
knDesc, CUDNN_DATA_HALF, /*fp16 type*/
|
||||
CUDNN_TENSOR_NCHW, f, channelsPerGrp, r, s));
|
||||
// get bias
|
||||
cudnnTensorDescriptor_t biasDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&biasDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(biasDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_HALF, 1, f, 1,
|
||||
1)); /*fp16 type*/
|
||||
|
||||
// get convolution descriptor
|
||||
cudnnConvolutionDescriptor_t convDesc;
|
||||
checkCudnnError(cudnnCreateConvolutionDescriptor(&convDesc));
|
||||
// TODO: CUDNN_CONVOLUTION is a tunable argument
|
||||
checkCudnnError(cudnnSetConvolution2dDescriptor(
|
||||
convDesc, ph, pw, sh, sw, dh, dw, MODES[record->mode],
|
||||
CUDNN_DATA_HALF)); /*fp16 type*/
|
||||
if (g > 1) {
|
||||
checkCudnnError(cudnnSetConvolutionGroupCount(convDesc, g));
|
||||
}
|
||||
|
||||
// get activation descriptor
|
||||
cudnnActivationDescriptor_t actDesc;
|
||||
checkCudnnError(cudnnCreateActivationDescriptor(&actDesc));
|
||||
// NOT_PROPAGATE_NAN is requierd by
|
||||
// cudnnConvolotionBiasActivationForward
|
||||
switch (op->getAct()) {
|
||||
case ActType::Relu:
|
||||
checkCudnnError(cudnnSetActivationDescriptor(
|
||||
actDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||
break;
|
||||
case ActType::Sigmoid:
|
||||
checkCudnnError(cudnnSetActivationDescriptor(
|
||||
actDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||
break;
|
||||
case ActType::None:
|
||||
checkCudnnError(
|
||||
cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_IDENTITY,
|
||||
CUDNN_NOT_PROPAGATE_NAN, 0));
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
|
||||
// get output descriptor
|
||||
int outn, outc, outh, outw;
|
||||
checkCudnnError(cudnnGetConvolution2dForwardOutputDim(
|
||||
convDesc, inDesc, knDesc, &outn, &outc, &outh, &outw));
|
||||
cudnnTensorDescriptor_t outDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_HALF, outn, outc,
|
||||
outh, outw));
|
||||
IT_ASSERT((vector{outn, outc, outh, outw}) ==
|
||||
op->getOutput()->getDims(),
|
||||
"cuDNN output shape mismatches with OP output shape");
|
||||
|
||||
return tuple(inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc);
|
||||
}
|
||||
|
||||
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||
const CudaRuntimeObj *context) const {
|
||||
cudnnStatus_t stat;
|
||||
|
||||
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc] =
|
||||
createCuDNNDescriptor(op, record);
|
||||
size_t wsSize = record->workspaceSize;
|
||||
CudaPtr wsData = context->getWorkspace(wsSize);
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
||||
stat = cudnnConvolutionForward(context->cudnnHandle(), &alpha, inDesc,
|
||||
inData, knDesc, knData, convDesc,
|
||||
ALGOS[record->algo], wsData, wsSize,
|
||||
&beta, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
|
||||
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
return true;
|
||||
}
|
||||
|
||||
void compute(const Operator &op, const RuntimeObj *context) const override {
|
||||
auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
|
||||
// default ctor
|
||||
compute(op, record, context);
|
||||
}
|
||||
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
ConvCuDnnPerfRecordObj ret;
|
||||
ret.time = std::numeric_limits<double>::max();
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto op = as<ConvObj>(_op);
|
||||
// Both modes have the same performance. Only run cross-correlation.
|
||||
for (int mode = 1; mode < 2; mode++) {
|
||||
// Try every possible algorithm of convolution
|
||||
for (int algo = 0; algo < N_ALGO; algo++) {
|
||||
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
|
||||
auto &record = *recordRef;
|
||||
record.mode = mode;
|
||||
record.algo = algo;
|
||||
cudnnStatus_t stat;
|
||||
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc] =
|
||||
createCuDNNDescriptor(op, recordRef);
|
||||
|
||||
// get workspace
|
||||
stat = cudnnGetConvolutionForwardWorkspaceSize(
|
||||
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
|
||||
ALGOS[record.algo], &record.workspaceSize);
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
if (record.workspaceSize > context->getWorkspaceSize()) {
|
||||
continue;
|
||||
}
|
||||
CudaPtr wsData = context->getWorkspace(record.workspaceSize);
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
||||
stat = cudnnConvolutionForward(
|
||||
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
|
||||
knData, convDesc, ALGOS[record.algo], wsData,
|
||||
record.workspaceSize, &beta, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
record.time = timeit(
|
||||
[&]() {
|
||||
cudnnConvolutionForward(context->cudnnHandle(), &alpha,
|
||||
inDesc, inData, knDesc, knData,
|
||||
convDesc, ALGOS[record.algo],
|
||||
wsData, record.workspaceSize,
|
||||
&beta, outDesc, outData);
|
||||
},
|
||||
[&]() { context->sync(); });
|
||||
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
|
||||
|
||||
// Update the tune result
|
||||
if (ret.time > record.time) {
|
||||
ret = record;
|
||||
}
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
|
||||
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
}
|
||||
}
|
||||
// printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
|
||||
// ret.mode);
|
||||
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
|
||||
"algorithm "
|
||||
"found");
|
||||
return make_ref<ConvCuDnnPerfRecordObj>(ret);
|
||||
}
|
||||
|
||||
void compute(const Operator &_op, const PerfRecord &_record,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvObj>(_op);
|
||||
auto record = as<ConvCuDnnPerfRecordObj>(_record);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
bool success = cuDNNUnfused(op, record, context);
|
||||
IT_ASSERT(success);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Conv, DataType::Float16, convCudnnFP16,
|
||||
"Conv_cuDNN_CUDA_Float16");
|
||||
|
||||
} // namespace infini
|
|
@ -300,8 +300,9 @@ class convBackwardDataCudnn : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, DataType::Float32,
|
||||
convBackwardDataCudnn, "ConvTranposed_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, DataType::Float32,
|
||||
convBackwardDataCudnn, "ConvTranposedNHWC_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ConvTranspose, convBackwardDataCudnn,
|
||||
"ConvTranposed_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ConvTransNHWC, convBackwardDataCudnn,
|
||||
"ConvTranposedNHWC_cuDNN_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#include "operators/dequantize_linear.h"
|
||||
#include "cuda/cuda_dequantize_linear.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class DequantizeLinearCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<DequantizeLinearObj>(_op);
|
||||
|
||||
void *const inputX = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const inputScale = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
|
||||
void *const output = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
const int axis = op->getAxis();
|
||||
const int stride = op->getInputs(0)->getStride().at(axis);
|
||||
|
||||
auto dims = op->getInputs(0)->getDims();
|
||||
int dimsize = dims[op->getAxis()];
|
||||
int size = op->getOutput()->size();
|
||||
|
||||
if (op->getInputs(1)->getDType() == DataType::Float32) {
|
||||
|
||||
if (op->numInputs() == 3) {
|
||||
void *const inputZeroPoint =
|
||||
(op->getInputs(2)->getRawDataPtr<void *>());
|
||||
|
||||
DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
|
||||
(float *)output, dimsize, stride,
|
||||
(uint8_t *)inputZeroPoint, size);
|
||||
} else {
|
||||
DequantizeLinearKernel((uint8_t *)inputX, (float *)inputScale,
|
||||
(float *)output, dimsize, stride, size);
|
||||
}
|
||||
} else if (op->getInputs(1)->getDType() == DataType::Float16) {
|
||||
if (op->numInputs() == 3) {
|
||||
void *const inputZeroPoint =
|
||||
(op->getInputs(2)->getRawDataPtr<void *>());
|
||||
|
||||
DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
|
||||
(half *)output, dimsize, stride,
|
||||
(uint8_t *)inputZeroPoint, size);
|
||||
} else {
|
||||
DequantizeLinearKernel((uint8_t *)inputX, (half *)inputScale,
|
||||
(half *)output, dimsize, stride, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::DequantizeLinear, DequantizeLinearCuda,
|
||||
"DequantizeLinear_CUDA");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,345 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
template <typename T, int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
|
||||
T *output, const int dimsize,
|
||||
const int stride,
|
||||
const uint8_t *inputZeroPoint) {
|
||||
// len(scale) = len(bias) = dimsize
|
||||
int tmp = blockIdx.x % stride;
|
||||
int tid = (blockIdx.x - tmp) * dimsize + tmp;
|
||||
int remain = dimsize % BLOCK_DIM;
|
||||
int step = (dimsize - remain) / BLOCK_DIM + 1;
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (threadIdx.x * step + ind) * stride] -
|
||||
inputZeroPoint[threadIdx.x * step + ind]) *
|
||||
inputScale[threadIdx.x * step + ind];
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
output[tid +
|
||||
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] -
|
||||
inputZeroPoint[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind]) *
|
||||
inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
|
||||
ind];
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T, int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void blockDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
|
||||
T *output, const int dimsize,
|
||||
const int stride) {
|
||||
// len(scale) = len(bias) = dimsize
|
||||
int tmp = blockIdx.x % stride;
|
||||
int tid = (blockIdx.x - tmp) * dimsize + tmp;
|
||||
int remain = dimsize % BLOCK_DIM;
|
||||
int step = (dimsize - remain) / BLOCK_DIM + 1;
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (threadIdx.x * step + ind) * stride]) *
|
||||
inputScale[threadIdx.x * step + ind];
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
output[tid +
|
||||
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride]) *
|
||||
inputScale[remain * step + (threadIdx.x - remain) * (step - 1) +
|
||||
ind];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void
|
||||
warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
|
||||
T *output, const int dimsize, const int otherSize,
|
||||
const int stride, const uint8_t *inputZeroPoint) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
int remain = dimsize % BLOCK_DIM_x;
|
||||
int step = (dimsize - remain) / BLOCK_DIM_x + 1;
|
||||
if (otherIdx < otherSize) {
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (threadIdx.x * step + ind) * stride] -
|
||||
inputZeroPoint[threadIdx.x * step + ind]) *
|
||||
inputScale[threadIdx.x * step + ind];
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
output[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid +
|
||||
(remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] -
|
||||
inputZeroPoint[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) +
|
||||
ind]) *
|
||||
inputScale[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void
|
||||
warpDequantizeLinearKernel(const uint8_t *inputX, const T *inputScale,
|
||||
T *output, const int dimsize, const int otherSize,
|
||||
const int stride) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
int remain = dimsize % BLOCK_DIM_x;
|
||||
int step = (dimsize - remain) / BLOCK_DIM_x + 1;
|
||||
if (otherIdx < otherSize) {
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid + (threadIdx.x * step + ind) * stride]) *
|
||||
inputScale[threadIdx.x * step + ind];
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
output[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
static_cast<T>(
|
||||
inputX[tid +
|
||||
(remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride]) *
|
||||
inputScale[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
namespace infini {
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
|
||||
float *output, const int dimsize, const int stride,
|
||||
const uint8_t *inputZeroPoint, const int size) {
|
||||
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
inputX, inputScale, output, dimsize, stride, inputZeroPoint);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 32, 32>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 16, 64>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 8, 128>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 4, 256>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
}
|
||||
}
|
||||
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const float *inputScale,
|
||||
float *output, const int dimsize, const int stride,
|
||||
const int size) {
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockDequantizeLinearKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
inputX, inputScale, output, dimsize, stride);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 32, 32><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 16, 64><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 8, 128><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<float, 4, 256><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
}
|
||||
}
|
||||
//-------------
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
|
||||
half *output, const int dimsize, const int stride,
|
||||
const uint8_t *inputZeroPoint, const int size) {
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
inputX, inputScale, output, dimsize, stride, inputZeroPoint);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 32, 32>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 16, 64>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 8, 128>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 4, 256>
|
||||
<<<grid_dim, block_dim>>>(inputX, inputScale, output, dimsize,
|
||||
num_block, stride, inputZeroPoint);
|
||||
}
|
||||
}
|
||||
|
||||
void DequantizeLinearKernel(const uint8_t *inputX, const half *inputScale,
|
||||
half *output, const int dimsize, const int stride,
|
||||
const int size) {
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockDequantizeLinearKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
inputX, inputScale, output, dimsize, stride);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 32, 32><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 16, 64><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 8, 128><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpDequantizeLinearKernel<half, 4, 256><<<grid_dim, block_dim>>>(
|
||||
inputX, inputScale, output, dimsize, num_block, stride);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,31 @@
|
|||
#include "operators/dynamic_quantize_linear.h"
|
||||
#include "cuda/cuda_dynamic_quantize_linear.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class DynamicQuantizeLinearCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<DynamicQuantizeLinearObj>(_op);
|
||||
|
||||
void *const input = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
|
||||
void *const outputY = (op->getOutput(0)->getRawDataPtr<void *>());
|
||||
void *const outputYScale = (op->getOutput(1)->getRawDataPtr<void *>());
|
||||
void *const outputYZeroPoint =
|
||||
(op->getOutput(2)->getRawDataPtr<void *>());
|
||||
|
||||
int size = op->getInputs(0)->size();
|
||||
|
||||
dynamicQuantizeLinearKernel((float *)input, (uint8_t *)outputY,
|
||||
(float *)outputYScale,
|
||||
(uint8_t *)outputYZeroPoint, size);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::DynamicQuantizeLinear,
|
||||
DynamicQuantizeLinearCuda, "DynamicQuantizeLinear_CUDA");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,182 @@
|
|||
#include <cub/cub.cuh>
|
||||
|
||||
#include "cuda/cuda_common.h"
|
||||
|
||||
__device__ float _saturate(float x) {
|
||||
return x < 0.f ? 0.f : (x > 255.0f ? 255.0f : x);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__device__ __forceinline__ static T max___(T a, T b) noexcept {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__device__ __forceinline__ static T min___(T a, T b) noexcept {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void _dynamicQuantizeLinearKernel(float *input, uint8_t *outputY,
|
||||
float *yScale, uint8_t *yZeroPoint,
|
||||
int size) {
|
||||
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
|
||||
float maxData = -__FLT_MAX__;
|
||||
float minData = __FLT_MAX__;
|
||||
int remain = size % BLOCK_DIM;
|
||||
int step = (size - remain) / BLOCK_DIM + 1;
|
||||
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
maxData = max___(maxData, input[threadIdx.x * step + ind]);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
maxData = max___(maxData,
|
||||
input[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind]);
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
minData = min___(minData, input[threadIdx.x * step + ind]);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
minData = min___(minData,
|
||||
input[remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind]);
|
||||
}
|
||||
}
|
||||
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storageMax;
|
||||
|
||||
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
|
||||
|
||||
__shared__ typename BlockReduce::TempStorage temp_storageMin;
|
||||
|
||||
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
|
||||
//-----
|
||||
int qmax = 255;
|
||||
int qmin = 0;
|
||||
__shared__ float _yZeroPoint;
|
||||
if (threadIdx.x == 0) {
|
||||
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
|
||||
float intermediate_zero_point = qmin - blockMin / yScale[0];
|
||||
_yZeroPoint = round(_saturate(intermediate_zero_point));
|
||||
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < size) {
|
||||
outputY[i] = static_cast<uint8_t>(
|
||||
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));
|
||||
}
|
||||
}
|
||||
//----------
|
||||
|
||||
template <int BLOCK_DIM, int numPerThread>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void _dynamicQuantizeLinearKernel(float *input, uint8_t *outputY,
|
||||
float *yScale, uint8_t *yZeroPoint,
|
||||
int size) {
|
||||
int i = threadIdx.x + blockIdx.x * BLOCK_DIM;
|
||||
float maxData = -__FLT_MAX__;
|
||||
float minData = __FLT_MAX__;
|
||||
int remain = size % BLOCK_DIM;
|
||||
int step = (size - remain) / BLOCK_DIM + 1;
|
||||
float dataPerThread[numPerThread];
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
dataPerThread[ind] = input[threadIdx.x * step + ind];
|
||||
maxData = max___(maxData, dataPerThread[ind]);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
dataPerThread[ind] =
|
||||
input[remain * step + (threadIdx.x - remain) * (step - 1) +
|
||||
ind];
|
||||
maxData = max___(maxData, dataPerThread[ind]);
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
minData = min___(minData, dataPerThread[ind]);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
minData = min___(minData, dataPerThread[ind]);
|
||||
}
|
||||
}
|
||||
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storageMax;
|
||||
|
||||
float blockMax = BlockReduce(temp_storageMax).Reduce(maxData, cub::Max());
|
||||
|
||||
__shared__ typename BlockReduce::TempStorage temp_storageMin;
|
||||
|
||||
float blockMin = BlockReduce(temp_storageMin).Reduce(minData, cub::Min());
|
||||
//-----
|
||||
int qmax = 255;
|
||||
int qmin = 0;
|
||||
__shared__ float _yZeroPoint;
|
||||
if (threadIdx.x == 0) {
|
||||
yScale[0] = (max(0.0f, blockMax) - min(0.0f, blockMin)) / (qmax - qmin);
|
||||
float intermediate_zero_point = qmin - blockMin / yScale[0];
|
||||
_yZeroPoint = round(_saturate(intermediate_zero_point));
|
||||
yZeroPoint[0] = static_cast<uint8_t>(_yZeroPoint);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < size) {
|
||||
outputY[i] = static_cast<uint8_t>(
|
||||
_saturate(round(input[i] / yScale[0]) + _yZeroPoint));
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void dynamicQuantizeLinearKernel(float *input, uint8_t *outputY, float *yScale,
|
||||
uint8_t *yZeroPoint, int size) {
|
||||
if (size > 1024 * 128) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else if (size > 1024 * 64) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 128><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else if (size > 1024 * 32) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 64><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else if (size > 1024 * 16) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 32><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else if (size > 1024 * 4) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 16><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else if (size > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 4><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
} else {
|
||||
int BLOCK_DIM = 1024;
|
||||
int num_blocks = (size + BLOCK_DIM - 1) / BLOCK_DIM;
|
||||
_dynamicQuantizeLinearKernel<1024, 1><<<num_blocks, BLOCK_DIM>>>(
|
||||
input, outputY, yScale, yZeroPoint, size);
|
||||
}
|
||||
}
|
||||
} // namespace infini
|
|
@ -2,6 +2,7 @@
|
|||
#include "cuda/cuda_element_wise.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
|
||||
namespace infini {
|
||||
class ElementWiseCudnn : public CudaKernelWithoutConfig {
|
||||
|
@ -44,22 +45,21 @@ class ElementWiseCudnn : public CudaKernelWithoutConfig {
|
|||
std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
|
||||
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||
|
||||
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
|
||||
// get inputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&aDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, a[0], a[1],
|
||||
a[2], a[3]));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
aDesc, CUDNN_TENSOR_NCHW, cudnnDataType, a[0], a[1], a[2], a[3]));
|
||||
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&bDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(bDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, b[0], b[1],
|
||||
b[2], b[3]));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
bDesc, CUDNN_TENSOR_NCHW, cudnnDataType, b[0], b[1], b[2], b[3]));
|
||||
|
||||
// get outputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&cDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW,
|
||||
CUDNN_DATA_FLOAT, c[0], c[1],
|
||||
c[2], c[3]));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
cDesc, CUDNN_TENSOR_NCHW, cudnnDataType, c[0], c[1], c[2], c[3]));
|
||||
|
||||
// get op descriptor
|
||||
cudnnOpTensorDescriptor_t opDesc;
|
||||
|
@ -127,40 +127,33 @@ class ElementWiseCuda : public CudaKernelWithoutConfig {
|
|||
std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
|
||||
std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
|
||||
|
||||
if (op->getOpType() == OpType::Div)
|
||||
div_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
else if (op->getOpType() == OpType::Pow)
|
||||
pow_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
else if (op->getOpType() == OpType::Add) {
|
||||
add_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
const int dType = _op->getDType().getIndex();
|
||||
if (op->getOpType() == OpType::Div) {
|
||||
div_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
|
||||
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
} else if (op->getOpType() == OpType::Add) {
|
||||
add_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
|
||||
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
} else if (op->getOpType() == OpType::Pow) {
|
||||
pow_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3], b[0],
|
||||
b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
} else if (op->getOpType() == OpType::Less) {
|
||||
less_kernel(aData, bData, cData, a[0], a[1], a[2], a[3], b[0], b[1],
|
||||
b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
} else
|
||||
less_kernel(dType, aData, bData, cData, a[0], a[1], a[2], a[3],
|
||||
b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Float32, AddCudnn,
|
||||
"Add_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sub, DataType::Float32, SubCudnn,
|
||||
"Sub_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Mul, DataType::Float32, MulCudnn,
|
||||
"Mul_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Min, DataType::Float32, MinCudnn,
|
||||
"Min_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Max, DataType::Float32, MaxCudnn,
|
||||
"Max_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Add, AddCudnn, "Add_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sub, SubCudnn, "Sub_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Mul, MulCudnn, "Mul_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Min, MinCudnn, "Min_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Max, MaxCudnn, "Max_cuDNN_CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Div, ElementWiseCuda, "Div_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Pow, ElementWiseCuda, "Pow_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Less, ElementWiseCuda, "Less_CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Div, DataType::Float32, ElementWiseCuda,
|
||||
"Div_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Add, DataType::Int64, ElementWiseCuda,
|
||||
"Add_CUDA_Int64");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Pow, DataType::Float32, ElementWiseCuda,
|
||||
"Pow__CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Less, DataType::Int64, ElementWiseCuda,
|
||||
"Less__CUDA_Int64");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include <math.h>
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
|
@ -129,44 +130,113 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
|
|||
}
|
||||
}
|
||||
|
||||
#define CASE(OP, T) \
|
||||
_##OP##_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
|
||||
#define SWITCH_DTYPE(OP, DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE(OP, 1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE(OP, 2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE(OP, 3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE(OP, 4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE(OP, 5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE(OP, 6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE(OP, 7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE(OP, 10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE(OP, 11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE(OP, 12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE(OP, 13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE(OP, 16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_div_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
|
||||
b2, b3, c0, c1, c2, c3);
|
||||
SWITCH_DTYPE(div, dType)
|
||||
}
|
||||
void add_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_add_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
SWITCH_DTYPE(add, dType)
|
||||
}
|
||||
void pow_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
int blocksize = block_work_size();
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0, b1,
|
||||
b2, b3, c0, c1, c2, c3);
|
||||
if (dType == 1) {
|
||||
_pow_kernel<float><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
} else if (dType == 3) {
|
||||
_pow_kernel<int8_t><<<gridsize, blocksize>>>(
|
||||
a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
} else if (dType == 10) {
|
||||
int a_size = a0 * a1 * a2 * a3;
|
||||
int b_size = b0 * b1 * b2 * b3;
|
||||
int c_size = c0 * c1 * c2 * c3;
|
||||
vector<float> a_float(a_size);
|
||||
vector<float> b_float(b_size);
|
||||
vector<float> c_float(c_size);
|
||||
for (int i = 0; i < a_size; ++i) {
|
||||
a_float[i] = __half2float(((half *)a)[i]);
|
||||
}
|
||||
for (int i = 0; i < b_size; ++i) {
|
||||
b_float[i] = __half2float(((half *)b)[i]);
|
||||
}
|
||||
_pow_kernel<float><<<gridsize, blocksize>>>(
|
||||
a_float.data(), b_float.data(), c_float.data(), a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
for (int i = 0; i < c_size; ++i) {
|
||||
((half *)c)[i] = __float2half(c_float[i]);
|
||||
}
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
void less_kernel(void *a, void *b, void *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
|
||||
int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
|
||||
int c3) {
|
||||
int blocksize = block_work_size();
|
||||
int num = c0 * c1 * c2 * c3;
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_less_kernel<int64_t><<<gridsize, blocksize>>>(a, b, c, a0, a1, a2, a3, b0,
|
||||
b1, b2, b3, c0, c1, c2, c3);
|
||||
SWITCH_DTYPE(less, dType)
|
||||
}
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -25,12 +25,12 @@ class ExpandCuda : public CudaKernelWithoutConfig {
|
|||
inputShape.data[i] = in_Shape[i];
|
||||
outputsize *= out_Shape[i];
|
||||
}
|
||||
expandKernel((float *)inputData, (float *)outputData, nDims, outputsize,
|
||||
const int dType = op->getDType().getIndex();
|
||||
expandKernel(dType, inputData, outputData, nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Expand, DataType::Float32, ExpandCuda,
|
||||
"Expand_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Expand, ExpandCuda, "Expand_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
#include "core/common.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _expandKernel(float *input, float *output, int nDims,
|
||||
template <class T>
|
||||
__global__ void _expandKernel(void *input, void *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputShape,
|
||||
infini::SmallArray outputShape) {
|
||||
|
||||
|
@ -33,17 +35,64 @@ __global__ void _expandKernel(float *input, float *output, int nDims,
|
|||
temp *= inputShape.data[i];
|
||||
v = v / outputShape.data[i];
|
||||
}
|
||||
output[outputIdx] = input[inputIdx];
|
||||
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void expandKernel(float *input, float *output, int nDims, int outputsize,
|
||||
SmallArray inputShape, SmallArray outputShape) {
|
||||
|
||||
#define CASE(T) \
|
||||
_expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
input, output, nDims, outputsize, inputShape, outputShape);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE(1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE(2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE(3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE(4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE(5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE(6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE(7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE(10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE(11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE(12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE(13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE(16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
void expandKernel(int dType, void *input, void *output, int nDims,
|
||||
int outputsize, SmallArray inputShape,
|
||||
SmallArray outputShape) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
|
||||
_expandKernel<<<gridsize, blocksize>>>(input, output, nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
SWITCH_DTYPE(dType)
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -22,6 +22,5 @@ class ExtendCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Extend, DataType::Float32, ExtendCuda,
|
||||
"Extend_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Extend, ExtendCuda, "Extend_CUDA");
|
||||
} // namespace infini
|
||||
|
|
|
@ -15,12 +15,26 @@ class GatherCuda : public CudaKernelWithoutConfig {
|
|||
GatherMetaData metaData;
|
||||
initGatherMetaData(metaData, op);
|
||||
|
||||
auto inData = input->getRawDataPtr<float *>();
|
||||
auto outData = op->getOutput()->getRawDataPtr<float *>();
|
||||
gather_kernel(inData, outData, metaData, op->getOutput()->size());
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
if (op->getDType() == DataType::Float32) {
|
||||
gather_kernel<float>((float *)inputData, (float *)outputData,
|
||||
metaData, op->getOutput()->size());
|
||||
} else if (op->getDType() == DataType::Float16) {
|
||||
gather_kernel<half>((half *)inputData, (half *)outputData, metaData,
|
||||
op->getOutput()->size());
|
||||
} else if (op->getDType() == DataType::Int8) {
|
||||
gather_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData,
|
||||
metaData, op->getOutput()->size());
|
||||
} else if (op->getDType() == DataType::UInt8) {
|
||||
gather_kernel<uint8_t>((uint8_t *)inputData, (uint8_t *)outputData,
|
||||
metaData, op->getOutput()->size());
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Gather, DataType::Float32, GatherCuda,
|
||||
"Gather_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Gather, GatherCuda, "Gather_CUDA");
|
||||
} // namespace infini
|
||||
|
|
|
@ -28,27 +28,34 @@ __device__ T gatheredOffset2Offset(int gOffset,
|
|||
return offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void _gather_kernel(float *in, float *out,
|
||||
template <typename dataT, typename T>
|
||||
__global__ void _gather_kernel(dataT *in, dataT *out,
|
||||
infini::GatherMetaData metaData, size_t num) {
|
||||
T tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
while (tid < num) {
|
||||
if (tid < num) {
|
||||
T offset = gatheredOffset2Offset<T>(tid, metaData);
|
||||
out[tid] = in[offset];
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num) {
|
||||
template <typename T>
|
||||
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num) {
|
||||
int blockSize = 32 * 16;
|
||||
int gridSize = (num + blockSize - 1) / blockSize;
|
||||
if (metaData.indexType == DataType::Int64) {
|
||||
_gather_kernel<int64_t>
|
||||
_gather_kernel<T, int64_t>
|
||||
<<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
} else {
|
||||
_gather_kernel<int><<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
_gather_kernel<T, int><<<gridSize, blockSize>>>(in, out, metaData, num);
|
||||
}
|
||||
}
|
||||
template void gather_kernel<float>(float *in, float *out,
|
||||
GatherMetaData metaData, size_t num);
|
||||
template void gather_kernel<half>(half *in, half *out, GatherMetaData metaData,
|
||||
size_t num);
|
||||
template void gather_kernel<int8_t>(int8_t *in, int8_t *out,
|
||||
GatherMetaData metaData, size_t num);
|
||||
template void gather_kernel<uint8_t>(uint8_t *in, uint8_t *out,
|
||||
GatherMetaData metaData, size_t num);
|
||||
} // namespace infini
|
||||
|
|
|
@ -21,8 +21,7 @@ class GatherElementsCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Float32,
|
||||
GatherElementsCuda, "GatherELements_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, DataType::Int32,
|
||||
GatherElementsCuda, "GatherElements_CUDA_Int32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::GatherElements, GatherElementsCuda,
|
||||
"GatherELements_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -24,22 +24,39 @@ class LayerNormCuda : public CudaKernelWithoutConfig {
|
|||
int dimsize = dims[op->getAxis()];
|
||||
int size = op->getOutput(0)->size();
|
||||
int scaleSize = op->getInputs(1)->size();
|
||||
if (op->numInputs() == 3) {
|
||||
void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
int biasSize = op->getInputs(2)->size();
|
||||
// printf("kernel bias:true:%d\n", 1);
|
||||
LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (float *)outputData,
|
||||
(float *)biasData, biasSize);
|
||||
} else {
|
||||
// printf("kernel bias:false:%d\n", 0);
|
||||
LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (float *)outputData);
|
||||
if (op->getDType() == DataType::Float32) {
|
||||
if (op->numInputs() == 3) {
|
||||
void *const biasData =
|
||||
(op->getInputs(2)->getRawDataPtr<void *>());
|
||||
int biasSize = op->getInputs(2)->size();
|
||||
// printf("kernel bias:true:%d\n", 1);
|
||||
LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (float *)outputData,
|
||||
(float *)biasData, biasSize);
|
||||
} else {
|
||||
// printf("kernel bias:false:%d\n", 0);
|
||||
LaynormKernel((float *)inputData, (float *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (float *)outputData);
|
||||
}
|
||||
} else if (op->getDType() == DataType::Float16) {
|
||||
if (op->numInputs() == 3) {
|
||||
void *const biasData =
|
||||
(op->getInputs(2)->getRawDataPtr<void *>());
|
||||
int biasSize = op->getInputs(2)->size();
|
||||
// printf("kernel bias:true:%d\n", 1);
|
||||
LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (half *)outputData,
|
||||
(half *)biasData, biasSize);
|
||||
} else {
|
||||
// printf("kernel bias:false:%d\n", 0);
|
||||
LaynormKernel((half *)inputData, (half *)scaleData, eps, size,
|
||||
scaleSize, dimsize, stride, (half *)outputData);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, DataType::Float32,
|
||||
LayerNormCuda, "LayerNorm_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::LayerNormalization, LayerNormCuda,
|
||||
"LayerNorm_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,43 +1,41 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
template <int BLOCK_DIM>
|
||||
template <typename T, int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void blockLaynormKernel(const float *input, const float *scale,
|
||||
const int dimsize, const int stride, float *output,
|
||||
const float eps, int scaleSize, const float *bias,
|
||||
int biasSize) {
|
||||
void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
|
||||
const int stride, T *output, const T eps,
|
||||
int scaleSize, const T *bias, int biasSize) {
|
||||
// len(scale) = len(bias) = dimsize
|
||||
int tmp = blockIdx.x % stride;
|
||||
int tid = (blockIdx.x - tmp) * dimsize + tmp;
|
||||
float muPartial = 0.0f;
|
||||
T muPartial = 0.0f;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
|
||||
}
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ float mu;
|
||||
float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
|
||||
__shared__ T mu;
|
||||
T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
mu = muBlock / dimsize;
|
||||
mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float sigma2Partial = 0.0f;
|
||||
T sigma2Partial = 0.0f;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
sigma2Partial +=
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
|
||||
}
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
|
||||
|
||||
__shared__ float sigma2;
|
||||
float sigma2Block =
|
||||
BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
|
||||
__shared__ T sigma2;
|
||||
T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
sigma2 = sigma2Block / dimsize;
|
||||
sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
}
|
||||
__syncthreads();
|
||||
if (biasSize == dimsize) {
|
||||
|
@ -47,8 +45,9 @@ __launch_bounds__(BLOCK_DIM) __global__
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[threadIdx.x + ph * BLOCK_DIM] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
mu) /
|
||||
sqrt(sigma2 + eps) +
|
||||
mu) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
|
||||
bias[threadIdx.x + ph * BLOCK_DIM];
|
||||
}
|
||||
} else {
|
||||
|
@ -57,8 +56,9 @@ __launch_bounds__(BLOCK_DIM) __global__
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[0] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
mu) /
|
||||
sqrt(sigma2 + eps) +
|
||||
mu) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
|
||||
bias[threadIdx.x + ph * BLOCK_DIM];
|
||||
}
|
||||
}
|
||||
|
@ -69,8 +69,9 @@ __launch_bounds__(BLOCK_DIM) __global__
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[threadIdx.x + ph * BLOCK_DIM] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
mu) /
|
||||
sqrt(sigma2 + eps) +
|
||||
mu) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
|
||||
bias[0];
|
||||
}
|
||||
} else {
|
||||
|
@ -79,50 +80,50 @@ __launch_bounds__(BLOCK_DIM) __global__
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[0] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
mu) /
|
||||
sqrt(sigma2 + eps) +
|
||||
mu) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(sigma2 + eps)))) +
|
||||
bias[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//-----------------
|
||||
template <int BLOCK_DIM>
|
||||
template <typename T, int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__
|
||||
void blockLaynormKernel(const float *input, const float *scale,
|
||||
const int dimsize, const int stride, float *output,
|
||||
const float eps, int scaleSize) {
|
||||
void blockLaynormKernel(const T *input, const T *scale, const int dimsize,
|
||||
const int stride, T *output, const T eps,
|
||||
int scaleSize) {
|
||||
// len(scale) = len(bias) = dimsize
|
||||
int tmp = blockIdx.x % stride;
|
||||
int tid = (blockIdx.x - tmp) * dimsize + tmp;
|
||||
float muPartial = 0.0f;
|
||||
T muPartial = 0.0f;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
|
||||
}
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ float mu;
|
||||
float muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
|
||||
__shared__ T mu;
|
||||
T muBlock = BlockReduce(temp_storage).Reduce(muPartial, cub::Sum());
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
mu = muBlock / dimsize;
|
||||
mu = muBlock * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float sigma2Partial = 0.0f;
|
||||
T sigma2Partial = 0.0f;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
sigma2Partial +=
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu);
|
||||
}
|
||||
typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
|
||||
typedef cub::BlockReduce<T, BLOCK_DIM> BlockReduce;
|
||||
|
||||
__shared__ float sigma2;
|
||||
float sigma2Block =
|
||||
BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
|
||||
__shared__ T sigma2;
|
||||
T sigma2Block = BlockReduce(temp_storage).Reduce(sigma2Partial, cub::Sum());
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
sigma2 = sigma2Block / dimsize;
|
||||
sigma2 = sigma2Block * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
}
|
||||
__syncthreads();
|
||||
if (scaleSize == dimsize) {
|
||||
|
@ -130,16 +131,18 @@ __launch_bounds__(BLOCK_DIM) __global__
|
|||
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[threadIdx.x + ph * BLOCK_DIM] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
|
||||
sqrt(sigma2 + eps);
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
|
||||
static_cast<T>(
|
||||
__fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
|
||||
}
|
||||
} else {
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
scale[0] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) /
|
||||
sqrt(sigma2 + eps);
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] - mu) *
|
||||
static_cast<T>(
|
||||
__fdividef(1.0F, sqrt(static_cast<float>(sigma2 + eps))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -158,33 +161,33 @@ __inline__ __device__ T WarpAllReduce(T val) {
|
|||
}
|
||||
return val;
|
||||
}
|
||||
template <int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void warpLaynormKernel(const float *input, const float *scale,
|
||||
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void warpLaynormKernel(const T *input, const T *scale,
|
||||
const int dimsize, const int stride,
|
||||
float *output, const float eps, int scaleSize,
|
||||
int otherSize, const float *bias,
|
||||
int biasSize) {
|
||||
T *output, const T eps, int scaleSize,
|
||||
int otherSize, const T *bias, int biasSize) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
if (otherIdx < otherSize) {
|
||||
|
||||
__shared__ float muTotal[BLOCK_DIM_y];
|
||||
__shared__ float sigma2Total[BLOCK_DIM_y];
|
||||
__shared__ T muTotal[BLOCK_DIM_y];
|
||||
__shared__ T sigma2Total[BLOCK_DIM_y];
|
||||
|
||||
float muPartial = 0.0f;
|
||||
T muPartial = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
|
||||
}
|
||||
|
||||
muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
|
||||
muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
muTotal[threadIdx.y] = muPartial / dimsize;
|
||||
muTotal[threadIdx.y] =
|
||||
muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
|
||||
//--------------------------------------------
|
||||
float sigma2Partial = 0.0f;
|
||||
T sigma2Partial = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
sigma2Partial +=
|
||||
|
@ -194,10 +197,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
muTotal[threadIdx.y]);
|
||||
}
|
||||
|
||||
sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
|
||||
sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
|
||||
sigma2Total[threadIdx.y] =
|
||||
sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
|
||||
//--------------------------------------------
|
||||
if (biasSize == dimsize) {
|
||||
|
@ -209,8 +213,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
scale[threadIdx.x + ph * BLOCK_DIM_x] *
|
||||
(input[tid +
|
||||
(threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps) +
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps)))) +
|
||||
bias[threadIdx.x + ph * BLOCK_DIM_x];
|
||||
}
|
||||
} else {
|
||||
|
@ -221,8 +227,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
scale[0] *
|
||||
(input[tid +
|
||||
(threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps) +
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps)))) +
|
||||
bias[threadIdx.x + ph * BLOCK_DIM_x];
|
||||
}
|
||||
}
|
||||
|
@ -235,8 +243,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
scale[threadIdx.x + ph * BLOCK_DIM_x] *
|
||||
(input[tid +
|
||||
(threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps) +
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps)))) +
|
||||
bias[0];
|
||||
}
|
||||
} else {
|
||||
|
@ -247,40 +257,43 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
scale[0] *
|
||||
(input[tid +
|
||||
(threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps) +
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(__fdividef(
|
||||
1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps)))) +
|
||||
bias[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void warpLaynormKernel(const float *input, const float *scale,
|
||||
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void warpLaynormKernel(const T *input, const T *scale,
|
||||
const int dimsize, const int stride,
|
||||
float *output, const float eps, int scaleSize,
|
||||
T *output, const T eps, int scaleSize,
|
||||
int otherSize) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
if (otherIdx < otherSize) {
|
||||
|
||||
__shared__ float muTotal[BLOCK_DIM_y];
|
||||
__shared__ float sigma2Total[BLOCK_DIM_y];
|
||||
__shared__ T muTotal[BLOCK_DIM_y];
|
||||
__shared__ T sigma2Total[BLOCK_DIM_y];
|
||||
|
||||
float muPartial = 0.0f;
|
||||
T muPartial = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
muPartial += input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
|
||||
}
|
||||
|
||||
muPartial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(muPartial);
|
||||
muPartial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(muPartial);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
muTotal[threadIdx.y] = muPartial / dimsize;
|
||||
muTotal[threadIdx.y] =
|
||||
muPartial * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
|
||||
//--------------------------------------------
|
||||
float sigma2Partial = 0.0f;
|
||||
T sigma2Partial = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
sigma2Partial +=
|
||||
|
@ -290,10 +303,11 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
muTotal[threadIdx.y]);
|
||||
}
|
||||
|
||||
sigma2Partial = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sigma2Partial);
|
||||
sigma2Partial = WarpAllReduce<SumOp, T, BLOCK_DIM_x>(sigma2Partial);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
sigma2Total[threadIdx.y] = sigma2Partial / dimsize;
|
||||
sigma2Total[threadIdx.y] =
|
||||
sigma2Partial * static_cast<T>(__fdividef(1.0F, dimsize));
|
||||
|
||||
//--------------------------------------------
|
||||
if (scaleSize == dimsize) {
|
||||
|
@ -302,8 +316,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
|
||||
scale[threadIdx.x + ph * BLOCK_DIM_x] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps);
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(
|
||||
__fdividef(1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps))));
|
||||
}
|
||||
} else {
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
|
@ -311,8 +327,10 @@ __global__ void warpLaynormKernel(const float *input, const float *scale,
|
|||
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
|
||||
scale[0] *
|
||||
(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
muTotal[threadIdx.y]) /
|
||||
sqrt(sigma2Total[threadIdx.y] + eps);
|
||||
muTotal[threadIdx.y]) *
|
||||
static_cast<T>(
|
||||
__fdividef(1.0F, sqrt(static_cast<float>(
|
||||
sigma2Total[threadIdx.y] + eps))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -325,7 +343,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<1024>
|
||||
blockLaynormKernel<float, 1024>
|
||||
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
|
||||
eps, scaleSize, bias, biasSize);
|
||||
} else if (dimsize > 31) {
|
||||
|
@ -335,7 +353,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 15) {
|
||||
|
@ -345,7 +363,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 7) {
|
||||
|
@ -355,7 +373,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else {
|
||||
|
@ -365,7 +383,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
}
|
||||
|
@ -378,7 +396,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<1024><<<num_block, BLOCK_DIM>>>(
|
||||
blockLaynormKernel<float, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
|
@ -387,7 +405,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<32, 32><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
|
@ -396,7 +414,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<16, 64><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
|
@ -405,7 +423,7 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<8, 128><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
|
@ -414,7 +432,108 @@ void LaynormKernel(const float *input, const float *scale, const float eps,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<4, 256><<<grid_dim, block_dim>>>(
|
||||
warpLaynormKernel<float, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
}
|
||||
}
|
||||
//-----------------
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output, const half *bias, int biasSize) {
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<half, 1024>
|
||||
<<<num_block, BLOCK_DIM>>>(input, scale, dimsize, stride, output,
|
||||
eps, scaleSize, bias, biasSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block,
|
||||
bias, biasSize);
|
||||
}
|
||||
}
|
||||
|
||||
void LaynormKernel(const half *input, const half *scale, const half eps,
|
||||
int size, int scaleSize, const int dimsize, const int stride,
|
||||
half *output) {
|
||||
int num_block = size / dimsize;
|
||||
if (dimsize > 1024) {
|
||||
int BLOCK_DIM = 1024;
|
||||
|
||||
blockLaynormKernel<half, 1024><<<num_block, BLOCK_DIM>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 32, 32><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 16, 64><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 8, 128><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_block + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
warpLaynormKernel<half, 4, 256><<<grid_dim, block_dim>>>(
|
||||
input, scale, dimsize, stride, output, eps, scaleSize, num_block);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "core/kernel.h"
|
||||
#include "cuda/cuda_expand.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
namespace infini {
|
||||
|
@ -48,11 +49,12 @@ class matmulCublas : public Kernel {
|
|||
auto opB = op->getTransB() ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
const int lda = op->getTransA() ? m : k, ldb = op->getTransB() ? k : n,
|
||||
ldc = n;
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
if (op->numInputs() == 2) { // no bias
|
||||
beta = 0.f;
|
||||
} else { // broadcast bias to output
|
||||
beta = 1.f;
|
||||
float alpha_naive = 1.f, beta_naive = 0.f;
|
||||
auto dataType = op->getDType();
|
||||
auto cuDataType = cublasDataTypeConvert(dataType);
|
||||
IT_ASSERT(cuDataType != CUDA_R_8I, "matmul don't support int8 dtype.");
|
||||
if (op->numInputs() == 3) { // have bias
|
||||
beta_naive = 1.f;
|
||||
auto inC = op->getInputs(2);
|
||||
auto out = op->getOutput();
|
||||
SmallArray inputShape, outputShape;
|
||||
|
@ -69,8 +71,9 @@ class matmulCublas : public Kernel {
|
|||
if (i >= offset)
|
||||
inputShape.data[i] = inC->getDims()[i - offset];
|
||||
}
|
||||
expandKernel(inC->getRawDataPtr<float *>(),
|
||||
out->getRawDataPtr<float *>(), nDims, outputsize,
|
||||
const int dType = dataType.getIndex();
|
||||
expandKernel(dType, inC->getRawDataPtr<void *>(),
|
||||
out->getRawDataPtr<void *>(), nDims, outputsize,
|
||||
inputShape, outputShape);
|
||||
}
|
||||
// TODO:use compute type
|
||||
|
@ -89,16 +92,38 @@ class matmulCublas : public Kernel {
|
|||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: n * k;
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, strideB, inAData, CUDA_R_32F, lda, strideA,
|
||||
&beta, outData, CUDA_R_32F, ldc, m * n, b, CUDA_R_32F,
|
||||
(cublasGemmAlgo_t)record->algo);
|
||||
if (dataType == DataType::Float16) {
|
||||
half alpha_half = static_cast<half>(alpha_naive);
|
||||
half beta_half = static_cast<half>(beta_naive);
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha_half,
|
||||
inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
|
||||
strideA, &beta_half, outData, cuDataType, ldc, m * n, b,
|
||||
cuDataType, (cublasGemmAlgo_t)record->algo);
|
||||
|
||||
} else {
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha_naive,
|
||||
inBData, cuDataType, ldb, strideB, inAData, cuDataType, lda,
|
||||
strideA, &beta_naive, outData, cuDataType, ldc, m * n, b,
|
||||
cuDataType, (cublasGemmAlgo_t)record->algo);
|
||||
}
|
||||
} else {
|
||||
stat = cublasGemmEx(
|
||||
context->cublasHandle(), opB, opA, n, m, k, &alpha, inBData,
|
||||
CUDA_R_32F, ldb, inAData, CUDA_R_32F, lda, &beta, outData,
|
||||
CUDA_R_32F, ldc, CUDA_R_32F, (cublasGemmAlgo_t)record->algo);
|
||||
if (dataType == DataType::Float16) {
|
||||
half alpha_half = static_cast<half>(alpha_naive);
|
||||
half beta_half = static_cast<half>(beta_naive);
|
||||
stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
|
||||
&alpha_half, inBData, cuDataType, ldb,
|
||||
inAData, cuDataType, lda, &beta_half,
|
||||
outData, cuDataType, ldc, cuDataType,
|
||||
(cublasGemmAlgo_t)record->algo);
|
||||
} else {
|
||||
stat = cublasGemmEx(context->cublasHandle(), opB, opA, n, m, k,
|
||||
&alpha_naive, inBData, cuDataType, ldb,
|
||||
inAData, cuDataType, lda, &beta_naive,
|
||||
outData, cuDataType, ldc, cuDataType,
|
||||
(cublasGemmAlgo_t)record->algo);
|
||||
}
|
||||
}
|
||||
// if (stat != CUBLAS_STATUS_SUCCESS)
|
||||
// cout << cublasGetErrorString(stat);
|
||||
|
@ -140,8 +165,9 @@ class matmulCublas : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MatMul, DataType::Float32, matmulCublas,
|
||||
"Matmul_cuBLAS_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MatMul, matmulCublas,
|
||||
"Matmul_cuBLAS_CUDA");
|
||||
|
||||
REGISTER_CONSTRUCTOR(2, MatmulCublasPerfRecordObj::from_json);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
#include "operators/matmul_integer.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_matmul_integer.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "utils/small_array.h"
|
||||
#include <thrust/transform.h>
|
||||
|
||||
namespace infini {
|
||||
|
||||
class matmulIntegerCublas : public CudaKernelWithoutConfig {
|
||||
bool do_compute(const Operator &_op, const RuntimeObj *_context) const {
|
||||
auto op = as<MatmulIntegerObj>(_op);
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
void *const inAData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const inBData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
const auto [b, m, n, k] = op->getBMNK();
|
||||
if (op->numInputs() >= 3) { // have a_zero_point
|
||||
int aZeroSize = op->getInputs(2)->size();
|
||||
int aSize = op->getInputs(0)->size();
|
||||
void *const aZeroPointData =
|
||||
(op->getInputs(2)->getRawDataPtr<void *>());
|
||||
if (op->getInputs(0)->getDType() == DataType::Int8) {
|
||||
if (aZeroSize > 1) {
|
||||
subA_kernel(DataType::Int8.getIndex(), inAData,
|
||||
aZeroPointData, aSize, k, 1);
|
||||
} else {
|
||||
subA_kernel(DataType::Int8.getIndex(), inAData,
|
||||
aZeroPointData, aSize, k, 0);
|
||||
}
|
||||
}
|
||||
if (op->getInputs(0)->getDType() == DataType::UInt8) {
|
||||
if (aZeroSize > 1) {
|
||||
subA_kernel(DataType::UInt8.getIndex(), inAData,
|
||||
aZeroPointData, aSize, k, 1);
|
||||
} else {
|
||||
subA_kernel(DataType::UInt8.getIndex(), inAData,
|
||||
aZeroPointData, aSize, k, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op->numInputs() == 4) { // have b_zero_point
|
||||
int bZeroSize = op->getInputs(3)->size();
|
||||
int bSize = op->getInputs(1)->size();
|
||||
void *const bZeroPointData =
|
||||
(op->getInputs(3)->getRawDataPtr<void *>());
|
||||
if (op->getInputs(1)->getDType() == DataType::Int8) {
|
||||
if (bZeroSize > 1) {
|
||||
subB_kernel(DataType::Int8.getIndex(), inBData,
|
||||
bZeroPointData, bSize, k, n, 1);
|
||||
} else {
|
||||
subB_kernel(DataType::Int8.getIndex(), inBData,
|
||||
bZeroPointData, bSize, k, n, 0);
|
||||
}
|
||||
}
|
||||
if (op->getInputs(1)->getDType() == DataType::UInt8) {
|
||||
if (bZeroSize > 1) {
|
||||
subB_kernel(DataType::UInt8.getIndex(), inBData,
|
||||
bZeroPointData, bSize, k, n, 1);
|
||||
} else {
|
||||
subB_kernel(DataType::UInt8.getIndex(), inBData,
|
||||
bZeroPointData, bSize, k, n, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
int lda = k, ldb = n, ldc = n;
|
||||
int32_t alpha = 1, beta = 0;
|
||||
|
||||
// TODO:use compute type
|
||||
cublasStatus_t stat;
|
||||
if (b > 1) {
|
||||
// Support batch broadcast with zero stride
|
||||
int dimA = op->getInputs(0)->getRank();
|
||||
int dimB = op->getInputs(1)->getRank();
|
||||
long long strideA =
|
||||
(dimA == 2 ||
|
||||
(dimA == 3 && op->getInputs(0)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: m * k;
|
||||
long long strideB =
|
||||
(dimB == 2 ||
|
||||
(dimB == 3 && op->getInputs(1)->getDims()[0] == 1))
|
||||
? 0 // Broadcast the batch dimension if batch size is 1
|
||||
: n * k;
|
||||
stat = cublasGemmStridedBatchedEx(
|
||||
context->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
|
||||
&alpha, inBData, CUDA_R_8I, ldb, strideB, inAData, CUDA_R_8I,
|
||||
lda, strideA, &beta, outData, CUDA_R_32I, ldc, m * n, b,
|
||||
CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
|
||||
} else {
|
||||
stat = cublasGemmEx(
|
||||
context->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
|
||||
&alpha, inBData, CUDA_R_8I, ldb, inAData, CUDA_R_8I, lda, &beta,
|
||||
outData, CUDA_R_32I, ldc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
|
||||
}
|
||||
return (stat == CUBLAS_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
IT_ASSERT(do_compute(_op, _context));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MatMulInteger, matmulIntegerCublas,
|
||||
"MatmulInteger_cuBLAS_CUDA");
|
||||
|
||||
}; // namespace infini
|
|
@ -0,0 +1,75 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _subA_kernel(void *a, void *b, int size, int k, int delta) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < size; i += stride) {
|
||||
int j = delta * (i - i % k) / k;
|
||||
((int8_t *)a)[i] = ((int8_t *)a)[i] - ((int8_t *)b)[j];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _subA_u8_kernel(void *a, void *b, int size, int k, int delta) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < size; i += stride) {
|
||||
int j = delta * (i - i % k) / k;
|
||||
auto aData = static_cast<int16_t>(((uint8_t *)a)[i]);
|
||||
auto bData = static_cast<int16_t>(((uint8_t *)b)[j]);
|
||||
((int8_t *)a)[i] = static_cast<int8_t>(aData - bData);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _subB_kernel(void *a, void *b, int size, int k, int n,
|
||||
int delta) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < size; i += stride) {
|
||||
int j = delta * ((i / k) + (i % n));
|
||||
((int8_t *)a)[i] = ((int8_t *)a)[i] - ((int8_t *)b)[j];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _subB_u8_kernel(void *a, void *b, int size, int k, int n,
|
||||
int delta) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < size; i += stride) {
|
||||
int j = delta * ((i / k) + (i % n));
|
||||
auto aData = static_cast<int16_t>(((uint8_t *)a)[i]);
|
||||
auto bData = static_cast<int16_t>(((uint8_t *)b)[j]);
|
||||
((int8_t *)a)[i] = static_cast<int8_t>(aData - bData);
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void subA_kernel(int dType, void *a, void *b, int size, int k, int delta) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (size + block_work_size() - 1) / block_work_size();
|
||||
if (dType == 3) {
|
||||
_subA_kernel<<<gridsize, blocksize>>>(a, b, size, k, delta);
|
||||
} else if (dType == 2) {
|
||||
_subA_u8_kernel<<<gridsize, blocksize>>>(a, b, size, k, delta);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
void subB_kernel(int dType, void *a, void *b, int size, int k, int n,
|
||||
int delta) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (size + block_work_size() - 1) / block_work_size();
|
||||
if (dType == 3) {
|
||||
_subB_kernel<<<gridsize, blocksize>>>(a, b, size, k, n, delta);
|
||||
} else if (dType == 2) {
|
||||
_subB_u8_kernel<<<gridsize, blocksize>>>(a, b, size, k, n, delta);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
}
|
||||
}; // namespace infini
|
|
@ -229,9 +229,8 @@ class MemboundTVMExtractSource : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
// MemboundTVMExtractSource,
|
||||
// "Memobund_TVM_Ansor_extract_source");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMExtractSource,
|
||||
"Memobund_TVM_Ansor_extract_source");
|
||||
}; // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -216,9 +216,9 @@ class MemboundTVMPackedFunction : public Kernel {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, DataType::Float32,
|
||||
MemboundTVMPackedFunction,
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MemBound, MemboundTVMPackedFunction,
|
||||
"Memobund_TVM_Ansor_packed_funciton");
|
||||
|
||||
}; // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,10 +39,8 @@ class SliceCuda : private PadSliceCudaCompute, public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Float32, SliceCuda,
|
||||
"Slice__CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Slice, DataType::Int64, SliceCuda,
|
||||
"Slice__CUDA_Int64");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Pad, DataType::Float32, PadCuda,
|
||||
"Pad__CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Slice, SliceCuda, "Slice__CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Pad, PadCuda, "Pad__CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "core/data_type.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_pad_slice.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
|
||||
__device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
|
||||
TransMetaData metaData,
|
||||
|
@ -21,39 +22,83 @@ __device__ int WholeTensorOffset2PartTensorOffset(int wholeOffset,
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void _pad_slice_kernel(T *part, T *whole, TransMetaData metaData,
|
||||
int nDims, int num, bool isPad) {
|
||||
__global__ void _pad_slice_kernel(void *part, void *whole,
|
||||
TransMetaData metaData, int nDims, int num,
|
||||
bool isPad) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid >= num)
|
||||
if (tid >= num) {
|
||||
return;
|
||||
}
|
||||
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
while (tid < num) {
|
||||
int offset = WholeTensorOffset2PartTensorOffset(tid, metaData, nDims);
|
||||
if (isPad)
|
||||
if (offset < 0)
|
||||
whole[tid] = 0;
|
||||
else
|
||||
whole[tid] = part[offset];
|
||||
else
|
||||
part[offset] = whole[tid];
|
||||
if (isPad) {
|
||||
if (offset < 0) {
|
||||
((T *)whole)[tid] = 0;
|
||||
} else {
|
||||
((T *)whole)[tid] = ((T *)part)[offset];
|
||||
}
|
||||
} else if (offset >= 0) {
|
||||
((T *)part)[offset] = ((T *)whole)[tid];
|
||||
}
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
#define CASE(T) \
|
||||
_pad_slice_kernel<DT_CUDA<T>::t><<<gridSize, blockSize>>>( \
|
||||
partData, wholeData, metadata, nDims, num, isPad);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE(1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE(2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE(3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE(4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE(5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE(6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE(7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE(10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE(11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE(12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE(13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE(16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
void pad_slice_kernel(void *partData, void *wholeData,
|
||||
const TransMetaData &metadata, int nDims, int num,
|
||||
bool isPad) {
|
||||
int blockSize = 32 * 16;
|
||||
int gridSize = (num + blockSize - 1) / blockSize;
|
||||
if (metadata.DType == DataType::Int64.getIndex()) {
|
||||
_pad_slice_kernel<int64_t>
|
||||
<<<gridSize, blockSize>>>((int64_t *)partData, (int64_t *)wholeData,
|
||||
metadata, nDims, num, isPad);
|
||||
} else if (metadata.DType == DataType::Float32.getIndex()) {
|
||||
_pad_slice_kernel<float><<<gridSize, blockSize>>>(
|
||||
(float *)partData, (float *)wholeData, metadata, nDims, num, isPad);
|
||||
}
|
||||
int dType = metadata.DType;
|
||||
SWITCH_DTYPE(dType)
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
|
@ -76,8 +76,9 @@ class avgPoolCudnn : public poolingCudnn {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, DataType::Float32, maxPoolCudnn,
|
||||
"MaxPool_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, DataType::Float32,
|
||||
avgPoolCudnn, "AvgPool_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::MaxPool, maxPoolCudnn,
|
||||
"MaxPool_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::AveragePool, avgPoolCudnn,
|
||||
"AvgPool_cuDNN_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -40,8 +40,7 @@ class RecvNCCL : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Recv, DataType::Float32, RecvNCCL,
|
||||
"Recv_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Recv, RecvNCCL, "Recv_NCCL_CUDA");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "operators/reduce.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
|
||||
namespace infini {
|
||||
class ReduceCudnnBase : public CudaKernelWithoutConfig {
|
||||
|
@ -46,12 +47,12 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
|
|||
checkCudnnError(cudnnCreateTensorDescriptor(&inDesc));
|
||||
cudnnTensorDescriptor_t outDesc;
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outDesc));
|
||||
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
|
||||
if (nInDims > 3) {
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
inDesc, CUDNN_DATA_FLOAT, nInDims, inDimArray, inStrideArray));
|
||||
checkCudnnError(
|
||||
cudnnSetTensorNdDescriptor(outDesc, CUDNN_DATA_FLOAT, nInDims,
|
||||
outDimArray, outStrideArray));
|
||||
inDesc, cudnnDataType, nInDims, inDimArray, inStrideArray));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
outDesc, cudnnDataType, nInDims, outDimArray, outStrideArray));
|
||||
} else {
|
||||
int idims[4] = {1, 1, 1, 1}, odims[4] = {1, 1, 1, 1};
|
||||
for (int i = 0; i < nInDims; ++i) {
|
||||
|
@ -62,11 +63,11 @@ class ReduceCudnnBase : public CudaKernelWithoutConfig {
|
|||
}
|
||||
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
inDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, idims[0], idims[1],
|
||||
inDesc, CUDNN_TENSOR_NCHW, cudnnDataType, idims[0], idims[1],
|
||||
idims[2], idims[3]));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
outDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, odims[0],
|
||||
odims[1], odims[2], odims[3]));
|
||||
outDesc, CUDNN_TENSOR_NCHW, cudnnDataType, odims[0], odims[1],
|
||||
odims[2], odims[3]));
|
||||
}
|
||||
|
||||
// get reduce descriptor
|
||||
|
@ -120,8 +121,9 @@ class ReduceSumCudnn : public ReduceCudnnBase {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, DataType::Float32,
|
||||
ReduceMeanCudnn, "ReduceMean_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, DataType::Float32,
|
||||
ReduceSumCudnn, "ReduceSum_cuDNN_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ReduceMean, ReduceMeanCudnn,
|
||||
"ReduceMean_cuDNN_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::ReduceSum, ReduceSumCudnn,
|
||||
"ReduceSum_cuDNN_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -11,15 +11,10 @@ class CopyCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
// reshape/flatten/identity all act as copying from input to output.
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Float32, CopyCuda,
|
||||
"Reshape_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int64, CopyCuda,
|
||||
"Reshape_CUDA_Int64");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, DataType::Int32, CopyCuda,
|
||||
"Reshape_CUDA_Int32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Flatten, DataType::Float32, CopyCuda,
|
||||
"Flatten_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Identity, DataType::Float32, CopyCuda,
|
||||
"Identity_CUDA_Float32");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Reshape, CopyCuda, "Reshape_CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Flatten, CopyCuda, "Flatten_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Identity, CopyCuda, "Identity_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -48,7 +48,6 @@ class ResizeCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Resize, DataType::Float32, ResizeCuda,
|
||||
"Resize_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Resize, ResizeCuda, "Resize_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -36,8 +36,7 @@ class SendNCCL : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Send, DataType::Float32, SendNCCL,
|
||||
"Send_NCCL_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Send, SendNCCL, "Send_NCCL_CUDA");
|
||||
} // namespace infini
|
||||
|
||||
#endif
|
||||
|
|
|
@ -20,11 +20,15 @@ class SoftmaxCuda : public CudaKernelWithoutConfig {
|
|||
int stride = op->getInputs(0)->getStride().at(op->getAxis());
|
||||
|
||||
int num_blocks = size / dimsize;
|
||||
softmax_kernel(num_blocks, (float *)input, (float *)output, size,
|
||||
dimsize, stride);
|
||||
if (op->getDType() == DataType::Float32) {
|
||||
softmax_kernel(num_blocks, (float *)input, (float *)output, size,
|
||||
dimsize, stride);
|
||||
} else if (op->getDType() == DataType::Float16) {
|
||||
softmax_kernel(num_blocks, (half *)input, (half *)output, size,
|
||||
dimsize, stride);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, SoftmaxCuda,
|
||||
"Softmax_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Softmax, SoftmaxCuda, "Softmax_CUDA");
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
struct __align__(8) DataMaxSum { // update the global max and sum, store the
|
||||
// output at max_tmp and sum_tmp
|
||||
float max_tmp; // store max
|
||||
|
@ -16,9 +15,9 @@ __device__ __forceinline__ DataMaxSum reduce_dms_op(DataMaxSum a,
|
|||
|
||||
return bigger;
|
||||
}
|
||||
template <int BLOCK_DIM>
|
||||
template <typename T, int BLOCK_DIM>
|
||||
__launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
|
||||
float *__restrict input, float *__restrict output, int size, int dimsize,
|
||||
T *__restrict input, T *__restrict output, int size, int dimsize,
|
||||
int stride) { // if set axis = 1, inputShape=[I,J,K,S]
|
||||
// tid = i(JKS) + j(KS) + k(S) + s
|
||||
|
||||
|
@ -33,15 +32,33 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
|
|||
dms_partial.max_tmp = -__FLT_MAX__;
|
||||
dms_partial.sum_tmp = 0.0f;
|
||||
DataMaxSum dms_input;
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
int remain = dimsize % BLOCK_DIM;
|
||||
int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
|
||||
|
||||
dms_input.max_tmp =
|
||||
input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride];
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
dms_input.max_tmp =
|
||||
input[tid + (threadIdx.x * step + ind) * stride];
|
||||
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial = reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial =
|
||||
reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
dms_input.max_tmp =
|
||||
input[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride];
|
||||
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial =
|
||||
reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
}
|
||||
}
|
||||
|
||||
typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ DataMaxSum dms_total;
|
||||
|
@ -53,12 +70,102 @@ __launch_bounds__(BLOCK_DIM) __global__ void _blockSoftmaxKernel(
|
|||
}
|
||||
__syncthreads();
|
||||
//-----------------
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM < dimsize; ph++) {
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] =
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM) * stride] -
|
||||
dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
__expf(static_cast<float>(
|
||||
input[tid + (threadIdx.x * step + ind) * stride]) -
|
||||
dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
|
||||
output[tid +
|
||||
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
__expf(static_cast<float>(
|
||||
input[tid +
|
||||
(remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride]) -
|
||||
dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int BLOCK_DIM, int numPerThread>
|
||||
__global__ void
|
||||
_blockSoftmaxKernel(T *__restrict input, T *__restrict output, int size,
|
||||
int dimsize,
|
||||
int stride) { // if set axis = 1, inputShape=[I,J,K,S]
|
||||
// tid = i(JKS) + j(KS) + k(S) + s
|
||||
|
||||
// blockDim.x = size/dimsize = IKS
|
||||
// blockIdx.x = i(KS) + k(S) + s,blockIdx.x%stride = k(S) + s
|
||||
|
||||
int tid =
|
||||
blockIdx.x % stride + (blockIdx.x - blockIdx.x % stride) *
|
||||
dimsize; // now, tid = i(JKS) + k(S) + s;
|
||||
int remain = dimsize % BLOCK_DIM;
|
||||
int step = (dimsize - remain) / BLOCK_DIM + 1; // step <= numPerThread
|
||||
float dataPerThread[numPerThread];
|
||||
|
||||
DataMaxSum dms_partial;
|
||||
dms_partial.max_tmp = -__FLT_MAX__;
|
||||
dms_partial.sum_tmp = 0.0f;
|
||||
DataMaxSum dms_input;
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
dataPerThread[ind] =
|
||||
input[tid + (threadIdx.x * step + ind) * stride];
|
||||
dms_input.max_tmp = dataPerThread[ind];
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial =
|
||||
reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
dataPerThread[ind] =
|
||||
input[tid + (remain * step +
|
||||
(threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride];
|
||||
dms_input.max_tmp = dataPerThread[ind];
|
||||
dms_input.sum_tmp = 1.0f;
|
||||
dms_partial =
|
||||
reduce_dms_op(dms_partial,
|
||||
dms_input); // reduce the data to one block
|
||||
}
|
||||
}
|
||||
|
||||
typedef cub::BlockReduce<DataMaxSum, BLOCK_DIM> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
__shared__ DataMaxSum dms_total;
|
||||
DataMaxSum dms_block =
|
||||
BlockReduce(temp_storage).Reduce(dms_partial, reduce_dms_op);
|
||||
if (threadIdx.x ==
|
||||
0) { // must set threadIdx.x = 0 write the output to memory
|
||||
dms_total = dms_block;
|
||||
}
|
||||
__syncthreads();
|
||||
//-----------------
|
||||
if (threadIdx.x < remain) {
|
||||
for (int ind = 0; ind < step; ind++) {
|
||||
output[tid + (threadIdx.x * step + ind) * stride] =
|
||||
__expf(dataPerThread[ind] - dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
}
|
||||
} else {
|
||||
for (int ind = 0; ind < step - 1; ind++) {
|
||||
output[tid +
|
||||
(remain * step + (threadIdx.x - remain) * (step - 1) + ind) *
|
||||
stride] =
|
||||
__expf(dataPerThread[ind] - dms_total.max_tmp) *
|
||||
__fdividef(1.0F, dms_total.sum_tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,14 +188,14 @@ __inline__ __device__ T WarpAllReduce(T val) {
|
|||
}
|
||||
return val;
|
||||
}
|
||||
template <int BLOCK_DIM_x, int BLOCK_DIM_y>
|
||||
__global__ void _warpSoftmaxKernel(float *__restrict input,
|
||||
float *__restrict output, int size,
|
||||
int dimsize, int stride) {
|
||||
|
||||
template <typename T, int BLOCK_DIM_x, int BLOCK_DIM_y, int numPerThreadx>
|
||||
__global__ void _warpSoftmaxKernel(T *__restrict input, T *__restrict output,
|
||||
int size, int dimsize, int stride) {
|
||||
int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
int otherSize = size / dimsize;
|
||||
int tid = otherIdx % stride + (otherIdx - otherIdx % stride) * dimsize;
|
||||
|
||||
float dataPerThreadx[numPerThreadx];
|
||||
if (otherIdx < otherSize) {
|
||||
|
||||
__shared__ float max_total[BLOCK_DIM_y];
|
||||
|
@ -96,9 +203,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
|
|||
float max_data = -__FLT_MAX__;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
max_data =
|
||||
max(max_data,
|
||||
input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride]);
|
||||
dataPerThreadx[ph] =
|
||||
input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride];
|
||||
max_data = max(max_data, dataPerThreadx[ph]);
|
||||
}
|
||||
|
||||
max_data = WarpAllReduce<MaxOp, float, BLOCK_DIM_x>(max_data);
|
||||
|
@ -110,9 +217,9 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
|
|||
float sum_data = 0.0f;
|
||||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
sum_data +=
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
max_total[threadIdx.y]);
|
||||
dataPerThreadx[ph] =
|
||||
__expf(dataPerThreadx[ph] - max_total[threadIdx.y]);
|
||||
sum_data += dataPerThreadx[ph];
|
||||
}
|
||||
|
||||
sum_data = WarpAllReduce<SumOp, float, BLOCK_DIM_x>(sum_data);
|
||||
|
@ -124,9 +231,7 @@ __global__ void _warpSoftmaxKernel(float *__restrict input,
|
|||
|
||||
for (int ph = 0; threadIdx.x + ph * BLOCK_DIM_x < dimsize; ph++) {
|
||||
output[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] =
|
||||
__expf(input[tid + (threadIdx.x + ph * BLOCK_DIM_x) * stride] -
|
||||
max_total[threadIdx.y]) *
|
||||
__fdividef(1.0F, sum_total[threadIdx.y]);
|
||||
dataPerThreadx[ph] * __fdividef(1.0F, sum_total[threadIdx.y]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -137,10 +242,35 @@ namespace infini {
|
|||
void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
||||
int dimsize, int stride) {
|
||||
|
||||
if (dimsize > 1024) {
|
||||
if (dimsize > 1024 * 128) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<1024>
|
||||
_blockSoftmaxKernel<float, 1024>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 64) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<float, 1024, 128>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 32) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<float, 1024, 64>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 16) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<float, 1024, 32>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 4) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<float, 1024, 16>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<float, 1024, 4>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
|
@ -149,7 +279,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<32, 32>
|
||||
_warpSoftmaxKernel<float, 32, 32, 32>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
|
@ -158,7 +288,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<16, 64>
|
||||
_warpSoftmaxKernel<float, 16, 64, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
|
@ -167,7 +297,7 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<8, 128>
|
||||
_warpSoftmaxKernel<float, 8, 128, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
|
@ -176,7 +306,79 @@ void softmax_kernel(int num_blocks, float *input, float *output, int size,
|
|||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<4, 256>
|
||||
_warpSoftmaxKernel<float, 4, 256, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
}
|
||||
}
|
||||
//------------------
|
||||
void softmax_kernel(int num_blocks, half *input, half *output, int size,
|
||||
int dimsize, int stride) {
|
||||
|
||||
if (dimsize > 1024 * 128) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 64) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024, 128>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 32) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024, 64>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 16) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024, 32>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024 * 4) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024, 16>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 1024) {
|
||||
|
||||
int BLOCK_DIM = 1024;
|
||||
_blockSoftmaxKernel<half, 1024, 4>
|
||||
<<<num_blocks, BLOCK_DIM>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 31) {
|
||||
int BLOCK_DIM_x = 32;
|
||||
int BLOCK_DIM_y = 32;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<half, 32, 32, 32>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 15) {
|
||||
int BLOCK_DIM_x = 16;
|
||||
int BLOCK_DIM_y = 64;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<half, 16, 64, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else if (dimsize > 7) {
|
||||
int BLOCK_DIM_x = 8;
|
||||
int BLOCK_DIM_y = 128;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<half, 8, 128, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
} else {
|
||||
int BLOCK_DIM_x = 4;
|
||||
int BLOCK_DIM_y = 256;
|
||||
int num_block_x = (num_blocks + BLOCK_DIM_y - 1) / BLOCK_DIM_y;
|
||||
dim3 block_dim(BLOCK_DIM_x, BLOCK_DIM_y, 1);
|
||||
dim3 grid_dim(num_block_x, 1, 1);
|
||||
|
||||
_warpSoftmaxKernel<half, 4, 256, 2>
|
||||
<<<grid_dim, block_dim>>>(input, output, size, dimsize, stride);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
namespace infini {
|
||||
|
||||
class CudaCompute {
|
||||
void initComposedTensorMetadata(ComposedTensorMetadata &metadata,
|
||||
template <typename T>
|
||||
void initComposedTensorMetadata(ComposedTensorMetadata<T> &metadata,
|
||||
Tensor tensor) const {
|
||||
int nDims = tensor->getRank();
|
||||
auto strides = tensor->getStride();
|
||||
|
@ -16,10 +17,10 @@ class CudaCompute {
|
|||
metadata.dimSize[i] = tensor->getDims().at(i);
|
||||
metadata.stride[i] = strides.at(i);
|
||||
}
|
||||
metadata.data = tensor->getRawDataPtr<float *>();
|
||||
metadata.data = tensor->getRawDataPtr<T *>();
|
||||
}
|
||||
|
||||
void initElementTensorMetadata(ElementTensorMetadata &metadata,
|
||||
template <typename T>
|
||||
void initElementTensorMetadata(ElementTensorMetadata<T> &metadata,
|
||||
TensorVec tensors, int idx, int dim,
|
||||
int &dimBgIdx, int &batchCounter) const {
|
||||
int nTensors = tensors.size();
|
||||
|
@ -27,7 +28,7 @@ class CudaCompute {
|
|||
++batchCounter) {
|
||||
auto tensor = tensors.at(idx + batchCounter);
|
||||
auto dimSize = tensor->getDims()[dim];
|
||||
metadata.data[batchCounter] = tensor->getRawDataPtr<float *>();
|
||||
metadata.data[batchCounter] = tensor->getRawDataPtr<T *>();
|
||||
metadata.dimBgNo[batchCounter] = dimBgIdx;
|
||||
metadata.dimSize[batchCounter] = dimSize;
|
||||
metadata.nElements[batchCounter] = tensor->size();
|
||||
|
@ -36,17 +37,17 @@ class CudaCompute {
|
|||
}
|
||||
|
||||
public:
|
||||
template <typename T>
|
||||
void do_compute(Tensor composedTensor, TensorVec elementsTensor, int dim,
|
||||
int nDims, bool isSplit) const {
|
||||
IT_ASSERT(nDims <= DIM_MAX_SIZE);
|
||||
|
||||
ComposedTensorMetadata composedMetadata;
|
||||
initComposedTensorMetadata(composedMetadata, composedTensor);
|
||||
ComposedTensorMetadata<T> composedMetadata;
|
||||
initComposedTensorMetadata<T>(composedMetadata, composedTensor);
|
||||
|
||||
int dimBgNo = 0;
|
||||
int nElemets = elementsTensor.size();
|
||||
for (int i = 0; i < nElemets; i += BATCH_SIZE) {
|
||||
ElementTensorMetadata elemMetadata;
|
||||
ElementTensorMetadata<T> elemMetadata;
|
||||
int batchCounter = 0;
|
||||
initElementTensorMetadata(elemMetadata, elementsTensor, i, dim,
|
||||
dimBgNo, batchCounter);
|
||||
|
@ -74,23 +75,34 @@ class ConcatCuda : private CudaCompute, public CudaKernelWithoutConfig {
|
|||
}
|
||||
}
|
||||
}
|
||||
do_compute(_op->getOutput(), _op->getInputs(),
|
||||
as<ConcatObj>(_op)->getDim(), _op->getOutput()->getRank(),
|
||||
false);
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
do_compute<float>(_op->getOutput(), _op->getInputs(),
|
||||
as<ConcatObj>(_op)->getDim(),
|
||||
_op->getOutput()->getRank(), false);
|
||||
} else if (_op->getDType() == DataType::Float16) {
|
||||
do_compute<half>(_op->getOutput(), _op->getInputs(),
|
||||
as<ConcatObj>(_op)->getDim(),
|
||||
_op->getOutput()->getRank(), false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SplitCuda : private CudaCompute, public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
do_compute(_op->getInputs(0), _op->getOutputs(),
|
||||
as<SplitObj>(_op)->getDim(), _op->getInputs(0)->getRank(),
|
||||
true);
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
do_compute<float>(_op->getInputs(0), _op->getOutputs(),
|
||||
as<SplitObj>(_op)->getDim(),
|
||||
_op->getInputs(0)->getRank(), true);
|
||||
} else if (_op->getDType() == DataType::Float16) {
|
||||
do_compute<half>(_op->getInputs(0), _op->getOutputs(),
|
||||
as<SplitObj>(_op)->getDim(),
|
||||
_op->getInputs(0)->getRank(), true);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Concat, DataType::Float32, ConcatCuda,
|
||||
"Concat_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Split, DataType::Float32, SplitCuda,
|
||||
"Split_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Concat, ConcatCuda, "Concat_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Split, SplitCuda, "Split_CUDA");
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_split_concat.h"
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ int
|
||||
elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
|
||||
int nDim, ComposedTensorMetadata wholeMeta) {
|
||||
int nDim, ComposedTensorMetadata<T> wholeMeta) {
|
||||
int offset = 0;
|
||||
|
||||
// COMP(x0,...,xk,...,xn-1) = ELMT[xk / d](x0,...,xk % d,...xn-1)
|
||||
|
@ -25,10 +25,10 @@ elementIdx2ComposedIdx(int elementIndex, int dimBgNo, int dimSize, int dim,
|
|||
int oP = (dim == 0) ? (elementIndex + dimBgNo) : elementIndex;
|
||||
return offset + oP * wholeMeta.stride[0];
|
||||
}
|
||||
|
||||
__global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
|
||||
ComposedTensorMetadata compMeta, int dim,
|
||||
int nDims, bool isSplit) {
|
||||
template <typename T>
|
||||
__global__ void _split_concat_kernel(ElementTensorMetadata<T> elemMeta,
|
||||
ComposedTensorMetadata<T> compMeta,
|
||||
int dim, int nDims, bool isSplit) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int nElements = elemMeta.nElements[blockIdx.y];
|
||||
if (tid >= nElements)
|
||||
|
@ -36,10 +36,10 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
|
|||
|
||||
auto dimBgNo = elemMeta.dimBgNo[blockIdx.y];
|
||||
auto dimSize = elemMeta.dimSize[blockIdx.y];
|
||||
float *elemData = elemMeta.data[blockIdx.y];
|
||||
T *elemData = elemMeta.data[blockIdx.y];
|
||||
|
||||
int Offset =
|
||||
elementIdx2ComposedIdx(tid, dimBgNo, dimSize, dim, nDims, compMeta);
|
||||
elementIdx2ComposedIdx<T>(tid, dimBgNo, dimSize, dim, nDims, compMeta);
|
||||
// copy data from input to output
|
||||
// for split:input is composed tensor;for concat:input is element
|
||||
// tensors.
|
||||
|
@ -52,8 +52,22 @@ __global__ void _split_concat_kernel(ElementTensorMetadata elemMeta,
|
|||
namespace infini {
|
||||
|
||||
// TODO: when dim=0, the operation can be executed in-place
|
||||
void split_concat_kernel(const ElementTensorMetadata &eleMeta,
|
||||
const ComposedTensorMetadata &compMeta, int dim,
|
||||
void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
|
||||
const ComposedTensorMetadata<float> &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit) {
|
||||
dim3 blockSize = dim3(32 * 16);
|
||||
// gridsize = max_n_elements / blockSize
|
||||
int max_n_elements =
|
||||
*std::max_element(eleMeta.nElements, eleMeta.nElements + batchSize);
|
||||
int gridDimX = (max_n_elements - 1) / (32 * 16) + 1;
|
||||
// each y is a split among the batch
|
||||
dim3 gridSize(gridDimX, batchSize);
|
||||
|
||||
_split_concat_kernel<<<gridSize, blockSize>>>(eleMeta, compMeta, dim, nDims,
|
||||
isSplit);
|
||||
}
|
||||
void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
|
||||
const ComposedTensorMetadata<half> &compMeta, int dim,
|
||||
int batchSize, int nDims, bool isSplit) {
|
||||
dim3 blockSize = dim3(32 * 16);
|
||||
// gridsize = max_n_elements / blockSize
|
||||
|
|
|
@ -38,8 +38,9 @@ class TransposeCuda : public CudaKernelWithoutConfig {
|
|||
outputDims.data[i] = outputShape[i];
|
||||
}
|
||||
|
||||
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
|
||||
strides, outputDims);
|
||||
const int dType = op->getDType().getIndex();
|
||||
transpose_kernel(dType, inputData, outputData, nDims, size, strides,
|
||||
outputDims);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -82,15 +83,16 @@ class DepthToSpaceCuda : public CudaKernelWithoutConfig {
|
|||
for (int i = 0; i < nDims; ++i) {
|
||||
outputDims.data[i] = transpose[i];
|
||||
}
|
||||
|
||||
transpose_kernel((float *)inputData, (float *)outputData, nDims, size,
|
||||
strides, outputDims);
|
||||
const int dType = op->getDType().getIndex();
|
||||
transpose_kernel(dType, inputData, outputData, nDims, size, strides,
|
||||
outputDims);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32,
|
||||
TransposeCuda, "Transpose_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Transpose, TransposeCuda,
|
||||
"Transpose_CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DepthToSpaceCuda,
|
||||
"DepthToSpace_CUDA");
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::DepthToSpace, DataType::Float32,
|
||||
DepthToSpaceCuda, "DepthToSpace_CUDA_Float32");
|
||||
} // namespace infini
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
#include "core/common.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
constexpr unsigned int num_threads() { return 32 * 4; }
|
||||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _transpose_kernel(float *input, float *output, int nDims,
|
||||
template <class T>
|
||||
__global__ void _transpose_kernel(void *input, void *output, int nDims,
|
||||
int size, infini::SmallArray strides,
|
||||
infini::SmallArray outputShape) {
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
@ -17,21 +19,61 @@ __global__ void _transpose_kernel(float *input, float *output, int nDims,
|
|||
inputIdx += v % outputShape.data[i] * strides.data[i];
|
||||
v /= outputShape.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
|
||||
output[outputIdx] = __ldg(input + inputIdx);
|
||||
#else
|
||||
output[outputIdx] = input[inputIdx];
|
||||
#endif
|
||||
((T *)output)[outputIdx] = ((T *)input)[inputIdx];
|
||||
}
|
||||
}
|
||||
#define CASE(T) \
|
||||
_transpose_kernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
input, output, nDims, size, strides, outputShape);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE(1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE(2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE(3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE(4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE(5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE(6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE(7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE(10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE(11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE(12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE(13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE(16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void transpose_kernel(float *input, float *output, int nDims, int size,
|
||||
void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
|
||||
SmallArray strides, SmallArray outputShape) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (size + block_work_size() - 1) / block_work_size();
|
||||
_transpose_kernel<<<gridsize, blocksize>>>(input, output, nDims, size,
|
||||
strides, outputShape);
|
||||
SWITCH_DTYPE(dType)
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_unary.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
|
@ -12,6 +13,51 @@ class UnaryCuda : public CudaKernelWithoutConfig {
|
|||
}
|
||||
};
|
||||
|
||||
class CastCuda : public CudaKernelWithoutConfig {
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<CastObj>(_op);
|
||||
|
||||
size_t num = op->getOutput()->size();
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
if (op->getType() == CastType::Float162Float) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Float16 &&
|
||||
op->getOutDType() == DataType::Float32);
|
||||
cast_kernel<half, float>((half *)inputData, (float *)outputData,
|
||||
num);
|
||||
} else if (op->getType() == CastType::Float2Float16) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Float32 &&
|
||||
op->getOutDType() == DataType::Float16);
|
||||
cast_kernel<float, half>((float *)inputData, (half *)outputData,
|
||||
num);
|
||||
} else if (op->getType() == CastType::Float2Int32) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Float32 &&
|
||||
op->getOutDType() == DataType::Int32);
|
||||
cast_kernel<float, int32_t>((float *)inputData,
|
||||
(int32_t *)outputData, num);
|
||||
} else if (op->getType() == CastType::Float2Int8) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Float32 &&
|
||||
op->getOutDType() == DataType::Int8);
|
||||
cast_kernel<float, int8_t>((float *)inputData, (int8_t *)outputData,
|
||||
num);
|
||||
} else if (op->getType() == CastType::Int82Float) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Int8 &&
|
||||
op->getOutDType() == DataType::Float32);
|
||||
cast_kernel<int8_t, float>((int8_t *)inputData, (float *)outputData,
|
||||
num);
|
||||
} else if (op->getType() == CastType::Int322Float) {
|
||||
IT_ASSERT(op->getInDType() == DataType::Int32 &&
|
||||
op->getOutDType() == DataType::Float32);
|
||||
cast_kernel<int32_t, float>((int32_t *)inputData,
|
||||
(float *)outputData, num);
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class ActivationCudnn : public CudaKernelWithoutConfig {
|
||||
virtual cudnnActivationMode_t getOpType() const = 0;
|
||||
virtual tuple<float, float> getAlphBeta() const { return {1.f, 0.f}; }
|
||||
|
@ -33,17 +79,17 @@ class ActivationCudnn : public CudaKernelWithoutConfig {
|
|||
while (stride.size() < 4)
|
||||
stride.push_back(1);
|
||||
|
||||
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
|
||||
|
||||
// get inputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(inputDesc, CUDNN_DATA_FLOAT,
|
||||
dim.size(), dim.data(),
|
||||
stride.data()));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
inputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
|
||||
|
||||
// get outputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(outputDesc, CUDNN_DATA_FLOAT,
|
||||
dim.size(), dim.data(),
|
||||
stride.data()));
|
||||
checkCudnnError(cudnnSetTensorNdDescriptor(
|
||||
outputDesc, cudnnDataType, dim.size(), dim.data(), stride.data()));
|
||||
|
||||
// get op descriptor
|
||||
cudnnActivationDescriptor_t activationDesc;
|
||||
|
@ -86,16 +132,18 @@ class SoftmaxCudnn : public CudaKernelWithoutConfig {
|
|||
memcpy(dim_array + (4 - dim.size()), dim.data(),
|
||||
dim.size() * sizeof(int));
|
||||
|
||||
auto cudnnDataType = cudnnDataTypeConvert(op->getDType());
|
||||
|
||||
// get inputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&inputDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
inputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
|
||||
inputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
|
||||
dim_array[1], dim_array[2], dim_array[3]));
|
||||
|
||||
// get outputs
|
||||
checkCudnnError(cudnnCreateTensorDescriptor(&outputDesc));
|
||||
checkCudnnError(cudnnSetTensor4dDescriptor(
|
||||
outputDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dim_array[0],
|
||||
outputDesc, CUDNN_TENSOR_NCHW, cudnnDataType, dim_array[0],
|
||||
dim_array[1], dim_array[2], dim_array[3]));
|
||||
|
||||
auto [alpha, beta] = getAlphBeta();
|
||||
|
@ -130,35 +178,27 @@ class TanhCudnn : public ActivationCudnn {
|
|||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, ReluCudnn,
|
||||
"Relu_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, SigmoidCudnn,
|
||||
"Sigmoid_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, DataType::Float32, UnaryCuda,
|
||||
"Hard_Sigmoid_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, DataType::Float32, UnaryCuda,
|
||||
"Hard_Swish_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, TanhCudnn,
|
||||
"Tanh_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
|
||||
"Abs_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, DataType::Float32, UnaryCuda,
|
||||
"Sqrt_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Gelu, DataType::Float32, UnaryCuda,
|
||||
"Gelu_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Neg, DataType::Float32, UnaryCuda,
|
||||
"Neg_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Erf, DataType::Float32, UnaryCuda,
|
||||
"Erf_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Relu, ReluCudnn, "Relu_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, SigmoidCudnn, "Sigmoid_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::HardSigmoid, UnaryCuda,
|
||||
"Hard_Sigmoid_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::HardSwish, UnaryCuda, "Hard_Swish_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Tanh, TanhCudnn, "Tanh_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda, "Abs_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Sqrt, UnaryCuda, "Sqrt_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Gelu, UnaryCuda, "Gelu_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Neg, UnaryCuda, "Neg_CUDA");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Erf, UnaryCuda, "Erf_CUDA");
|
||||
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Softmax, DataType::Float32, UnaryCuda,
|
||||
// "Softmax_CUDA_Float32");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Relu, DataType::Float32, UnaryCuda,
|
||||
// "Relu_CUDA_Float32");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, DataType::Float32, UnaryCuda,
|
||||
// "Sigmoid_CUDA_Float32");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Tanh, DataType::Float32, UnaryCuda,
|
||||
// "Tanh_CUDA_Float32");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Abs, DataType::Float32, UnaryCuda,
|
||||
// "Abs_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Cast, CastCuda, "Cast_CUDA");
|
||||
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Softmax, UnaryCuda, "Softmax_CUDA");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Relu, UnaryCuda,
|
||||
// "Relu_CUDA");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Sigmoid, UnaryCuda,
|
||||
// "Sigmoid_CUDA");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Tanh, UnaryCuda,
|
||||
// "Tanh_CUDA");
|
||||
// REGISTER_KERNEL(Device::CUDA, OpType::Abs, UnaryCuda,
|
||||
// "Abs_CUDA");
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#include "core/common.h"
|
||||
#include "core/constants.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_unary.h"
|
||||
#include <cub/cub.cuh>
|
||||
#include <math.h>
|
||||
|
||||
using infini::E_CONSTANT;
|
||||
|
@ -8,15 +10,16 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
|||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _softmax_kernel1(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _softmax_kernel1(T *input, T *output, size_t n) {
|
||||
float sum = 0.0f;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
sum += pow(E_CONSTANT, input[i]);
|
||||
}
|
||||
*output = sum;
|
||||
}
|
||||
|
||||
__global__ void _softmax_kernel2(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _softmax_kernel2(T *input, T *output, size_t n) {
|
||||
float sum = *output;
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
@ -24,32 +27,32 @@ __global__ void _softmax_kernel2(float *input, float *output, size_t n) {
|
|||
output[i] = pow(E_CONSTANT, input[i]) / sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _relu_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _relu_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
output[i] = max(input[i], float(0));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _sigmoid_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _sigmoid_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
output[i] = 1 / (1 + pow(E_CONSTANT, -input[i]));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _hard_sigmoid_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _hard_sigmoid_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
output[i] = max(0.0f, min(1.0f, 0.2f * input[i] + 0.5f));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _hard_swish_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _hard_swish_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
|
@ -57,8 +60,8 @@ __global__ void _hard_swish_kernel(float *input, float *output, size_t n) {
|
|||
input[i] * max(0.f, min(1.f, (1.f / 6.f) * input[i] + 0.5f));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _tanh_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _tanh_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
|
@ -66,8 +69,8 @@ __global__ void _tanh_kernel(float *input, float *output, size_t n) {
|
|||
(pow(E_CONSTANT, input[i]) + pow(E_CONSTANT, -input[i]));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _abs_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _abs_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
|
@ -83,7 +86,24 @@ __global__ void _sqrt_kernel(float *input, float *output, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void _gelu_kernel(float *input, float *output, size_t n) {
|
||||
__global__ void _sqrt_kernel(half *input, half *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
output[i] = hsqrt(input[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _sqrt_kernel(int8_t *input, int8_t *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
output[i] = __fsqrt_rn(static_cast<float>(input[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void _gelu_kernel(T *input, T *output, size_t n) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride) {
|
||||
|
@ -91,8 +111,8 @@ __global__ void _gelu_kernel(float *input, float *output, size_t n) {
|
|||
output[i] = 0.5 * x * (1 + erf(x / sqrt(2.0f)));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _erf_kernel(float *input, float *output, size_t n) {
|
||||
template <typename T>
|
||||
__global__ void _erf_kernel(T *input, T *output, size_t n) {
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride) {
|
||||
|
@ -109,72 +129,191 @@ __global__ void _neg_kernel(T *input, T *output, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
__global__ void _cast_kernel(INPUT *input, OUTPUT *output, size_t n) {
|
||||
|
||||
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (index < n) {
|
||||
cub::CastOp<OUTPUT> _CastOp;
|
||||
output[index] = _CastOp(input[index]);
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void softmax_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void softmax_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_softmax_kernel1<<<1, 1>>>(input, output, num);
|
||||
_softmax_kernel2<<<gridsize, blocksize>>>(input, output, num);
|
||||
_softmax_kernel1<T><<<1, 1>>>(input, output, num);
|
||||
_softmax_kernel2<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void relu_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void relu_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_relu_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_relu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sigmoid_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void sigmoid_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void hard_sigmoid_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T>
|
||||
void hard_sigmoid_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_hard_sigmoid_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_hard_sigmoid_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void hard_swish_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void hard_swish_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_hard_swish_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_hard_swish_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void tanh_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void tanh_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_tanh_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_tanh_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void abs_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void abs_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_abs_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_abs_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void sqrt_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void sqrt_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_sqrt_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_sqrt_kernel<<<gridsize, blocksize>>>((T *)input, (T *)output, num);
|
||||
}
|
||||
void gelu_kernel(float *input, float *output, size_t num) {
|
||||
|
||||
template <typename T> void gelu_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_gelu_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_gelu_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void erf_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void erf_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_erf_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_erf_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
void neg_kernel(float *input, float *output, size_t num) {
|
||||
template <typename T> void neg_kernel(T *input, T *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_neg_kernel<<<gridsize, blocksize>>>(input, output, num);
|
||||
_neg_kernel<T><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
|
||||
void unary_kernel(const Operator &_op) {
|
||||
auto op = as<UnaryObj>(_op);
|
||||
void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
size_t num = op->getOutput()->size();
|
||||
if (op->getOpType() == OpType::Softmax) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
softmax_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Relu) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
relu_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Sigmoid) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
sigmoid_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::HardSigmoid) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
hard_sigmoid_kernel<float>((float *)inputData, (float *)outputData,
|
||||
num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::HardSwish) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
hard_swish_kernel<float>((float *)inputData, (float *)outputData,
|
||||
num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Tanh) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
tanh_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Abs) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
abs_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Sqrt) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
sqrt_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else if (_op->getDType() == DataType::Float16) {
|
||||
sqrt_kernel<half>((half *)inputData, (half *)outputData, num);
|
||||
} else if (_op->getDType() == DataType::Int8) {
|
||||
sqrt_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Gelu) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
gelu_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Neg) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
neg_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else if (_op->getDType() == DataType::Float16) {
|
||||
neg_kernel<half>((half *)inputData, (half *)outputData, num);
|
||||
} else if (_op->getDType() == DataType::Int8) {
|
||||
neg_kernel<int8_t>((int8_t *)inputData, (int8_t *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else if (op->getOpType() == OpType::Erf) {
|
||||
if (_op->getDType() == DataType::Float32) {
|
||||
erf_kernel<float>((float *)inputData, (float *)outputData, num);
|
||||
} else {
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
} else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
void cast_kernel(INPUT *input, OUTPUT *output, size_t num) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_cast_kernel<INPUT, OUTPUT><<<gridsize, blocksize>>>(input, output, num);
|
||||
}
|
||||
|
||||
template void cast_kernel<float, half>(float *input, half *output, size_t num);
|
||||
template void cast_kernel<half, float>(half *input, float *output, size_t num);
|
||||
template void cast_kernel<float, int32_t>(float *input, int32_t *output,
|
||||
size_t num);
|
||||
template void cast_kernel<float, int8_t>(float *input, int8_t *output,
|
||||
size_t num);
|
||||
template void cast_kernel<int8_t, float>(int8_t *input, float *output,
|
||||
size_t num);
|
||||
template void cast_kernel<int32_t, float>(int32_t *input, float *output,
|
||||
size_t num);
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "operators/where.h"
|
||||
#include "cuda/cuda_kernel_wihtout_config.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "cuda/cuda_where.h"
|
||||
#include "utils/broadcast_shape.h"
|
||||
|
||||
|
@ -35,15 +36,14 @@ class WhereCuda : public CudaKernelWithoutConfig {
|
|||
broadcastShape(opInputXShape, inputXShape, nDims, xSize);
|
||||
broadcastShape(opInputYShape, inputYShape, nDims, ySize);
|
||||
broadcastShape(opConditionShape, conditionShape, nDims, cSize);
|
||||
|
||||
whereKernel((float *)inputXData, (float *)inputYData,
|
||||
(uint8_t *)conditionData, (float *)outputData, nDims,
|
||||
outputsize, inputXShape, inputYShape, conditionShape,
|
||||
outputShape, xSize, ySize, cSize);
|
||||
const int dTypeIndex = op->getDType().getIndex();
|
||||
whereKernel(dTypeIndex, inputXData, inputYData,
|
||||
(uint8_t *)conditionData, outputData, nDims, outputsize,
|
||||
inputXShape, inputYShape, conditionShape, outputShape,
|
||||
xSize, ySize, cSize);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Where, DataType::Float32, WhereCuda,
|
||||
"Where_CUDA_Float32");
|
||||
REGISTER_KERNEL(Device::CUDA, OpType::Where, WhereCuda, "Where_CUDA");
|
||||
|
||||
}; // namespace infini
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "cuda/cuda_common.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "utils/small_array.h"
|
||||
|
||||
__device__ int inferIndex(infini::SmallArray inputShape,
|
||||
|
@ -17,13 +18,12 @@ __device__ int inferIndex(infini::SmallArray inputShape,
|
|||
}
|
||||
return inputIdx;
|
||||
}
|
||||
__global__ void _whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
int outputsize, infini::SmallArray inputXShape,
|
||||
infini::SmallArray inputYShape,
|
||||
infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize) {
|
||||
template <typename T>
|
||||
__global__ void
|
||||
_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
|
||||
int nDims, int outputsize, infini::SmallArray inputXShape,
|
||||
infini::SmallArray inputYShape, infini::SmallArray conditionShape,
|
||||
infini::SmallArray outputShape, int xSize, int ySize, int cSize) {
|
||||
|
||||
int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (outputIdx < outputsize) {
|
||||
|
@ -35,14 +35,61 @@ __global__ void _whereKernel(const float *inputX, const float *inputY,
|
|||
int inputYIdx =
|
||||
inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
|
||||
|
||||
output[outputIdx] =
|
||||
condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx];
|
||||
((T *)output)[outputIdx] = condition[conditionIdx]
|
||||
? ((T *)inputX)[inputXIdx]
|
||||
: ((T *)inputY)[inputYIdx];
|
||||
}
|
||||
}
|
||||
#define CASE(T) \
|
||||
_whereKernel<DT_CUDA<T>::t><<<gridsize, blocksize>>>( \
|
||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape, \
|
||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
||||
|
||||
#define SWITCH_DTYPE(DTYPE) \
|
||||
switch (DTYPE) { \
|
||||
case 1: \
|
||||
CASE(1) \
|
||||
break; \
|
||||
case 2: \
|
||||
CASE(2) \
|
||||
break; \
|
||||
case 3: \
|
||||
CASE(3) \
|
||||
break; \
|
||||
case 4: \
|
||||
CASE(4) \
|
||||
break; \
|
||||
case 5: \
|
||||
CASE(5) \
|
||||
break; \
|
||||
case 6: \
|
||||
CASE(6) \
|
||||
break; \
|
||||
case 7: \
|
||||
CASE(7) \
|
||||
break; \
|
||||
case 10: \
|
||||
CASE(10) \
|
||||
break; \
|
||||
case 11: \
|
||||
CASE(11) \
|
||||
break; \
|
||||
case 12: \
|
||||
CASE(12) \
|
||||
break; \
|
||||
case 13: \
|
||||
CASE(13) \
|
||||
break; \
|
||||
case 16: \
|
||||
CASE(16) \
|
||||
break; \
|
||||
default: \
|
||||
IT_TODO_HALT(); \
|
||||
}
|
||||
namespace infini {
|
||||
void whereKernel(const float *inputX, const float *inputY,
|
||||
const uint8_t *condition, float *output, int nDims,
|
||||
|
||||
void whereKernel(int dTypeIndex, void *inputX, void *inputY,
|
||||
const uint8_t *condition, void *output, int nDims,
|
||||
int outputsize, SmallArray inputXShape, SmallArray inputYShape,
|
||||
SmallArray conditionShape, SmallArray outputShape, int xSize,
|
||||
int ySize, int cSize) {
|
||||
|
@ -61,8 +108,8 @@ void whereKernel(const float *inputX, const float *inputY,
|
|||
blocksize = 32;
|
||||
}
|
||||
int gridsize = (outputsize + blocksize - 1) / blocksize;
|
||||
_whereKernel<<<gridsize, blocksize>>>(
|
||||
inputX, inputY, condition, output, nDims, outputsize, inputXShape,
|
||||
inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
|
||||
|
||||
SWITCH_DTYPE(dTypeIndex)
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
#include "operators/dequantize_linear.h"
|
||||
#include "utils/operator_utils.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
DequantizeLinearObj::DequantizeLinearObj(GraphObj *graph, Tensor inputX,
|
||||
Tensor inputScale, Tensor output,
|
||||
[[maybe_unused]] Tensor inputZeroPoint,
|
||||
int axis)
|
||||
: OperatorObj(OpType::DequantizeLinear,
|
||||
inputZeroPoint ? TensorVec{inputX, inputScale, inputZeroPoint}
|
||||
: TensorVec{inputX, inputScale},
|
||||
{output}),
|
||||
axis(axis) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
DequantizeLinearObj::inferShape(const TensorVec &inputs) {
|
||||
return {{inputs[0]->getDims()}}; // x.shape = output.shape = inputs[0].shape
|
||||
}
|
||||
vector<DataType>
|
||||
DequantizeLinearObj::inferDataType(const TensorVec &inputs) const {
|
||||
IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
|
||||
|
||||
return {
|
||||
inputs[1]->getDType()}; // scale.dtype = output.dtype = inputs[1].dtype
|
||||
}
|
||||
|
||||
std::string DequantizeLinearObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "DequantizeLinear[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "inputX=" << inputs[0]->getGuid() << ",";
|
||||
os << "inputScale=" << inputs[1]->getGuid() << ",";
|
||||
// os << "inputZeroPoint=" << inputs[2]->getGuid() << ",";
|
||||
os << "axis=" << axis << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> DequantizeLinearObj::getWorkloadVector() const {
|
||||
vector<int> ret = inputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), type.underlying());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> DequantizeLinearObj::getOpAttrVector() const {
|
||||
return {type.underlying(), axis};
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,46 @@
|
|||
#include "operators/dynamic_quantize_linear.h"
|
||||
#include "utils/operator_utils.h"
|
||||
|
||||
namespace infini {
|
||||
DynamicQuantizeLinearObj::DynamicQuantizeLinearObj(
|
||||
GraphObj *graph, Tensor input, std::optional<TensorVec> outputs)
|
||||
: OperatorObj(OpType::DynamicQuantizeLinear, TensorVec{input},
|
||||
((!outputs) ? TensorVec(3, nullptr) : std::move(*outputs))) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
DynamicQuantizeLinearObj::inferShape(const TensorVec &inputs) {
|
||||
return {{inputs[0]->getDims(), {}, {}}};
|
||||
}
|
||||
|
||||
vector<DataType>
|
||||
DynamicQuantizeLinearObj::inferDataType(const TensorVec &inputs) const {
|
||||
IT_ASSERT(inputs.size() == 1);
|
||||
return {DataType(2), DataType(1), DataType(2)};
|
||||
}
|
||||
|
||||
std::string DynamicQuantizeLinearObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "DynamicQuantizeLinear[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=";
|
||||
for (auto output : outputs)
|
||||
os << output->getGuid() << ",";
|
||||
os << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> DynamicQuantizeLinearObj::getWorkloadVector() const {
|
||||
vector<int> ret = inputs[0]->getDims();
|
||||
ret.emplace(ret.begin(), type.underlying());
|
||||
return ret;
|
||||
}
|
||||
|
||||
vector<int> DynamicQuantizeLinearObj::getOpAttrVector() const {
|
||||
return {type.underlying()};
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -27,10 +27,7 @@ optional<vector<Shape>> LayerNormObj::inferShape(const TensorVec &inputs) {
|
|||
|
||||
vector<DataType> LayerNormObj::inferDataType(const TensorVec &inputs) const {
|
||||
IT_ASSERT(inputs.size() == 2 || inputs.size() == 3);
|
||||
IT_ASSERT(inputs[1]->getDType() == DataType::Float32);
|
||||
if (inputs.size() == 3) {
|
||||
IT_ASSERT(inputs[2]->getDType() == DataType::Float32);
|
||||
}
|
||||
|
||||
return {inputs[0]->getDType()};
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
#include "operators/matmul_integer.h"
|
||||
#include "utils/operator_utils.h"
|
||||
#include <numeric>
|
||||
|
||||
namespace infini {
|
||||
|
||||
MatmulIntegerObj::MatmulIntegerObj(GraphObj *graph, Tensor A, Tensor B,
|
||||
Tensor C,
|
||||
[[maybe_unused]] Tensor a_zero_point,
|
||||
[[maybe_unused]] Tensor b_zero_point)
|
||||
: OperatorObj(OpType::MatMulInteger,
|
||||
a_zero_point ? (b_zero_point ? TensorVec{A, B, a_zero_point,
|
||||
b_zero_point}
|
||||
: TensorVec{A, B, a_zero_point})
|
||||
: TensorVec{A, B},
|
||||
{C}),
|
||||
b(1) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
string MatmulIntegerObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "MatmulInteger(A=" << inputs[0]->getGuid()
|
||||
<< ",B=" << inputs[1]->getGuid() << ",C=" << outputs[0]->getGuid()
|
||||
<< ",bmnk=[" << b << "," << m << "," << n << "," << k << "])";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
optional<vector<Shape>> MatmulIntegerObj::inferShape(const TensorVec &inputs) {
|
||||
auto A = inputs[0], B = inputs[1];
|
||||
auto shapeA = A->getDims();
|
||||
auto shapeB = B->getDims();
|
||||
int rankA = A->getRank();
|
||||
int rankB = B->getRank();
|
||||
Shape shapeA1(shapeA.begin(), shapeA.begin() + (rankA - 2));
|
||||
Shape shapeB1(shapeB.begin(), shapeB.begin() + (rankB - 2));
|
||||
Shape ret = infer_broadcast(shapeA1, shapeB1);
|
||||
if (ret.empty()) {
|
||||
b = 1;
|
||||
} else {
|
||||
b = std::accumulate(ret.begin(), ret.end(), 1, std::multiplies<int>());
|
||||
}
|
||||
IT_ASSERT(*(shapeA.rbegin()) == *(shapeB.rbegin() + 1));
|
||||
m = *(shapeA.rbegin() + 1);
|
||||
n = *(shapeB.rbegin());
|
||||
k = *(shapeA.rbegin());
|
||||
ret.emplace_back(m);
|
||||
ret.emplace_back(n);
|
||||
return {{ret}};
|
||||
}
|
||||
|
||||
vector<DataType>
|
||||
MatmulIntegerObj::inferDataType(const TensorVec &inputs) const {
|
||||
for (auto &input : inputs) {
|
||||
IT_ASSERT(input->getDType() == DataType::Int8 ||
|
||||
input->getDType() == DataType::UInt8);
|
||||
}
|
||||
if (inputs.size() >= 3) {
|
||||
IT_ASSERT(inputs[0]->getDType() == inputs[2]->getDType());
|
||||
}
|
||||
if (inputs.size() == 4) {
|
||||
IT_ASSERT(inputs[1]->getDType() == inputs[3]->getDType());
|
||||
}
|
||||
return vector(numOutputs(), DataType::Int32);
|
||||
}
|
||||
|
||||
vector<int> MatmulIntegerObj::getWorkloadVector() const {
|
||||
return {type.underlying(), b, m, n, k};
|
||||
}
|
||||
|
||||
vector<int> MatmulIntegerObj::getOpAttrVector() const {
|
||||
return {type.underlying()};
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -187,4 +187,42 @@ TEST(ConcatToIdentity, Cuda) {
|
|||
EXPECT_TRUE(
|
||||
oCpu->equalData(vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}));
|
||||
}
|
||||
//----------
|
||||
TEST(ConcatFp16, CudaHigh) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto t1 = gCpu->addTensor({2, 2, 3, 1, 2}, DataType::Float16);
|
||||
auto t2 = gCpu->addTensor({2, 2, 1, 1, 2}, DataType::Float16);
|
||||
auto t3 = gCpu->addTensor({2, 2, 2, 1, 2}, DataType::Float16);
|
||||
gCpu->dataMalloc();
|
||||
t1->setData(ValGenerator<2>());
|
||||
t2->setData(ValGenerator<1>());
|
||||
t3->setData(ValGenerator<4>());
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto t1Gpu = gCuda->cloneTensor(t1);
|
||||
auto t2Gpu = gCuda->cloneTensor(t2);
|
||||
auto t3Gpu = gCuda->cloneTensor(t3);
|
||||
|
||||
auto op =
|
||||
gCuda->addOp<ConcatObj>(TensorVec{t1Gpu, t2Gpu, t3Gpu}, nullptr, 2);
|
||||
gCuda->dataMalloc();
|
||||
t1Gpu->setData(ValGenerator<2>());
|
||||
t2Gpu->setData(ValGenerator<1>());
|
||||
t3Gpu->setData(ValGenerator<4>());
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
// cudaPrintTensor(op->getOutput());
|
||||
// copy output from CUDA to CPU
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput());
|
||||
EXPECT_TRUE(oCpu->equalData(vector<float>{
|
||||
2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2.,
|
||||
2., 2., 1., 1., 4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1.,
|
||||
4., 4., 4., 4., 2., 2., 2., 2., 2., 2., 1., 1., 4., 4., 4., 4.}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -160,8 +160,8 @@ TEST(cuDNN_ConvTransposed, tune) {
|
|||
bool tune = true;
|
||||
cuda->run(gCuda, tune);
|
||||
// check record
|
||||
auto kernelAttrs = KernelAttrs{Device::CUDA, conv->getOpType().underlying(),
|
||||
DataType::Float32};
|
||||
auto kernelAttrs =
|
||||
KernelAttrs{Device::CUDA, conv->getOpType().underlying()};
|
||||
auto perfKey = PerfEngine::Key{kernelAttrs, conv->getOpPerfKey()};
|
||||
std::optional<PerfRecord> perfData =
|
||||
PerfEngine::getInstance().getPerfData(perfKey);
|
||||
|
|
|
@ -0,0 +1,219 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/dequantize_linear.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void test_dequantizeLinearFp32(
|
||||
const Shape &inputXShape, const vector<uint8_t> &inputXData,
|
||||
const Shape &inputScaleShape, const vector<float> &inputScaleData, int axis,
|
||||
const vector<float> &ExpectData,
|
||||
const std::optional<Shape> &zeroPointShape = std::nullopt,
|
||||
const std::optional<std::vector<uint8_t>> &inputZeroPointData =
|
||||
std::nullopt) {
|
||||
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
|
||||
Shape inputZeroPointShape = *zeroPointShape;
|
||||
|
||||
auto inputZeroPoint =
|
||||
gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
|
||||
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
|
||||
gCpu->dataMalloc();
|
||||
inputZeroPoint->copyin(*inputZeroPointData); //
|
||||
inputX->copyin(inputXData);
|
||||
inputScale->copyin(inputScaleData); //
|
||||
|
||||
// inputX->printData();
|
||||
// inputZeroPoint->printData();
|
||||
// inputScale->printData();
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
|
||||
|
||||
auto op = gCuda->addOp<DequantizeLinearObj>(
|
||||
inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
|
||||
axis); // DequantizeLinearObj
|
||||
gCuda->dataMalloc();
|
||||
inputZeroPointGpu->copyin(*inputZeroPointData);
|
||||
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
|
||||
inputXGpu->copyin(inputXData);
|
||||
inputScaleGpu->copyin(inputScaleData);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
} else {
|
||||
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
|
||||
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float32);
|
||||
gCpu->dataMalloc();
|
||||
|
||||
inputX->copyin(inputXData);
|
||||
inputScale->copyin(inputScaleData); //
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
|
||||
auto op = gCuda->addOp<DequantizeLinearObj>(
|
||||
inputXGpu, inputScaleGpu, nullptr, nullptr,
|
||||
axis); // DequantizeLinearObj
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputXGpu->copyin(inputXData);
|
||||
inputScaleGpu->copyin(inputScaleData);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
}
|
||||
void test_dequantizeLinearFp16(
|
||||
const Shape &inputXShape, const vector<uint8_t> &inputXData,
|
||||
const Shape &inputScaleShape,
|
||||
const std::function<void(void *, size_t, DataType)> &generator, int axis,
|
||||
const vector<float> &ExpectData,
|
||||
const std::optional<Shape> &zeroPointShape = std::nullopt,
|
||||
const std::optional<std::vector<uint8_t>> &inputZeroPointData =
|
||||
std::nullopt) {
|
||||
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
if (zeroPointShape.has_value() && inputZeroPointData.has_value()) {
|
||||
Shape inputZeroPointShape = *zeroPointShape;
|
||||
|
||||
auto inputZeroPoint =
|
||||
gCpu->addTensor(inputZeroPointShape, DataType::UInt8);
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
|
||||
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
|
||||
gCpu->dataMalloc();
|
||||
inputZeroPoint->copyin(*inputZeroPointData); //
|
||||
// inputZeroPoint->printData();
|
||||
inputX->copyin(inputXData);
|
||||
inputScale->setData(generator);
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputZeroPointGpu = gCuda->cloneTensor(inputZeroPoint);
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
|
||||
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
|
||||
auto op = gCuda->addOp<DequantizeLinearObj>(
|
||||
inputXGpu, inputScaleGpu, nullptr, inputZeroPointGpu,
|
||||
axis); // DequantizeLinearObj
|
||||
gCuda->dataMalloc();
|
||||
inputZeroPointGpu->copyin(*inputZeroPointData);
|
||||
// gCpu->cloneTensor(inputZeroPointGpu)->printData();
|
||||
inputXGpu->copyin(inputXData);
|
||||
inputScaleGpu->setData(generator);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
} else {
|
||||
|
||||
auto inputX = gCpu->addTensor(inputXShape, DataType::UInt8);
|
||||
auto inputScale = gCpu->addTensor(inputScaleShape, DataType::Float16);
|
||||
gCpu->dataMalloc();
|
||||
|
||||
inputX->copyin(inputXData);
|
||||
inputScale->setData(generator);
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputXGpu = gCuda->cloneTensor(inputX);
|
||||
auto inputScaleGpu = gCuda->cloneTensor(inputScale);
|
||||
auto op = gCuda->addOp<DequantizeLinearObj>(
|
||||
inputXGpu, inputScaleGpu, nullptr, nullptr,
|
||||
axis); // DequantizeLinearObj
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputXGpu->copyin(inputXData);
|
||||
inputScaleGpu->setData(generator);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CUDA_DequantizeLinearFp32, run) {
|
||||
|
||||
test_dequantizeLinearFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
|
||||
Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
|
||||
vector<float>{-0.3000000, 0.0000000, 0.3000000, 0.6000000,
|
||||
0.9000000, 1.2000000, 0.8000000, 1.0000000,
|
||||
1.2000000, 1.4000000, 1.6000000, 1.8000001,
|
||||
4.5000000, 5.0000000, 5.5000000, 6.0000000,
|
||||
6.5000000, 7.0000000, 5.1000004, 5.4000001,
|
||||
5.7000003, 6.0000000, 6.3000002, 6.6000004,
|
||||
4.4000001, 4.5999999, 4.8000002, 5.0000000,
|
||||
5.2000003, 5.4000001, 13.5000000, 14.0000000,
|
||||
14.5000000, 15.0000000, 15.5000000, 16.0000000},
|
||||
Shape{3}, vector<uint8_t>{1, 2, 3});
|
||||
test_dequantizeLinearFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
|
||||
Shape{3}, vector<float>{0.3, 0.2, 0.5}, 1,
|
||||
vector<float>{0.0000000, 0.3000000, 0.6000000, 0.9000000,
|
||||
1.2000000, 1.5000000, 1.2000000, 1.4000000,
|
||||
1.6000000, 1.8000001, 2.0000000, 2.2000000,
|
||||
6.0000000, 6.5000000, 7.0000000, 7.5000000,
|
||||
8.0000000, 8.5000000, 5.4000001, 5.7000003,
|
||||
6.0000000, 6.3000002, 6.6000004, 6.9000001,
|
||||
4.8000002, 5.0000000, 5.2000003, 5.4000001,
|
||||
5.5999999, 5.8000002, 15.0000000, 15.5000000,
|
||||
16.0000000, 16.5000000, 17.0000000, 17.5000000});
|
||||
|
||||
} // python output
|
||||
TEST(CUDA_DequantizeLinearFp16, run) {
|
||||
test_dequantizeLinearFp16(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
|
||||
Shape{3}, ValGenerator<2>(), 1,
|
||||
vector<float>{-2., 0., 2., 4., 6., 8., 8., 10., 12.,
|
||||
14., 16., 18., 18., 20., 22., 24., 26., 28.,
|
||||
34., 36., 38., 40., 42., 44., 44., 46., 48.,
|
||||
50., 52., 54., 54., 56., 58., 60., 62., 64.},
|
||||
Shape{3}, vector<uint8_t>{1, 2, 3});
|
||||
test_dequantizeLinearFp16(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<uint8_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
|
||||
Shape{3}, ValGenerator<2>(), 1,
|
||||
vector<float>{0., 2., 4., 6., 8., 10., 12., 14., 16.,
|
||||
18., 20., 22., 24., 26., 28., 30., 32., 34.,
|
||||
36., 38., 40., 42., 44., 46., 48., 50., 52.,
|
||||
54., 56., 58., 60., 62., 64., 66., 68., 70.});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,74 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "cuda/cuda_utility.h"
|
||||
#include "operators/dynamic_quantize_linear.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void test_dynamicquantizeLinearFp32(
|
||||
const Shape &inputShape, const vector<float> &inputData,
|
||||
const vector<uint8_t> &outputYData, const vector<float> &outputYScaleData,
|
||||
const vector<uint8_t> &outputYZeroPointData) {
|
||||
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float32);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
|
||||
input->copyin(inputData);
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
|
||||
auto op = gCuda->addOp<DynamicQuantizeLinearObj>(
|
||||
inputGpu,
|
||||
std::nullopt); // DynamicQuantizeLinear
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputGpu->copyin(inputData);
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
EXPECT_EQ(op->getOutputs().size(), (size_t)3);
|
||||
auto o0Cpu = gCpu->cloneTensor(op->getOutput(0));
|
||||
auto o1Cpu = gCpu->cloneTensor(op->getOutput(1));
|
||||
auto o2Cpu = gCpu->cloneTensor(op->getOutput(2));
|
||||
|
||||
EXPECT_TRUE(o0Cpu->equalData(outputYData));
|
||||
EXPECT_TRUE(o1Cpu->equalData(outputYScaleData));
|
||||
EXPECT_TRUE(o2Cpu->equalData(outputYZeroPointData));
|
||||
}
|
||||
|
||||
TEST(CUDA_DynamicquantizeLinearFp32, run) {
|
||||
|
||||
test_dynamicquantizeLinearFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||
18., 19., 20., 21., 22., 23., 24., 25., 26.,
|
||||
27., 28., 29., 30., 31., 32., 33., 34., 35.},
|
||||
vector<uint8_t>{0, 7, 15, 22, 29, 36, 44, 51, 58,
|
||||
66, 73, 80, 87, 95, 102, 109, 117, 124,
|
||||
131, 138, 146, 153, 160, 168, 175, 182, 189,
|
||||
197, 204, 211, 219, 226, 233, 240, 248, 255},
|
||||
vector<float>{0.1372549}, vector<uint8_t>{0});
|
||||
test_dynamicquantizeLinearFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
vector<uint8_t>{0, 11, 22, 33, 44, 55, 67, 78,
|
||||
89, 100, 111, 122, 133, 144, 155, 166,
|
||||
177, 188, 200, 211, 222, 233, 244, 255},
|
||||
vector<float>{0.0901961}, vector<uint8_t>{0});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
namespace infini {
|
||||
|
||||
void test_layernorm(
|
||||
void test_layernormFp32(
|
||||
const Shape &inputShape, const vector<float> &inputData,
|
||||
const Shape &scaleShape, const vector<float> &scaleData, float eps,
|
||||
int axis, int stash_type, const vector<float> &ExpectData,
|
||||
|
@ -77,9 +77,78 @@ void test_layernorm(
|
|||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
}
|
||||
void test_layernormFp16(
|
||||
const Shape &inputShape,
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &scaleShape, float eps, int axis, int stash_type,
|
||||
const vector<float> &ExpectData,
|
||||
const std::optional<Shape> &bShape = std::nullopt) {
|
||||
|
||||
TEST(CUDA_Layernorm, run) {
|
||||
test_layernorm(
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
if (bShape.has_value()) {
|
||||
Shape biasShape = *bShape;
|
||||
|
||||
auto bias = gCpu->addTensor(biasShape, DataType::Float16);
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float16);
|
||||
auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
|
||||
gCpu->dataMalloc();
|
||||
bias->setData(generator);
|
||||
// bias->printData();
|
||||
input->setData(generator);
|
||||
scale->setData(generator);
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto biasGpu = gCuda->cloneTensor(bias);
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
auto scaleGpu = gCuda->cloneTensor(scale);
|
||||
// gCpu->cloneTensor(biasGpu)->printData();
|
||||
auto op =
|
||||
gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, biasGpu,
|
||||
eps, axis, stash_type); // LayernormObj
|
||||
gCuda->dataMalloc();
|
||||
biasGpu->setData(generator);
|
||||
// gCpu->cloneTensor(biasGpu)->printData();
|
||||
inputGpu->setData(generator);
|
||||
scaleGpu->setData(generator);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
} else {
|
||||
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float16);
|
||||
auto scale = gCpu->addTensor(scaleShape, DataType::Float16);
|
||||
gCpu->dataMalloc();
|
||||
|
||||
input->setData(generator);
|
||||
scale->setData(generator);
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
auto scaleGpu = gCuda->cloneTensor(scale);
|
||||
auto op =
|
||||
gCuda->addOp<LayerNormObj>(inputGpu, scaleGpu, nullptr, nullptr,
|
||||
eps, axis, stash_type); // LayernormObj
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputGpu->setData(generator);
|
||||
scaleGpu->setData(generator);
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu =
|
||||
gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CUDA_LayernormFp32, run) {
|
||||
test_layernormFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||
|
@ -94,7 +163,7 @@ TEST(CUDA_Layernorm, run) {
|
|||
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678,
|
||||
-0.3674207, 0.0000000, 0.6123678, -0.3674207, 0.0000000, 0.6123678},
|
||||
Shape{3}, vector<float>{0, 0, 0});
|
||||
test_layernorm(
|
||||
test_layernormFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||
|
@ -109,7 +178,7 @@ TEST(CUDA_Layernorm, run) {
|
|||
-0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679,
|
||||
-0.0674207, 0.2000000, 1.1123679, -0.0674207, 0.2000000, 1.1123679},
|
||||
Shape{3}, vector<float>{0.3, 0.2, 0.5});
|
||||
test_layernorm(
|
||||
test_layernormFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||
|
@ -124,7 +193,7 @@ TEST(CUDA_Layernorm, run) {
|
|||
-0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207,
|
||||
-0.0674207, 0.2000000, 0.8674207, -0.0674207, 0.2000000, 0.8674207},
|
||||
Shape{3}, vector<float>{0.3, 0.2, 0.5});
|
||||
test_layernorm(
|
||||
test_layernormFp32(
|
||||
Shape{2, 3, 2, 3},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7., 8.,
|
||||
9., 10., 11., 12., 13., 14., 15., 16., 17.,
|
||||
|
@ -141,6 +210,15 @@ TEST(CUDA_Layernorm, run) {
|
|||
0.0000000, 0.6123678, -0.3674207, 0.0000000,
|
||||
0.6123678, -0.3674207, 0.0000000, 0.6123678});
|
||||
|
||||
} // python output
|
||||
TEST(CUDA_LayernormFp16, run) {
|
||||
test_layernormFp16(Shape{2, 3, 2, 3}, ValGenerator<2>(), Shape{3}, 1e-5, 3,
|
||||
1, vector<float>{2., 2., 2., 2., 2., 2., 2., 2., 2.,
|
||||
2., 2., 2., 2., 2., 2., 2., 2., 2.,
|
||||
2., 2., 2., 2., 2., 2., 2., 2., 2.,
|
||||
2., 2., 2., 2., 2., 2., 2., 2., 2.},
|
||||
Shape{3});
|
||||
|
||||
} // python output
|
||||
|
||||
} // namespace infini
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "cuda/cuda_runtime.h"
|
||||
#include "operators/matmul_integer.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
using ExpectOutput = vector<int32_t>;
|
||||
|
||||
TEST(cuBLAS_MatmulInteger, ZeroPoint1) {
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
auto gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto ACuda = gCuda->addTensor({1, 4}, DataType::UInt8);
|
||||
auto BCuda = gCuda->addTensor({4, 12}, DataType::UInt8);
|
||||
auto AZeroPointCuda = gCuda->addTensor({}, DataType::UInt8);
|
||||
auto BZeroPointCuda = gCuda->addTensor({}, DataType::UInt8);
|
||||
auto op = gCuda->addOp<MatmulIntegerObj>(ACuda, BCuda, nullptr,
|
||||
AZeroPointCuda, BZeroPointCuda);
|
||||
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
// ACuda->copyin(vector<uint8_t>{11, 7, 3, 10, 6, 2, 9, 5, 1, 8, 4, 0});
|
||||
ACuda->copyin(vector<uint8_t>{11, 7, 3, 10});
|
||||
// BCuda->copyin(vector<uint8_t>({1, 4, 2, 5, 3, 6,}));
|
||||
BCuda->copyin(vector<uint8_t>(48, 1));
|
||||
AZeroPointCuda->copyin(vector<uint8_t>{12});
|
||||
BZeroPointCuda->copyin(vector<uint8_t>{0});
|
||||
cudaRuntime->run(gCuda);
|
||||
auto result = op->getOutput()->clone(NativeCpuRuntimeObj::getInstance());
|
||||
// ExpectOutput ans = {
|
||||
// -38, -83, -44, -98, -50, -113, -56, -128,
|
||||
// };
|
||||
ExpectOutput ans = {-17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17};
|
||||
EXPECT_TRUE(result->equalData(ans));
|
||||
}
|
||||
|
||||
TEST(cuBLAS_MatmulInteger, ZeroPoint2) {
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
auto gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
auto ACuda = gCuda->addTensor({2, 3, 1, 4}, DataType::UInt8);
|
||||
auto BCuda = gCuda->addTensor({2, 3, 4, 12}, DataType::UInt8);
|
||||
auto AZeroPointCuda = gCuda->addTensor({2, 3, 1, 1}, DataType::UInt8);
|
||||
auto BZeroPointCuda = gCuda->addTensor({2, 3, 1, 12}, DataType::UInt8);
|
||||
auto op = gCuda->addOp<MatmulIntegerObj>(ACuda, BCuda, nullptr,
|
||||
AZeroPointCuda, BZeroPointCuda);
|
||||
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
ACuda->copyin(vector<uint8_t>{11, 7, 3, 10, 11, 7, 3, 10, 11, 7, 3, 10,
|
||||
11, 7, 3, 10, 11, 7, 3, 10, 11, 7, 3, 10});
|
||||
BCuda->copyin(vector<uint8_t>(288, 1));
|
||||
AZeroPointCuda->copyin(vector<uint8_t>(6, 12));
|
||||
BZeroPointCuda->copyin(vector<uint8_t>(72, 0));
|
||||
cudaRuntime->run(gCuda);
|
||||
auto result = op->getOutput()->clone(NativeCpuRuntimeObj::getInstance());
|
||||
ExpectOutput ans = {-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17,
|
||||
-17, -17, -17, -17, -17, -17};
|
||||
EXPECT_TRUE(result->equalData(ans));
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -8,130 +8,127 @@
|
|||
#include <cmath>
|
||||
namespace infini {
|
||||
|
||||
TEST(cuDNN_Softmax, run_axis1) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
void test_softmaxFp32(const Shape &inputShape, const vector<float> &inputData,
|
||||
int axis, const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float32);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
|
||||
input->copyin(inputData);
|
||||
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
cudaPrintTensor(outputGpu);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(
|
||||
vector<float>{0.032058604, 0.08714432, 0.23688284, 0.6439143,
|
||||
0.032058604, 0.08714432, 0.23688284, 0.6439143}));
|
||||
auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputGpu->copyin(inputData);
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
void test_softmaxFp16(
|
||||
const Shape &inputShape,
|
||||
const std::function<void(void *, size_t, DataType)> &generator, int axis,
|
||||
const vector<float> &ExpectData) {
|
||||
Runtime runtime = NativeCpuRuntimeObj::getInstance();
|
||||
Graph gCpu = make_ref<GraphObj>(runtime);
|
||||
|
||||
auto input = gCpu->addTensor(inputShape, DataType::Float16);
|
||||
|
||||
gCpu->dataMalloc();
|
||||
|
||||
input->setData(generator);
|
||||
|
||||
TEST(cuDNN_Softmax, run_axis0) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
Graph gCuda = make_ref<GraphObj>(cudaRuntime);
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 4}, DataType::Float32, cpuRuntime);
|
||||
auto inputGpu = gCuda->cloneTensor(input);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 0);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->copyin(vector<float>{0, 1, 2, 3, 10000, 10001, 10002, 10003});
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
cudaPrintTensor(outputGpu);
|
||||
// Check
|
||||
EXPECT_TRUE(
|
||||
outputGpu2Cpu->equalData(vector<float>{0., 0., 0., 0., 1, 1, 1, 1}));
|
||||
auto op = gCuda->addOp<SoftmaxObj>(inputGpu, nullptr, axis);
|
||||
gCuda->dataMalloc();
|
||||
|
||||
inputGpu->setData(generator);
|
||||
|
||||
cudaRuntime->run(gCuda);
|
||||
|
||||
auto oCpu = gCpu->cloneTensor(op->getOutput()); // move Data from gpu to cpu
|
||||
oCpu->printData(); //->printData
|
||||
EXPECT_TRUE(oCpu->equalData(ExpectData));
|
||||
}
|
||||
TEST(CUDA_SoftmaxFP32, run) {
|
||||
test_softmaxFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
0, vector<float>{6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
6.14417422e-06, 6.14417422e-06, 6.14417422e-06,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01,
|
||||
9.99993801e-01, 9.99993801e-01, 9.99993801e-01});
|
||||
test_softmaxFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
1, vector<float>{3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
|
||||
3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
|
||||
1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
|
||||
9.81690347e-01, 9.81690347e-01, 9.81690347e-01,
|
||||
3.29320435e-04, 3.29320435e-04, 3.29320435e-04,
|
||||
3.29320435e-04, 1.79802869e-02, 1.79802869e-02,
|
||||
1.79802869e-02, 1.79802869e-02, 9.81690347e-01,
|
||||
9.81690347e-01, 9.81690347e-01, 9.81690347e-01});
|
||||
test_softmaxFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
2, vector<float>{0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703,
|
||||
0.11920292, 0.11920292, 0.88079703, 0.88079703});
|
||||
test_softmaxFp32(
|
||||
Shape{2, 3, 2, 2},
|
||||
vector<float>{0., 1., 2., 3., 4., 5., 6., 7.,
|
||||
8., 9., 10., 11., 12., 13., 14., 15.,
|
||||
16., 17., 18., 19., 20., 21., 22., 23.},
|
||||
3, vector<float>{0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860,
|
||||
0.26894143, 0.73105860, 0.26894143, 0.73105860});
|
||||
} // python output
|
||||
TEST(CUDA_SoftmaxFP16, run) {
|
||||
test_softmaxFp16(Shape{2, 3, 2, 2}, ValGenerator<2>(), 0,
|
||||
vector<float>{0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
|
||||
0.5000, 0.5000, 0.5000, 0.5000});
|
||||
test_softmaxFp16(
|
||||
Shape{2, 3, 2, 2}, ValGenerator<2>(), 1, // data accuracy down
|
||||
vector<float>{0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252, 0.333252,
|
||||
0.333252, 0.333252, 0.333252, 0.333252});
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis1) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
} // python output
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 1);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
cudaPrintTensor(outputGpu);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||
0.0179862, 0.0179862, 0.0179862, 0.0179862, 0.9820138, 0.9820138,
|
||||
0.9820138, 0.9820138, 0.0179862, 0.0179862, 0.0179862, 0.0179862,
|
||||
0.9820138, 0.9820138, 0.9820138, 0.9820138}));
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis2) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 2);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
cudaPrintTensor(outputGpu);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||
0.1192029, 0.1192029, 0.8807971, 0.8807971, 0.1192029, 0.1192029,
|
||||
0.8807971, 0.8807971, 0.1192029, 0.1192029, 0.8807971, 0.8807971,
|
||||
0.1192029, 0.1192029, 0.8807971, 0.8807971}));
|
||||
}
|
||||
|
||||
TEST(cuDNN_Softmax2, run_axis3) {
|
||||
// Runtime
|
||||
Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor inputCpu =
|
||||
make_ref<TensorObj>(Shape{2, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
||||
|
||||
// GPU
|
||||
Graph cudaGraph = make_ref<GraphObj>(cudaRuntime);
|
||||
auto inputGpu = cudaGraph->cloneTensor(inputCpu);
|
||||
auto gpuOp = cudaGraph->addOp<SoftmaxObj>(inputGpu, nullptr, 3);
|
||||
cudaGraph->dataMalloc();
|
||||
inputGpu->setData(IncrementalGenerator());
|
||||
cudaRuntime->run(cudaGraph);
|
||||
auto outputGpu = gpuOp->getOutput();
|
||||
auto outputGpu2Cpu = outputGpu->clone(cpuRuntime);
|
||||
cudaPrintTensor(outputGpu);
|
||||
// Check
|
||||
EXPECT_TRUE(outputGpu2Cpu->equalData(vector<float>{
|
||||
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||
0.2689414, 0.7310586, 0.2689414, 0.7310586, 0.2689414, 0.7310586,
|
||||
0.2689414, 0.7310586, 0.2689414, 0.7310586}));
|
||||
}
|
||||
} // namespace infini
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue