InfiniTensor/include/cuda/gather.h

62 lines
1.8 KiB
C
Raw Permalink Normal View History

#pragma once
#include "core/data_type.h"
#include "core/operator.h"
#include "operators/gather.h"
namespace infini {
struct GatherMetaData {
// Pointer to indices
void *indexValue;
// Type of index values
DataType indexType;
// Type of input and output data
DataType dataType;
// Axis of the gather operation
int axis;
// Rank of input
int inNDim;
// Rank of output
int outNDim;
// Rank of indices
int idxNDim;
// Shape of output
int outDim[4];
// Shape of indices
int idxDim[4];
// Strides of indices
int idxStride[4];
// Strides of input
int inStride[4];
};
inline void initGatherMetaData(GatherMetaData &metaData,
const Ref<OperatorObj> &_op) {
memset(&metaData, 0, sizeof(metaData));
auto op = as<GatherBaseObj>(_op);
Ref<TensorObj> in = op->getInputs(0);
Ref<TensorObj> index = op->getInputs(1);
Ref<TensorObj> out = op->getOutput();
metaData.indexValue = index->getRawDataPtr<void *>();
metaData.indexType = index->getDType();
metaData.dataType = in->getDType();
metaData.axis = op->getAxis();
metaData.inNDim = in->getRank();
metaData.outNDim = out->getRank();
metaData.idxNDim = index->getRank();
for (int i = 0; i < metaData.outNDim; ++i)
metaData.outDim[i] = out->getDims()[i];
for (int i = 0; i < metaData.idxNDim; ++i) {
metaData.idxDim[i] = index->getDims()[i];
metaData.idxStride[i] = index->getStride()[i];
}
for (int i = 0; i < metaData.inNDim; ++i) {
metaData.inStride[i] = in->getStride()[i];
}
}
Modify kernel registration & support fp16 (#205) * - Remove dataType from the kernel registration. * - support fp16 for conv * - cpu kernel: adapt the new registration mechanism * modified all register kernel * add where fp16 * add layernorm fp16 * add split_concat fp16 * - element_wise support fp16 * feat: support transpose fp16 * feat: support sliceOp fp16 * - unary support fp16 * - feat: support reduceOp fp16 * feat: support matmulOp/expandOp fp16 * feat: support powOp int8 * add cuda cast & support half-precision for gather * style: fix style * feat:support int8 for gather * style:fix style * modified test_cuda_conv_transposed * fix: fix dist code to support fp16 * fix(graph.cc): fix topo_sort * fix: fix recv and send kernel registration * feat: add field tensors for stub * refactor(frontend): 先排序后构图 Signed-off-by: YdrMaster <ydrml@hotmail.com> * fix: 为中间结果提供tensor到node的mapping * fix (slice): add guard for area out of range * fix: fix matmul fp16 * fix: fix re-dataMalloc for weight tensor and use of naive allocator * feat: add dataType filter for cuda kernel * feat: bang kernel adapt the new registration mechanism * fix: fix some error on mlu * feat: intelcpu kernel adapt the new registration mechanism * feat: modify kernel registration on kunlun * fix intelcpu compiler bug * feat: bang reshape support all dataType * fix: fix bang reduce * fix(all_reduce.cc): fix as reviewer suggessted * fix: fix style and restore unary test codes --------- Signed-off-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com> Co-authored-by: zhangyunze <z13785159769@163.com> Co-authored-by: OdinaryWord <sx-hz@163.com> Co-authored-by: YdrMaster <ydrml@hotmail.com> Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
2024-01-15 11:02:13 +08:00
template <typename T>
void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
size_t num);
} // namespace infini