diff --git a/include/cuda/cuda_transpose.h b/include/cuda/cuda_transpose.h new file mode 100644 index 00000000..b168cf0e --- /dev/null +++ b/include/cuda/cuda_transpose.h @@ -0,0 +1,11 @@ +#pragma once + +#include "operators/transpose.h" +#include "utils/small_array.h" + +namespace infini { + +void transpose_kernel(float *input, float *output, int nDims, int size, + SmallArray strides, SmallArray outputShape); + +}; // namespace infini diff --git a/include/utils/small_array.h b/include/utils/small_array.h new file mode 100644 index 00000000..d0e29a09 --- /dev/null +++ b/include/utils/small_array.h @@ -0,0 +1,8 @@ +namespace infini { + +#define SMALL_ARRAY_SIZE 8 +struct SmallArray { + int data[SMALL_ARRAY_SIZE]; +}; + +} // namespace infini diff --git a/src/kernels/cuda/transpose.cc b/src/kernels/cuda/transpose.cc new file mode 100644 index 00000000..37f97cd9 --- /dev/null +++ b/src/kernels/cuda/transpose.cc @@ -0,0 +1,49 @@ +#include "operators/transpose.h" +#include "cuda/cuda_kernel_wihtout_config.h" +#include "cuda/cuda_runtime.h" +#include "cuda/cuda_transpose.h" + +namespace infini { + +class TransposeCuda : public CudaKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + + auto input = op->getInputs(0); + auto output = op->getOutput(); + void *const inputData = input->getRawDataPtr(); + void *const outputData = output->getRawDataPtr(); + const auto &inputShape = input->getDims(); + const auto &outputShape = output->getDims(); + + const auto &perm = op->getPermute(); + int size = input->size(); + int nDims = input->getDims().size(); + + // Compute strides + SmallArray strides, buffer; + IT_ASSERT(nDims <= SMALL_ARRAY_SIZE); + int curStride = 1; + for (int i = nDims - 1; i >= 0; --i) { + buffer.data[i] = curStride; + curStride *= inputShape[i]; + } + for (int i = 0; i < nDims; ++i) { + strides.data[i] = buffer.data[perm[i]]; + } + + SmallArray outputDims; + for (int i = 0; i < nDims; ++i) { + outputDims.data[i] = outputShape[i]; + } + + transpose_kernel((float *)inputData, (float *)outputData, nDims, size, + strides, outputDims); + } +}; + +REGISTER_KERNEL(Device::CUDA, OpType::Transpose, DataType::Float32, + TransposeCuda, "Transpose_CUDA_Float32"); + +} // namespace infini diff --git a/src/kernels/cuda/transpose.cu b/src/kernels/cuda/transpose.cu new file mode 100644 index 00000000..f753217c --- /dev/null +++ b/src/kernels/cuda/transpose.cu @@ -0,0 +1,37 @@ +#include "core/common.h" +#include "cuda/cuda_common.h" +#include "utils/small_array.h" + +constexpr unsigned int num_threads() { return 32 * 4; } +constexpr int thread_work_size() { return 4; } +constexpr int block_work_size() { return thread_work_size() * num_threads(); } + +__global__ void _transpose_kernel(float *input, float *output, int nDims, + int size, infini::SmallArray strides, + infini::SmallArray outputShape) { + int outputIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (outputIdx < size) { + int inputIdx = 0; + int v = outputIdx; + for (int i = nDims - 1; i >= 0; --i) { + inputIdx += v % outputShape.data[i] * strides.data[i]; + v /= outputShape.data[i]; + } +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) + output[outputIdx] = __ldg(input + inputIdx); +#else + output[outputIdx] = input[inputIdx]; +#endif + } +} + +namespace infini { +void transpose_kernel(float *input, float *output, int nDims, int size, + SmallArray strides, SmallArray outputShape) { + int blocksize = block_work_size(); + int gridsize = (size + block_work_size() - 1) / block_work_size(); + _transpose_kernel<<>>(input, output, nDims, size, + strides, outputShape); +} + +} // namespace infini diff --git a/test/kernels/cuda/test_cuda_transpose.cc b/test/kernels/cuda/test_cuda_transpose.cc new file mode 100644 index 00000000..2f4eb8b3 --- /dev/null +++ b/test/kernels/cuda/test_cuda_transpose.cc @@ -0,0 +1,46 @@ +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "cuda/cuda_runtime.h" +#include "operators/transpose.h" + +#include "test.h" + +namespace infini { + +template +void testTranspose( + const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); + auto cudaRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph cudaGraph = make_ref(cudaRuntime); + auto inputGpu = cudaGraph->cloneTensor(inputCpu); + vector permute = {0, 2, 1, 3}; + auto gpuOp = cudaGraph->addOp(inputGpu, nullptr, permute); + cudaGraph->dataMalloc(); + inputGpu->copyData(inputCpu); + cudaRuntime->run(cudaGraph); + auto outputGpu = gpuOp->getOutput(); + auto oCpu = outputGpu->clone(cpuRuntime); + // Check + // inputCpu->printData(); + // oCpu->printData(); + EXPECT_TRUE(oCpu->equalData(vector{0, 1, 2, 3, 12, 13, 14, 15, + 4, 5, 6, 7, 16, 17, 18, 19, + 8, 9, 10, 11, 20, 21, 22, 23})); +} + +TEST(cuda_Transpose, run) { + testTranspose(IncrementalGenerator(), Shape{1, 2, 3, 4}); +} + +} // namespace infini