Accelerate llama (#219)

* [feature] add cudagraph support * modify code to pass the cuda_all_reduce test * modify rope op * support rmsnorm * add fp16 support to silu cuda op * fix bugs in rmsnorm op * uncomment simplify in onnx.py --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com>
2024-04-01 08:46:05 +08:00 · 2024-04-01 08:46:05 +08:00 · a98573990b
parent 54a35772fb
commit a98573990b
13 changed files with 254 additions and 10 deletions
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -37,6 +37,7 @@ class GraphHandlerObj {
                              float momentum, float eps, bool training);
    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
                              Tensor bias, float eps, int axis, int stash_type);
    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                   int ph, int pw, int sh, int sw, int ceilMode);
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -156,8 +156,9 @@ struct OpType {
        Resize,
        ReverseSequence,
        RoiAlign,
-        RoPE,  // Fusion
+        RoPE,    // Fusion
-        Round, // Unary
+        Round,   // Unary
        RMSNorm, // Fusion
        STFT,
        Scan,
        Scatter,
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -0,0 +1,10 @@
 #pragma once
 #include "operators/rms_norm.h"
 namespace infini {
 void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
                    int num_tokens, int hidden_size);
 }; // namespace infini
--- a/include/operators/rms_norm.h
+++ b/include/operators/rms_norm.h
@ -0,0 +1,34 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief Fused RMSNorm Operator
 *
 */
 class RMSNormObj : public OperatorObj {
    int dim;
  public:
    /**
     * @brief Construct a new RMSNorm object.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor.
     * @param output The output tensor.
     */
    RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
    OP_CLONE(RMSNormObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
    int getDim() const { return dim; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -285,6 +285,12 @@ class OnnxStub:
                    axis,
                    stash_type,
                )
            elif node.op_type == "RMSNorm":
                tensors[node.output[0]] = self.handler.RMSNorm(
                    tensors[node.input[0]],
                    tensors[node.input[1]],
                    tensors.get(node.output[0]),
                )
            elif node.op_type == "MaxPool":
                attributes = _parse_attribute(
                    node,
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -18,6 +18,7 @@
 #include "operators/reduce.h"
 #include "operators/reshape.h"
 #include "operators/resize.h"
 #include "operators/rms_norm.h"
 #include "operators/rope.h"
 #include "operators/send.h"
 #include "operators/slice.h"
@ -124,6 +125,17 @@ Tensor GraphHandlerObj::layerNormalization(Tensor input, Tensor scale,
    }
 }
 Tensor GraphHandlerObj::rmsNorm(Tensor input, Tensor weight, Tensor output) {
    if (output) {
        g->addOpWithOutputs<RMSNormObj>(std::move(input), std::move(weight),
                                        output);
        return output;
    } else {
        return g->addOp<RMSNormObj>(std::move(input), std::move(weight), output)
            ->getOutput();
    }
 }
 Tensor GraphHandlerObj::maxPool(Tensor input, Tensor output, int kh, int kw,
                                int dh, int dw, int ph, int pw, int sh, int sw,
                                int ceilMode) {
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -506,6 +506,7 @@ void init_graph_builder(py::module &m) {
        .def("matmul", &Handler::matmul, policy::move)
        .def("batchNormalization", &Handler::batchNormalization, policy::move)
        .def("layerNormalization", &Handler::layerNormalization, policy::move)
        .def("RMSNorm", &Handler::rmsNorm, policy::move)
        .def("maxPool", &Handler::maxPool, policy::move)
        .def("avgPool", &Handler::avgPool, policy::move)
        .def("add", &Handler::add, policy::move)
--- a/src/kernels/cuda/rms_norm.cc
+++ b/src/kernels/cuda/rms_norm.cc
@ -0,0 +1,34 @@
 #include "operators/rms_norm.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_rmsnorm.h"
 #include "cuda/cuda_runtime.h"
 namespace infini {
 class RMSNormCuda : public CudaKernelWithoutConfig {
    void compute(const Operator &_op,
                 const RuntimeObj *_context) const override {
        auto op = as<RMSNormObj>(_op);
        auto input = op->getInputs(0);
        auto weight = op->getInputs(1);
        auto output = op->getOutput();
        void *const inputData = input->getRawDataPtr<void *>();
        void *const weightData = weight->getRawDataPtr<void *>();
        void *const outputData = output->getRawDataPtr<void *>();
        const auto &inputShape = input->getDims();
        int nDims = input->getDims().size();
        int hidden_size = inputShape[nDims - 1];
        int num_tokens = input->size() / hidden_size;
        IT_ASSERT(hidden_size == (int)weight->size());
        const int dType = op->getDType().getIndex();
        rmsnorm_kernel(dType, inputData, weightData, outputData, num_tokens,
                       hidden_size);
    }
 };
 REGISTER_KERNEL(Device::CUDA, OpType::RMSNorm, RMSNormCuda, "RMSNorm_CUDA");
 } // namespace infini
--- a/src/kernels/cuda/rms_norm.cu
+++ b/src/kernels/cuda/rms_norm.cu
@ -0,0 +1,112 @@
 #include "core/common.h"
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 template<class T>
 __inline__ __device__ T warpReduceSum(T val) {
 #pragma unroll
  for (int mask = 16; mask > 0; mask >>= 1)
    val += __shfl_xor_sync(uint32_t(-1), val, mask);
  return val;
 }
 /* Calculate the sum of all elements in a block */
 template<class T>
 __inline__ __device__ T blockReduceSum(T val) {
  static __shared__ T shared[32];
  int lane = threadIdx.x & 0x1f;
  int wid = threadIdx.x >> 5;
  val = warpReduceSum<T>(val);
  if (lane == 0)
    shared[wid] = val;
  __syncthreads();
  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
  // blockDim.x is not divided by 32
  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
  val = warpReduceSum<T>(val);
  return val;
 }
 template <class T>
 __global__ void _rmsnorm_kernel(void *in, void *weight, void *out, int num_tokens, int hidden_size) {
    __shared__ float s_variance;
    float variance = 0.0f;
    for(int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x){
        const float x = ((T*) in)[blockIdx.x * hidden_size + idx];
        variance += x * x; 
    }
    variance = blockReduceSum<float>(variance);
    if(threadIdx.x == 0){
        s_variance = rsqrtf(variance / hidden_size + 0.00001f);
    }
    __syncthreads();
    for(int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x){
        float x = ((T*) in)[blockIdx.x * hidden_size + idx];
        ((T*)out)[blockIdx.x * hidden_size + idx] = ((T)(x * s_variance)) * ((T*)weight)[idx];
    }
 }
 #define CASE(T)                                                                \
    _rmsnorm_kernel<DT_CUDA<T>::t>                                             \
        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>           \
        (input, weight, output, num_tokens, hidden_size);
 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
    case 1:                                                                    \
        CASE(1)                                                                \
        break;                                                                 \
    case 2:                                                                    \
        CASE(2)                                                                \
        break;                                                                 \
    case 3:                                                                    \
        CASE(3)                                                                \
        break;                                                                 \
    case 4:                                                                    \
        CASE(4)                                                                \
        break;                                                                 \
    case 5:                                                                    \
        CASE(5)                                                                \
        break;                                                                 \
    case 6:                                                                    \
        CASE(6)                                                                \
        break;                                                                 \
    case 7:                                                                    \
        CASE(7)                                                                \
        break;                                                                 \
    case 10:                                                                   \
        CASE(10)                                                               \
        break;                                                                 \
    case 11:                                                                   \
        CASE(11)                                                               \
        break;                                                                 \
    case 12:                                                                   \
        CASE(12)                                                               \
        break;                                                                 \
    case 13:                                                                   \
        CASE(13)                                                               \
        break;                                                                 \
    case 16:                                                                   \
        CASE(16)                                                               \
        break;                                                                 \
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
 namespace infini {
 void rmsnorm_kernel(int dType, void *input, void *weight, void *output, 
                    int num_tokens, int hidden_size) {
    dim3 blocksize = dim3(std::min(hidden_size, 1024));
    dim3 gridsize = dim3(num_tokens);
    SWITCH_DTYPE(dType)
 }
 } // namespace infini
--- a/src/kernels/cuda/rope.cc
+++ b/src/kernels/cuda/rope.cc
@ -22,7 +22,7 @@ class RoPECuda : public CudaKernelWithoutConfig {
        IT_ASSERT(nDims == 3 && pos->getDims().size() == 2);
        IT_ASSERT(inputShape[1] == pos->getDims()[1]);
        int dim_model = inputShape[2];
-        int dim_head = dim_model / 32;
+        int dim_head = 128;
        int hidden_stride = dim_model * inputShape[1];
        int pos_stride = inputShape[1];
--- a/src/kernels/cuda/rope.cu
+++ b/src/kernels/cuda/rope.cu
@ -3,11 +3,6 @@
 #include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 // gridDim (batch, seq_len, dim_model / 1024),   blockDim (1024, 1, 1)
 template <class T>
 __global__ void _rope_kernel(int* pos, void *in, void *out, int size, int dim_model,
                             int dim_head, int hidden_stride, int pos_stride) {
@ -86,8 +81,8 @@ __global__ void _rope_kernel(int* pos, void *in, void *out, int size, int dim_mo
 namespace infini {
 void rope_kernel(int dType, int * pos, void *input, void *output, int size,
                 int dim_model, int dim_head, int hidden_stride, int pos_stride) {
-    dim3 blocksize = dim3(1024,1,1);
+    dim3 blocksize = dim3(32,1,1);
-    dim3 gridsize = dim3(1, 1, 4);
+    dim3 gridsize = dim3(1, 1, dim_model/32);
    SWITCH_DTYPE(dType)
 }
--- a/src/kernels/cuda/unary.cu
+++ b/src/kernels/cuda/unary.cu
@ -315,6 +315,8 @@ void unary_kernel(const Operator &_op) {
    } else if (op->getOpType() == OpType::Silu) {
        if (_op->getDType() == DataType::Float32) {
            silu_kernel<float>((float *)inputData, (float *)outputData, num);
        } else if (_op->getDType() == DataType::Float16){
            silu_kernel<half>((half *)inputData, (half *)outputData, num);
        } else {
            IT_TODO_HALT();
        }
--- a/src/operators/rms_norm.cc
+++ b/src/operators/rms_norm.cc
@ -0,0 +1,36 @@
 #include "operators/rms_norm.h"
 namespace infini {
 RMSNormObj::RMSNormObj(GraphObj *graph, Tensor input, Tensor weight,
                       Tensor output)
    : OperatorObj(OpType::RMSNorm, {input, weight}, {output}) {
    IT_ASSERT(checkValid(graph));
 }
 optional<vector<Shape>> RMSNormObj::inferShape(const TensorVec &inputs) {
    const auto A = inputs[0];
    auto input_dim = A->getDims();
    auto output_dim = input_dim;
    return {{output_dim}};
 }
 std::string RMSNormObj::toString() const {
    std::ostringstream os;
    os << type.toString() << "[" << getGuid() << "]";
    os << "(";
    os << vecToString(inputs[0]->getDims()) << ",";
    os << "input=" << inputs[0]->getGuid() << ",";
    os << "output=" << outputs[0]->getGuid() << ")";
    return os.str();
 }
 vector<int> RMSNormObj::getWorkloadVector() const {
    vector<int> ret{type.underlying()};
    const Shape shape = outputs[0]->getDims();
    ret.insert(ret.end(), shape.begin(), shape.end());
    return ret;
 }
 vector<int> RMSNormObj::getOpAttrVector() const { return {type.underlying()}; }
 }; // namespace infini