diff --git a/src/kernels/cuda/attention_kvcache.cu b/src/kernels/cuda/attention_kvcache.cu index 3499f9f0..e6b39921 100644 --- a/src/kernels/cuda/attention_kvcache.cu +++ b/src/kernels/cuda/attention_kvcache.cu @@ -5,7 +5,6 @@ #define BLOCKSIZE_2 WARP_SIZE*4 #define MAX_PARTITION 1024 -// ASSUME SEQ_LEN OF Q IS 1 template __global__ void _attention_kvcache_kernel_128_1(T* input_k_cache, T* input_v_cache, diff --git a/src/kernels/cuda/rope.cc b/src/kernels/cuda/rope.cc index 3197da1b..1d28260b 100644 --- a/src/kernels/cuda/rope.cc +++ b/src/kernels/cuda/rope.cc @@ -22,7 +22,6 @@ class RoPECuda : public CudaKernelWithoutConfig { IT_ASSERT(inputShape[0] == pos->getDims()[0] && inputShape[1] == pos->getDims()[1]); int position_idx_dtype = op->getInputs()[0]->getDTypeIndex(); - IT_ASSERT(position_idx_dtype == 7); int dim_model = inputShape[2]; int dim_head = 128; // TODO: get dim_head from the framework int pos_stride = inputShape[1]; diff --git a/test/kernels/cuda/test_cuda_rope.cc b/test/kernels/cuda/test_cuda_rope.cc index 8d88bf8e..cb02ca40 100644 --- a/test/kernels/cuda/test_cuda_rope.cc +++ b/test/kernels/cuda/test_cuda_rope.cc @@ -26,11 +26,12 @@ TEST(RoPE, Cuda) { cudaRuntime->run(gCuda); auto oCpu = gCpu->cloneTensor(op->getOutputs()[0]); + oCpu->printData(); EXPECT_TRUE(oCpu->equalData(vector{ - 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, - 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, - 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, - 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, 1.381773, - 1.381773, 1.381773, 1.381773, 1.381773})); + 0.540302, 0.647906, 0.731761, 0.796458, 0.846009, 0.883756, 0.912396, + 0.934062, 0.950415, 0.962739, 0.972014, 0.978989, 0.98423, 0.988167, + 0.991122, 0.99334, 0.995004, 0.996253, 0.99719, 0.997892, 0.998419, + 0.998815, 0.999111, 0.999333, 0.9995, 0.999625, 0.999719, 0.999789, + 0.999842, 0.999881, 0.999911, 0.999933})); } } // namespace infini