InfiniTensor/test/kernels/cuda/test_cuda_attention.cc

#include "core/graph.h"
#include "core/runtime.h"
#include "cuda/cuda_runtime.h"
#include "cuda/cuda_utility.h"
#include "operators/attention_kvcache.h"

#include "test.h"

namespace infini {
TEST(AttentionKVCache, Cuda) {
    Runtime runtime = NativeCpuRuntimeObj::getInstance();

    Graph gCpu = make_ref<GraphObj>(runtime);

    auto cudaRuntime = make_ref<CudaRuntimeObj>();
    Graph gCuda = make_ref<GraphObj>(cudaRuntime);
    auto input_k_cache_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);
    auto input_v_cache_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);
    auto input_q_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);
    auto input_k_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);
    auto input_v_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);
    auto position_id_d = gCuda->addTensor({1, 1}, DataType::UInt32);

    auto op = gCuda->addOp<AttentionKVCacheObj>(
        input_k_cache_d, input_v_cache_d, input_q_d, input_k_d, input_v_d,
        position_id_d, nullptr);
    gCuda->dataMalloc();

    input_q_d->setData(OneGenerator());
    input_k_d->setData(OneGenerator());
    input_v_d->setData(OneGenerator());
    position_id_d->setData(IncrementalGenerator());
    cudaRuntime->run(gCuda);

    auto oCpu = gCpu->cloneTensor(op->getOutputs()[0]);
    EXPECT_TRUE(oCpu->equalData(vector<float>{
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}));
}

} // namespace infini
[feature] add fused attention_kvcache operator support (#179) * [feature] add fused attention_kvcache operator support * add test to attention_kvcache op * Add space line at EOF --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-11-14 23:44:22 +08:00			`#include "core/graph.h"`
			`#include "core/runtime.h"`
			`#include "cuda/cuda_runtime.h"`
			`#include "cuda/cuda_utility.h"`
			`#include "operators/attention_kvcache.h"`

			`#include "test.h"`

			`namespace infini {`
			`TEST(AttentionKVCache, Cuda) {`
			`Runtime runtime = NativeCpuRuntimeObj::getInstance();`

			`Graph gCpu = make_ref<GraphObj>(runtime);`

			`auto cudaRuntime = make_ref<CudaRuntimeObj>();`
			`Graph gCuda = make_ref<GraphObj>(cudaRuntime);`
[feature] support kvcache with static graph (#209) * [feature] support kvcache with static graph * use workspace to optimize kvcache attention --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2024-01-25 14:20:43 +08:00			`auto input_k_cache_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);`
			`auto input_v_cache_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);`
			`auto input_q_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);`
			`auto input_k_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);`
			`auto input_v_d = gCuda->addTensor({1, 1, 1, 128}, DataType::Float32);`
[feature] add fused attention_kvcache operator support (#179) * [feature] add fused attention_kvcache operator support * add test to attention_kvcache op * Add space line at EOF --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-11-14 23:44:22 +08:00			`auto position_id_d = gCuda->addTensor({1, 1}, DataType::UInt32);`

			`auto op = gCuda->addOp<AttentionKVCacheObj>(`
			`input_k_cache_d, input_v_cache_d, input_q_d, input_k_d, input_v_d,`
			`position_id_d, nullptr);`
			`gCuda->dataMalloc();`

			`input_q_d->setData(OneGenerator());`
			`input_k_d->setData(OneGenerator());`
			`input_v_d->setData(OneGenerator());`
			`position_id_d->setData(IncrementalGenerator());`
			`cudaRuntime->run(gCuda);`

[feature] support kvcache with static graph (#209) * [feature] support kvcache with static graph * use workspace to optimize kvcache attention --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2024-01-25 14:20:43 +08:00			`auto oCpu = gCpu->cloneTensor(op->getOutputs()[0]);`
[feature] add fused attention_kvcache operator support (#179) * [feature] add fused attention_kvcache operator support * add test to attention_kvcache op * Add space line at EOF --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-11-14 23:44:22 +08:00			`EXPECT_TRUE(oCpu->equalData(vector<float>{`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
[feature] support kvcache with static graph (#209) * [feature] support kvcache with static graph * use workspace to optimize kvcache attention --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2024-01-25 14:20:43 +08:00			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}));`
[feature] add fused attention_kvcache operator support (#179) * [feature] add fused attention_kvcache operator support * add test to attention_kvcache op * Add space line at EOF --------- Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-11-14 23:44:22 +08:00			`}`

			`} // namespace infini`