#ifdef INFINI_USE_NCCL #include "core/graph.h" #include "core/runtime.h" #include "cuda/cuda_runtime.h" #include "cuda/cuda_utility.h" #include "operators/all_reduce.h" #include "test.h" #include #include static int WORLD_SIZE = 2; namespace infini { template void allReduce(const string taskName, int deviceID, vector data, vector ans) { // Create Runtimes and initiate communication Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Runtime cudaRuntime = make_ref(deviceID); cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID); // Create Graph and insert allReduce operation Graph g = make_ref(cudaRuntime); auto input = g->addTensor(Shape{static_cast(data.size())}, DataType::Float32); auto op = g->addOp(input, nullptr); // Copy data from CPU to GPU g->dataMalloc(); input->copyin(data); // Run operation cudaRuntime->run(g); // Copy output from GPU to CPU auto result = op->getOutput()->clone(cpuRuntime); EXPECT_TRUE(result->equalData(ans)); } TEST(CUDA_AllReduce, sum) { vector data[2] = {{2., 3.}, {5., 6.}}; vector ans = {7., 9.}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(allReduce, "test_allreduce_sum", gpu, data[gpu], ans); } for (auto &thread : threads) { thread.join(); } } TEST(CUDA_AllReduce, prod) { vector data[2] = {{2., 3.}, {5., 6.}}; vector ans = {10., 18.}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(allReduce, "test_allreduce_prod", gpu, data[gpu], ans); } for (auto &thread : threads) { thread.join(); } } TEST(CUDA_AllReduce, min) { vector data[2] = {{2., 3.}, {5., 6.}}; vector ans = {2., 3.}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(allReduce, "test_allreduce_min", gpu, data[gpu], ans); } for (auto &thread : threads) { thread.join(); } } TEST(CUDA_AllReduce, max) { vector data[2] = {{2., 3.}, {5., 6.}}; vector ans = {5., 6.}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(allReduce, "test_allreduce_max", gpu, data[gpu], ans); } for (auto &thread : threads) { thread.join(); } } TEST(CUDA_AllReduce, avg) { vector data[2] = {{2., 3.}, {5., 6.}}; vector ans = {3.5, 4.5}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(allReduce, "test_allreduce_avg", gpu, data[gpu], ans); } for (auto &thread : threads) { thread.join(); } } } // namespace infini #endif