#ifdef INFINI_USE_NCCL #include "core/graph.h" #include "core/runtime.h" #include "cuda/cuda_runtime.h" #include "cuda/cuda_utility.h" #include "operators/broadcast.h" #include "test.h" #include #include static int WORLD_SIZE = 2; static int root = 0; namespace infini { void broadcast(const string taskName, int deviceID, vector data, vector ans) { // Create Runtimes and initiate communication Runtime cpuRuntime = NativeCpuRuntimeObj::getInstance(); Runtime cudaRuntime = make_ref(deviceID); cudaRuntime->initComm(taskName, WORLD_SIZE, deviceID); // Create Graph and insert allReduce operation Graph g = make_ref(cudaRuntime); auto input = g->addTensor(Shape{static_cast(data.size())}, DataType::Float32); auto op = g->addOp(input, nullptr, root); // Copy data from CPU to GPU g->dataMalloc(); // Only rank 0 has the data if (deviceID == root) { input->copyin(data); } // Run broadcast operation cudaRuntime->run(g); // Copy output from GPU to CPU auto result = op->getOutput()->clone(cpuRuntime); EXPECT_TRUE(result->equalData(ans)); } TEST(CUDA_Broadcast, run) { // Only 1 device gets data. Every rank should have the same data after // broadcast. vector data = {2., 3., 5., 6.}; vector ans = {2., 3., 5., 6.}; std::vector threads; for (int gpu = 0; gpu < WORLD_SIZE; ++gpu) { threads.emplace_back(broadcast, "test_broadcast", gpu, data, ans); } for (auto &thread : threads) { thread.join(); } } } // namespace infini #endif