This is an automated email from the ASF dual-hosted git repository.
tlopex pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 5319c9c854 [REFACTOR] Remove unused mscclpp contrib module (#18852)
5319c9c854 is described below
commit 5319c9c85455c8a62a224d3e78aa4d78b2528111
Author: Tianqi Chen <[email protected]>
AuthorDate: Sat Feb 28 10:25:39 2026 -0500
[REFACTOR] Remove unused mscclpp contrib module (#18852)
## Summary
Remove the mscclpp (Microsoft MSCCL++) contrib module. It was added as
an
experimental integration but is not used anywhere in the codebase.
Removes
3rdparty headers, cmake module, runtime source, and build config
options.
---
3rdparty/mscclpp/include/common.h | 107 -------
3rdparty/mscclpp/include/msccl.cuh | 323 --------------------
3rdparty/mscclpp/include/msccl.h | 494 -------------------------------
CMakeLists.txt | 2 -
cmake/config.cmake | 5 -
cmake/modules/LibInfo.cmake | 1 -
cmake/modules/contrib/MSCCLPP.cmake | 49 ---
src/runtime/contrib/mscclpp/allreduce.cu | 183 ------------
src/support/libinfo.cc | 5 -
9 files changed, 1169 deletions(-)
diff --git a/3rdparty/mscclpp/include/common.h
b/3rdparty/mscclpp/include/common.h
deleted file mode 100644
index ccde5a3ef4..0000000000
--- a/3rdparty/mscclpp/include/common.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCL_COMMON_HPP_
-#define MSCCL_COMMON_HPP_
-
-#if defined(__HIP_PLATFORM_AMD__)
-#define WARP_SIZE 64
-#define __syncwarp() __builtin_amdgcn_wave_barrier()
-#else
-#define WARP_SIZE 32
-#endif
-
-constexpr int NRANKS_PER_NODE = 8;
-constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks
* 256KB = 70MB
-
-template <typename To, typename From>
-__forceinline__ __device__ To bit_cast(const From& src) {
- static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
- union {
- From f;
- To t;
- } u;
- u.f = src;
- return u.t;
-}
-
-template <typename T>
-__forceinline__ __device__ T add_elements(T a, T b) {
- return a + b;
-}
-
-template <>
-__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
- return __hadd2(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) {
- int4 ret;
- ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T,
int>(b.w)));
- ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T,
int>(b.x)));
- ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T,
int>(b.y)));
- ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T,
int>(b.z)));
- return ret;
-}
-
-template <typename T>
-__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) {
- return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) {
- return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) {
- uint2 ret;
- ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T,
int>(b.x)));
- ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T,
int>(b.y)));
- return ret;
-}
-
-template <typename T>
-__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) {
- return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) {
- return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ int add_vectors_helper(int a, int b) {
- return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T,
int>(b)));
-}
-
-template <typename T>
-__forceinline__ __device__ int add_vectors(int a, int b) {
- return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ int add_vectors<__half>(int a, int b) {
- return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b)
{
- return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a),
bit_cast<T, uint32_t>(b)));
-}
-
-template <typename T>
-__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) {
- return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t
b) {
- return add_vectors_helper<__half2>(a, b);
-}
-
-#endif // MSCCL_COMMON_HPP_
diff --git a/3rdparty/mscclpp/include/msccl.cuh
b/3rdparty/mscclpp/include/msccl.cuh
deleted file mode 100644
index 93612126dc..0000000000
--- a/3rdparty/mscclpp/include/msccl.cuh
+++ /dev/null
@@ -1,323 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <algorithm>
-#include <mscclpp/concurrency_device.hpp>
-#include <mscclpp/core.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
-#include <unordered_map>
-#include <vector>
-
-#include "common.h"
-#include "msccl.h"
-
-#define MSCCL_API extern "C" __attribute__((visibility("default")))
-
-#define CUDACHECK(cmd)
\
- do {
\
- cudaError_t e = cmd;
\
- if (e != cudaSuccess) {
\
- printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,
cudaGetErrorString(e)); \
- exit(EXIT_FAILURE);
\
- }
\
- } while (0)
-
-#define NUM_CHANNELS_PER_CONNECTION 64
-
-struct channelKey {
- const void* sendbuff;
- const void* recvbuff;
- size_t bytes;
- bool operator==(const channelKey& other) const {
- return sendbuff == other.sendbuff && recvbuff == other.recvbuff && bytes
== other.bytes;
- }
-};
-
-namespace std {
-template <>
-struct hash<channelKey> {
- std::size_t operator()(const channelKey& k) const {
- return std::hash<const void*>()(k.sendbuff) ^ std::hash<const
void*>()(k.recvbuff) ^ std::hash<size_t>()(k.bytes);
- }
-};
-} // namespace std
-
-struct ChannelInfo {
- std::vector<mscclpp::SmChannel> smChannels;
- std::vector<mscclpp::SmChannel> smOutChannels;
- std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>>
smChannelDeviceHandles;
- std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>>
smOutChannelDeviceHandles;
-};
-
-struct mscclComm {
- std::shared_ptr<mscclpp::Communicator> comm;
- std::vector<std::shared_ptr<mscclpp::Connection>> connections;
- std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
-
- std::unordered_map<channelKey, ChannelInfo> channelInfos;
- std::shared_ptr<char> scratchBuff;
- std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
-};
-
-static size_t mscclTypeSize(mscclDataType_t type) {
- switch (type) {
- case mscclInt8:
- case mscclUint8:
- return 1;
- case mscclFloat16:
- return 2;
- case mscclInt32:
- case mscclUint32:
- return 4;
- case mscclInt64:
- case mscclUint64:
- return 8;
- case mscclFloat32:
- return 4;
- case mscclFloat64:
- return 8;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
- case mscclBfloat16:
- return 2;
-#endif // defined(__CUDA_BF16_TYPES_EXIST__)
-#if defined(__CUDA_FP8_TYPES_EXIST__)
- case mscclFp8E4M3:
- case mscclFp8E5M2:
- return 1;
-#endif // defined(__CUDA_FP8_TYPES_EXIST__)
- case mscclNumTypes:
- return 0;
- }
- return 0;
-}
-
-static mscclpp::Transport getTransport(int, int) { return
mscclpp::Transport::CudaIpc; }
-
-static std::vector<mscclpp::RegisteredMemory>
setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
- void* buff,
size_t bytes,
-
mscclpp::TransportFlags transport) {
- std::vector<mscclpp::RegisteredMemory> remoteMemories;
- mscclpp::RegisteredMemory memory = comm->registerMemory(buff, bytes,
transport);
- std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>
remoteRegMemoryFutures;
- for (int i = 0; i < comm->bootstrap()->getNranks(); i++) {
- if (i == rank) continue;
- remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(i, 0));
- comm->sendMemoryOnSetup(memory, i, 0);
- }
- comm->setup();
- std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(),
std::back_inserter(remoteMemories),
- [](const auto& future) { return future.get(); });
- return remoteMemories;
-}
-
-static std::vector<mscclpp::SmChannel> setupSmChannels(mscclComm_t comm,
- const
std::vector<mscclpp::RegisteredMemory>& remoteMemories,
- void* src) {
- std::vector<mscclpp::SmChannel> channels;
- std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>&
smSemaphores = comm->smSemaphores;
- size_t nConnections = comm->connections.size();
- for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
- for (size_t cid = 0; cid < nConnections; ++cid) {
- if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
- channels.emplace_back(smSemaphores[idx * nConnections + cid],
remoteMemories[cid], src, nullptr);
- }
- }
- }
- return channels;
-}
-
-static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>>
setupSmChannelDeviceHandles(
- const std::vector<mscclpp::SmChannel>& smChannels) {
- std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>>
smChannelDeviceHandles;
- std::transform(smChannels.begin(), smChannels.end(),
std::back_inserter(smChannelDeviceHandles),
- [](const mscclpp::SmChannel& smChannel) { return
mscclpp::deviceHandle(smChannel); });
- std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-
mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
- mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(),
smChannelDeviceHandles.data(),
- smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
- return ptr;
-}
-
-MSCCL_API mscclResult_t mscclGetVersion(int* version) {
- if (version == nullptr) return mscclInvalidArgument;
- *version = MSCCLPP_VERSION;
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId) {
- if (uniqueId == nullptr) return mscclInvalidArgument;
- if (MSCCLPP_UNIQUE_ID_BYTES != MSCCL_UNIQUE_ID_BYTES) return
mscclInternalError;
- mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId();
- memcpy(uniqueId, &id, sizeof(mscclUniqueId));
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommInitRankConfig(mscclComm_t*, int,
mscclUniqueId, int,
- mscclConfig_t*) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks,
mscclUniqueId commId, int rank) {
- if (comm == nullptr) return mscclInvalidArgument;
- if (nranks < 0 || rank < 0 || rank >= nranks) return mscclInvalidArgument;
- std::shared_ptr<mscclpp::TcpBootstrap> bootstrap =
std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
- mscclpp::UniqueId id;
- memcpy(id.data(), &commId, sizeof(mscclUniqueId));
- bootstrap->initialize(id);
- std::shared_ptr<mscclpp::Communicator> mscclppComm =
std::make_shared<mscclpp::Communicator>(bootstrap);
-
std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>>
connectionFutures;
-
- for (int i = 0; i < mscclppComm->bootstrap()->getNranks(); i++) {
- if (i == rank) continue;
- mscclpp::Transport transport = getTransport(rank, i);
- connectionFutures.push_back(mscclppComm->connectOnSetup(i, 0, transport));
- }
- mscclppComm->setup();
-
- std::vector<std::shared_ptr<mscclpp::Connection>> connections;
- std::transform(connectionFutures.begin(), connectionFutures.end(),
std::back_inserter(connections),
- [](const auto& future) { return future.get(); });
-
- std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
- for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
- for (size_t cid = 0; cid < connections.size(); ++cid) {
- if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
- smSemaphores.emplace_back(
-
std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*(mscclppComm),
connections[cid]));
- }
- }
- }
- mscclppComm->setup();
-
- mscclComm* commPtr = new mscclComm();
- commPtr->comm = mscclppComm;
- commPtr->connections = std::move(connections);
- commPtr->smSemaphores = std::move(smSemaphores);
- commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
- commPtr->remoteScratchRegMemories =
- setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(),
SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
-
- *comm = commPtr;
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommInitAll(mscclComm_t*, int, const int*) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclCommFinalize(mscclComm_t comm) {
- comm->comm->bootstrap()->barrier();
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommDestroy(mscclComm_t comm) {
- if (comm == nullptr) return mscclInvalidArgument;
- delete comm;
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommAbort(mscclComm_t) { return mscclSuccess; }
-
-MSCCL_API mscclResult_t mscclCommSplit(mscclComm_t, int, int, mscclComm_t*,
mscclConfig_t*) {
- return mscclInternalError;
-}
-
-MSCCL_API const char* mscclGetErrorString(mscclResult_t result) {
- switch (result) {
- case mscclSuccess:
- return "no error";
- case mscclUnhandledCudaError:
- return "unhandled cuda error (run with MSCCL_DEBUG=INFO for details)";
- case mscclSystemError:
- return "unhandled system error (run with MSCCL_DEBUG=INFO for details)";
- case mscclInternalError:
- return "internal error - please report this issue to the MSCCL
developers";
- case mscclInvalidArgument:
- return "invalid argument (run with MSCCL_DEBUG=WARN for details)";
- case mscclInvalidUsage:
- return "invalid usage (run with MSCCL_DEBUG=WARN for details)";
- case mscclRemoteError:
- return "remote process exited or there was a network error";
- case mscclInProgress:
- return "MSCCL operation in progress";
- default:
- return "unknown result code";
- }
-}
-
-MSCCL_API const char* mscclGetLastError(mscclComm_t) { return nullptr; }
-
-MSCCL_API mscclResult_t mscclCommGetAsyncError(mscclComm_t, mscclResult_t*
asyncError) {
- if (asyncError == nullptr) return mscclInvalidArgument;
- *asyncError = mscclSuccess;
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommCount(const mscclComm_t comm, int* count) {
- if (comm == nullptr || count == nullptr) return mscclInvalidArgument;
- *count = comm->comm->bootstrap()->getNranks();
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device)
{
- if (comm == nullptr || device == nullptr) return mscclInvalidArgument;
- *device = comm->comm->bootstrap()->getRank();
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank) {
- if (comm == nullptr || rank == nullptr) return mscclInvalidArgument;
- *rank = comm->comm->bootstrap()->getRank();
- return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff,
size_t sendcount,
- mscclDataType_t datatype, mscclComm_t
comm,
- cudaStream_t stream) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t*, void*,
mscclDataType_t,
- mscclScalarResidence_t,
mscclComm_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRedOpDestroy(mscclRedOp_t, mscclComm_t) { return
mscclInternalError; }
-
-MSCCL_API mscclResult_t mscclReduce(const void*, void*, size_t,
mscclDataType_t, mscclRedOp_t, int,
- mscclComm_t, cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclBcast(void*, size_t, mscclDataType_t, int,
mscclComm_t, cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclBroadcast(const void*, void*, size_t,
mscclDataType_t, int,
- mscclComm_t, cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclReduceScatter(const void*, void*, size_t,
mscclDataType_t,
- mscclRedOp_t, mscclComm_t,
cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclSend(const void*, size_t, mscclDataType_t, int,
mscclComm_t,
- cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRecv(void*, size_t, mscclDataType_t, int,
mscclComm_t, cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclAllToAll(const void*, void*, size_t,
mscclDataType_t, mscclComm_t,
- cudaStream_t) {
- return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclGroupStart() { return mscclSuccess; }
-
-MSCCL_API mscclResult_t mscclGroupEnd() { return mscclSuccess; }
diff --git a/3rdparty/mscclpp/include/msccl.h b/3rdparty/mscclpp/include/msccl.h
deleted file mode 100644
index 12e4e7222b..0000000000
--- a/3rdparty/mscclpp/include/msccl.h
+++ /dev/null
@@ -1,494 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT
License.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef MSCCL_H_
-#define MSCCL_H_
-
-#include <mscclpp/gpu.hpp>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <limits.h>
-/* Opaque handle to communicator */
-typedef struct mscclComm* mscclComm_t;
-#define MSCCL_COMM_NULL NULL
-
-#define MSCCL_UNIQUE_ID_BYTES 128
-typedef struct {
- char internal[MSCCL_UNIQUE_ID_BYTES];
-} mscclUniqueId;
-
-/* Error type */
-typedef enum {
- mscclSuccess = 0,
- mscclUnhandledCudaError = 1,
- mscclSystemError = 2,
- mscclInternalError = 3,
- mscclInvalidArgument = 4,
- mscclInvalidUsage = 5,
- mscclRemoteError = 6,
- mscclInProgress = 7,
- mscclNumResults = 8
-} mscclResult_t;
-
-#define MSCCL_CONFIG_UNDEF_INT INT_MIN
-#define MSCCL_CONFIG_UNDEF_PTR NULL
-#define MSCCL_SPLIT_NOCOLOR -1
-
-/* Communicator configuration. Users can assign value to attributes to specify
the
- * behavior of a communicator. */
-typedef struct mscclConfig_v21700 {
- /* attributes that users should never touch. */
- size_t size;
- unsigned int magic;
- unsigned int version;
- /* attributes that users are able to customize. */
- int blocking;
- int cgaClusterSize;
- int minCTAs;
- int maxCTAs;
- const char* netName;
- int splitShare;
-} mscclConfig_t;
-
-/* Config initializer must be assigned to initialize config structure when it
is created.
- * Not initialized config will result in MSCCL error. */
-#define MSCCL_CONFIG_INITIALIZER
\
- {
\
- sizeof(mscclConfig_t), /* size */
\
- 0xcafebeef, /* magic */
\
- MSCCL_VERSION(MSCCL_MAJOR, MSCCL_MINOR, MSCCL_PATCH), /* version */
\
- MSCCL_CONFIG_UNDEF_INT, /* blocking */
\
- MSCCL_CONFIG_UNDEF_INT, /*
cgaClusterSize */ \
- MSCCL_CONFIG_UNDEF_INT, /* minCTAs */
\
- MSCCL_CONFIG_UNDEF_INT, /* maxCTAs */
\
- MSCCL_CONFIG_UNDEF_PTR, /* netName */
\
- MSCCL_CONFIG_UNDEF_INT /* splitShare */
\
- }
-
-/* Return the MSCCL_VERSION_CODE of the MSCCL library in the supplied integer.
- * This integer is coded with the MAJOR, MINOR and PATCH level of the
- * MSCCL library
- */
-mscclResult_t mscclGetVersion(int* version);
-mscclResult_t pmscclGetVersion(int* version);
-
-/* Generates an Id to be used in mscclCommInitRank. mscclGetUniqueId should be
- * called once and the Id should be distributed to all ranks in the
- * communicator before calling mscclCommInitRank. */
-mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId);
-mscclResult_t pmscclGetUniqueId(mscclUniqueId* uniqueId);
-
-/* Create a new communicator (multi thread/process version) with a
configuration
- * set by users. */
-mscclResult_t mscclCommInitRankConfig(mscclComm_t* comm, int nranks,
mscclUniqueId commId, int rank,
- mscclConfig_t* config);
-mscclResult_t pmscclCommInitRankConfig(mscclComm_t* comm, int nranks,
mscclUniqueId commId,
- int rank, mscclConfig_t* config);
-
-/* Creates a new communicator (multi thread/process version).
- * rank must be between 0 and nranks-1 and unique within a communicator clique.
- * Each rank is associated to a CUDA device, which has to be set before calling
- * mscclCommInitRank.
- * mscclCommInitRank implicitly syncronizes with other ranks, so it must be
- * called by different threads/processes or use mscclGroupStart/mscclGroupEnd.
*/
-mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId
commId, int rank);
-mscclResult_t pmscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId
commId, int rank);
-
-/* Creates a clique of communicators (single process version).
- * This is a convenience function to create a single-process communicator
clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(mscclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator.
*/
-mscclResult_t mscclCommInitAll(mscclComm_t* comm, int ndev, const int*
devlist);
-mscclResult_t pmscclCommInitAll(mscclComm_t* comm, int ndev, const int*
devlist);
-
-/* Finalize a communicator. mscclCommFinalize flushes all issued
communications,
- * and marks communicator state as mscclInProgress. The state will change to
mscclSuccess
- * when the communicator is globally quiescent and related resources are
freed; then,
- * calling mscclCommDestroy can locally free the rest of the resources (e.g.
communicator
- * itself) without blocking. */
-mscclResult_t mscclCommFinalize(mscclComm_t comm);
-mscclResult_t pmscclCommFinalize(mscclComm_t comm);
-
-/* Frees local resources associated with communicator object. */
-mscclResult_t mscclCommDestroy(mscclComm_t comm);
-mscclResult_t pmscclCommDestroy(mscclComm_t comm);
-
-/* Frees resources associated with communicator object and aborts any
operations
- * that might still be running on the device. */
-mscclResult_t mscclCommAbort(mscclComm_t comm);
-mscclResult_t pmscclCommAbort(mscclComm_t comm);
-
-/* Creates one or more communicators from an existing one.
- * Ranks with the same color will end up in the same communicator.
- * Within the new communicator, key will be used to order ranks.
- * MSCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any
group
- * and will therefore return a NULL communicator.
- * If config is NULL, the new communicator will inherit the original
communicator's
- * configuration*/
-mscclResult_t mscclCommSplit(mscclComm_t comm, int color, int key,
mscclComm_t* newcomm,
- mscclConfig_t* config);
-mscclResult_t pmscclCommSplit(mscclComm_t comm, int color, int key,
mscclComm_t* newcomm,
- mscclConfig_t* config);
-
-/* Returns a string for each error code. */
-const char* mscclGetErrorString(mscclResult_t result);
-const char* pmscclGetErrorString(mscclResult_t result);
-
-/* Returns a human-readable message of the last error that occurred.
- * comm is currently unused and can be set to NULL
- */
-const char* mscclGetLastError(mscclComm_t comm);
-const char* pmscclGetLastError(mscclComm_t comm);
-
-/* Checks whether the comm has encountered any asynchronous errors */
-mscclResult_t mscclCommGetAsyncError(mscclComm_t comm, mscclResult_t*
asyncError);
-mscclResult_t pmscclCommGetAsyncError(mscclComm_t comm, mscclResult_t*
asyncError);
-
-/* Gets the number of ranks in the communicator clique. */
-mscclResult_t mscclCommCount(const mscclComm_t comm, int* count);
-mscclResult_t pmscclCommCount(const mscclComm_t comm, int* count);
-
-/* Returns the cuda device number associated with the communicator. */
-mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device);
-mscclResult_t pmscclCommCuDevice(const mscclComm_t comm, int* device);
-
-/* Returns the user-ordered "rank" associated with the communicator. */
-mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank);
-mscclResult_t pmscclCommUserRank(const mscclComm_t comm, int* rank);
-
-/* Reduction operation selector */
-typedef enum { mscclNumOps_dummy = 5 } mscclRedOp_dummy_t;
-typedef enum {
- mscclSum = 0,
- mscclProd = 1,
- mscclMax = 2,
- mscclMin = 3,
- mscclAvg = 4,
- /* mscclNumOps: The number of built-in mscclRedOp_t values. Also
- * serves as the least possible value for dynamic mscclRedOp_t's
- * as constructed by mscclRedOpCreate*** functions. */
- mscclNumOps = 5,
- /* mscclMaxRedOp: The largest valid value for mscclRedOp_t.
- * It is defined to be the largest signed value (since compilers
- * are permitted to use signed enums) that won't grow
- * sizeof(mscclRedOp_t) when compared to previous MSCCL versions to
- * maintain ABI compatibility. */
- mscclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(mscclRedOp_dummy_t))
-} mscclRedOp_t;
-
-/* Data types */
-typedef enum {
- mscclInt8 = 0,
- mscclChar = 0,
- mscclUint8 = 1,
- mscclInt32 = 2,
- mscclInt = 2,
- mscclUint32 = 3,
- mscclInt64 = 4,
- mscclUint64 = 5,
- mscclFloat16 = 6,
- mscclHalf = 6,
- mscclFloat32 = 7,
- mscclFloat = 7,
- mscclFloat64 = 8,
- mscclDouble = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__)
- mscclBfloat16 = 9,
- mscclFp8E4M3 = 10,
- mscclFp8E5M2 = 11,
- mscclNumTypes = 12
-#elif defined(__CUDA_BF16_TYPES_EXIST__)
- mscclBfloat16 = 9,
- mscclNumTypes = 10
-#else
- mscclNumTypes = 9
-#endif
-} mscclDataType_t;
-
-/* mscclScalarResidence_t: Location and dereferencing logic for scalar
arguments. */
-typedef enum {
- /* mscclScalarDevice: The scalar is in device-visible memory and will be
- * dereferenced while the collective is running. */
- mscclScalarDevice = 0,
-
- /* mscclScalarHostImmediate: The scalar is in host-visible memory and will be
- * dereferenced before the mscclRedOpCreate***() function returns. */
- mscclScalarHostImmediate = 1
-} mscclScalarResidence_t;
-
-/*
- * mscclRedOpCreatePreMulSum
- *
- * Creates a new reduction operator which pre-multiplies input values by a
given
- * scalar locally before reducing them with peer values via summation. For use
- * only with collectives launched against *comm* and *datatype*. The
- * *residence* argument indicates how/when the memory pointed to by *scalar*
- * will be dereferenced. Upon return, the newly created operator's handle
- * is stored in *op*.
- */
-mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar,
mscclDataType_t datatype,
- mscclScalarResidence_t residence,
mscclComm_t comm);
-mscclResult_t pmscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar,
mscclDataType_t datatype,
- mscclScalarResidence_t residence,
mscclComm_t comm);
-
-/*
- * mscclRedOpDestroy
- *
- * Destroys the reduction operator *op*. The operator must have been created by
- * mscclRedOpCreatePreMul with the matching communicator *comm*. An operator
may be
- * destroyed as soon as the last MSCCL function which is given that operator
returns.
- */
-mscclResult_t mscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm);
-mscclResult_t pmscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm);
-
-/*
- * Collective communication operations
- *
- * Collective communication operations must be called separately for each
- * communicator in a communicator clique.
- *
- * They return when operations have been enqueued on the CUDA stream.
- *
- * Since they may perform inter-CPU synchronization, each call has to be done
- * from a different thread or process, or need to use Group Semantics (see
- * below).
- */
-
-/*
- * Reduce
- *
- * Reduces data arrays of length count in sendbuff into recvbuff using op
- * operation.
- * recvbuff may be NULL on all calls except for root device.
- * root is the rank (not the CUDA device) where data will reside after the
- * operation is complete.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclReduce(const void* sendbuff, void* recvbuff, size_t count,
- mscclDataType_t datatype, mscclRedOp_t op, int root,
mscclComm_t comm,
- cudaStream_t stream);
-mscclResult_t pmscclReduce(const void* sendbuff, void* recvbuff, size_t count,
- mscclDataType_t datatype, mscclRedOp_t op, int
root, mscclComm_t comm,
- cudaStream_t stream);
-
-/*
- * (deprecated) Broadcast (in-place)
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * This operation is implicitly in place.
- */
-mscclResult_t mscclBcast(void* buff, size_t count, mscclDataType_t datatype,
int root,
- mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclBcast(void* buff, size_t count, mscclDataType_t datatype,
int root,
- mscclComm_t comm, cudaStream_t stream);
-
-/*
- * Broadcast
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclBroadcast(const void* sendbuff, void* recvbuff, size_t
count,
- mscclDataType_t datatype, int root, mscclComm_t
comm,
- cudaStream_t stream);
-mscclResult_t pmscclBroadcast(const void* sendbuff, void* recvbuff, size_t
count,
- mscclDataType_t datatype, int root, mscclComm_t
comm,
- cudaStream_t stream);
-
-/*
- * All-Reduce
- *
- * Reduces data arrays of length count in sendbuff using op operation, and
- * leaves identical copies of result on each recvbuff.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff, size_t
count,
- mscclDataType_t datatype, mscclRedOp_t op,
mscclComm_t comm,
- cudaStream_t stream);
-mscclResult_t pmscclAllReduce(const void* sendbuff, void* recvbuff, size_t
count,
- mscclDataType_t datatype, mscclRedOp_t op,
mscclComm_t comm,
- cudaStream_t stream);
-
-/*
- * Reduce-Scatter
- *
- * Reduces data in sendbuff using op operation and leaves reduced result
- * scattered over the devices so that recvbuff on rank i will contain the i-th
- * block of the result.
- * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
- * should have a size of at least nranks*recvcount elements.
- *
- * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
- */
-mscclResult_t mscclReduceScatter(const void* sendbuff, void* recvbuff, size_t
recvcount,
- mscclDataType_t datatype, mscclRedOp_t op,
mscclComm_t comm,
- cudaStream_t stream);
-mscclResult_t pmscclReduceScatter(const void* sendbuff, void* recvbuff, size_t
recvcount,
- mscclDataType_t datatype, mscclRedOp_t op,
mscclComm_t comm,
- cudaStream_t stream);
-
-/*
- * All-Gather
- *
- * Each device gathers sendcount values from other GPUs into recvbuff,
- * receiving data from rank i at offset i*sendcount.
- * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
- * should have a size of at least nranks*sendcount elements.
- *
- * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
- */
-mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff, size_t
sendcount,
- mscclDataType_t datatype, mscclComm_t comm,
cudaStream_t stream);
-mscclResult_t pmscclAllGather(const void* sendbuff, void* recvbuff, size_t
sendcount,
- mscclDataType_t datatype, mscclComm_t comm,
cudaStream_t stream);
-
-/*
- * Send
- *
- * Send data from sendbuff to rank peer.
- *
- * Rank peer needs to call mscclRecv with the same datatype and the same count
from this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv
operations
- * need to progress concurrently to complete, they must be fused within a
mscclGroupStart/
- * mscclGroupEnd section.
- */
-mscclResult_t mscclSend(const void* sendbuff, size_t count, mscclDataType_t
datatype, int peer,
- mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclSend(const void* sendbuff, size_t count, mscclDataType_t
datatype, int peer,
- mscclComm_t comm, cudaStream_t stream);
-
-/*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
- *
- * Rank peer needs to call mscclSend with the same datatype and the same count
to this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv
operations
- * need to progress concurrently to complete, they must be fused within a
mscclGroupStart/
- * mscclGroupEnd section.
- */
-mscclResult_t pmscclRecv(void* recvbuff, size_t count, mscclDataType_t
datatype, int peer,
- mscclComm_t comm, cudaStream_t stream);
-mscclResult_t mscclRecv(void* recvbuff, size_t count, mscclDataType_t
datatype, int peer,
- mscclComm_t comm, cudaStream_t stream);
-
-/* All-To-All
- *
- * Device (i) send (j)th block of data to device (j) and be placed as (i)th
- * block. Each block for sending/receiving has count elements, which means
- * that recvbuff and sendbuff should have a size of nranks*count elements.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
- mscclDataType_t datatype, mscclComm_t comm,
cudaStream_t stream);
-mscclResult_t pmscclAllToAll(const void* sendbuff, void* recvbuff, size_t
count,
- mscclDataType_t datatype, mscclComm_t comm,
cudaStream_t stream);
-/*! @brief Opaque handle to MSCCL algorithm */
-typedef int mscclAlgoHandle_t;
-
-/*! @brief MSCCL Load Algorithm
- *
- * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
- * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
- * scheduler instead of end users.
- */
-mscclResult_t mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t*
mscclAlgoHandle,
- int rank);
-mscclResult_t pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t*
mscclAlgoHandle,
- int rank);
-
-/*! @brief MSCCL Run Algorithm
- *
- * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
- * list merges all possible parameters required by different operations as this
- * is a general-purposed API. This API is expected to be called by MSCCL
- * scheduler instead of end users.
- */
-mscclResult_t mscclRunAlgo(const void* sendBuff, const size_t sendCounts[],
const size_t sDisPls[],
- void* recvBuff, const size_t recvCounts[], const
size_t rDisPls[],
- size_t count, mscclDataType_t dataType, int root,
int peer,
- mscclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle,
mscclComm_t comm,
- cudaStream_t stream);
-mscclResult_t pmscclRunAlgo(const void* sendBuff, const size_t sendCounts[],
const size_t sDisPls[],
- void* recvBuff, const size_t recvCounts[], const
size_t rDisPls[],
- size_t count, mscclDataType_t dataType, int root,
int peer,
- mscclRedOp_t op, mscclAlgoHandle_t
mscclAlgoHandle, mscclComm_t comm,
- cudaStream_t stream);
-
-/*! @brief MSCCL Load Algorithm
- *
- * @details Unload MSCCL algorithm previous loaded using its handle. This API
- * is expected to be called by MSCCL scheduler instead of end users.
- */
-mscclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
-mscclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
-
-/*
- * Group semantics
- *
- * When managing multiple GPUs from a single thread, and since MSCCL collective
- * calls may perform inter-CPU synchronization, we need to "group" calls for
- * different ranks/devices into a single call.
- *
- * Grouping MSCCL calls as being part of the same collective operation is done
- * using mscclGroupStart and mscclGroupEnd. mscclGroupStart will enqueue all
- * collective calls until the mscclGroupEnd call, which will wait for all calls
- * to be complete. Note that for collective communication, mscclGroupEnd only
- * guarantees that the operations are enqueued on the streams, not that
- * the operation is effectively done.
- *
- * Both collective communication and mscclCommInitRank can be used in
conjunction
- * of mscclGroupStart/mscclGroupEnd, but not together.
- *
- * Group semantics also allow to fuse multiple operations on the same device
- * to improve performance (for aggregated collective calls), or to permit
- * concurrent progress of multiple send/receive operations.
- */
-
-/*
- * Group Start
- *
- * Start a group call. All calls to MSCCL until mscclGroupEnd will be fused
into
- * a single MSCCL operation. Nothing will be started on the CUDA stream until
- * mscclGroupEnd.
- */
-mscclResult_t mscclGroupStart();
-mscclResult_t pmscclGroupStart();
-
-/*
- * Group End
- *
- * End a group call. Start a fused MSCCL operation consisting of all calls
since
- * mscclGroupStart. Operations on the CUDA stream depending on the MSCCL
operations
- * need to be called after mscclGroupEnd.
- */
-mscclResult_t mscclGroupEnd();
-mscclResult_t pmscclGroupEnd();
-
-#ifdef __cplusplus
-} // end extern "C"
-#endif
-
-#endif // end include guard
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d8df25593..2a269fdc27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,6 @@ endif()
# Alernatively, use cmake -DOPTION=VALUE through command-line.
tvm_option(USE_CUDA "Build with CUDA" OFF)
tvm_option(USE_NCCL "Build with NCCL" OFF)
-tvm_option(USE_MSCCL "Build with MSCCL" OFF)
tvm_option(USE_OPENCL "Build with OpenCL" OFF)
tvm_option(USE_OPENCL_ENABLE_HOST_PTR "Enable OpenCL memory object access to
host" OFF)
tvm_option(USE_OPENCL_GTEST "Path to OpenCL specific gtest version for runtime
cpp tests." /path/to/opencl/gtest)
@@ -446,7 +445,6 @@ include(cmake/modules/contrib/AMX.cmake)
include(cmake/modules/contrib/CUTLASS.cmake)
include(cmake/modules/contrib/Random.cmake)
include(cmake/modules/contrib/Posit.cmake)
-include(cmake/modules/contrib/MSCCLPP.cmake)
include(cmake/modules/contrib/Sort.cmake)
include(cmake/modules/contrib/CoreML.cmake)
include(cmake/modules/contrib/TensorRT.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 6612bf12cc..03067fa83a 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -54,11 +54,6 @@ set(USE_CUDA OFF)
# - /path/to/nccl: use specific path to nccl
set(USE_NCCL OFF)
-# Whether to enable MSCCL support:
-# - ON: enable MSCCL
-# - OFF: disable MSCCL
-set(USE_MSCCL OFF)
-
# Whether to enable NVTX support (must have USE_CUDA enabled):
# - ON: enable NCCL with CMake's auto search
# - OFF: disable NCCL
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index c211d51bde..b023bea469 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -64,7 +64,6 @@ function(add_lib_info src_file)
TVM_INFO_USE_CUDA="${USE_CUDA}"
TVM_INFO_USE_NVTX="${USE_NVTX}"
TVM_INFO_USE_NCCL="${USE_NCCL}"
- TVM_INFO_USE_MSCCL="${USE_MSCCL}"
TVM_INFO_USE_CUDNN="${USE_CUDNN}"
TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
diff --git a/cmake/modules/contrib/MSCCLPP.cmake
b/cmake/modules/contrib/MSCCLPP.cmake
deleted file mode 100644
index b63958b69d..0000000000
--- a/cmake/modules/contrib/MSCCLPP.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_CUDA AND USE_NCCL AND USE_MSCCL)
- include(FetchContent)
- FetchContent_Declare(
- mscclpp
- GIT_REPOSITORY https://github.com/csullivan/mscclpp.git
- GIT_TAG feature/2024-03-19/msccl-nccl-equivalents
- )
- set(USE_CUDA ON)
- set(BYPASS_PEERMEM_CHECK ON)
- set(BUILD_PYTHON_BINDINGS OFF)
- set(BUILD_TESTS OFF)
- FetchContent_MakeAvailable(mscclpp)
-
- tvm_file_glob(GLOB MSCCL_SRCS
- ${PROJECT_SOURCE_DIR}/src/runtime/contrib/mscclpp/*.cu
- )
-
- add_library(msccl SHARED ${MSCCL_SRCS})
- target_link_libraries(msccl PUBLIC mscclpp)
- target_include_directories(msccl PUBLIC
- $<BUILD_INTERFACE:${mscclpp_SOURCE_DIR}/include>
- $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/3rdparty/mscclpp/include>
- $<INSTALL_INTERFACE:include/msccl>
- )
-
- install(TARGETS mscclpp_obj
- EXPORT ${PROJECT_NAME}Targets
- FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
- install(TARGETS mscclpp EXPORT ${PROJECT_NAME}Targets DESTINATION
lib${LIB_SUFFIX})
- install(TARGETS msccl EXPORT ${PROJECT_NAME}Targets DESTINATION
lib${LIB_SUFFIX})
- list(APPEND TVM_RUNTIME_LINKER_LIBS msccl)
-endif()
diff --git a/src/runtime/contrib/mscclpp/allreduce.cu
b/src/runtime/contrib/mscclpp/allreduce.cu
deleted file mode 100644
index 147c306bf4..0000000000
--- a/src/runtime/contrib/mscclpp/allreduce.cu
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/ffi/function.h>
-#include <tvm/runtime/tensor.h>
-
-#include "msccl.cuh"
-
-namespace tvm {
-namespace runtime {
-
-template <typename T>
-cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff,
- mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
- mscclpp::DeviceHandle<mscclpp::SmChannel>*
smOutChannels, int rank,
- int nRanksPerNode, int worldSize, size_t nelems,
cudaStream_t stream);
-
-MSCCL_API mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff,
size_t count,
- mscclDataType_t datatype, mscclRedOp_t
op, mscclComm_t comm,
- cudaStream_t stream) {
- size_t bytes = count * mscclTypeSize(datatype);
- if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm ==
nullptr ||
- op != mscclSum || bytes > (1 << 24)) {
- return mscclInvalidArgument;
- }
-
- int rank = comm->comm->bootstrap()->getRank();
- channelKey key{sendbuff, recvbuff, bytes};
- mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
- mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
-
- auto it = comm->channelInfos.find(key);
- if (it == comm->channelInfos.end()) {
- // setup smChannels (src: sendbuff, dst: remote scratch buff)
- std::vector<mscclpp::SmChannel> channels =
- setupSmChannels(comm, comm->remoteScratchRegMemories,
const_cast<void*>(sendbuff));
- ChannelInfo channelInfo{channels, {},
setupSmChannelDeviceHandles(channels), nullptr};
- it = comm->channelInfos.emplace(key, channelInfo).first;
-
- // TODO(csullivan): Consider supporting allreduce for larger transfers
- // setup smOutChannels (src: recvbuff, dst: remote recvbuff)
- // if (bytes > (1 << 24)) {
- // std::vector<mscclpp::RegisteredMemory> remoteMemories =
- // setupRemoteMemories(comm->comm, rank, recvbuff, bytes,
mscclpp::Transport::CudaIpc);
- // std::vector<mscclpp::SmChannel> outChannels = setupSmChannels(comm,
remoteMemories,
- // recvbuff); it->second.smOutChannels = outChannels;
it->second.smOutChannelDeviceHandles =
- // setupSmChannelDeviceHandles(outChannels);
- // }
- }
-
- smChannels = it->second.smChannelDeviceHandles.get();
- smOutChannels = it->second.smOutChannelDeviceHandles.get();
-
- switch (datatype) {
- case mscclFloat16:
- CUDACHECK(allreduce(reinterpret_cast<const half*>(sendbuff),
- reinterpret_cast<half*>(comm->scratchBuff.get()),
- reinterpret_cast<half*>(recvbuff), smChannels,
smOutChannels, rank,
- NRANKS_PER_NODE,
comm->comm->bootstrap()->getNranks(), count, stream));
- break;
- case mscclFloat32:
- CUDACHECK(allreduce(reinterpret_cast<const float*>(sendbuff),
- reinterpret_cast<float*>(comm->scratchBuff.get()),
- reinterpret_cast<float*>(recvbuff), smChannels,
smOutChannels,
- comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
- comm->comm->bootstrap()->getNranks(), count,
stream));
- break;
- case mscclInt32:
- case mscclUint32:
- CUDACHECK(allreduce(reinterpret_cast<const int*>(sendbuff),
- reinterpret_cast<int*>(comm->scratchBuff.get()),
- reinterpret_cast<int*>(recvbuff), smChannels,
smOutChannels,
- comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
- comm->comm->bootstrap()->getNranks(), count,
stream));
- break;
- default:
- return mscclInvalidArgument;
- }
- return mscclSuccess;
-}
-
-template <typename T>
-__global__ void __launch_bounds__(1024, 1)
- allreduce_simple(mscclpp::SmChannelDeviceHandle* smChans, const T* buff,
T* scratch,
- void* resultBuff, int rank, int worldSize, size_t nelems,
- const uint32_t flag) {
- nelems = nelems / (sizeof(int) / sizeof(T));
-
- const int nPeers = worldSize - 1;
- const size_t nPkts = nelems / 2;
- const int nelemsPerRank = nelems / worldSize;
- const int nPktsPerRank = nelemsPerRank / 2;
- const int nBlocksPerPeer = gridDim.x / nPeers;
- const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
- const int peerIdx = blockIdx.x / nBlocksPerPeer;
- const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
- mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
- const int tid = threadIdx.x + localBlockIdx * blockDim.x;
-
- size_t scratchOffset = rank * nPktsPerRank * sizeof(mscclpp::LLPacket);
- size_t resultOffset = 2 * nPkts * sizeof(mscclpp::LLPacket);
- size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
- const uint2* src = reinterpret_cast<const uint2*>(reinterpret_cast<const
char*>(buff) +
- rank * nelemsPerRank *
sizeof(int));
- uint2* dst = reinterpret_cast<uint2*>(reinterpret_cast<char*>(resultBuff) +
- rank * nelemsPerRank * sizeof(int));
-
- // Step 1. Write to scratch buffer which exposes memory to peers via cuda
IPC memory
- smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
- blockDim.x * nBlocksPerPeer, flag);
-
- // Step 2. Get data from scratch buffer, reduce data, and write result back
to peer scratch
- for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank;
- idx += blockDim.x * gridDim.x) {
- uint2 data = make_uint2(0, 0);
- for (int index = 0; index < nPeers; index++) {
- const int remoteRank = index < rank ? index : index + 1;
- mscclpp::LLPacket* dstPkt =
- reinterpret_cast<mscclpp::LLPacket*>(scratch) + remoteRank *
nPktsPerRank;
- uint2 val = dstPkt[idx].read(flag);
- data = add_vectors<T>(val, data);
- }
- data = add_vectors<T>(data, src[idx]);
- dst[idx] = data;
-
- mscclpp::LLPacket packet;
- packet.data1 = data.x;
- packet.flag1 = flag;
- packet.data2 = data.y;
- packet.flag2 = flag;
- size_t offset = resultOffset / sizeof(mscclpp::LLPacket) + (idx + rank *
nPktsPerRank);
- for (int index = 0; index < nPeers; index++) {
- smChans[index].write(offset, packet);
- }
- }
-
- // Step 3. Update local GPU's final result from peer scratch buffers
- mscclpp::LLPacket* dstPkt =
- reinterpret_cast<mscclpp::LLPacket*>(reinterpret_cast<char*>(scratch) +
resultOffset);
- const int dstOffset = remoteRank * nPktsPerRank;
- uint2* result = reinterpret_cast<uint2*>(reinterpret_cast<char*>(resultBuff)
+
- remoteRank * nelemsPerRank *
sizeof(int));
- for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank;
- idx += blockDim.x * nBlocksPerPeer) {
- uint2 data = dstPkt[idx + dstOffset].read(flag);
- result[idx].x = data.x;
- result[idx].y = data.y;
- }
-}
-
-template <typename T>
-cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff,
- mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
- mscclpp::DeviceHandle<mscclpp::SmChannel>*
smOutChannels, int rank,
- int nRanksPerNode, int worldSize, size_t nelems,
cudaStream_t stream) {
- static uint32_t flag = 1;
- size_t num_bytes = sizeof(T) * nelems;
- ICHECK(num_bytes <= (1 << 24)) << "mscclpp allreduce expects bytes
transfered < " << (1 << 24)
- << " but got num_bytes = " << num_bytes << "
bytes";
- allreduce_simple<<<105, 1024, 0, stream>>>(smChannels, buff, scratch,
resultBuff, rank, worldSize,
- nelems, flag++);
-
- return cudaGetLastError();
-}
-
-} // namespace runtime
-} // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 782e314869..c7f740b9b4 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -50,10 +50,6 @@
#define TVM_INFO_USE_NCCL "NOT-FOUND"
#endif
-#ifndef TVM_INFO_USE_MSCCLPP
-#define TVM_INFO_USE_MSCCLPP "NOT-FOUND"
-#endif
-
#ifndef TVM_INFO_CUDA_VERSION
#define TVM_INFO_CUDA_VERSION "NOT-FOUND"
#endif
@@ -259,7 +255,6 @@ TVM_DLL ffi::Map<ffi::String, ffi::String> GetLibInfo() {
{"USE_CUDA", TVM_INFO_USE_CUDA},
{"USE_NVTX", TVM_INFO_USE_NVTX},
{"USE_NCCL", TVM_INFO_USE_NCCL},
- {"USE_MSCCL", TVM_INFO_USE_MSCCL},
{"USE_CUDNN", TVM_INFO_USE_CUDNN},
{"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
{"USE_CUTLASS", TVM_INFO_USE_CUTLASS},