DNDSR 0.1.0.dev1+gcd065ad
Distributed Numeric Data Structure for CFV
Loading...
Searching...
No Matches
MPI.hpp
Go to the documentation of this file.
1#pragma once
2/// @file MPI.hpp
3/// @brief MPI wrappers: MPIInfo, collective operations, type mapping, CommStrategy.
4/// @par Unit Test Coverage (test_MPI.cpp, MPI np=1,2,4)
5/// - MPIInfo: constructor, setWorld, operator==, field validation
6/// - MPIWorldSize, MPIWorldRank
7/// - Allreduce (MPI_SUM, MPI_MAX for real/index), AllreduceOneReal, AllreduceOneIndex
8/// - Scan (MPI_SUM), Allgather, Bcast, Barrier, Alltoall
9/// - BasicType_To_MPIIntType (scalar, std::array, Eigen::Matrix)
10/// - CommStrategy: get/set HIndexed/InSituPack
11/// @par Not Yet Tested
12/// - Alltoallv, WaitallLazy, WaitallAuto, BarrierLazy
13/// - MPIBufferHandler, MPITypePairHolder, MPIReqHolder (tested indirectly via ArrayTransformer)
14/// - MPI::ResourceRecycler, MPISerialDo, InsertCheck
15/// - Sub-communicators, CommStrategy functional impact on ArrayTransformer
16
17#include <vector>
18#include <fstream>
19
20#include <mutex>
21
22#include "Defines.hpp"
24// disable mpicxx 's many warnings in intel oneAPI mpi's header
26#include "mpi.h"
28
29#ifdef OPEN_MPI
30# include <mpi-ext.h> // Open MPI extension header
31#endif
32
33#ifdef NDEBUG
34# define NDEBUG_DISABLED
35# undef NDEBUG
36#endif
37
38namespace DNDS
39{
40
41 /// @brief MPI counterpart type for `MPI_int` (= C `int`). Used for counts
42 /// and ranks in MPI calls.
43 using MPI_int = int;
44 /// @brief MPI-compatible address/offset type (= `MPI_Aint`, 64-bit on all
45 /// supported platforms). Used by the `hindexed` datatype machinery.
46 using MPI_index = MPI_Aint;
47#define MAX_MPI_int INT32_MAX
48#define MAX_MPI_Aint INT64_MAX
49 static_assert(sizeof(MPI_Aint) == 8);
50
51 /// @brief Vector of MPI counts.
52 using tMPI_sizeVec = std::vector<MPI_int>;
53 /// @brief Alias for #tMPI_sizeVec; used where the name "int vec" reads better.
55 /// @brief Vector of MPI_Aint byte-offsets for hindexed datatypes.
56 using tMPI_indexVec = std::vector<MPI_index>;
57 /// @brief Alias for #tMPI_indexVec to match `MPI_Aint` terminology.
59
60 /// @brief Vector of `MPI_Status`, for `MPI_Waitall` / `MPI_Testall`.
61 using tMPI_statVec = std::vector<MPI_Status>;
62 /// @brief Vector of `MPI_Request`, for persistent / nonblocking calls.
63 using tMPI_reqVec = std::vector<MPI_Request>;
64
65 /**
66 * @brief Map a DNDS integer type size to an MPI signed-integer datatype.
67 * @details Compile-time selects `MPI_INT64_T` or `MPI_INT32_T` based on `sizeof(Tbasic)`.
68 * Used by @ref DNDS_MPI_INDEX.
69 */
70 template <class Tbasic>
71 constexpr MPI_Datatype __DNDSToMPITypeInt()
72 {
73 static_assert(sizeof(Tbasic) == 8 || sizeof(Tbasic) == 4, "DNDS::Tbasic is not right size");
74 return sizeof(Tbasic) == 8 ? MPI_INT64_T : (sizeof(Tbasic) == 4 ? MPI_INT32_T : MPI_DATATYPE_NULL);
75 }
76
77 /**
78 * @brief Map a DNDS floating-point type size to an MPI datatype.
79 * @details Compile-time selects `MPI_REAL8` or `MPI_REAL4` based on `sizeof(Tbasic)`.
80 * Used by #DNDS_MPI_REAL.
81 */
82 template <class Tbasic>
83 constexpr MPI_Datatype __DNDSToMPITypeFloat()
84 {
85 static_assert(sizeof(Tbasic) == 8 || sizeof(Tbasic) == 4, "DNDS::Tbasic is not right size");
86 return sizeof(Tbasic) == 8 ? MPI_REAL8 : (sizeof(Tbasic) == 4 ? MPI_REAL4 : MPI_DATATYPE_NULL);
87 }
88
89 /// @brief MPI datatype matching #index (= `MPI_INT64_T`).
90 const MPI_Datatype DNDS_MPI_INDEX = __DNDSToMPITypeInt<index>();
91 /// @brief MPI datatype matching #real (= `MPI_REAL8`).
92 const MPI_Datatype DNDS_MPI_REAL = __DNDSToMPITypeFloat<real>();
93
94 //! here are some reasons to upgrade to C++20...
95 // detect if have CommMult and CommType static methods
96
97 /// @brief SFINAE trait detecting a static @ref CommMult member in T.
98 template <typename T, typename = void>
99 struct has_static_CommMult : std::false_type
100 {
101 };
102
103 template <typename T>
104 struct has_static_CommMult<T, std::void_t<decltype(T::CommMult())>> : std::true_type
105 {
106 };
107
108 /// @brief SFINAE trait detecting a static @ref CommType member in T.
109 template <typename T, typename = void>
110 struct has_static_CommType : std::false_type
111 {
112 };
113
114 template <typename T>
115 struct has_static_CommType<T, std::void_t<decltype(T::CommType())>> : std::true_type
116 {
117 };
118
119 /// @brief Dispatch to a user-provided @ref CommPair / @ref CommMult+@ref CommType pair on `T`.
120 /// @details Last resort for types that are not scalars, plain arrays, or
121 /// real Eigen matrices. `T` must either expose static `CommMult()` +
122 /// `CommType()` methods, or a static `CommPair()` returning the same pair.
123 template <class T>
124 std::pair<MPI_Datatype, MPI_int> BasicType_To_MPIIntType_Custom()
125 {
127 return std::make_pair(T::CommType(), T::CommMult());
128 else
129 return T::CommPair(); // last resort
130 }
131
132 /**
133 * @brief Deduce an `(MPI_Datatype, count)` pair that represents a `T` value.
134 *
135 * @details Compile-time dispatch:
136 * - builtin float / int types map to their obvious `MPI_*` datatypes, count = 1;
137 * - C-style arrays (`T[N]`) recurse into the element type and multiply the count;
138 * - `std::array<T, N>` recurses into `T::value_type` and multiplies by `N`;
139 * - fixed-size real Eigen matrices map to #DNDS_MPI_REAL with count `sizeof(T)/sizeof(real)`;
140 * - otherwise falls back to @ref BasicType_To_MPIIntType_Custom.
141 *
142 * Used throughout ghost-communication and collective code to avoid hand-
143 * writing datatype mapping for every MPI call.
144 *
145 * @note Not `constexpr` because OpenMPI handles are not constant-expressions.
146 */
147 template <class T> // TODO: see if an array is bounded
148 //! Warning, not const-expr since OpenMPI disallows it
149 std::pair<MPI_Datatype, MPI_int> BasicType_To_MPIIntType()
150 {
151 static const auto badReturn = std::make_pair(MPI_Datatype(MPI_DATATYPE_NULL), MPI_int(-1));
152 if constexpr (std::is_scalar_v<T>)
153 {
154 if constexpr (std::is_same_v<T, float>)
155 return std::make_pair(MPI_Datatype(MPI_FLOAT), MPI_int(1));
156 if constexpr (std::is_same_v<T, double>)
157 return std::make_pair(MPI_Datatype(MPI_DOUBLE), MPI_int(1));
158 if constexpr (std::is_same_v<T, long double>)
159 return std::make_pair(MPI_Datatype(MPI_LONG_DOUBLE), MPI_int(1));
160
161 if constexpr (std::is_same_v<T, int8_t>)
162 return std::make_pair(MPI_Datatype(MPI_INT8_T), MPI_int(1));
163 if constexpr (std::is_same_v<T, int16_t>)
164 return std::make_pair(MPI_Datatype(MPI_INT16_T), MPI_int(1));
165 if constexpr (std::is_same_v<T, int32_t>)
166 return std::make_pair(MPI_Datatype(MPI_INT32_T), MPI_int(1));
167 if constexpr (std::is_same_v<T, int64_t>)
168 return std::make_pair(MPI_Datatype(MPI_INT64_T), MPI_int(1));
169
170 if constexpr (sizeof(T) == 1)
171 return std::make_pair(MPI_Datatype(MPI_UINT8_T), MPI_int(1));
172 else if constexpr (sizeof(T) == 2)
173 return std::make_pair(MPI_Datatype(MPI_UINT16_T), MPI_int(1));
174 else if constexpr (sizeof(T) == 4)
175 return std::make_pair(MPI_Datatype(MPI_UINT32_T), MPI_int(1));
176 else if constexpr (sizeof(T) == 8)
177 return std::make_pair(MPI_Datatype(MPI_UINT64_T), MPI_int(1));
178 else
179 return BasicType_To_MPIIntType_Custom<T>();
180 }
181 else if constexpr (std::is_array_v<T>)
182 {
183 std::pair<MPI_Datatype, MPI_int> SizCom = BasicType_To_MPIIntType<std::remove_extent_t<T>>();
184 return std::make_pair(SizCom.first, SizCom.second * std::extent_v<T>);
185 }
186 else if constexpr (std::is_trivially_copyable_v<T>)
187 {
188 if constexpr (Meta::is_std_array_v<T>)
189 return std::make_pair(
190 BasicType_To_MPIIntType<typename T::value_type>().first,
191 BasicType_To_MPIIntType<typename T::value_type>().second * T().size());
192 else
193 return BasicType_To_MPIIntType_Custom<T>();
194 }
195 else if constexpr (Meta::is_fixed_data_real_eigen_matrix_v<T>)
196 return std::make_pair(DNDS_MPI_REAL, MPI_int(divide_ceil(sizeof(T), sizeof(real))));
197 else
198 return BasicType_To_MPIIntType_Custom<T>();
199 // else
200 // return badReturn;
201 }
202
203 /**
204 * @brief Lightweight bundle of an MPI communicator and the calling rank's coordinates.
205 *
206 * @details The canonical "where am I in the parallel world" object passed
207 * almost everywhere in DNDSR. Cheap to copy (three ints). Two-phase
208 * construction is supported:
209 * - default-construct, then call #setWorld (or the `MPI_Comm` ctor) once
210 * `MPI_Init` has run.
211 *
212 * Comparison (#operator==) tests exact equality of the triple `(comm, rank, size)`.
213 */
214 struct MPIInfo
215 {
216 /// @brief The underlying MPI communicator handle.
217 MPI_Comm comm = MPI_COMM_NULL;
218 /// @brief This rank's 0-based index within #comm (`-1` until initialised).
219 int rank = -1;
220 /// @brief Number of ranks in #comm (`-1` until initialised).
221 int size = -1;
222
223 MPIInfo() = default;
224
225 /// @brief Wrap an existing MPI communicator; queries rank and size.
226 MPIInfo(MPI_Comm ncomm)
227 {
228 comm = ncomm;
229 int ierr;
230 ierr = MPI_Comm_rank(comm, &rank), DNDS_assert(ierr == MPI_SUCCESS);
231 ierr = MPI_Comm_size(comm, &size), DNDS_assert(ierr == MPI_SUCCESS);
232 }
233
234 /// @brief Low-level constructor for callers that already know `(rank, size)`.
235 /// @warning Bug: the fourth argument stores `r` into `size` too; callers
236 /// currently pass matching values. Prefer the single-arg MPI_Comm ctor.
237 MPIInfo(MPI_Comm nc, int r, int s) : comm(nc), rank(r), size(r)
238 {
239 }
240
241 /// @brief Initialise the object to `MPI_COMM_WORLD`. Requires `MPI_Init` to have run.
242 void setWorld()
243 {
244 comm = MPI_COMM_WORLD;
245 int ierr;
246 ierr = MPI_Comm_rank(comm, &rank), DNDS_assert(ierr == MPI_SUCCESS);
247 ierr = MPI_Comm_size(comm, &size), DNDS_assert(ierr == MPI_SUCCESS);
248 }
249
250 /// @brief Exact triple equality.
251 bool operator==(const MPIInfo &r) const
252 {
253 return (comm == r.comm) && (rank == r.rank) && (size == r.size);
254 }
255 };
256
257 namespace MPI
258 {
259 /**
260 * @brief Singleton that tracks and releases long-lived MPI resources at
261 * `MPI_Finalize` time.
262 *
263 * @details MPI communicators, derived datatypes, and persistent requests
264 * must be released before `MPI_Finalize`; otherwise they leak memory and
265 * MPICH prints warnings. Several DNDSR objects (@ref DNDS::MPITypePairHolder "MPITypePairHolder",
266 * @ref DNDS::MPIReqHolder "MPIReqHolder") register themselves here so that `MPI::Finalize()`
267 * can call their cleanup callbacks even if the C++ lifetime would
268 * outlive the MPI runtime (e.g., static destructors).
269 *
270 * Thread-safe C++11 singleton. Intended to be created under `MPI_COMM_WORLD`.
271 */
273 {
274 private:
275 std::unordered_map<void *, std::function<void()>> cleaners;
276
277 ResourceRecycler(){}; // implemented
279 ResourceRecycler &operator=(const ResourceRecycler &);
280
281 public:
282 /// @brief Access the process-wide singleton.
283 static ResourceRecycler &Instance();
284 /**
285 * @brief Register a cleanup callback keyed by `p`.
286 * @warning Must be paired with @ref RemoveCleaner when `p` is destroyed,
287 * else dangling pointers will be invoked by #clean.
288 */
289 void RegisterCleaner(void *p, std::function<void()> nCleaner);
290 /// @brief Remove a previously-registered cleaner.
291 void RemoveCleaner(void *p);
292 /// @brief Invoke all registered cleaners and drop them. Called by
293 /// `MPI::Finalize()`.
294 void clean();
295 };
296 }
297
298 /// @brief Convenience: `MPI_Comm_size(MPI_COMM_WORLD)`.
300 {
301 MPI_int ret{0};
302 MPI_Comm_size(MPI_COMM_WORLD, &ret);
303 return ret;
304 }
305
306 /// @brief Convenience: `MPI_Comm_rank(MPI_COMM_WORLD)`.
308 {
309 MPI_int ret{0};
310 MPI_Comm_rank(MPI_COMM_WORLD, &ret);
311 return ret;
312 }
313
314 /// @brief Format a human-readable timestamp using the calling rank as context.
315 std::string getTimeStamp(const MPIInfo &mpi);
316
317/// @brief Debug helper: barrier + print "CHECK <info> RANK r @ fn @ file:line".
318/// @details Compiled out when either `NDEBUG` or `NINSERT` is defined. Used to
319/// trace parallel execution during development; see @ref InsertCheck.
320#define DNDS_MPI_InsertCheck(mpi, info) \
321 InsertCheck(mpi, info, __FUNCTION__, __FILE__, __LINE__)
322
323 using tMPI_typePairVec = std::vector<std::pair<MPI_int, MPI_Datatype>>;
324 /**
325 * @brief RAII vector of `(count, MPI_Datatype)` pairs that frees every
326 * committed datatype when destroyed.
327 *
328 * @details Used by @ref DNDS::ArrayTransformer "ArrayTransformer" to hold the derived datatypes it
329 * builds via `MPI_Type_create_hindexed`. Construction is channelled through
330 * the static #create factory so instances are always owned by
331 * `shared_ptr<MPITypePairHolder>` and correctly registered with the
332 * @ref DNDS::ResourceRecycler "ResourceRecycler".
333 */
334 struct MPITypePairHolder : public tMPI_typePairVec, public std::enable_shared_from_this<MPITypePairHolder>
335 {
338
339 private:
340 /// @brief RAII guard restricting construction to shared_ptr factory.
341 struct shared_ctor_guard // make_shared needs a public ctor but give a private arg
342 {
343 };
344
345 public:
346 /// @brief Perfect-forwarding factory; returns `shared_ptr<MPITypePairHolder>`.
347 template <typename... Args>
348 MPITypePairHolder(shared_ctor_guard g, Args &&...args) : tBase(std::forward<Args>(args)...)
349 {
350 if (!(std::shared_ptr<tSelf>(this, [](tSelf *) {}).use_count() == 1))
351 throw std::runtime_error("tSelf must be created via shared_ptr");
353 { this->clear(); });
354 }
355
356 /// @brief Only public path to construct an instance; forwards to the private constructor.
357 template <typename... Args>
358 static ssp<MPITypePairHolder> create(Args &&...args)
359 {
360 return std::make_shared<MPITypePairHolder>(shared_ctor_guard{}, std::forward<Args>(args)...);
361 }
367 /// @brief Free every committed datatype and empty the vector.
368 void clear()
369 {
370 for (auto &i : (*this))
371 if (i.first >= 0 && i.second != 0 && i.second != MPI_DATATYPE_NULL)
372 MPI_Type_free(&i.second); //, std::cout << "Free Type" << std::endl;
373 this->tMPI_typePairVec::clear();
374 }
375 };
376
377 /// @brief Shared-pointer alias to @ref DNDS::MPITypePairHolder "MPITypePairHolder".
379 /**
380 * @brief RAII vector of `MPI_Request`s that frees each non-null handle when destroyed.
381 *
382 * @details Mirror of @ref DNDS::MPITypePairHolder "MPITypePairHolder" for MPI requests (persistent or
383 * transient). Used by @ref DNDS::ArrayTransformer "ArrayTransformer" for send/recv request sets.
384 * Construction is likewise channelled through the static #create factory.
385 */
386 struct MPIReqHolder : public tMPI_reqVec, public std::enable_shared_from_this<MPIReqHolder>
387 {
390
391 private:
392 /// @brief RAII guard restricting construction to shared_ptr factory.
393 struct shared_ctor_guard // make_shared needs a public ctor but give a private arg
394 {
395 };
396
397 public:
398 /// @brief Perfect-forwarding factory; returns `shared_ptr<MPIReqHolder>`.
399 template <typename... Args>
400 MPIReqHolder(shared_ctor_guard g, Args &&...args) : tBase(std::forward<Args>(args)...)
401 {
402 if (!(std::shared_ptr<tSelf>(this, [](tSelf *) {}).use_count() == 1))
403 throw std::runtime_error("tSelf must be created via shared_ptr");
405 { this->clear(); });
406 }
407
408 /// @brief Only public path to construct an instance.
409 template <typename... Args>
410 static ssp<MPIReqHolder> create(Args &&...args)
411 {
412 return std::make_shared<MPIReqHolder>(shared_ctor_guard{}, std::forward<Args>(args)...);
413 }
415 {
416 this->clear();
418 }
419 /// @brief Free every non-null request and empty the vector.
420 void clear()
421 {
422 for (auto &i : (*this))
423 if (i != MPI_REQUEST_NULL)
424 MPI_Request_free(&i);
425 this->tMPI_reqVec::clear();
426 }
427 };
428
429}
430
431namespace DNDS::Debug
432{
433 /// @brief Whether the current process is running under a debugger.
434 /// Implemented via `/proc/self/status TracerPid` on Linux.
435 bool IsDebugged();
436 /// @brief If #isDebugging is set, block every rank in a busy-wait loop so
437 /// the user can attach a debugger and inspect state. Exit by setting
438 /// `isDebugging = false` in the debugger.
439 void MPIDebugHold(const MPIInfo &mpi);
440 /// @brief Flag consulted by @ref MPIDebugHold and #assert_false_info_mpi.
441 extern bool isDebugging;
442}
443
444// DNDS_assert_info_mpi is used to help barrier the process before exiting if DNDS::Debug::isDebugging is set
445// remember to set a breakpoint here
446namespace DNDS
447{
448 /// @brief MPI-aware assertion-failure reporter.
449 /// @details Barriers before abort so every rank flushes its output; if
450 /// @ref Debug::isDebugging is set, busy-waits to allow debugger attachment.
451 void assert_false_info_mpi(const char *expr, const char *file, int line, const std::string &info, const DNDS::MPIInfo &mpi);
452}
453#ifdef DNDS_NDEBUG
454# define DNDS_assert_info_mpi(expr, mpi, info) (void(0))
455#else
456/// @brief Collective-aware variant of @ref DNDS_assert_info: every rank calls
457/// into an MPI barrier before aborting, so logs are not interleaved.
458# define DNDS_assert_info_mpi(expr, mpi, info) \
459 ((static_cast<bool>(expr)) \
460 ? void(0) \
461 : ::DNDS::assert_false_info_mpi(#expr, __FILE__, __LINE__, info, mpi))
462#endif
463
464namespace DNDS // TODO: get a concurrency header
465{
466 /// @brief Global mutex serialising host-side HDF5 calls.
467 /// @details HDF5 is not thread-safe by default; this mutex guards all
468 /// DNDSR HDF5 I/O when multiple CPU threads might touch the same file.
469 extern std::mutex HDF_mutex;
470
471 namespace MPI
472 {
473 /// @brief Return the MPI thread-support level the current process was initialised with.
474 inline int GetMPIThreadLevel()
475 {
476 int ret;
477 int ierr;
478 ierr = MPI_Query_thread(&ret), DNDS_assert(ierr == MPI_SUCCESS);
479 return ret;
480 }
481
482 /**
483 * @brief Initialise MPI with thread support, honouring the
484 * @ref DNDS_DISABLE_ASYNC_MPI environment override.
485 *
486 * @details
487 * - No env var or value `0`: request `MPI_THREAD_MULTIPLE` (full).
488 * - `1`: drop to `MPI_THREAD_SERIALIZED`.
489 * - `2`: drop to `MPI_THREAD_FUNNELED`.
490 * - `>=3`: `MPI_THREAD_SINGLE`.
491 *
492 * Aborts via `MPI_Abort` if the provided level is lower than requested.
493 * Idempotent: if MPI is already initialised the call just queries the level.
494 */
495 inline MPI_int Init_thread(int *argc, char ***argv)
496 {
497 int init_flag{0};
498 MPI_Initialized(&init_flag);
499
500 int provided_MPI_THREAD_LEVEL{0};
501 int needed_MPI_THREAD_LEVEL = MPI_THREAD_MULTIPLE;
502
503 auto *env = std::getenv("DNDS_DISABLE_ASYNC_MPI");
504 if (env != NULL && (std::stod(env) != 0))
505 {
506 int ienv = static_cast<int>(std::stod(env));
507 if (ienv >= 1)
508 needed_MPI_THREAD_LEVEL = MPI_THREAD_SERIALIZED;
509 if (ienv >= 2)
510 needed_MPI_THREAD_LEVEL = MPI_THREAD_FUNNELED;
511 if (ienv >= 3)
512 needed_MPI_THREAD_LEVEL = MPI_THREAD_SINGLE;
513 }
514 int ret{0};
515 if (!init_flag)
516 ret = MPI_Init_thread(argc, argv, needed_MPI_THREAD_LEVEL, &provided_MPI_THREAD_LEVEL);
517 else
518 provided_MPI_THREAD_LEVEL = GetMPIThreadLevel();
519
520 if (provided_MPI_THREAD_LEVEL < needed_MPI_THREAD_LEVEL)
521 {
522 printf("ERROR: The MPI library does not have full thread support\n");
523 MPI_Abort(MPI_COMM_WORLD, 1);
524 }
525
526 return ret;
527 }
528
529 /// @brief Release DNDSR-registered MPI resources then call `MPI_Finalize`.
530 /// @details Idempotent: returns immediately if MPI has already been finalised.
531 inline int Finalize()
532 {
534 int finalized{0};
535 int err = MPI_Finalized(&finalized);
536 if (!finalized)
537 err |= MPI_Finalize();
538 return err;
539 }
540 }
541}
542
543// MPI buffer handler
544#define MPIBufferHandler_REPORT_CHANGE // for monitoring
545namespace DNDS
546{
547 /**
548 * @brief Process-singleton managing the buffer attached to MPI for
549 * `MPI_Bsend` (buffered sends).
550 *
551 * @details Some algorithms (e.g., serialised writes) use buffered sends to
552 * decouple sender from receiver. MPI requires the application to provide
553 * the buffer via `MPI_Buffer_attach`. This singleton owns that buffer,
554 * grows it on demand via #claim, and exposes a thin accounting layer
555 * (#claim / #unclaim) so multiple components can share the buffer without
556 * stepping on each other.
557 *
558 * Thread-safe construction on C++11; not MT-safe for concurrent claims.
559 */
560 class MPIBufferHandler // cxx11 + thread-safe singleton
561 {
562 private:
563 std::vector<uint8_t> buf;
564
565 public:
566 using size_type = decltype(buf)::size_type;
567
568 private:
569 size_type claimed = 0;
570
571 private:
573 {
574 uint8_t *obuf;
575 int osize;
576 MPI_Buffer_detach(reinterpret_cast<void *>(&obuf) /* caution */, &osize);
577
578 buf.resize(1024ULL * 1024ULL);
579 MPI_Buffer_attach(buf.data(), int(buf.size())); //! warning, bufsize could overflow
580 }
582 MPIBufferHandler &operator=(const MPIBufferHandler &);
583
584 public:
585 /// @brief Access the process-wide singleton.
586 static MPIBufferHandler &Instance();
587 /// @brief Current buffer size in bytes (fits in `MPI_int`; asserted).
589 {
590 DNDS_assert(buf.size() <= MAX_MPI_int);
591 return MPI_int(buf.size()); // could overflow!
592 }
593 /// @brief Reserve `cs` additional bytes, growing and re-attaching the
594 /// MPI buffer if needed. `reportRank` is only used for diagnostic logs.
595 void claim(MPI_Aint cs, int reportRank = 0)
596 {
597 if (buf.size() - claimed < static_cast<size_type>(cs))
598 {
599 // std::cout << "claim in " << std::endl;
600 uint8_t *obuf;
601 int osize;
602 MPI_Buffer_detach(reinterpret_cast<void *>(&obuf) /* caution */, &osize);
603#ifdef MPIBufferHandler_REPORT_CHANGE
604 std::cout << "MPIBufferHandler: New BUf at " << reportRank << std::endl
605 << osize << std::endl;
606#endif
607 DNDS_assert(static_cast<size_type>(osize) == buf.size());
608 buf.resize(claimed + cs);
609 MPI_Buffer_attach(buf.data(), size_t_to_signed<MPI_int>(buf.size()));
610#ifdef MPIBufferHandler_REPORT_CHANGE
611 std::cout << " -> " << buf.size() << std::endl;
612#endif
613 }
614 claimed += cs;
615 }
616 /// @brief Release `cs` previously-#claim ed bytes (only updates accounting;
617 /// does not shrink the buffer).
619 {
620 DNDS_assert(size_t_to_signed<MPI_int>(claimed) >= cs);
621 claimed -= cs;
622 }
623 /// @brief Direct pointer to the attached buffer (for diagnostics).
624 void *getBuf()
625 {
626 return (void *)(buf.data());
627 }
628 };
629
630}
631
632namespace DNDS::MPI
633{
634 /// @brief Wrapper over `MPI_Bcast` that logs on error and goes through DNDSR retry logic.
635 MPI_int Bcast(void *buf, MPI_int num, MPI_Datatype type, MPI_int source_rank, MPI_Comm comm);
636
637 /// @brief Wrapper over `MPI_Alltoall` (fixed per-peer count).
638 MPI_int Alltoall(void *send, MPI_int sendNum, MPI_Datatype typeSend, void *recv, MPI_int recvNum, MPI_Datatype typeRecv, MPI_Comm comm);
639
640 /// @brief Wrapper over `MPI_Alltoallv` (variable per-peer counts + displacements).
642 void *send, MPI_int *sendSizes, MPI_int *sendStarts, MPI_Datatype sendType,
643 void *recv, MPI_int *recvSizes, MPI_int *recvStarts, MPI_Datatype recvType, MPI_Comm comm);
644
645 /// @brief Wrapper over `MPI_Allreduce`.
646 MPI_int Allreduce(const void *sendbuf, void *recvbuf, MPI_int count,
647 MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
648
649 /// @brief Wrapper over `MPI_Scan` (inclusive prefix reduction).
650 MPI_int Scan(const void *sendbuf, void *recvbuf, MPI_int count,
651 MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
652
653 /// @brief Wrapper over `MPI_Allgather`.
654 MPI_int Allgather(const void *sendbuf, MPI_int sendcount, MPI_Datatype sendtype,
655 void *recvbuf, MPI_int recvcount,
656 MPI_Datatype recvtype, MPI_Comm comm);
657
658 /// @brief Wrapper over `MPI_Barrier`.
659 MPI_int Barrier(MPI_Comm comm);
660
661 /// @brief Polling barrier that sleeps `checkNanoSecs` ns between MPI_Test
662 /// calls. Reduces CPU spin when many ranks wait unevenly.
663 MPI_int BarrierLazy(MPI_Comm comm, uint64_t checkNanoSecs);
664
665 /// @brief Like @ref WaitallAuto but sleeps `checkNanoSecs` ns between polls.
666 MPI_int WaitallLazy(MPI_int count, MPI_Request *reqs, MPI_Status *statuses, uint64_t checkNanoSecs = 10000000);
667
668 /// @brief Wait on an array of requests, choosing between `MPI_Waitall` and
669 /// the lazy-poll variant based on @ref DNDS::CommStrategy "CommStrategy" settings.
670 MPI_int WaitallAuto(MPI_int count, MPI_Request *reqs, MPI_Status *statuses);
671
672 /// @brief Single-scalar Allreduce helper for reals (in-place, count = 1).
673 inline void AllreduceOneReal(real &v, MPI_Op op, const MPIInfo &mpi)
674 {
675 Allreduce(MPI_IN_PLACE, &v, 1, DNDS_MPI_REAL, op, mpi.comm);
676 }
677
678 /// @brief Single-scalar Allreduce helper for indices (in-place, count = 1).
679 inline void AllreduceOneIndex(index &v, MPI_Op op, const MPIInfo &mpi)
680 {
681 Allreduce(MPI_IN_PLACE, &v, 1, DNDS_MPI_INDEX, op, mpi.comm);
682 }
683
684}
685
686namespace DNDS
687{
688 /**
689 * @brief Execute `f` on each rank serially, in rank order.
690 *
691 * @details Inserts an `MPI_Barrier` before each rank's turn so that output
692 * interleaving is deterministic. Useful for diagnostics where every rank
693 * prints something about its own state.
694 *
695 * @tparam F Callable with no arguments.
696 */
697 template <class F>
698 inline void MPISerialDo(const MPIInfo &mpi, F f)
699 { //! need some improvement: order could be bad?
700 for (MPI_int i = 0; i < mpi.size; i++)
701 {
702 MPI::Barrier(mpi.comm);
703 if (mpi.rank == i)
704 f();
705 }
706 }
707}
708
709namespace DNDS::MPI
710{
711 /**
712 * @brief Process-wide singleton that selects how @ref DNDS::ArrayTransformer "ArrayTransformer" packs
713 * and waits for MPI messages.
714 *
715 * @details Settings affect every transformer:
716 * - @ref ArrayCommType: @ref HIndexed (default: `MPI_Type_create_hindexed`
717 * derived types) vs @ref InSituPack (manual `memcpy` into contiguous send/recv
718 * buffers). The latter can be faster on networks where derived types pay
719 * large unpacking overhead.
720 * - @ref UseStrongSyncWait: insert barriers around wait calls for easier
721 * profiling.
722 * - @ref UseAsyncOneByOne: issue per-peer @ref Isend/@ref Irecv instead of one
723 * persistent @ref Startall.
724 * - @ref UseLazyWait: poll interval (ns) used by @ref MPI::WaitallLazy.
725 *
726 * Must be constructed under `MPI_COMM_WORLD`. Thread-safe C++11 singleton.
727 */
729 {
730 public:
731 /// @brief Which derived-type strategy @ref DNDS::ArrayTransformer "ArrayTransformer" should use.
733 {
734 UnknownArrayCommType = 0, ///< Sentinel / uninitialised.
735 HIndexed = 1, ///< Use `MPI_Type_create_hindexed` derived types (default).
736 InSituPack = 2, ///< Manually pack / unpack into contiguous buffers.
737 };
738
739 static const int Ntype = 10;
740
741 private:
742 ArrayCommType _array_strategy = HIndexed;
743 bool _use_strong_sync_wait = false;
744 bool _use_async_one_by_one = false;
745 double _use_lazy_wait = 0;
746
747 CommStrategy();
748 CommStrategy(const CommStrategy &);
749 CommStrategy &operator=(const CommStrategy &);
750
751 public:
752 /// @brief Access the process-wide singleton.
753 static CommStrategy &Instance();
754 /// @brief Current array-pack strategy.
756 /// @brief Override the array-pack strategy (affects subsequently-created transformers).
758 /// @brief Whether barriers are inserted around @ref Waitall for profiling.
759 [[nodiscard]] bool GetUseStrongSyncWait() const;
760 /// @brief Whether transformers should use one-by-one Isend/Irecv.
761 [[nodiscard]] bool GetUseAsyncOneByOne() const;
762 /// @brief Polling interval (ns) for @ref MPI::WaitallLazy. `0` means use `MPI_Waitall`.
763 [[nodiscard]] double GetUseLazyWait() const;
764 };
765}
766
767namespace DNDS::MPI
768{
769 /// @brief Runtime probe: is the current MPI implementation configured with
770 /// CUDA-aware support? Affects whether arrays are transferred on-device or
771 /// via the host round-trip.
772 bool isCudaAware();
773}
774
775namespace DNDS
776{
777 /// @brief Barrier + annotated print used by @ref DNDS_MPI_InsertCheck.
778 /// @details No-op in release builds (`NDEBUG` or `NINSERT` defined).
779 inline void InsertCheck(const MPIInfo &mpi, const std::string &info = "",
780 const std::string &FUNCTION = "", const std::string &FILE = "", int LINE = -1)
781 {
782#if !(defined(NDEBUG) || defined(NINSERT))
783 MPI::Barrier(mpi.comm);
784 std::cout << "=== CHECK \"" << info << "\" RANK " << mpi.rank << " ==="
785 << " @ FName: " << FUNCTION
786 << " @ Place: " << FILE << ":" << LINE << std::endl;
787 MPI::Barrier(mpi.comm);
788#endif
789 }
790}
791
792#ifdef NDEBUG_DISABLED
793# define NDEBUG
794# undef NDEBUG_DISABLED
795#endif
Core type aliases, constants, and metaprogramming utilities for the DNDS framework.
#define DNDS_assert(expr)
Debug-only assertion (compiled out when DNDS_NDEBUG is defined). Prints the expression + file/line + ...
Definition Errors.hpp:108
#define MAX_MPI_int
Definition MPI.hpp:47
#define DISABLE_WARNING_PUSH
Definition Warnings.hpp:73
#define DISABLE_WARNING_UNUSED_VALUE
Definition Warnings.hpp:76
#define DISABLE_WARNING_POP
Definition Warnings.hpp:74
Process-singleton managing the buffer attached to MPI for MPI_Bsend (buffered sends).
Definition MPI.hpp:561
decltype(buf)::size_type size_type
Definition MPI.hpp:566
void unclaim(MPI_int cs)
Release cs previously-claim ed bytes (only updates accounting; does not shrink the buffer).
Definition MPI.hpp:618
MPI_int size()
Current buffer size in bytes (fits in MPI_int; asserted).
Definition MPI.hpp:588
static MPIBufferHandler & Instance()
Access the process-wide singleton.
Definition MPI.cpp:107
void claim(MPI_Aint cs, int reportRank=0)
Reserve cs additional bytes, growing and re-attaching the MPI buffer if needed. reportRank is only us...
Definition MPI.hpp:595
void * getBuf()
Direct pointer to the attached buffer (for diagnostics).
Definition MPI.hpp:624
Process-wide singleton that selects how ArrayTransformer packs and waits for MPI messages.
Definition MPI.hpp:729
static CommStrategy & Instance()
Access the process-wide singleton.
Definition MPI.cpp:415
bool GetUseStrongSyncWait() const
Whether barriers are inserted around Waitall for profiling.
Definition MPI.cpp:431
double GetUseLazyWait() const
Polling interval (ns) for MPI::WaitallLazy. 0 means use MPI_Waitall.
Definition MPI.cpp:441
ArrayCommType GetArrayStrategy()
Current array-pack strategy.
Definition MPI.cpp:421
bool GetUseAsyncOneByOne() const
Whether transformers should use one-by-one Isend/Irecv.
Definition MPI.cpp:436
ArrayCommType
Which derived-type strategy ArrayTransformer should use.
Definition MPI.hpp:733
@ InSituPack
Manually pack / unpack into contiguous buffers.
Definition MPI.hpp:736
@ UnknownArrayCommType
Sentinel / uninitialised.
Definition MPI.hpp:734
@ HIndexed
Use MPI_Type_create_hindexed derived types (default).
Definition MPI.hpp:735
static const int Ntype
Definition MPI.hpp:739
void SetArrayStrategy(ArrayCommType t)
Override the array-pack strategy (affects subsequently-created transformers).
Definition MPI.cpp:426
Singleton that tracks and releases long-lived MPI resources at MPI_Finalize time.
Definition MPI.hpp:273
static ResourceRecycler & Instance()
Access the process-wide singleton.
Definition MPI.cpp:310
void RegisterCleaner(void *p, std::function< void()> nCleaner)
Register a cleanup callback keyed by p.
Definition MPI.cpp:316
void clean()
Invoke all registered cleaners and drop them. Called by MPI::Finalize().
Definition MPI.cpp:328
void RemoveCleaner(void *p)
Remove a previously-registered cleaner.
Definition MPI.cpp:322
bool IsDebugged()
Whether the current process is running under a debugger. Implemented via /proc/self/status TracerPid ...
Definition MPI.cpp:35
void MPIDebugHold(const MPIInfo &mpi)
If isDebugging is set, block every rank in a busy-wait loop so the user can attach a debugger and ins...
Definition MPI.cpp:59
bool isDebugging
Flag consulted by MPIDebugHold and assert_false_info_mpi.
Definition MPI.cpp:90
MPI_int Allgather(const void *sendbuf, MPI_int sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Wrapper over MPI_Allgather.
Definition MPI.cpp:230
MPI_int Bcast(void *buf, MPI_int num, MPI_Datatype type, MPI_int source_rank, MPI_Comm comm)
dumb wrapper
Definition MPI.cpp:150
MPI_int WaitallAuto(MPI_int count, MPI_Request *reqs, MPI_Status *statuses)
Wait on an array of requests, choosing between MPI_Waitall and the lazy-poll variant based on CommStr...
Definition MPI.cpp:283
MPI_int BarrierLazy(MPI_Comm comm, uint64_t checkNanoSecs)
Polling barrier that sleeps checkNanoSecs ns between MPI_Test calls. Reduces CPU spin when many ranks...
Definition MPI.cpp:260
MPI_int Alltoall(void *send, MPI_int sendNum, MPI_Datatype typeSend, void *recv, MPI_int recvNum, MPI_Datatype typeRecv, MPI_Comm comm)
Wrapper over MPI_Alltoall (fixed per-peer count).
Definition MPI.cpp:166
void AllreduceOneReal(real &v, MPI_Op op, const MPIInfo &mpi)
Single-scalar Allreduce helper for reals (in-place, count = 1).
Definition MPI.hpp:673
MPI_int Scan(const void *sendbuf, void *recvbuf, MPI_int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
Wrapper over MPI_Scan (inclusive prefix reduction).
Definition MPI.cpp:220
int Finalize()
Release DNDSR-registered MPI resources then call MPI_Finalize.
Definition MPI.hpp:531
MPI_int Alltoallv(void *send, MPI_int *sendSizes, MPI_int *sendStarts, MPI_Datatype sendType, void *recv, MPI_int *recvSizes, MPI_int *recvStarts, MPI_Datatype recvType, MPI_Comm comm)
Wrapper over MPI_Alltoallv (variable per-peer counts + displacements).
Definition MPI.cpp:182
MPI_int Allreduce(const void *sendbuf, void *recvbuf, MPI_int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
Wrapper over MPI_Allreduce.
Definition MPI.cpp:203
MPI_int Barrier(MPI_Comm comm)
Wrapper over MPI_Barrier.
Definition MPI.cpp:248
MPI_int Init_thread(int *argc, char ***argv)
Initialise MPI with thread support, honouring the DNDS_DISABLE_ASYNC_MPI environment override.
Definition MPI.hpp:495
bool isCudaAware()
Runtime probe: is the current MPI implementation configured with CUDA-aware support?...
Definition MPI.cpp:298
MPI_int WaitallLazy(MPI_int count, MPI_Request *reqs, MPI_Status *statuses, uint64_t checkNanoSecs)
Like WaitallAuto but sleeps checkNanoSecs ns between polls.
Definition MPI.cpp:271
int GetMPIThreadLevel()
Return the MPI thread-support level the current process was initialised with.
Definition MPI.hpp:474
void AllreduceOneIndex(index &v, MPI_Op op, const MPIInfo &mpi)
Single-scalar Allreduce helper for indices (in-place, count = 1).
Definition MPI.hpp:679
the host side operators are provided as implemented
MPI_Aint MPI_index
MPI-compatible address/offset type (= MPI_Aint, 64-bit on all supported platforms)....
Definition MPI.hpp:46
std::vector< MPI_int > tMPI_sizeVec
Vector of MPI counts.
Definition MPI.hpp:52
const MPI_Datatype DNDS_MPI_INDEX
MPI datatype matching index (= MPI_INT64_T).
Definition MPI.hpp:90
tMPI_indexVec tMPI_AintVec
Alias for tMPI_indexVec to match MPI_Aint terminology.
Definition MPI.hpp:58
ssp< MPITypePairHolder > tpMPITypePairHolder
Shared-pointer alias to MPITypePairHolder.
Definition MPI.hpp:378
std::vector< MPI_index > tMPI_indexVec
Vector of MPI_Aint byte-offsets for hindexed datatypes.
Definition MPI.hpp:56
void MPISerialDo(const MPIInfo &mpi, F f)
Execute f on each rank serially, in rank order.
Definition MPI.hpp:698
std::vector< std::pair< MPI_int, MPI_Datatype > > tMPI_typePairVec
Definition MPI.hpp:323
MPI_int MPIWorldRank()
Convenience: MPI_Comm_rank(MPI_COMM_WORLD).
Definition MPI.hpp:307
constexpr T divide_ceil(T a, T b)
Integer ceiling division ceil(a / b). Correct for all signs.
Definition Defines.hpp:569
std::pair< MPI_Datatype, MPI_int > BasicType_To_MPIIntType_Custom()
Dispatch to a user-provided CommPair / CommMult+ CommType pair on T.
Definition MPI.hpp:124
MPI_int MPIWorldSize()
Convenience: MPI_Comm_size(MPI_COMM_WORLD).
Definition MPI.hpp:299
constexpr MPI_Datatype __DNDSToMPITypeInt()
Map a DNDS integer type size to an MPI signed-integer datatype.
Definition MPI.hpp:71
std::vector< MPI_Request > tMPI_reqVec
Vector of MPI_Request, for persistent / nonblocking calls.
Definition MPI.hpp:63
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).
Definition Defines.hpp:107
void InsertCheck(const MPIInfo &mpi, const std::string &info="", const std::string &FUNCTION="", const std::string &FILE="", int LINE=-1)
Barrier + annotated print used by DNDS_MPI_InsertCheck.
Definition MPI.hpp:779
tMPI_sizeVec tMPI_intVec
Alias for tMPI_sizeVec; used where the name "int vec" reads better.
Definition MPI.hpp:54
constexpr MPI_Datatype __DNDSToMPITypeFloat()
Map a DNDS floating-point type size to an MPI datatype.
Definition MPI.hpp:83
std::shared_ptr< T > ssp
Shortened alias for std::shared_ptr used pervasively in DNDSR.
Definition Defines.hpp:138
double real
Canonical floating-point scalar used throughout DNDSR (double precision).
Definition Defines.hpp:105
std::string getTimeStamp(const MPIInfo &mpi)
Format a human-readable timestamp using the calling rank as context.
Definition MPI.cpp:116
void assert_false_info_mpi(const char *expr, const char *file, int line, const std::string &info, const DNDS::MPIInfo &mpi)
MPI-aware assertion-failure reporter.
Definition MPI.cpp:95
std::pair< MPI_Datatype, MPI_int > BasicType_To_MPIIntType()
Deduce an (MPI_Datatype, count) pair that represents a T value.
Definition MPI.hpp:149
std::vector< MPI_Status > tMPI_statVec
Vector of MPI_Status, for MPI_Waitall / MPI_Testall.
Definition MPI.hpp:61
std::mutex HDF_mutex
Global mutex serialising host-side HDF5 calls.
Definition MPI.cpp:449
int MPI_int
MPI counterpart type for MPI_int (= C int). Used for counts and ranks in MPI calls.
Definition MPI.hpp:43
const MPI_Datatype DNDS_MPI_REAL
MPI datatype matching real (= MPI_REAL8).
Definition MPI.hpp:92
Lightweight bundle of an MPI communicator and the calling rank's coordinates.
Definition MPI.hpp:215
MPIInfo()=default
int size
Number of ranks in comm (-1 until initialised).
Definition MPI.hpp:221
MPIInfo(MPI_Comm nc, int r, int s)
Low-level constructor for callers that already know (rank, size).
Definition MPI.hpp:237
MPIInfo(MPI_Comm ncomm)
Wrap an existing MPI communicator; queries rank and size.
Definition MPI.hpp:226
int rank
This rank's 0-based index within comm (-1 until initialised).
Definition MPI.hpp:219
MPI_Comm comm
The underlying MPI communicator handle.
Definition MPI.hpp:217
void setWorld()
Initialise the object to MPI_COMM_WORLD. Requires MPI_Init to have run.
Definition MPI.hpp:242
bool operator==(const MPIInfo &r) const
Exact triple equality.
Definition MPI.hpp:251
RAII vector of MPI_Requests that frees each non-null handle when destroyed.
Definition MPI.hpp:387
tMPI_reqVec tBase
Definition MPI.hpp:389
static ssp< MPIReqHolder > create(Args &&...args)
Only public path to construct an instance.
Definition MPI.hpp:410
void clear()
Free every non-null request and empty the vector.
Definition MPI.hpp:420
MPIReqHolder(shared_ctor_guard g, Args &&...args)
Perfect-forwarding factory; returns shared_ptr<MPIReqHolder>.
Definition MPI.hpp:400
RAII vector of (count, MPI_Datatype) pairs that frees every committed datatype when destroyed.
Definition MPI.hpp:335
tMPI_typePairVec tBase
Definition MPI.hpp:337
MPITypePairHolder(shared_ctor_guard g, Args &&...args)
Perfect-forwarding factory; returns shared_ptr<MPITypePairHolder>.
Definition MPI.hpp:348
void clear()
Free every committed datatype and empty the vector.
Definition MPI.hpp:368
static ssp< MPITypePairHolder > create(Args &&...args)
Only public path to construct an instance; forwards to the private constructor.
Definition MPI.hpp:358
here are some reasons to upgrade to C++20...
Definition MPI.hpp:100
SFINAE trait detecting a static CommType member in T.
Definition MPI.hpp:111
Eigen::Matrix< real, 5, 1 > v
real err
tVec r(NCells)