57 template <
class T, rowsize _row_size = 1, rowsize _row_max = _row_size, rowsize _align = NoAlign>
112 fmt::format(
"ParArray MPI context (rank={}, size={}) doesn't match serializer (rank={}, size={})",
123 if (!serializerP->IsPerRank() && this->
_pRowStart)
158 Serializer::ArrayGlobalOffset_Parts);
188 fmt::format(
"ParArray MPI context (rank={}, size={}) doesn't match serializer (rank={}, size={})",
194 if (offset == Serializer::ArrayGlobalOffset_EvenSplit)
299 template <
typename...
Args>
385 "globalSize() requires global mapping. "
386 "Ensure createGlobalMapping() was called first (typically via ArrayPair operations).");
425 template <
class T, rowsize _row_size = 1, rowsize _row_max = _row_size, rowsize _align = NoAlign>
503 commTypeCurrent = R.commTypeCurrent;
595 template <
class TRArrayTrans>
601 DNDS_check_throw(RArrayTrans.pLGhostMapping && RArrayTrans.pLGlobalMapping);
605 father->pLGlobalMapping = RArrayTrans.pLGlobalMapping;
613 father->createGlobalMapping();
630 template <
class TPullSet>
641 std::forward<TPullSet>(pullingIndexGlobal),
663 template <
class TPushSet,
class TPushStart>
674 std::forward<TPushStart>(pushStarts),
715 auto fatherDataStart =
father->operator[](0);
741 auto do_son_resizing = [&]()
744 index ghostArraySiz = LGhostMapping.ghostStart[LGhostMapping.ghostStart.size() - 1];
748 son->Resize(ghostArraySiz);
753 son->Resize(ghostArraySiz,
father->RowSize());
758 son->Resize(ghostArraySiz,
father->RowSizeMax());
763 son->Resize(ghostArraySiz);
769 MPI_Alltoallv(
pushingSizes.data(), LGhostMapping.pushIndexSizes.data(), LGhostMapping.pushIndexStarts.data(), MPI_INT,
770 pullingSizes.data(), LGhostMapping.ghostSizes.data(), LGhostMapping.ghostStart.data(), MPI_INT,
775 son->Resize(ghostArraySiz, [&](
index i)
776 {
return pullingSizes[i] /
father->getTypeMult(); });
779 son->Resize(ghostArraySiz,
father->RowSizeMax());
780 for (
index i = 0; i <
son->Size(); i++)
781 son->ResizeRow(i, pullingSizes[i] /
father->getTypeMult());
785 son->Resize(ghostArraySiz);
786 for (
index i = 0; i <
son->Size(); i++)
787 son->ResizeRow(i, pullingSizes[i] /
father->getTypeMult());
806 i =
son->RowSizeField() *
father->getTypeMult();
821 index sumPushSizes = 0;
822 for (
MPI_int i = 0; i < pushNumber; i++)
823 sumPushSizes += pPushSizes[i];
824 if (sumPushSizes > 0)
839 int sizeof_T = MPI_UNDEFINED;
840 MPI_Type_size(
father->getDataType(), &sizeof_T);
842 auto [n_number, new_Sizes, new_Disps] =
844 MPI_Type_create_hindexed(n_number, new_Sizes.data(), new_Disps.data(),
father->getDataType(), &dtype);
847 MPI_Type_commit(&dtype);
853 std::array<MPI_Aint, 1> pullDisp;
855 std::array<MPI_int, 1> pullSizes;
858 auto gStartPtr =
son->operator[](
index(0));
859 auto ghostSpan = gRPtr - gLPtr;
860 auto ghostStart = gLPtr - gStartPtr;
863 pullDisp[0] = ghostStart *
sizeof(T);
864 if (pullSizes[0] > 0)
869 MPI_Type_create_hindexed(1, pullSizes.data(), pullDisp.data(),
father->getDataType(), &dtype);
872 MPI_Type_commit(&dtype);
882 else if (commTypeCurrent == MPI::CommStrategy::CommStrategy::InSituPack)
896 T *fatherData =
nullptr;
897 T *sonData =
nullptr;
900 fatherData =
father->data();
901 sonData =
son->data();
911 fatherData =
father->data(B);
912 sonData =
son->data(B);
916 case DeviceBackend::CUDA:
920 fatherData =
father->data(B);
921 sonData =
son->data(B);
931 return std::make_pair(fatherData, sonData);
968 auto dtypeInfo = (*pPushTypeVec)[ip];
969 MPI_int rankOther = dtypeInfo.first;
977 auto dtypeInfo = (*pPullTypeVec)[ip];
978 MPI_int rankOther = dtypeInfo.first;
981#ifndef ARRAY_COMM_USE_BUFFERED_SEND
987 (sonData, 1, dtypeInfo.second, rankOther, tag,
mpi.
comm,
PushReqVec->data() + ip);
998#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1002 else if (commTypeCurrent == MPI::CommStrategy::CommStrategy::InSituPack)
1042 auto dtypeInfo = (*pPullTypeVec)[ip];
1043 MPI_int rankOther = dtypeInfo.first;
1046 MPI_Recv_init(sonData, 1, dtypeInfo.second, rankOther, tag,
mpi.
comm,
PullReqVec->data() + ip);
1053 auto dtypeInfo = (*pPushTypeVec)[ip];
1054 MPI_int rankOther = dtypeInfo.first;
1057#ifndef ARRAY_COMM_USE_BUFFERED_SEND
1074#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1078 else if (commTypeCurrent == MPI::CommStrategy::CommStrategy::InSituPack)
1103 for (
index i = 0; i < pushNumber; i++)
1108 nPush =
father->RowSizeField(loc);
1110 nPush =
father->RowSize(loc);
1112 nPush =
father->RowSizeField();
1129 auto ghostSpan = gRPtr - gLPtr;
1130 pullSize =
MPI_int(ghostSpan);
1167#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1186 auto ghostSpan = gRPtr - gLPtr;
1187 pullSize =
MPI_int(ghostSpan);
1204 for (
index i = 0; i < pushNumber; i++)
1209 nPush =
father->RowSizeField(loc);
1211 nPush =
father->RowSize(loc);
1213 nPush =
father->RowSizeField();
1218 for (
index i = 0; i < pushNumber; i++)
1223 nPush =
father->RowSizeField(loc);
1225 nPush =
father->RowSize(loc);
1227 nPush =
father->RowSizeField();
1267#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1281#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1296 MPI_Wait(&
PushReqVec->operator[](iReq), MPI_STATUS_IGNORE);
1319 index nPushData = 0;
1320 for (
index i = 0; i < pushNumber; i++)
1325 nPush =
father->RowSizeField(loc);
1327 nPush =
father->RowSize(loc);
1329 nPush =
father->RowSizeField();
1330 std::copy(bufferVec->begin() + nPushData, bufferVec->begin() + nPushData + nPush, (*
father)[loc]);
1354#ifdef ARRAY_COMM_USE_BUFFERED_SEND
1369 MPI_Wait(&
PullReqVec->operator[](iReq), MPI_STATUS_IGNORE);
1456 bool clearedPull{
false}, clearedPush{
false};
1477 template <
class TArray>
1483 template <
class TArray>
Core 2D variable-length array container with five data layouts.
Device memory abstraction layer with backend-specific storage and factory creation.
Assertion / error-handling macros and supporting helper functions.
#define DNDS_assert_info(expr, info)
Debug-only assertion with an extra std::string info message.
#define DNDS_check_throw_info(expr, info)
Same as DNDS_check_throw but attaches a user-supplied info message to the thrown std::runtime_error.
#define DNDS_check_throw(expr)
Runtime check active in both debug and release builds. Throws std::runtime_error if expr evaluates to...
Global-to-local index mapping for distributed arrays.
Wall-clock performance timer and running scalar statistics utilities.
Small utilities for MPI-indexed type layouts (hindexed optimisation).
Non-owning device-callable view of an Array, specialised per DeviceBackend.
static const DataLayout _dataLayout
const T & at(index iRow, rowsize iCol) const
Bounds-checked element read (not device-callable because CSR decompressed uses std::vector::at which ...
DNDS_DEVICE_CALLABLE T * data()
Raw pointer to the start of the flat data buffer.
Core 2D variable-length array container, the storage foundation of DNDSR.
void WriteSerializer(Serializer::SerializerBaseSSP serializerP, const std::string &name, Serializer::ArrayGlobalOffset offset, Serializer::ArrayGlobalOffset dataOffset=Serializer::ArrayGlobalOffset_Unknown)
Serialize (write) array data to a serializer.
bool IfCompressed() const
(CSR only) Whether the array is in packed / flat form.
ArrayDeviceView< B, T, _row_size, _row_max, _align > t_deviceView
Array()=default
Default-constructed array: empty, no storage.
ssp< t_RowSizes > t_pRowSizes
void Compress()
Layout-polymorphic compress: no-op for non-CSR, calls CSRCompress for CSR.
index Size() const
Number of rows currently stored. O(1).
rowsize RowSizeField() const
"Logical" row-field width used by derived (Eigen) arrays: max for padded layouts, uniform width for f...
iterator< B > end()
Iterator one past the last row, viewed on device backend B.
void ReadSerializer(Serializer::SerializerBaseSSP serializerP, const std::string &name, Serializer::ArrayGlobalOffset &offset)
Convenience overload that discards the dataOffset output.
void unclaim(MPI_int cs)
Release cs previously-claim ed bytes (only updates accounting; does not shrink the buffer).
static MPIBufferHandler & Instance()
Access the process-wide singleton.
void claim(MPI_Aint cs, int reportRank=0)
Reserve cs additional bytes, growing and re-attaching the MPI buffer if needed. reportRank is only us...
static CommStrategy & Instance()
Access the process-wide singleton.
ArrayCommType GetArrayStrategy()
Current array-pack strategy.
ArrayCommType
Which derived-type strategy ArrayTransformer should use.
@ InSituPack
Manually pack / unpack into contiguous buffers.
@ UnknownArrayCommType
Sentinel / uninitialised.
@ HIndexed
Use MPI_Type_create_hindexed derived types (default).
void setObjectName(const std::string &name)
MPI-aware Array: adds a communicator, rank, and global index mapping.
void setDataType(MPI_Datatype n_dType, MPI_int n_TypeMult)
Override the deduced MPI datatype and element multiplier (advanced; needed for custom compound elemen...
ParArray()=default
Default-construct an uninitialised ParArray; call setMPI and Resize later.
t_self & operator=(const t_self &R)=default
void setMPI(const MPIInfo &n_mpi)
Install the MPI context after default construction.
typename TArray::t_pRowSizes t_pRowSizes
void clone(const t_self &R)
Copy-assign from another ParArray. Shallow copy semantics (mirrors Arrayclone): shares structural/dat...
static const DataLayout _dataLayout
MPI_int getTypeMult()
Per-element count multiplier that goes with getDataType.
void AssertDataType()
Assert the MPI datatype matches sizeof(T) exactly.
index globalSize() const
Returns the total global size (sum of sizes across all ranks).
bool AssertConsistent()
Check array consistency across all ranks.
const MPIInfo & getMPI() const
Read-only MPI context accessor.
MPI_Datatype getDataType()
MPI element datatype used for ghost exchange (deduced from T).
t_pLGlobalMapping pLGlobalMapping
Shared pointer to the global-offsets table. Populated by createGlobalMapping; may be pointed at an ex...
void WriteSerializer(Serializer::SerializerBaseSSP serializerP, const std::string &name, Serializer::ArrayGlobalOffset offset)
Serialize (write) the parallel array with MPI-aware metadata.
void createGlobalMapping()
Collective: build the global offsets table.
ParArray(const t_self &R)=default
ParArray(ObjName objName, Args &&...args)
Named constructor: sets the object name for tracing/debugging. All existing constructor overloads are...
MPIInfo mpi
MPI context associated with this array (must be set before collectives).
ParArray(MPI_Datatype n_dType, MPI_int n_TypeMult, const MPIInfo &n_mpi)
Construct with a custom (MPI datatype, multiplier) pair.
ParArray(const MPIInfo &n_mpi)
Construct a ParArray bound to the given MPI context.
MPIInfo & getMPI()
Mutable MPI context accessor.
void ReadSerializer(Serializer::SerializerBaseSSP serializerP, const std::string &name, Serializer::ArrayGlobalOffset &offset)
Deserialize (read) the parallel array with MPI-aware metadata.
Describes one rank's window into a globally-distributed dataset.
bool isDist() const
Whether this descriptor carries a real distributed offset (rather than a sentinel like Offset_Parts).
MPI_int Allgather(const void *sendbuf, MPI_int sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Wrapper over MPI_Allgather.
MPI_int WaitallAuto(MPI_int count, MPI_Request *reqs, MPI_Status *statuses)
Wait on an array of requests, choosing between MPI_Waitall and the lazy-poll variant based on CommStr...
MPI_int Scan(const void *sendbuf, void *recvbuf, MPI_int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
Wrapper over MPI_Scan (inclusive prefix reduction).
MPI_int Allreduce(const void *sendbuf, void *recvbuf, MPI_int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
Wrapper over MPI_Allreduce.
MPI_int Barrier(MPI_Comm comm)
Wrapper over MPI_Barrier.
bool isCudaAware()
Runtime probe: is the current MPI implementation configured with CUDA-aware support?...
ssp< SerializerBase > SerializerBaseSSP
the host side operators are provided as implemented
ssp< GlobalOffsetsMapping > t_pLGlobalMapping
Shared pointer to a GlobalOffsetsMapping (globally replicated).
std::vector< rowsize > t_RowsizeVec
Vector of row widths (one rowsize per row).
ssp< OffsetAscendIndexMapping > t_pLGhostMapping
Shared pointer to an OffsetAscendIndexMapping (per-rank ghost layout).
const MPI_Datatype DNDS_MPI_INDEX
MPI datatype matching index (= MPI_INT64_T).
constexpr bool isTABLE_Fixed(DataLayout lo)
Whether the layout has a uniform row width (no per-row size needed).
DeviceBackend
Enumerates the backends a DeviceStorage / Array can live on.
@ Unknown
Unset / sentinel.
tMPI_indexVec tMPI_AintVec
Alias for tMPI_indexVec to match MPI_Aint terminology.
int32_t rowsize
Row-width / per-row element-count type (signed 32-bit).
DataLayout
Enumeration of the five concrete data layouts supported by Array.
@ TABLE_StaticFixed
Fixed row width, known at compile time.
@ TABLE_Max
Padded variable rows; max width set at runtime.
@ CSR
Compressed Sparse Row (flat buffer + row-start index).
@ TABLE_StaticMax
Padded variable rows; max width fixed at compile time.
@ TABLE_Fixed
Fixed row width, set at runtime (uniform across rows).
std::pair< index, index > EvenSplitRange(int rank, int nRanks, index nGlobal)
Split a global range [0, nGlobal) evenly among nRanks workers.
constexpr bool isTABLE_Max(DataLayout lo)
Whether the layout is a padded-max variant (uses _pRowSizes).
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).
tMPI_sizeVec tMPI_intVec
Alias for tMPI_sizeVec; used where the name "int vec" reads better.
std::shared_ptr< T > ssp
Shortened alias for std::shared_ptr used pervasively in DNDSR.
auto optimize_hindexed_layout(index o_size, TBlkSiz *blk_sizes, TDisp *disps, TSizeof sizeofElem)
Coalesce contiguous blocks in an MPI_Type_create_hindexed layout.
std::vector< MPI_Status > tMPI_statVec
Vector of MPI_Status, for MPI_Waitall / MPI_Testall.
typename ArrayTransformerType< TArray >::Type ArrayTransformerType_t
int MPI_int
MPI counterpart type for MPI_int (= C int). Used for counts and ranks in MPI calls.
Lightweight bundle of an MPI communicator and the calling rank's coordinates.
int size
Number of ranks in comm (-1 until initialised).
int rank
This rank's 0-based index within comm (-1 until initialised).
MPI_Comm comm
The underlying MPI communicator handle.
static ssp< MPIReqHolder > create(Args &&...args)
Only public path to construct an instance.
static ssp< MPITypePairHolder > create(Args &&...args)
Only public path to construct an instance; forwards to the private constructor.
Tag type for naming objects created via make_ssp.