DNDSR 0.1.0.dev1+gcd065ad
Distributed Numeric Data Structure for CFV
Loading...
Searching...
No Matches
EulerP_Evaluator_impl_utils.hpp
Go to the documentation of this file.
1/**
2 * @file EulerP_Evaluator_impl_utils.hpp
3 * @brief Internal utility types for the EulerP Evaluator kernel implementations.
4 *
5 * Provides:
6 * - @c FLocalAccessor_noOp / @c FGlobalAccessor_noOp: No-op accessor stubs used as template
7 * arguments when no local/global accumulation is needed in a kernel.
8 * - @c CUDA_Local2GlobalAssign: CUDA shared-memory helper for bank-conflict-free coalesced
9 * write-back from thread-local storage to global memory.
10 *
11 * @note This header is an internal implementation detail of the EulerP Evaluator.
12 * Contents reside in the @c DNDS::EulerP::detail namespace.
13 */
14#pragma once
15
17#include "DNDS/CUDA_Utils.hpp"
18
19/**
20 * @brief Internal implementation detail namespace for EulerP Evaluator utilities.
21 */
23{
24 /**
25 * @brief No-op local accessor returning a dummy real value.
26 *
27 * Used as a template argument for kernels that do not require
28 * thread-local accumulation. All writes go to a discarded dummy member.
29 */
31 {
33 real dummy_; ///< Discarded dummy value; all accesses read/write this.
34
35 /// @brief Returns a reference to the dummy value (ignores index).
36 /// @param i Component index (unused).
37 /// @return Reference to @c dummy_.
38 DNDS_FORCEINLINE DNDS_DEVICE real &operator()(int i) { return dummy_; }
39 };
40
41 /**
42 * @brief No-op global accessor returning a dummy real value.
43 *
44 * Used as a template argument for kernels that do not require
45 * global memory write-back. All writes go to a discarded dummy member.
46 */
48 {
49 real dummy_; ///< Discarded dummy value; all accesses read/write this.
51
52 /// @brief Returns a reference to the dummy value (ignores point and component indices).
53 /// @param iPnt Point index (unused).
54 /// @param i Component index (unused).
55 /// @return Reference to @c dummy_.
56 DNDS_FORCEINLINE DNDS_DEVICE real &operator()(index iPnt, int i) { return dummy_; }
57 };
58
59#ifdef DNDS_USE_CUDA
60
61 /**
62 * @brief CUDA shared-memory helper for bank-conflict-free coalesced write-back.
63 *
64 * Transfers per-thread local accumulation buffers to global memory through CUDA
65 * shared memory, using a stride-padded layout to avoid shared memory bank conflicts.
66 *
67 * The algorithm:
68 * 1. Each thread writes its @c local_stride_fixed elements into shared memory at
69 * a padded stride @c local_stride_buf = (local_stride/2)*2+1 to avoid bank conflicts.
70 * 2. After a thread synchronization barrier, threads cooperatively read from shared
71 * memory and write to global memory in a coalesced pattern where thread @c tid
72 * writes element @c (i * blockDim + tid) to the appropriate global location.
73 *
74 * @tparam local_stride_fixed Number of elements per point (must be > 0, compile-time constant).
75 * @tparam max_tid_fixed Maximum number of threads per block (must be > 0, compile-time constant).
76 * @tparam TFLocalAccessor Thread-local accessor type: operator()(int i) -> real&.
77 * @tparam TFGlobalAccessor Global accessor type: operator()(index iPnt, int i) -> real&.
78 * @tparam bufferSize_idx Size of the shared index buffer (must be >= max_tid_fixed).
79 * @tparam bufferSize_val Size of the shared value buffer (must be >= local_stride_buf * max_tid_fixed).
80 * @param FLocalAccessor Functor providing access to thread-local data.
81 * @param FGLobalAccessor Functor providing access to global output data.
82 * @param shared_buf_idx Shared memory buffer for point indices.
83 * @param shared_buf_val Shared memory buffer for intermediate values.
84 * @param iPnt Global point index for this thread.
85 * @param iPntMax Total number of valid points (writes beyond this are skipped).
86 */
87 template <int local_stride_fixed, int max_tid_fixed,
88 class TFLocalAccessor, class TFGlobalAccessor,
89 int bufferSize_idx, int bufferSize_val>
90 DNDS_FORCEINLINE DNDS_DEVICE void CUDA_Local2GlobalAssign(
91 TFLocalAccessor &&FLocalAccessor,
92 TFGlobalAccessor &&FGLobalAccessor,
93 CUDA::SharedBuffer<index, bufferSize_idx> &shared_buf_idx,
94 CUDA::SharedBuffer<real, bufferSize_val> &shared_buf_val,
95 index iPnt, index iPntMax)
96 {
97# ifndef __CUDA_ARCH__
98 static_assert(local_stride_fixed > 0 && local_stride_fixed < 0);
99# endif
100 static_assert(local_stride_fixed > 0);
101 static_assert(max_tid_fixed > 0);
102 static_assert(bufferSize_idx >= max_tid_fixed);
103 // TODO: support dynamic sized?
104
105 constexpr int local_stride = local_stride_fixed;
106 constexpr int local_stride_buf = (local_stride / 2) * 2 + 1;
107 static_assert(bufferSize_val >= local_stride_buf * max_tid_fixed);
108
109 real *buf_data = shared_buf_val.buffer;
110 index *iPntThread = shared_buf_idx.buffer;
111
112 int tid = CUDA::tid_x();
113 int bDim = CUDA::bDim_x();
114 DNDS_HD_assert(tid < max_tid_fixed && tid >= 0);
115 iPntThread[tid] = iPnt; //! can out of bounds
116 for (int i = 0; i < local_stride; i++)
117 buf_data[tid * local_stride_buf + i] = FLocalAccessor(i);
118
119 CUDA::sync_threads();
120 for (int i = 0; i < local_stride; i++)
121 {
122 int iComp = (i * bDim + tid);
123 int iPntInBlock = iComp / local_stride;
124 int iCompSub = iComp % local_stride;
125 int iCompBuf = (local_stride == local_stride_buf) ? iComp : (iPntInBlock * local_stride_buf + iCompSub);
126 // int iComp = (i * bDim + tid) % local_stride;
127 index iPntC = iPntThread[iPntInBlock];
128 if (iPntC < iPntMax)
129 FGLobalAccessor(iPntC, iCompSub) = buf_data[iCompBuf];
130 }
131 }
132#endif
133}
CUDA helpers: driver/runtime error macros, device sync primitives, thrust-backed allocators,...
#define DNDS_DEVICE
Definition Defines.hpp:77
#define DNDS_DEVICE_TRIVIAL_COPY_DEFINE(T, T_Self)
Definition Defines.hpp:83
#define DNDS_FORCEINLINE
Definition Defines.hpp:978
Device memory abstraction layer with backend-specific storage and factory creation.
#define DNDS_HD_assert(cond)
Host-only expansion of DNDS_HD_assert (equivalent to DNDS_assert).
Definition Errors.hpp:189
Internal implementation detail namespace for EulerP Evaluator utilities.
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).
Definition Defines.hpp:107
double real
Canonical floating-point scalar used throughout DNDSR (double precision).
Definition Defines.hpp:105
No-op global accessor returning a dummy real value.
real dummy_
Discarded dummy value; all accesses read/write this.
No-op local accessor returning a dummy real value.
real dummy_
Discarded dummy value; all accesses read/write this.