87 template <
int local_stride_fixed,
int max_tid_fixed,
88 class TFLocalAccessor,
class TFGlobalAccessor,
89 int bufferSize_idx,
int bufferSize_val>
91 TFLocalAccessor &&FLocalAccessor,
92 TFGlobalAccessor &&FGLobalAccessor,
93 CUDA::SharedBuffer<index, bufferSize_idx> &shared_buf_idx,
94 CUDA::SharedBuffer<real, bufferSize_val> &shared_buf_val,
98 static_assert(local_stride_fixed > 0 && local_stride_fixed < 0);
100 static_assert(local_stride_fixed > 0);
101 static_assert(max_tid_fixed > 0);
102 static_assert(bufferSize_idx >= max_tid_fixed);
105 constexpr int local_stride = local_stride_fixed;
106 constexpr int local_stride_buf = (local_stride / 2) * 2 + 1;
107 static_assert(bufferSize_val >= local_stride_buf * max_tid_fixed);
109 real *buf_data = shared_buf_val.buffer;
110 index *iPntThread = shared_buf_idx.buffer;
112 int tid = CUDA::tid_x();
113 int bDim = CUDA::bDim_x();
115 iPntThread[tid] = iPnt;
116 for (
int i = 0; i < local_stride; i++)
117 buf_data[tid * local_stride_buf + i] = FLocalAccessor(i);
119 CUDA::sync_threads();
120 for (
int i = 0; i < local_stride; i++)
122 int iComp = (i * bDim + tid);
123 int iPntInBlock = iComp / local_stride;
124 int iCompSub = iComp % local_stride;
125 int iCompBuf = (local_stride == local_stride_buf) ? iComp : (iPntInBlock * local_stride_buf + iCompSub);
127 index iPntC = iPntThread[iPntInBlock];
129 FGLobalAccessor(iPntC, iCompSub) = buf_data[iCompBuf];