16#include <unordered_set>
19# include <thrust/device_malloc_allocator.h>
20# include <thrust/device_malloc.h>
21# include <thrust/device_free.h>
22# include <thrust/copy.h>
23# include <thrust/host_vector.h>
24# include <thrust/device_vector.h>
25# include <Eigen/Dense>
30# define DNDS_CUDA_1D_TID_GLOBAL_INDEX ((index)blockIdx.x * (index)blockDim.x + (index)threadIdx.x)
35# define DNDS_CUDA_CHECKED(expr) \
38 cudaError_t _err = (expr); \
39 if (_err != cudaSuccess) \
41 std::stringstream ss; \
42 ss << "CUDA Error: " << cudaGetErrorString(_err) \
43 << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \
44 << " in " << #expr << std::endl; \
45 DNDS_check_throw_info(_err != cudaSuccess, ss.str()); \
50# define DNDS_CUDA_DRIVER_CHECKED(expr) \
53 CUresult _err = (expr); \
57 cuGetErrorString(_err, &errStr); \
58 std::stringstream ss; \
59 ss << "CUDA Driver Error: " << errStr \
60 << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \
61 << " in " << #expr << std::endl; \
62 DNDS_check_throw_info(_err, ss.str()); \
75 static_assert(
_ == 0 &&
_ == 1,
"not allowed");
85 static_assert(
_ == 0 &&
_ == 1,
"not allowed");
96 static_assert(
_ == 0 &&
_ == 1,
"not allowed");
101 template <
class T,
int N>
104 static_assert(
N >= 0);
106 template <
class TInd>
110 template <
class T,
int N>
114 __shared__ SharedBuffer<T, N> buf;
118 static SharedBuffer<T, N> buf;
127 template <
typename T>
130 static_assert(std::is_trivially_copyable_v<T>);
131 thrust::device_ptr<T> dev;
132 DeviceObject(
const T &host)
135 dev = thrust::device_malloc<T>(1);
137 DNDS_CUDA_CHECKED(cudaMemcpy(thrust::raw_pointer_cast(dev), &host,
sizeof(T), cudaMemcpyHostToDevice));
140 ~DeviceObject() { thrust::device_free(dev); }
141 T *get() {
return dev.get(); }
144# define DNDS_CUDA_DEVICE_VIEW_COPY_OBJ(obj) \
145 auto obj##_device_copy = ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj);
146# define DNDS_CUDA_DEVICE_VIEW_TMP_COPY(obj) \
147 ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj).get()
149 inline auto calckernelSizeSimple(index total_threads, uint32_t threadsPerBlock)
152 uint32_t blocksPerGrid = 0;
153 if (
result > 0 &&
result <= std::numeric_limits<uint32_t>::max())
157 return std::make_tuple(blocksPerGrid, threadsPerBlock);
165 CudaEvent(
unsigned flags = cudaEventDisableTiming)
167 if (cudaEventCreateWithFlags(&ev, flags) != cudaSuccess)
168 throw std::runtime_error(
"Failed to create CUDA event");
171 ~CudaEvent() { DNDS_CUDA_CHECKED(cudaEventDestroy(ev)); }
173 void record(cudaStream_t stream = 0)
175 if (cudaEventRecord(ev, stream) != cudaSuccess)
176 throw std::runtime_error(
"Failed to record CUDA event");
181 if (cudaEventSynchronize(ev) != cudaSuccess)
182 throw std::runtime_error(
"Failed to synchronize CUDA event");
185 cudaEvent_t get() {
return ev; };
191 std::unordered_set<ssp<CudaEvent>> waiting_events;
194 CudaStream(
unsigned flags = cudaStreamNonBlocking)
196 if (cudaStreamCreateWithFlags(&stream, flags) != cudaSuccess)
197 throw std::runtime_error(
"Failed to create CUDA stream");
200 static CudaStream &DefaultStream();
202 ~CudaStream() { DNDS_CUDA_CHECKED(cudaStreamDestroy(stream)); }
204 cudaStream_t get()
const {
return stream; }
216 cudaStreamSynchronize(stream);
217 waiting_events.clear();
220 void waitForEvent(
const ssp<CudaEvent> &e)
222 cudaStreamWaitEvent(stream, e->get());
223 waiting_events.insert(e);
226 void makeOtherStreamWait(CudaStream &s_other)
228 auto e = std::make_shared<CudaEvent>();
230 s_other.waitForEvent(e);
Array layout descriptors, non-owning views, row views, and iterator base.
Core type aliases, constants, and metaprogramming utilities for the DNDS framework.
#define DNDS_assert_info(expr, info)
Debug-only assertion with an extra std::string info message.
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).