DNDSR 0.1.0.dev1+gcd065ad
Distributed Numeric Data Structure for CFV
Loading...
Searching...
No Matches
CUDA_Utils.hpp
Go to the documentation of this file.
1#pragma once
2/// @file CUDA_Utils.hpp
3/// @brief CUDA helpers: driver/runtime error macros, device sync primitives,
4/// thrust-backed allocators, kernel launch utilities.
5///
6/// The whole file is gated on @ref DNDS_USE_CUDA; a no-op stub is exposed when
7/// CUDA is not compiled in so callers can include it unconditionally.
8
9#include "Defines.hpp"
10#include "ArrayBasic.hpp"
11#include <cstdint>
12#include <limits>
13#include <string>
14#include <tuple>
15#include <type_traits>
16#include <unordered_set>
17
18#ifdef DNDS_USE_CUDA
19# include <thrust/device_malloc_allocator.h>
20# include <thrust/device_malloc.h>
21# include <thrust/device_free.h>
22# include <thrust/copy.h>
23# include <thrust/host_vector.h>
24# include <thrust/device_vector.h>
25# include <Eigen/Dense>
26# include <sstream>
27# include <cuda.h>
28
29/// @brief Global 1D thread id for a CUDA kernel; expects `blockDim.x` blocks.
30# define DNDS_CUDA_1D_TID_GLOBAL_INDEX ((index)blockIdx.x * (index)blockDim.x + (index)threadIdx.x)
31
32/// @brief Evaluate a `cudaError_t`-returning expression and throw on failure.
33/// @details Uses @ref DNDS_check_throw_info so the failure site and CUDA error
34/// string are included in the thrown exception.
35# define DNDS_CUDA_CHECKED(expr) \
36 do \
37 { \
38 cudaError_t _err = (expr); \
39 if (_err != cudaSuccess) \
40 { \
41 std::stringstream ss; \
42 ss << "CUDA Error: " << cudaGetErrorString(_err) \
43 << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \
44 << " in " << #expr << std::endl; \
45 DNDS_check_throw_info(_err != cudaSuccess, ss.str()); \
46 } \
47 } while (0)
48
49/// @brief Same as @ref DNDS_CUDA_CHECKED but for CUDA driver API (`CUresult`).
50# define DNDS_CUDA_DRIVER_CHECKED(expr) \
51 do \
52 { \
53 CUresult _err = (expr); \
54 if (_err) \
55 { \
56 const char *errStr; \
57 cuGetErrorString(_err, &errStr); \
58 std::stringstream ss; \
59 ss << "CUDA Driver Error: " << errStr \
60 << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \
61 << " in " << #expr << std::endl; \
62 DNDS_check_throw_info(_err, ss.str()); \
63 } \
64 } while (0)
65
66namespace DNDS::CUDA
67{
68
69 template <int _ = 1>
70 DNDS_FORCEINLINE DNDS_DEVICE void sync_threads()
71 {
72# ifdef __CUDA_ARCH__
73 __syncthreads();
74# else
75 static_assert(_ == 0 && _ == 1, "not allowed");
76# endif
77 }
78
79 template <int _ = 1>
80 DNDS_FORCEINLINE DNDS_DEVICE uint32_t tid_x()
81 {
82# ifdef __CUDA_ARCH__
83 return threadIdx.x;
84# else
85 static_assert(_ == 0 && _ == 1, "not allowed");
86 return -1;
87# endif
88 }
89
90 template <int _ = 1>
91 DNDS_FORCEINLINE DNDS_DEVICE uint32_t bDim_x()
92 {
93# ifdef __CUDA_ARCH__
94 return blockDim.x;
95# else
96 static_assert(_ == 0 && _ == 1, "not allowed");
97 return -1;
98# endif
99 }
100
101 template <class T, int N>
102 struct SharedBuffer
103 {
104 static_assert(N >= 0);
105 T buffer[N];
106 template <class TInd>
107 DNDS_FORCEINLINE DNDS_DEVICE T &operator[](TInd &&i) { return buffer[i]; }
108 };
109
110 template <class T, int N>
111 DNDS_FORCEINLINE DNDS_DEVICE SharedBuffer<T, N> &DeclareSharedBuffer()
112 {
113# ifdef __CUDA_ARCH__
114 __shared__ SharedBuffer<T, N> buf;
115 return buf;
116# else
117 // static_assert(N == 0 && N == 1, "not allowed");
118 static SharedBuffer<T, N> buf;
119 return buf;
120# endif
121 }
122
123}
124
125namespace DNDS::CUDA
126{
127 template <typename T>
128 struct DeviceObject
129 {
130 static_assert(std::is_trivially_copyable_v<T>);
131 thrust::device_ptr<T> dev;
132 DeviceObject(const T &host)
133 {
134
135 dev = thrust::device_malloc<T>(1);
136 // cudaMemcpy(dev.get(), &host, sizeof(T), cudaMemcpyHostToDevice);
137 DNDS_CUDA_CHECKED(cudaMemcpy(thrust::raw_pointer_cast(dev), &host, sizeof(T), cudaMemcpyHostToDevice));
138 // thrust::copy(&host, (&host) + 1, dev);
139 }
140 ~DeviceObject() { thrust::device_free(dev); }
141 T *get() { return dev.get(); }
142 };
143
144# define DNDS_CUDA_DEVICE_VIEW_COPY_OBJ(obj) \
145 auto obj##_device_copy = ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj);
146# define DNDS_CUDA_DEVICE_VIEW_TMP_COPY(obj) \
147 ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj).get()
148
149 inline auto calckernelSizeSimple(index total_threads, uint32_t threadsPerBlock)
150 {
151 index result = index(total_threads + threadsPerBlock - 1) / index(threadsPerBlock);
152 uint32_t blocksPerGrid = 0;
153 if (result > 0 && result <= std::numeric_limits<uint32_t>::max())
154 blocksPerGrid = result;
155 else
156 DNDS_assert_info(false, "too many blocks: " + std::to_string(result));
157 return std::make_tuple(blocksPerGrid, threadsPerBlock);
158 }
159
160 class CudaEvent
161 {
162 public:
163 cudaEvent_t ev;
164
165 CudaEvent(unsigned flags = cudaEventDisableTiming)
166 {
167 if (cudaEventCreateWithFlags(&ev, flags) != cudaSuccess)
168 throw std::runtime_error("Failed to create CUDA event");
169 }
170
171 ~CudaEvent() { DNDS_CUDA_CHECKED(cudaEventDestroy(ev)); }
172
173 void record(cudaStream_t stream = 0)
174 {
175 if (cudaEventRecord(ev, stream) != cudaSuccess)
176 throw std::runtime_error("Failed to record CUDA event");
177 }
178
179 void sync() const
180 {
181 if (cudaEventSynchronize(ev) != cudaSuccess)
182 throw std::runtime_error("Failed to synchronize CUDA event");
183 }
184
185 cudaEvent_t get() { return ev; };
186 };
187
188 class CudaStream
189 {
190 cudaStream_t stream;
191 std::unordered_set<ssp<CudaEvent>> waiting_events;
192
193 public:
194 CudaStream(unsigned flags = cudaStreamNonBlocking)
195 {
196 if (cudaStreamCreateWithFlags(&stream, flags) != cudaSuccess)
197 throw std::runtime_error("Failed to create CUDA stream");
198 }
199
200 static CudaStream &DefaultStream();
201
202 ~CudaStream() { DNDS_CUDA_CHECKED(cudaStreamDestroy(stream)); }
203
204 cudaStream_t get() const { return stream; }
205
206 /// Wait for this stream to finish, using a temporary event
207 void wait() const
208 {
209 CudaEvent ev;
210 ev.record(stream);
211 ev.sync(); // blocks CPU until stream is done
212 }
213
214 void sync()
215 {
216 cudaStreamSynchronize(stream);
217 waiting_events.clear();
218 }
219
220 void waitForEvent(const ssp<CudaEvent> &e)
221 {
222 cudaStreamWaitEvent(stream, e->get());
223 waiting_events.insert(e);
224 }
225
226 void makeOtherStreamWait(CudaStream &s_other)
227 {
228 auto e = std::make_shared<CudaEvent>();
229 e->record(stream);
230 s_other.waitForEvent(e);
231 }
232 };
233}
234
235#endif
Array layout descriptors, non-owning views, row views, and iterator base.
Core type aliases, constants, and metaprogramming utilities for the DNDS framework.
#define DNDS_DEVICE
Definition Defines.hpp:77
#define DNDS_FORCEINLINE
Definition Defines.hpp:978
#define DNDS_assert_info(expr, info)
Debug-only assertion with an extra std::string info message.
Definition Errors.hpp:113
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).
Definition Defines.hpp:107
constexpr DNDS::index N
auto result