DNDSR/doxygen/CUDA__Utils_8hpp_source.html

#pragma once

/// @file CUDA_Utils.hpp

/// @brief CUDA helpers: driver/runtime error macros, device sync primitives,

/// thrust-backed allocators, kernel launch utilities.

///

/// The whole file is gated on @ref DNDS_USE_CUDA; a no-op stub is exposed when

/// CUDA is not compiled in so callers can include it unconditionally.


#include "DNDS/Defines.hpp"

#include "DNDS/ArrayBasic.hpp"

#include <cstdint>

#include <limits>

#include <string>

#include <tuple>

#include <type_traits>

#include <unordered_set>


#ifdef DNDS_USE_CUDA

#    include <thrust/device_malloc_allocator.h>

#    include <thrust/device_malloc.h>

#    include <thrust/device_free.h>

#    include <thrust/copy.h>

#    include <thrust/host_vector.h>

#    include <thrust/device_vector.h>

#    include <Eigen/Dense>

#    include <sstream>

#    include <cuda.h>


/// @brief Global 1D thread id for a CUDA kernel; expects `blockDim.x` blocks.

#    define DNDS_CUDA_1D_TID_GLOBAL_INDEX ((index)blockIdx.x * (index)blockDim.x + (index)threadIdx.x)


/// @brief Evaluate a `cudaError_t`-returning expression and throw on failure.

/// @details Uses @ref DNDS_check_throw_info so the failure site and CUDA error

/// string are included in the thrown exception.

#    define DNDS_CUDA_CHECKED(expr)                                          \

        do                                                                   \

        {                                                                    \

            cudaError_t _err = (expr);                                       \

            if (_err != cudaSuccess)                                         \

            {                                                                \

                std::stringstream ss;                                        \

                ss << "CUDA Error: " << cudaGetErrorString(_err)             \

                   << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \

                   << " in " << #expr << std::endl;                          \

                DNDS_check_throw_info(_err != cudaSuccess, ss.str());        \

            }                                                                \

        } while (0)


/// @brief Same as @ref DNDS_CUDA_CHECKED but for CUDA driver API (`CUresult`).

#    define DNDS_CUDA_DRIVER_CHECKED(expr)                                   \

        do                                                                   \

        {                                                                    \

            CUresult _err = (expr);                                          \

            if (_err)                                                        \

            {                                                                \

                const char *errStr;                                          \

                cuGetErrorString(_err, &errStr);                             \

                std::stringstream ss;                                        \

                ss << "CUDA Driver Error: " << errStr                        \

                   << " (" << _err << ") at " << __FILE__ << ":" << __LINE__ \

                   << " in " << #expr << std::endl;                          \

                DNDS_check_throw_info(_err, ss.str());                       \

            }                                                                \

        } while (0)


namespace DNDS::CUDA

{


    template <int _ = 1>

    DNDS_FORCEINLINE DNDS_DEVICE void sync_threads()

    {

#    ifdef __CUDA_ARCH__

        __syncthreads();

#    else

        static_assert(_ == 0 && _ == 1, "not allowed");

#    endif

    }


    template <int _ = 1>

    DNDS_FORCEINLINE DNDS_DEVICE uint32_t tid_x()

    {

#    ifdef __CUDA_ARCH__

        return threadIdx.x;

#    else

        static_assert(_ == 0 && _ == 1, "not allowed");

        return -1;

#    endif

    }


    template <int _ = 1>

    DNDS_FORCEINLINE DNDS_DEVICE uint32_t bDim_x()

    {

#    ifdef __CUDA_ARCH__

        return blockDim.x;

#    else

        static_assert(_ == 0 && _ == 1, "not allowed");

        return -1;

#    endif

    }


    template <class T, int N>

    struct SharedBuffer

    {

        static_assert(N >= 0);

        T buffer[N];

        template <class TInd>

        DNDS_FORCEINLINE DNDS_DEVICE T &operator[](TInd &&i) { return buffer[i]; }

    };


    template <class T, int N>

    DNDS_FORCEINLINE DNDS_DEVICE SharedBuffer<T, N> &DeclareSharedBuffer()

    {

#    ifdef __CUDA_ARCH__

        __shared__ SharedBuffer<T, N> buf;

        return buf;

#    else

        // static_assert(N == 0 && N == 1, "not allowed");

        static SharedBuffer<T, N> buf;

        return buf;

#    endif

    }


}


namespace DNDS::CUDA

{

    template <typename T>

    struct DeviceObject

    {

        static_assert(std::is_trivially_copyable_v<T>);

        thrust::device_ptr<T> dev;

        DeviceObject(const T &host)

        {


            dev = thrust::device_malloc<T>(1);

            // cudaMemcpy(dev.get(), &host, sizeof(T), cudaMemcpyHostToDevice);

            DNDS_CUDA_CHECKED(cudaMemcpy(thrust::raw_pointer_cast(dev), &host, sizeof(T), cudaMemcpyHostToDevice));

            // thrust::copy(&host, (&host) + 1, dev);

        }

        ~DeviceObject() { thrust::device_free(dev); }

        T *get() { return dev.get(); }

    };


#    define DNDS_CUDA_DEVICE_VIEW_COPY_OBJ(obj) \

        auto obj##_device_copy = ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj);

#    define DNDS_CUDA_DEVICE_VIEW_TMP_COPY(obj) \

        ::DNDS::CUDA::DeviceObject<std::remove_cv_t<std::remove_reference_t<decltype(obj)>>>(obj).get()


    inline auto calckernelSizeSimple(index total_threads, uint32_t threadsPerBlock)

    {

        index result = index(total_threads + threadsPerBlock - 1) / index(threadsPerBlock);

        uint32_t blocksPerGrid = 0;

        if (result > 0 && result <= std::numeric_limits<uint32_t>::max())

            blocksPerGrid = result;

        else

            DNDS_assert_info(false, "too many blocks: " + std::to_string(result));

        return std::make_tuple(blocksPerGrid, threadsPerBlock);

    }


    class CudaEvent

    {

    public:

        cudaEvent_t ev;


        CudaEvent(unsigned flags = cudaEventDisableTiming)

        {

            if (cudaEventCreateWithFlags(&ev, flags) != cudaSuccess)

                throw std::runtime_error("Failed to create CUDA event");

        }


        ~CudaEvent() { DNDS_CUDA_CHECKED(cudaEventDestroy(ev)); }


        void record(cudaStream_t stream = 0)

        {

            if (cudaEventRecord(ev, stream) != cudaSuccess)

                throw std::runtime_error("Failed to record CUDA event");

        }


        void sync() const

        {

            if (cudaEventSynchronize(ev) != cudaSuccess)

                throw std::runtime_error("Failed to synchronize CUDA event");

        }


        cudaEvent_t get() { return ev; };

    };


    class CudaStream

    {

        cudaStream_t stream;

        std::unordered_set<ssp<CudaEvent>> waiting_events;


    public:

        CudaStream(unsigned flags = cudaStreamNonBlocking)

        {

            if (cudaStreamCreateWithFlags(&stream, flags) != cudaSuccess)

                throw std::runtime_error("Failed to create CUDA stream");

        }


        static CudaStream &DefaultStream();


        ~CudaStream() { DNDS_CUDA_CHECKED(cudaStreamDestroy(stream)); }


        cudaStream_t get() const { return stream; }


        /// Wait for this stream to finish, using a temporary event

        void wait() const

        {

            CudaEvent ev;

            ev.record(stream);

            ev.sync(); // blocks CPU until stream is done

        }


        void sync()

        {

            cudaStreamSynchronize(stream);

            waiting_events.clear();

        }


        void waitForEvent(const ssp<CudaEvent> &e)

        {

            cudaStreamWaitEvent(stream, e->get());

            waiting_events.insert(e);

        }


        void makeOtherStreamWait(CudaStream &s_other)

        {

            auto e = std::make_shared<CudaEvent>();

            e->record(stream);

            s_other.waitForEvent(e);

        }

    };

}


#endif

ArrayBasic.hpp
Array layout descriptors, non-owning views, row views, and iterator base.

Defines.hpp
Core type aliases, constants, and metaprogramming utilities for the DNDS framework.

DNDS_DEVICE
#define DNDS_DEVICE
Definition Defines.hpp:77

DNDS_FORCEINLINE
#define DNDS_FORCEINLINE
Definition Defines.hpp:983

DNDS_assert_info
#define DNDS_assert_info(expr, info)
Debug-only assertion with an extra std::string info message.
Definition Errors.hpp:117

DNDS::index
int64_t index
Global row / DOF index type (signed 64-bit; handles multi-billion-cell meshes).
Definition Defines.hpp:112

OversetCart.OversetCart._
_
Definition OversetCart.py:744

N
constexpr DNDS::index N
Definition test_ArrayDOF.cpp:463

result
auto result
Definition test_Limiters.cpp:110

i
res i
Definition test_MeshConnectivity_Interpolate.cpp:205